diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 25f4e3b3fbd26..b503283559db4 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -950,7 +950,8 @@ Each builtin accesses memory according to a provided boolean mask. These are
 provided as ``__builtin_masked_load`` and ``__builtin_masked_store``. The first
 argument is always boolean mask vector. The ``__builtin_masked_load`` builtin
 takes an optional third vector argument that will be used for the result of the
-masked-off lanes. These builtins assume the memory is always aligned.
+masked-off lanes. These builtins assume the memory is unaligned, use
+``__builtin_assume_aligned`` if alignment is desired.
 
 The ``__builtin_masked_expand_load`` and ``__builtin_masked_compress_store``
 builtins have the same interface but store the result in consecutive indices.
@@ -969,17 +970,17 @@ Example:
     using v8b = bool [[clang::ext_vector_type(8)]];
     using v8i = int [[clang::ext_vector_type(8)]];
 
-    v8i load(v8b mask, v8i *ptr) { return __builtin_masked_load(mask, ptr); }
+    v8i load(v8b mask, int *ptr) { return __builtin_masked_load(mask, ptr); }
     
-    v8i load_expand(v8b mask, v8i *ptr) {
+    v8i load_expand(v8b mask, int *ptr) {
       return __builtin_masked_expand_load(mask, ptr);
     }
     
-    void store(v8b mask, v8i val, v8i *ptr) {
+    void store(v8b mask, v8i val, int *ptr) {
       __builtin_masked_store(mask, val, ptr);
     }
     
-    void store_compress(v8b mask, v8i val, v8i *ptr) {
+    void store_compress(v8b mask, v8i val, int *ptr) {
       __builtin_masked_compress_store(mask, val, ptr);
     }
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index f7c3dea257d50..1cff1e0de71b7 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4277,14 +4277,15 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     llvm::Value *Ptr = EmitScalarExpr(E->getArg(1));
 
     llvm::Type *RetTy = CGM.getTypes().ConvertType(E->getType());
-    CharUnits Align = CGM.getNaturalTypeAlignment(E->getType(), nullptr);
-    llvm::Value *AlignVal =
-        llvm::ConstantInt::get(Int32Ty, Align.getQuantity());
-
     llvm::Value *PassThru = llvm::PoisonValue::get(RetTy);
     if (E->getNumArgs() > 2)
       PassThru = EmitScalarExpr(E->getArg(2));
 
+    CharUnits Align = CGM.getNaturalTypeAlignment(
+        E->getType()->getAs<VectorType>()->getElementType(), nullptr);
+    llvm::Value *AlignVal =
+        llvm::ConstantInt::get(Int32Ty, Align.getQuantity());
+
     llvm::Value *Result;
     if (BuiltinID == Builtin::BI__builtin_masked_load) {
       Function *F =
@@ -4335,7 +4336,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     llvm::Type *ValLLTy = CGM.getTypes().ConvertType(ValTy);
     llvm::Type *PtrTy = Ptr->getType();
 
-    CharUnits Align = CGM.getNaturalTypeAlignment(ValTy, nullptr);
+    CharUnits Align = CGM.getNaturalTypeAlignment(
+        E->getArg(1)->getType()->getAs<VectorType>()->getElementType(),
+        nullptr);
     llvm::Value *AlignVal =
         llvm::ConstantInt::get(Int32Ty, Align.getQuantity());
 
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index b3b67230f7687..2520f06ef550e 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2268,7 +2268,7 @@ static bool BuiltinCountZeroBitsGeneric(Sema &S, CallExpr *TheCall) {
 }
 
 static bool CheckMaskedBuiltinArgs(Sema &S, Expr *MaskArg, Expr *PtrArg,
-                                   unsigned Pos, bool Vector = true) {
+                                   unsigned Pos) {
   QualType MaskTy = MaskArg->getType();
   if (!MaskTy->isExtVectorBoolType())
     return S.Diag(MaskArg->getBeginLoc(), diag::err_builtin_invalid_arg_type)
@@ -2276,11 +2276,9 @@ static bool CheckMaskedBuiltinArgs(Sema &S, Expr *MaskArg, Expr *PtrArg,
            << MaskTy;
 
   QualType PtrTy = PtrArg->getType();
-  if (!PtrTy->isPointerType() ||
-      (Vector && !PtrTy->getPointeeType()->isVectorType()) ||
-      (!Vector && PtrTy->getPointeeType()->isVectorType()))
+  if (!PtrTy->isPointerType() || PtrTy->getPointeeType()->isVectorType())
     return S.Diag(PtrArg->getExprLoc(), diag::err_vec_masked_load_store_ptr)
-           << Pos << (Vector ? "pointer to vector" : "scalar pointer");
+           << Pos << "scalar pointer";
   return false;
 }
 
@@ -2297,24 +2295,18 @@ static ExprResult BuiltinMaskedLoad(Sema &S, CallExpr *TheCall) {
   QualType PtrTy = PtrArg->getType();
   QualType PointeeTy = PtrTy->getPointeeType();
   const VectorType *MaskVecTy = MaskTy->getAs<VectorType>();
-  const VectorType *DataVecTy = PointeeTy->getAs<VectorType>();
 
+  QualType RetTy =
+      S.Context.getExtVectorType(PointeeTy, MaskVecTy->getNumElements());
   if (TheCall->getNumArgs() == 3) {
     Expr *PassThruArg = TheCall->getArg(2);
     QualType PassThruTy = PassThruArg->getType();
-    if (!S.Context.hasSameType(PassThruTy, PointeeTy))
+    if (!S.Context.hasSameType(PassThruTy, RetTy))
       return S.Diag(PtrArg->getExprLoc(), diag::err_vec_masked_load_store_ptr)
-             << /* third argument */ 3 << PointeeTy;
+             << /* third argument */ 3 << RetTy;
   }
 
-  if (MaskVecTy->getNumElements() != DataVecTy->getNumElements())
-    return ExprError(
-        S.Diag(TheCall->getBeginLoc(), diag::err_vec_masked_load_store_size)
-        << S.getASTContext().BuiltinInfo.getQuotedName(
-               TheCall->getBuiltinCallee())
-        << MaskTy << PointeeTy);
-
-  TheCall->setType(PointeeTy);
+  TheCall->setType(RetTy);
   return TheCall;
 }
 
@@ -2339,18 +2331,10 @@ static ExprResult BuiltinMaskedStore(Sema &S, CallExpr *TheCall) {
 
   QualType PointeeTy = PtrTy->getPointeeType();
   const VectorType *MaskVecTy = MaskTy->getAs<VectorType>();
-  const VectorType *ValVecTy = ValTy->getAs<VectorType>();
-  const VectorType *PtrVecTy = PointeeTy->getAs<VectorType>();
-
-  if (MaskVecTy->getNumElements() != ValVecTy->getNumElements() ||
-      MaskVecTy->getNumElements() != PtrVecTy->getNumElements())
-    return ExprError(
-        S.Diag(TheCall->getBeginLoc(), diag::err_vec_masked_load_store_size)
-        << S.getASTContext().BuiltinInfo.getQuotedName(
-               TheCall->getBuiltinCallee())
-        << MaskTy << PointeeTy);
+  QualType RetTy =
+      S.Context.getExtVectorType(PointeeTy, MaskVecTy->getNumElements());
 
-  if (!S.Context.hasSameType(ValTy, PointeeTy))
+  if (!S.Context.hasSameType(ValTy, RetTy))
     return ExprError(S.Diag(TheCall->getBeginLoc(),
                             diag::err_vec_builtin_incompatible_vector)
                      << TheCall->getDirectCallee() << /*isMorethantwoArgs*/ 2
@@ -2368,7 +2352,7 @@ static ExprResult BuiltinMaskedGather(Sema &S, CallExpr *TheCall) {
   Expr *MaskArg = TheCall->getArg(0);
   Expr *IdxArg = TheCall->getArg(1);
   Expr *PtrArg = TheCall->getArg(2);
-  if (CheckMaskedBuiltinArgs(S, MaskArg, PtrArg, 3, /*Vector=*/false))
+  if (CheckMaskedBuiltinArgs(S, MaskArg, PtrArg, 3))
     return ExprError();
 
   QualType IdxTy = IdxArg->getType();
@@ -2413,7 +2397,7 @@ static ExprResult BuiltinMaskedScatter(Sema &S, CallExpr *TheCall) {
   Expr *ValArg = TheCall->getArg(2);
   Expr *PtrArg = TheCall->getArg(3);
 
-  if (CheckMaskedBuiltinArgs(S, MaskArg, PtrArg, 3, /*Vector=*/false))
+  if (CheckMaskedBuiltinArgs(S, MaskArg, PtrArg, 3))
     return ExprError();
 
   QualType IdxTy = IdxArg->getType();
diff --git a/clang/test/CodeGen/builtin-masked.c b/clang/test/CodeGen/builtin-masked.c
index adb1ad4b698ac..2ac05fac219fe 100644
--- a/clang/test/CodeGen/builtin-masked.c
+++ b/clang/test/CodeGen/builtin-masked.c
@@ -19,10 +19,10 @@ typedef _Bool v8b __attribute__((ext_vector_type(8)));
 // CHECK-NEXT:    [[LOAD_BITS2:%.*]] = load i8, ptr [[M_ADDR]], align 1
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1>
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[P_ADDR]], align 8
-// CHECK-NEXT:    [[MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP2]], i32 32, <8 x i1> [[TMP1]], <8 x i32> poison)
+// CHECK-NEXT:    [[MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
 // CHECK-NEXT:    ret <8 x i32> [[MASKED_LOAD]]
 //
-v8i test_load(v8b m, v8i *p) {
+v8i test_load(v8b m, int *p) {
   return __builtin_masked_load(m, p);
 }
 
@@ -45,10 +45,10 @@ v8i test_load(v8b m, v8i *p) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1>
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[P_ADDR]], align 8
 // CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr [[T_ADDR]], align 32
-// CHECK-NEXT:    [[MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP3]], i32 32, <8 x i1> [[TMP2]], <8 x i32> [[TMP4]])
+// CHECK-NEXT:    [[MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP2]], <8 x i32> [[TMP4]])
 // CHECK-NEXT:    ret <8 x i32> [[MASKED_LOAD]]
 //
-v8i test_load_passthru(v8b m, v8i *p, v8i t) {
+v8i test_load_passthru(v8b m, int *p, v8i t) {
   return __builtin_masked_load(m, p, t);
 }
 
@@ -74,7 +74,7 @@ v8i test_load_passthru(v8b m, v8i *p, v8i t) {
 // CHECK-NEXT:    [[MASKED_EXPAND_LOAD:%.*]] = call <8 x i32> @llvm.masked.expandload.v8i32(ptr [[TMP3]], <8 x i1> [[TMP2]], <8 x i32> [[TMP4]])
 // CHECK-NEXT:    ret <8 x i32> [[MASKED_EXPAND_LOAD]]
 //
-v8i test_load_expand(v8b m, v8i *p, v8i t) {
+v8i test_load_expand(v8b m, int *p, v8i t) {
   return __builtin_masked_expand_load(m, p, t);
 }
 
@@ -97,10 +97,10 @@ v8i test_load_expand(v8b m, v8i *p, v8i t) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1>
 // CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr [[V_ADDR]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[P_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP3]], ptr [[TMP4]], i32 32, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP3]], ptr [[TMP4]], i32 4, <8 x i1> [[TMP2]])
 // CHECK-NEXT:    ret void
 //
-void test_store(v8b m, v8i v, v8i *p) {
+void test_store(v8b m, v8i v, int *p) {
   __builtin_masked_store(m, v, p);
 }
 
@@ -126,7 +126,7 @@ void test_store(v8b m, v8i v, v8i *p) {
 // CHECK-NEXT:    call void @llvm.masked.compressstore.v8i32(<8 x i32> [[TMP3]], ptr [[TMP4]], <8 x i1> [[TMP2]])
 // CHECK-NEXT:    ret void
 //
-void test_compress_store(v8b m, v8i v, v8i *p) {
+void test_compress_store(v8b m, v8i v, int *p) {
   __builtin_masked_compress_store(m, v, p);
 }
 
diff --git a/clang/test/Sema/builtin-masked.c b/clang/test/Sema/builtin-masked.c
index eb0070b0276af..e24648da57fb1 100644
--- a/clang/test/Sema/builtin-masked.c
+++ b/clang/test/Sema/builtin-masked.c
@@ -5,44 +5,34 @@ typedef _Bool v8b __attribute__((ext_vector_type(8)));
 typedef _Bool v2b __attribute__((ext_vector_type(2)));
 typedef float v8f __attribute__((ext_vector_type(8)));
 
-void test_masked_load(v8i *pf, v8b mask, v2b mask2, v2b thru) {
+void test_masked_load(int *pf, v8b mask, v2b mask2, v2b thru) {
   (void)__builtin_masked_load(mask); // expected-error {{too few arguments to function call, expected 2, have 1}}
   (void)__builtin_masked_load(mask, pf, pf, pf); // expected-error {{too many arguments to function call, expected at most 3, have 4}}
-  (void)__builtin_masked_load(mask2, pf); // expected-error {{all arguments to '__builtin_masked_load' must have the same number of elements}}
-  (void)__builtin_masked_load(mask, mask); // expected-error {{2nd argument must be a pointer to vector}}
-  (void)__builtin_masked_load(mask, (void *)0); // expected-error {{2nd argument must be a pointer to vector}}
-  (void)__builtin_masked_load(mask2, pf, thru); // expected-error {{3rd argument must be a 'v8i' (vector of 8 'int' values)}}
-  (void)__builtin_masked_load(mask2, pf); // expected-error {{all arguments to '__builtin_masked_load' must have the same number of elements}}
+  (void)__builtin_masked_load(mask, mask); // expected-error {{2nd argument must be a scalar pointer}}
+  (void)__builtin_masked_load(mask2, pf, thru); // expected-error {{3rd argument must be a 'int __attribute__((ext_vector_type(2)))' (vector of 2 'int' values)}}
 }
 
-void test_masked_store(v8i *pf, v8f *pf2, v8b mask, v2b mask2) {
+void test_masked_store(int *pf, v8f *pf2, v8b mask, v2b mask2) {
   __builtin_masked_store(mask); // expected-error {{too few arguments to function call, expected 3, have 1}}
   __builtin_masked_store(mask, 0, 0, 0); // expected-error {{too many arguments to function call, expected 3, have 4}}
   __builtin_masked_store(0, 0, pf); // expected-error {{1st argument must be a vector of boolean types (was 'int')}}
   __builtin_masked_store(mask, 0, pf); // expected-error {{2nd argument must be a vector}}
-  __builtin_masked_store(mask, *pf, 0); // expected-error {{3rd argument must be a pointer to vector}}
-  __builtin_masked_store(mask2, *pf, pf); // expected-error {{all arguments to '__builtin_masked_store' must have the same number of elements}}
-  __builtin_masked_store(mask, *pf, pf2); // expected-error {{last two arguments to '__builtin_masked_store' must have the same type}}
+  __builtin_masked_store(mask, *pf, 0); // expected-error {{3rd argument must be a scalar pointer}}
 }
 
-void test_masked_expand_load(v8i *pf, v8b mask, v2b mask2, v2b thru) {
+void test_masked_expand_load(int *pf, v8b mask, v2b mask2, v2b thru) {
   (void)__builtin_masked_expand_load(mask); // expected-error {{too few arguments to function call, expected 2, have 1}}
   (void)__builtin_masked_expand_load(mask, pf, pf, pf); // expected-error {{too many arguments to function call, expected at most 3, have 4}}
-  (void)__builtin_masked_expand_load(mask2, pf); // expected-error {{all arguments to '__builtin_masked_expand_load' must have the same number of elements}}
-  (void)__builtin_masked_expand_load(mask, mask); // expected-error {{2nd argument must be a pointer to vector}}
-  (void)__builtin_masked_expand_load(mask, (void *)0); // expected-error {{2nd argument must be a pointer to vector}}
-  (void)__builtin_masked_expand_load(mask2, pf, thru); // expected-error {{3rd argument must be a 'v8i' (vector of 8 'int' values)}}
-  (void)__builtin_masked_expand_load(mask2, pf); // expected-error {{all arguments to '__builtin_masked_expand_load' must have the same number of elements}}
+  (void)__builtin_masked_expand_load(mask, mask); // expected-error {{2nd argument must be a scalar pointer}}
+  (void)__builtin_masked_expand_load(mask2, pf, thru); // expected-error {{3rd argument must be a 'int __attribute__((ext_vector_type(2)))' (vector of 2 'int' values)}}
 }
 
-void test_masked_compress_store(v8i *pf, v8f *pf2, v8b mask, v2b mask2) {
+void test_masked_compress_store(int *pf, v8f *pf2, v8b mask, v2b mask2) {
   __builtin_masked_compress_store(mask); // expected-error {{too few arguments to function call, expected 3, have 1}}
   __builtin_masked_compress_store(mask, 0, 0, 0); // expected-error {{too many arguments to function call, expected 3, have 4}}
   __builtin_masked_compress_store(0, 0, pf); // expected-error {{1st argument must be a vector of boolean types (was 'int')}}
   __builtin_masked_compress_store(mask, 0, pf); // expected-error {{2nd argument must be a vector}}
-  __builtin_masked_compress_store(mask, *pf, 0); // expected-error {{3rd argument must be a pointer to vector}}
-  __builtin_masked_compress_store(mask2, *pf, pf); // expected-error {{all arguments to '__builtin_masked_compress_store' must have the same number of elements}}
-  __builtin_masked_compress_store(mask, *pf, pf2); // expected-error {{last two arguments to '__builtin_masked_compress_store' must have the same type}}
+  __builtin_masked_compress_store(mask, *pf, 0); // expected-error {{3rd argument must be a scalar pointer}}
 }
 
 void test_masked_gather(int *p, v8i idx, v8b mask, v2b mask2, v2b thru) {