From ef39145218f584bfda7fc44f9078be02272b9e89 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Fri, 8 Nov 2024 09:45:09 +0800
Subject: [PATCH] [CIR] [Lowering] [X86_64] Support VAArg in shape

---
 clang/include/clang/CIR/ABIArgInfo.h          |   2 +
 .../CIR/Dialect/Builder/CIRBaseBuilder.h      |  11 +
 clang/include/clang/CIR/Dialect/IR/CIROps.td  |  20 +
 .../Dialect/Transforms/LoweringPrepare.cpp    |   9 +-
 .../Transforms/LoweringPrepareCXXABI.h        |   1 +
 .../Transforms/LoweringPrepareX86ABI.h        |   0
 .../Transforms/TargetLowering/ABIInfoImpl.cpp |  12 +
 .../Transforms/TargetLowering/ABIInfoImpl.h   |   3 +
 .../Transforms/TargetLowering/CIRCXXABI.h     |  19 -
 .../Transforms/TargetLowering/CMakeLists.txt  |   1 +
 .../TargetLowering/ItaniumCXXABI.cpp          |   1 +
 .../Targets/LoweringPrepareX86CXXABI.cpp      | 357 ++++++++++++++++++
 .../Transforms/TargetLowering/Targets/X86.cpp |  92 +----
 .../TargetLowering/Targets/X86_64ABIInfo.h    |  96 +++++
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |  34 +-
 clang/test/CIR/Lowering/var-arg-x86_64.c      |  78 ++++
 16 files changed, 624 insertions(+), 112 deletions(-)
 create mode 100644 clang/lib/CIR/Dialect/Transforms/LoweringPrepareX86ABI.h
 create mode 100644 clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/LoweringPrepareX86CXXABI.cpp
 create mode 100644 clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86_64ABIInfo.h
 create mode 100644 clang/test/CIR/Lowering/var-arg-x86_64.c

diff --git a/clang/include/clang/CIR/ABIArgInfo.h b/clang/include/clang/CIR/ABIArgInfo.h
index b3c3d68b9572..28215e7ba196 100644
--- a/clang/include/clang/CIR/ABIArgInfo.h
+++ b/clang/include/clang/CIR/ABIArgInfo.h
@@ -252,6 +252,8 @@ class ABIArgInfo {
   bool isExpand() const { return TheKind == Expand; }
   bool isCoerceAndExpand() const { return TheKind == CoerceAndExpand; }
 
+  bool isIgnore() const { return TheKind == Ignore; }
+
   bool isSignExt() const {
     assert(isExtend() && "Invalid kind!");
     return SignExt;
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index f1275a472f3c..225fa444e340 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -48,6 +48,17 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
     return create<cir::ConstantOp>(loc, ty, getAttr<cir::IntAttr>(ty, val));
   }
 
+  mlir::Value getSignedInt(mlir::Location loc, int64_t val, unsigned numBits) {
+    return getConstAPSInt(
+        loc, llvm::APSInt(llvm::APInt(numBits, val), /*isUnsigned=*/false));
+  }
+
+  mlir::Value getUnsignedInt(mlir::Location loc, uint64_t val,
+                             unsigned numBits) {
+    return getConstAPSInt(
+        loc, llvm::APSInt(llvm::APInt(numBits, val), /*isUnsigned=*/true));
+  }
+
   mlir::Value getConstAPInt(mlir::Location loc, mlir::Type typ,
                             const llvm::APInt &val) {
     return create<cir::ConstantOp>(loc, typ, getAttr<cir::IntAttr>(typ, val));
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index b01e9eb55517..8e43713b8fe4 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -4485,6 +4485,26 @@ def AssumeSepStorageOp : CIR_Op<"assume.separate_storage", [SameTypeOperands]> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// PtrMask Operations
+//===----------------------------------------------------------------------===//
+
+def PtrMaskOp : CIR_Op<"ptr_mask", [AllTypesMatch<["ptr", "result"]>]> {
+  let summary = "Masks out bits of the pointer according to a mask";
+  let description = [{
+    The `cir.ptr_mask` operation takes a pointer and an interger `mask` as its
+    argument and return the masked pointer type according to the `mask`.
+  }];
+
+  let arguments = (ins CIR_PointerType:$ptr,
+                       CIR_IntType:$mask);
+  let results = (outs CIR_PointerType:$result);
+
+  let assemblyFormat = [{
+    `(` $ptr `,` $mask `:` type($mask) `)` `:` qualified(type($result)) attr-dict
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // Branch Probability Operations
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 031c3b3b4b40..6af33fd551f2 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -124,9 +124,16 @@ struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
 
   void setASTContext(clang::ASTContext *c) {
     astCtx = c;
-    auto abiStr = c->getTargetInfo().getABI();
+    const clang::TargetInfo &target = c->getTargetInfo();
+    auto abiStr = target.getABI();
     switch (c->getCXXABIKind()) {
     case clang::TargetCXXABI::GenericItanium:
+      if (target.getTriple().getArch() == llvm::Triple::x86_64) {
+        cxxABI.reset(
+            cir::LoweringPrepareCXXABI::createX86ABI(/*is64bit=*/true));
+        break;
+      }
+
       cxxABI.reset(cir::LoweringPrepareCXXABI::createItaniumABI());
       break;
     case clang::TargetCXXABI::GenericAArch64:
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepareCXXABI.h b/clang/lib/CIR/Dialect/Transforms/LoweringPrepareCXXABI.h
index 47c63fae7d7b..f3ae48c13574 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepareCXXABI.h
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepareCXXABI.h
@@ -28,6 +28,7 @@ class LoweringPrepareCXXABI {
 public:
   static LoweringPrepareCXXABI *createItaniumABI();
   static LoweringPrepareCXXABI *createAArch64ABI(cir::AArch64ABIKind k);
+  static LoweringPrepareCXXABI *createX86ABI(bool is64Bit);
 
   virtual mlir::Value lowerVAArg(CIRBaseBuilderTy &builder, cir::VAArgOp op,
                                  const cir::CIRDataLayout &datalayout) = 0;
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepareX86ABI.h b/clang/lib/CIR/Dialect/Transforms/LoweringPrepareX86ABI.h
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.cpp
index 2c92be20bd41..e07315d54a38 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.cpp
@@ -36,6 +36,18 @@ bool isAggregateTypeForABI(mlir::Type T) {
   return !LowerFunction::hasScalarEvaluationKind(T);
 }
 
+mlir::Value emitRoundPointerUpToAlignment(cir::CIRBaseBuilderTy &builder,
+                                          mlir::Value ptr, unsigned alignment) {
+  // OverflowArgArea = (OverflowArgArea + Align - 1) & -Align;
+  mlir::Location loc = ptr.getLoc();
+  mlir::Value roundUp = builder.createPtrStride(
+      loc, builder.createPtrBitcast(ptr, builder.getUIntNTy(8)),
+      builder.getUnsignedInt(loc, alignment - 1, /*width=*/32));
+  return builder.create<cir::PtrMaskOp>(
+      loc, roundUp.getType(), roundUp,
+      builder.getSignedInt(loc, -alignment, /*width=*/32));
+}
+
 mlir::Type useFirstFieldIfTransparentUnion(mlir::Type Ty) {
   if (auto RT = mlir::dyn_cast<StructType>(Ty)) {
     if (RT.isUnion())
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.h b/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.h
index df1cd2d0fe0d..8005b153a544 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.h
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.h
@@ -25,6 +25,9 @@ bool classifyReturnType(const CIRCXXABI &CXXABI, LowerFunctionInfo &FI,
 
 bool isAggregateTypeForABI(mlir::Type T);
 
+mlir::Value emitRoundPointerUpToAlignment(cir::CIRBaseBuilderTy &builder,
+                                          mlir::Value ptr, unsigned alignment);
+
 /// Pass transparent unions as if they were the type of the first element. Sema
 /// should ensure that all elements of the union have the same "machine type".
 mlir::Type useFirstFieldIfTransparentUnion(mlir::Type Ty);
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.h b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.h
index a980f76f012d..0f05ec8040f8 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.h
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.h
@@ -66,23 +66,4 @@ CIRCXXABI *CreateItaniumCXXABI(LowerModule &CGM);
 
 } // namespace cir
 
-// FIXME(cir): Merge this into the CIRCXXABI class above. To do so, this code
-// should be updated to follow some level of codegen parity.
-namespace cir {
-
-class LoweringPrepareCXXABI {
-public:
-  static LoweringPrepareCXXABI *createItaniumABI();
-  static LoweringPrepareCXXABI *createAArch64ABI(cir::AArch64ABIKind k);
-
-  virtual mlir::Value lowerVAArg(CIRBaseBuilderTy &builder, cir::VAArgOp op,
-                                 const cir::CIRDataLayout &datalayout) = 0;
-  virtual ~LoweringPrepareCXXABI() {}
-
-  virtual mlir::Value lowerDynamicCast(CIRBaseBuilderTy &builder,
-                                       clang::ASTContext &astCtx,
-                                       cir::DynamicCastOp op) = 0;
-};
-} // namespace cir
-
 #endif // LLVM_CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_CIRCXXABI_H
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt
index 218656c3b144..d3cb9fc96f1a 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt
@@ -17,6 +17,7 @@ add_clang_library(TargetLowering
   Targets/X86.cpp
   Targets/LoweringPrepareAArch64CXXABI.cpp
   Targets/LoweringPrepareItaniumCXXABI.cpp
+  Targets/LoweringPrepareX86CXXABI.cpp
 
   DEPENDS
   clangBasic
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/ItaniumCXXABI.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/ItaniumCXXABI.cpp
index deb4053dc682..081db25808d1 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/ItaniumCXXABI.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/ItaniumCXXABI.cpp
@@ -20,6 +20,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "../LoweringPrepareCXXABI.h"
 #include "CIRCXXABI.h"
 #include "LowerModule.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/LoweringPrepareX86CXXABI.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/LoweringPrepareX86CXXABI.cpp
new file mode 100644
index 000000000000..ba376d26b0fc
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/LoweringPrepareX86CXXABI.cpp
@@ -0,0 +1,357 @@
+//====- LoweringPrepareX86CXXABI.cpp - Arm64 ABI specific code -------====//
+//
+// Part of the LLVM Project,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------===//
+//
+// This file provides X86{_64, _32} C++ ABI specific code that is used during
+// LLVMIR lowering prepare.
+//
+//===------------------------------------------------------------------===//
+
+#include "../LowerModule.h"
+#include "../LoweringPrepareItaniumCXXABI.h"
+#include "ABIInfoImpl.h"
+#include "X86_64ABIInfo.h"
+
+using namespace clang;
+using namespace cir;
+
+namespace {
+class LoweringPrepareX86CXXABI : public LoweringPrepareItaniumCXXABI {
+  bool is64;
+
+public:
+  LoweringPrepareX86CXXABI(bool is64) : is64(is64) {}
+  mlir::Value lowerVAArg(cir::CIRBaseBuilderTy &builder, cir::VAArgOp op,
+                         const cir::CIRDataLayout &datalayout) override {
+    if (is64)
+      return lowerVAArgX86_64(builder, op, datalayout);
+
+    return lowerVAArgX86_32(builder, op, datalayout);
+  }
+
+  mlir::Value lowerVAArgX86_64(cir::CIRBaseBuilderTy &builder, cir::VAArgOp op,
+                               const cir::CIRDataLayout &datalayout);
+  mlir::Value lowerVAArgX86_32(cir::CIRBaseBuilderTy &builder, cir::VAArgOp op,
+                               const cir::CIRDataLayout &datalayout) {
+    llvm_unreachable("lowerVAArg for X86_32 not implemented yet");
+  }
+};
+
+std::unique_ptr<cir::LowerModule> getLowerModule(cir::VAArgOp op) {
+  mlir::ModuleOp mo = op->getParentOfType<mlir::ModuleOp>();
+  if (!mo)
+    return nullptr;
+
+  mlir::PatternRewriter rewriter(mo.getContext());
+  return cir::createLowerModule(mo, rewriter);
+}
+
+mlir::Value buildX86_64VAArgFromMemory(cir::CIRBaseBuilderTy &builder,
+                                       const cir::CIRDataLayout &datalayout,
+                                       mlir::Value valist, mlir::Type Ty,
+                                       mlir::Location loc) {
+  mlir::Value overflow_arg_area_p =
+      builder.createGetMemberOp(loc, valist, "overflow_arg_area", 2);
+  mlir::Value overflow_arg_area = builder.createLoad(loc, overflow_arg_area_p);
+
+  // AMD64-ABI 3.5.7p5: Step 7. Align l->overflow_arg_area upwards to a 16
+  // byte boundary if alignment needed by type exceeds 8 byte boundary.
+  // It isn't stated explicitly in the standard, but in practice we use
+  // alignment greater than 16 where necessary.
+  unsigned alignment = datalayout.getABITypeAlign(Ty).value();
+  if (alignment > 8)
+    overflow_arg_area =
+        emitRoundPointerUpToAlignment(builder, overflow_arg_area, alignment);
+
+  // AMD64-ABI 3.5.7p5: Step 8. Fetch type from l->overflow_arg_area.
+  mlir::Value res = overflow_arg_area;
+
+  // AMD64-ABI 3.5.7p5: Step 9. Set l->overflow_arg_area to:
+  // l->overflow_arg_area + sizeof(type).
+  // AMD64-ABI 3.5.7p5: Step 10. Align l->overflow_arg_area upwards to
+  // an 8 byte boundary.
+  uint64_t sizeInBytes = datalayout.getTypeStoreSize(Ty).getFixedValue();
+  mlir::Value stride = builder.getSignedInt(loc, ((sizeInBytes + 7) & ~7), 32);
+  mlir::Value castedPtr =
+      builder.createPtrBitcast(overflow_arg_area, builder.getSIntNTy(8));
+  overflow_arg_area = builder.createPtrStride(loc, castedPtr, stride);
+  builder.createStore(loc, overflow_arg_area, overflow_arg_area_p);
+
+  return res;
+}
+
+mlir::Value LoweringPrepareX86CXXABI::lowerVAArgX86_64(
+    cir::CIRBaseBuilderTy &builder, cir::VAArgOp op,
+    const cir::CIRDataLayout &datalayout) {
+  // FIXME: return early since X86_64ABIInfo::classify can't handle these types.
+  // Let's hope LLVM's va_arg instruction can take care of it.
+  // Remove this when X86_64ABIInfo::classify can take care of every type.
+  if (!mlir::isa<VoidType, IntType, SingleType, DoubleType, BoolType,
+                 StructType>(op.getType()))
+    return nullptr;
+
+  // Assume that va_list type is correct; should be pointer to LLVM type:
+  // struct {
+  //   i32 gp_offset;
+  //   i32 fp_offset;
+  //   i8* overflow_arg_area;
+  //   i8* reg_save_area;
+  // };
+  unsigned neededInt, neededSSE;
+
+  std::unique_ptr<cir::LowerModule> lowerModule = getLowerModule(op);
+  if (!lowerModule)
+    return nullptr;
+
+  mlir::Type ty = op.getType();
+
+  // FIXME: How should we access the X86AVXABILevel?
+  X86_64ABIInfo abiInfo(lowerModule->getTypes(), X86AVXABILevel::None);
+  ABIArgInfo ai = abiInfo.classifyArgumentType(
+      ty, 0, neededInt, neededSSE, /*isNamedArg=*/false, /*IsRegCall=*/false);
+
+  // Empty records are ignored for parameter passing purposes.
+  if (ai.isIgnore())
+    return nullptr;
+
+  mlir::Location loc = op.getLoc();
+  mlir::Value valist = op.getOperand();
+
+  // AMD64-ABI 3.5.7p5: Step 1. Determine whether type may be passed
+  // in the registers. If not go to step 7.
+  if (!neededInt && !neededSSE)
+    return builder.createLoad(
+        loc, builder.createPtrBitcast(buildX86_64VAArgFromMemory(
+                                          builder, datalayout, valist, ty, loc),
+                                      ty));
+
+  auto currentBlock = builder.getInsertionBlock();
+
+  // AMD64-ABI 3.5.7p5: Step 2. Compute num_gp to hold the number of
+  // general purpose registers needed to pass type and num_fp to hold
+  // the number of floating point registers needed.
+
+  // AMD64-ABI 3.5.7p5: Step 3. Verify whether arguments fit into
+  // registers. In the case: l->gp_offset > 48 - num_gp * 8 or
+  // l->fp_offset > 304 - num_fp * 16 go to step 7.
+  //
+  // NOTE: 304 is a typo, there are (6 * 8 + 8 * 16) = 176 bytes of
+  // register save space).
+
+  mlir::Value inRegs;
+  mlir::Value gp_offset_p, fp_offset_p;
+  mlir::Value gp_offset, fp_offset;
+
+  if (neededInt) {
+    gp_offset_p = builder.createGetMemberOp(loc, valist, "gp_offset", 0);
+    gp_offset = builder.createLoad(loc, gp_offset_p);
+    inRegs = builder.getUnsignedInt(loc, 48 - neededInt * 8, 32);
+    inRegs = builder.createCompare(loc, cir::CmpOpKind::le, gp_offset, inRegs);
+  }
+
+  if (neededSSE) {
+    fp_offset_p = builder.createGetMemberOp(loc, valist, "fp_offset", 1);
+    fp_offset = builder.createLoad(loc, fp_offset_p);
+    mlir::Value fitsInFP =
+        builder.getUnsignedInt(loc, 176 - neededSSE * 16, 32);
+    fitsInFP =
+        builder.createCompare(loc, cir::CmpOpKind::le, fp_offset, fitsInFP);
+    inRegs = inRegs ? builder.createAnd(inRegs, fitsInFP) : fitsInFP;
+  }
+
+  mlir::Block *contBlock = currentBlock->splitBlock(op);
+  mlir::Block *inRegBlock = builder.createBlock(contBlock);
+  mlir::Block *inMemBlock = builder.createBlock(contBlock);
+
+  builder.setInsertionPointToEnd(currentBlock);
+  builder.create<BrCondOp>(loc, inRegs, inRegBlock, inMemBlock);
+
+  // Emit code to load the value if it was passed in registers.
+  builder.setInsertionPointToStart(inRegBlock);
+
+  // AMD64-ABI 3.5.7p5: Step 4. Fetch type from l->reg_save_area with
+  // an offset of l->gp_offset and/or l->fp_offset. This may require
+  // copying to a temporary location in case the parameter is passed
+  // in different register classes or requires an alignment greater
+  // than 8 for general purpose registers and 16 for XMM registers.
+  //
+  // FIXME: This really results in shameful code when we end up needing to
+  // collect arguments from different places; often what should result in a
+  // simple assembling of a structure from scattered addresses has many more
+  // loads than necessary. Can we clean this up?
+  mlir::Value regSaveArea = builder.createLoad(
+      loc, builder.createGetMemberOp(loc, valist, "reg_save_area", 3));
+  mlir::Value regAddr;
+
+  uint64_t tyAlign = datalayout.getABITypeAlign(ty).value();
+  // The alignment of result address.
+  uint64_t alignment = 0;
+  if (neededInt && neededSSE) {
+    // FIXME: Cleanup.
+    assert(ai.isDirect() && "Unexpected ABI info for mixed regs");
+    StructType structTy = mlir::cast<StructType>(ai.getCoerceToType());
+    cir::PointerType addrTy = builder.getPointerTo(ty);
+
+    mlir::Value tmp = builder.createAlloca(loc, addrTy, ty, "tmp",
+                                           CharUnits::fromQuantity(tyAlign));
+    tmp = builder.createPtrBitcast(tmp, structTy);
+    assert(structTy.getNumElements() == 2 &&
+           "Unexpected ABI info for mixed regs");
+    mlir::Type tyLo = structTy.getMembers()[0];
+    mlir::Type tyHi = structTy.getMembers()[1];
+    assert((isFPOrFPVectorTy(tyLo) ^ isFPOrFPVectorTy(tyHi)) &&
+           "Unexpected ABI info for mixed regs");
+    mlir::Value gpAddr = builder.createPtrStride(loc, regSaveArea, gp_offset);
+    mlir::Value fpAddr = builder.createPtrStride(loc, regSaveArea, fp_offset);
+    mlir::Value regLoAddr = isFPOrFPVectorTy(tyLo) ? fpAddr : gpAddr;
+    mlir::Value regHiAddr = isFPOrFPVectorTy(tyHi) ? gpAddr : fpAddr;
+
+    // Copy the first element.
+    // FIXME: Our choice of alignment here and below is probably pessimistic.
+    mlir::Value v = builder.createAlignedLoad(
+        loc, regLoAddr, datalayout.getABITypeAlign(tyLo).value());
+    builder.createStore(loc, v,
+                        builder.createGetMemberOp(loc, tmp, "gp_offset", 0));
+
+    // Copy the second element.
+    v = builder.createAlignedLoad(loc, regHiAddr,
+                                  datalayout.getABITypeAlign(tyHi).value());
+    builder.createStore(loc, v,
+                        builder.createGetMemberOp(loc, tmp, "fp_offset", 1));
+
+    tmp = builder.createPtrBitcast(tmp, ty);
+    regAddr = tmp;
+  } else if (neededInt || neededSSE == 1) {
+    uint64_t tySize = datalayout.getTypeStoreSize(ty).getFixedValue();
+
+    mlir::Type coTy;
+    if (ai.isDirect())
+      coTy = ai.getCoerceToType();
+
+    mlir::Value gpOrFpOffset = neededInt ? gp_offset : fp_offset;
+    alignment = neededInt ? 8 : 16;
+    uint64_t regSize = neededInt ? neededInt * 8 : 16;
+    // There are two cases require special handling:
+    // 1)
+    //    ```
+    //    struct {
+    //      struct {} a[8];
+    //      int b;
+    //    };
+    //    ```
+    //    The lower 8 bytes of the structure are not stored,
+    //    so an 8-byte offset is needed when accessing the structure.
+    // 2)
+    //   ```
+    //   struct {
+    //     long long a;
+    //     struct {} b;
+    //   };
+    //   ```
+    //   The stored size of this structure is smaller than its actual size,
+    //   which may lead to reading past the end of the register save area.
+    if (coTy && (ai.getDirectOffset() == 8 || regSize < tySize)) {
+      cir::PointerType addrTy = builder.getPointerTo(ty);
+      mlir::Value tmp = builder.createAlloca(loc, addrTy, ty, "tmp",
+                                             CharUnits::fromQuantity(tyAlign));
+      mlir::Value addr =
+          builder.createPtrStride(loc, regSaveArea, gpOrFpOffset);
+      mlir::Value src = builder.createAlignedLoad(
+          loc, builder.createPtrBitcast(addr, coTy), tyAlign);
+      mlir::Value ptrOffset =
+          builder.getUnsignedInt(loc, ai.getDirectOffset(), 32);
+      mlir::Value dst = builder.createPtrStride(loc, tmp, ptrOffset);
+      builder.createStore(loc, src, dst);
+      regAddr = tmp;
+    } else {
+      regAddr = builder.createPtrStride(loc, regSaveArea, gpOrFpOffset);
+
+      // Copy into a temporary if the type is more aligned than the
+      // register save area.
+      if (neededInt && tyAlign > 8) {
+        cir::PointerType addrTy = builder.getPointerTo(ty);
+        mlir::Value tmp = builder.createAlloca(
+            loc, addrTy, ty, "tmp", CharUnits::fromQuantity(tyAlign));
+        builder.createMemCpy(loc, tmp, regAddr,
+                             builder.getUnsignedInt(loc, tySize, 32));
+        regAddr = tmp;
+      }
+    }
+
+  } else {
+    assert(neededSSE == 2 && "Invalid number of needed registers!");
+    // SSE registers are spaced 16 bytes apart in the register save
+    // area, we need to collect the two eightbytes together.
+    // The ABI isn't explicit about this, but it seems reasonable
+    // to assume that the slots are 16-byte aligned, since the stack is
+    // naturally 16-byte aligned and the prologue is expected to store
+    // all the SSE registers to the RSA.
+
+    mlir::Value regAddrLo =
+        builder.createPtrStride(loc, regSaveArea, fp_offset);
+    mlir::Value regAddrHi = builder.createPtrStride(
+        loc, regAddrLo, builder.getUnsignedInt(loc, 16, /*numBits=*/32));
+
+    mlir::MLIRContext *Context = abiInfo.getContext().getMLIRContext();
+    StructType structTy =
+        ai.canHaveCoerceToType()
+            ? cast<StructType>(ai.getCoerceToType())
+            : StructType::get(
+                  Context, {DoubleType::get(Context), DoubleType::get(Context)},
+                  /*packed=*/false, StructType::Struct);
+    cir::PointerType addrTy = builder.getPointerTo(ty);
+    mlir::Value tmp = builder.createAlloca(loc, addrTy, ty, "tmp",
+                                           CharUnits::fromQuantity(tyAlign));
+    tmp = builder.createPtrBitcast(tmp, structTy);
+    mlir::Value v = builder.createLoad(
+        loc, builder.createPtrBitcast(regAddrLo, structTy.getMembers()[0]));
+    builder.createStore(loc, v, builder.createGetMemberOp(loc, tmp, "", 0));
+    v = builder.createLoad(
+        loc, builder.createPtrBitcast(regAddrHi, structTy.getMembers()[1]));
+    builder.createStore(loc, v, builder.createGetMemberOp(loc, tmp, "", 1));
+
+    tmp = builder.createPtrBitcast(tmp, ty);
+    regAddr = tmp;
+  }
+
+  // AMD64-ABI 3.5.7p5: Step 5. Set:
+  // l->gp_offset = l->gp_offset + num_gp * 8
+  // l->fp_offset = l->fp_offset + num_fp * 16.
+  if (neededInt) {
+    mlir::Value offset = builder.getUnsignedInt(loc, neededInt * 8, 32);
+    builder.createStore(loc, builder.createAdd(gp_offset, offset), gp_offset_p);
+  }
+
+  if (neededSSE) {
+    mlir::Value offset = builder.getUnsignedInt(loc, neededSSE * 8, 32);
+    builder.createStore(loc, builder.createAdd(fp_offset, offset), fp_offset_p);
+  }
+
+  builder.create<BrOp>(loc, mlir::ValueRange{regAddr}, contBlock);
+
+  // Emit code to load the value if it was passed in memory.
+  builder.setInsertionPointToStart(inMemBlock);
+  mlir::Value memAddr =
+      buildX86_64VAArgFromMemory(builder, datalayout, valist, ty, loc);
+  builder.create<BrOp>(loc, mlir::ValueRange{memAddr}, contBlock);
+
+  // Return the appropriate result.
+  builder.setInsertionPointToStart(contBlock);
+  mlir::Value res_addr = contBlock->addArgument(regAddr.getType(), loc);
+
+  return alignment
+             ? builder.createAlignedLoad(
+                   loc, builder.createPtrBitcast(res_addr, ty), alignment)
+             : builder.createLoad(loc, builder.createPtrBitcast(res_addr, ty));
+}
+} // namespace
+
+cir::LoweringPrepareCXXABI *
+cir::LoweringPrepareCXXABI::createX86ABI(bool is64Bit) {
+  return new LoweringPrepareX86CXXABI(is64Bit);
+}
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp
index 3d590b3d499b..39bd1716aa3b 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp
@@ -5,6 +5,7 @@
 #include "LowerModule.h"
 #include "LowerTypes.h"
 #include "TargetInfo.h"
+#include "X86_64ABIInfo.h"
 #include "clang/CIR/ABIArgInfo.h"
 #include "clang/CIR/Dialect/IR/CIRDataLayout.h"
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
@@ -101,97 +102,6 @@ mlir::Type getFPTypeAtOffset(mlir::Type IRType, unsigned IROffset,
 
 } // namespace
 
-class X86_64ABIInfo : public ABIInfo {
-  using Class = cir::X86ArgClass;
-
-  /// Implement the X86_64 ABI merging algorithm.
-  ///
-  /// Merge an accumulating classification \arg Accum with a field
-  /// classification \arg Field.
-  ///
-  /// \param Accum - The accumulating classification. This should
-  /// always be either NoClass or the result of a previous merge
-  /// call. In addition, this should never be Memory (the caller
-  /// should just return Memory for the aggregate).
-  static Class merge(Class Accum, Class Field);
-
-  /// Implement the X86_64 ABI post merging algorithm.
-  ///
-  /// Post merger cleanup, reduces a malformed Hi and Lo pair to
-  /// final MEMORY or SSE classes when necessary.
-  ///
-  /// \param AggregateSize - The size of the current aggregate in
-  /// the classification process.
-  ///
-  /// \param Lo - The classification for the parts of the type
-  /// residing in the low word of the containing object.
-  ///
-  /// \param Hi - The classification for the parts of the type
-  /// residing in the higher words of the containing object.
-  ///
-  void postMerge(unsigned AggregateSize, Class &Lo, Class &Hi) const;
-
-  /// Determine the x86_64 register classes in which the given type T should be
-  /// passed.
-  ///
-  /// \param Lo - The classification for the parts of the type
-  /// residing in the low word of the containing object.
-  ///
-  /// \param Hi - The classification for the parts of the type
-  /// residing in the high word of the containing object.
-  ///
-  /// \param OffsetBase - The bit offset of this type in the
-  /// containing object.  Some parameters are classified different
-  /// depending on whether they straddle an eightbyte boundary.
-  ///
-  /// \param isNamedArg - Whether the argument in question is a "named"
-  /// argument, as used in AMD64-ABI 3.5.7.
-  ///
-  /// \param IsRegCall - Whether the calling conversion is regcall.
-  ///
-  /// If a word is unused its result will be NoClass; if a type should
-  /// be passed in Memory then at least the classification of \arg Lo
-  /// will be Memory.
-  ///
-  /// The \arg Lo class will be NoClass iff the argument is ignored.
-  ///
-  /// If the \arg Lo class is ComplexX87, then the \arg Hi class will
-  /// also be ComplexX87.
-  void classify(mlir::Type T, uint64_t OffsetBase, Class &Lo, Class &Hi,
-                bool isNamedArg, bool IsRegCall = false) const;
-
-  mlir::Type GetSSETypeAtOffset(mlir::Type IRType, unsigned IROffset,
-                                mlir::Type SourceTy,
-                                unsigned SourceOffset) const;
-
-  mlir::Type GetINTEGERTypeAtOffset(mlir::Type DestTy, unsigned IROffset,
-                                    mlir::Type SourceTy,
-                                    unsigned SourceOffset) const;
-
-  /// The 0.98 ABI revision clarified a lot of ambiguities,
-  /// unfortunately in ways that were not always consistent with
-  /// certain previous compilers.  In particular, platforms which
-  /// required strict binary compatibility with older versions of GCC
-  /// may need to exempt themselves.
-  bool honorsRevision0_98() const {
-    return !getTarget().getTriple().isOSDarwin();
-  }
-
-  X86AVXABILevel AVXLevel;
-
-public:
-  X86_64ABIInfo(LowerTypes &CGT, X86AVXABILevel AVXLevel)
-      : ABIInfo(CGT), AVXLevel(AVXLevel) {}
-
-  cir::ABIArgInfo classifyReturnType(mlir::Type RetTy) const;
-
-  ABIArgInfo classifyArgumentType(mlir::Type Ty, unsigned freeIntRegs,
-                                  unsigned &neededInt, unsigned &neededSSE,
-                                  bool isNamedArg, bool IsRegCall) const;
-
-  void computeInfo(LowerFunctionInfo &FI) const override;
-};
-
 class X86_64TargetLoweringInfo : public TargetLoweringInfo {
 public:
   X86_64TargetLoweringInfo(LowerTypes &LM, X86AVXABILevel AVXLevel)
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86_64ABIInfo.h b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86_64ABIInfo.h
new file mode 100644
index 000000000000..201730519207
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86_64ABIInfo.h
@@ -0,0 +1,96 @@
+#include "ABIInfo.h"
+#include "clang/CIR/Target/x86.h"
+
+namespace cir {
+class X86_64ABIInfo : public cir::ABIInfo {
+  using Class = cir::X86ArgClass;
+
+  /// Implement the X86_64 ABI merging algorithm.
+  ///
+  /// Merge an accumulating classification \arg Accum with a field
+  /// classification \arg Field.
+  ///
+  /// \param Accum - The accumulating classification. This should
+  /// always be either NoClass or the result of a previous merge
+  /// call. In addition, this should never be Memory (the caller
+  /// should just return Memory for the aggregate).
+  static Class merge(Class Accum, Class Field);
+
+  /// Implement the X86_64 ABI post merging algorithm.
+  ///
+  /// Post merger cleanup, reduces a malformed Hi and Lo pair to
+  /// final MEMORY or SSE classes when necessary.
+  ///
+  /// \param AggregateSize - The size of the current aggregate in
+  /// the classification process.
+  ///
+  /// \param Lo - The classification for the parts of the type
+  /// residing in the low word of the containing object.
+  ///
+  /// \param Hi - The classification for the parts of the type
+  /// residing in the higher words of the containing object.
+  ///
+  void postMerge(unsigned AggregateSize, Class &Lo, Class &Hi) const;
+
+  /// Determine the x86_64 register classes in which the given type T should be
+  /// passed.
+  ///
+  /// \param Lo - The classification for the parts of the type
+  /// residing in the low word of the containing object.
+  ///
+  /// \param Hi - The classification for the parts of the type
+  /// residing in the high word of the containing object.
+  ///
+  /// \param OffsetBase - The bit offset of this type in the
+  /// containing object.  Some parameters are classified different
+  /// depending on whether they straddle an eightbyte boundary.
+  ///
+  /// \param isNamedArg - Whether the argument in question is a "named"
+  /// argument, as used in AMD64-ABI 3.5.7.
+  ///
+  /// \param IsRegCall - Whether the calling conversion is regcall.
+  ///
+  /// If a word is unused its result will be NoClass; if a type should
+  /// be passed in Memory then at least the classification of \arg Lo
+  /// will be Memory.
+  ///
+  /// The \arg Lo class will be NoClass iff the argument is ignored.
+  ///
+  /// If the \arg Lo class is ComplexX87, then the \arg Hi class will
+  /// also be ComplexX87.
+  void classify(mlir::Type T, uint64_t OffsetBase, Class &Lo, Class &Hi,
+                bool isNamedArg, bool IsRegCall = false) const;
+
+  mlir::Type GetSSETypeAtOffset(mlir::Type IRType, unsigned IROffset,
+                                mlir::Type SourceTy,
+                                unsigned SourceOffset) const;
+
+  mlir::Type GetINTEGERTypeAtOffset(mlir::Type DestTy, unsigned IROffset,
+                                    mlir::Type SourceTy,
+                                    unsigned SourceOffset) const;
+
+  /// The 0.98 ABI revision clarified a lot of ambiguities,
+  /// unfortunately in ways that were not always consistent with
+  /// certain previous compilers.  In particular, platforms which
+  /// required strict binary compatibility with older versions of GCC
+  /// may need to exempt themselves.
+  bool honorsRevision0_98() const {
+    return !getTarget().getTriple().isOSDarwin();
+  }
+
+  ::cir::X86AVXABILevel AVXLevel;
+
+public:
+  X86_64ABIInfo(LowerTypes &CGT, cir::X86AVXABILevel AVXLevel)
+      : ABIInfo(CGT), AVXLevel(AVXLevel) {}
+
+  cir::ABIArgInfo classifyReturnType(mlir::Type RetTy) const;
+
+  cir::ABIArgInfo classifyArgumentType(mlir::Type Ty, unsigned freeIntRegs,
+                                       unsigned &neededInt, unsigned &neededSSE,
+                                       bool isNamedArg, bool IsRegCall) const;
+
+  void computeInfo(LowerFunctionInfo &FI) const override;
+};
+
+} // namespace cir
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index a5d6a69693ae..f3a2eb1aa244 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -4305,6 +4305,37 @@ class CIRIsFPClassOpLowering
   }
 };
 
+class CIRPtrMaskOpLowering : public mlir::OpConversionPattern<cir::PtrMaskOp> {
+public:
+  using OpConversionPattern<cir::PtrMaskOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::PtrMaskOp op, OpAdaptor adaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+    // FIXME: We'd better to lower to mlir::LLVM::PtrMaskOp if it exists.
+    // So we have to make it manually here by following:
+    // https://llvm.org/docs/LangRef.html#llvm-ptrmask-intrinsic
+    auto loc = op.getLoc();
+    auto mask = op.getMask();
+
+    auto moduleOp = op->getParentOfType<mlir::ModuleOp>();
+    mlir::DataLayout layout(moduleOp);
+    auto iPtrIdxValue = layout.getTypeSizeInBits(mask.getType());
+    auto iPtrIdx = mlir::IntegerType::get(moduleOp->getContext(), iPtrIdxValue);
+
+    auto intPtr = rewriter.create<mlir::LLVM::PtrToIntOp>(
+        loc, iPtrIdx, adaptor.getPtr()); // this may truncate
+    mlir::Value masked =
+        rewriter.create<mlir::LLVM::AndOp>(loc, intPtr, adaptor.getMask());
+    mlir::Value diff = rewriter.create<mlir::LLVM::SubOp>(loc, intPtr, masked);
+    rewriter.replaceOpWithNewOp<mlir::LLVM::GEPOp>(
+        op, getTypeConverter()->convertType(op.getType()),
+        mlir::IntegerType::get(moduleOp->getContext(), 8), adaptor.getPtr(),
+        diff);
+    return mlir::success();
+  }
+};
+
 class CIRAbsOpLowering : public mlir::OpConversionPattern<cir::AbsOp> {
 public:
   using OpConversionPattern<cir::AbsOp>::OpConversionPattern;
@@ -4398,7 +4429,8 @@ void populateCIRToLLVMConversionPatterns(
       CIRAssumeLowering, CIRAssumeAlignedLowering, CIRAssumeSepStorageLowering,
       CIRBaseClassAddrOpLowering, CIRDerivedClassAddrOpLowering,
       CIRVTTAddrPointOpLowering, CIRIsFPClassOpLowering, CIRAbsOpLowering,
-      CIRMemMoveOpLowering, CIRMemsetOpLowering, CIRSignBitOpLowering
+      CIRMemMoveOpLowering, CIRMemsetOpLowering, CIRSignBitOpLowering,
+      CIRPtrMaskOpLowering
 #define GET_BUILTIN_LOWERING_LIST
 #include "clang/CIR/Dialect/IR/CIRBuiltinsLowering.inc"
 #undef GET_BUILTIN_LOWERING_LIST
diff --git a/clang/test/CIR/Lowering/var-arg-x86_64.c b/clang/test/CIR/Lowering/var-arg-x86_64.c
new file mode 100644
index 000000000000..992d5e82cd98
--- /dev/null
+++ b/clang/test/CIR/Lowering/var-arg-x86_64.c
@@ -0,0 +1,78 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -fno-clangir-call-conv-lowering %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm -fno-clangir-call-conv-lowering %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s
+
+#include <stdarg.h>
+
+double f1(int n, ...) {
+  va_list valist;
+  va_start(valist, n);
+  double res = va_arg(valist, double);
+  va_end(valist);
+  return res;
+}
+
+// CHECK: [[VA_LIST_TYPE:%.+]] = type { i32, i32, ptr, ptr }
+
+// CHECK: define {{.*}}@f1
+// CHECK: [[VA_LIST_ALLOCA:%.+]] = alloca {{.*}}[[VA_LIST_TYPE]]
+// CHECK: [[VA_LIST:%.+]] = getelementptr {{.*}} [[VA_LIST_ALLOCA]], i32 0
+// CHECK: call {{.*}}@llvm.va_start.p0(ptr [[VA_LIST]])
+// CHECK: [[VA_LIST2:%.+]] = getelementptr {{.*}} [[VA_LIST_ALLOCA]], i32 0
+// CHECK: [[FP_OFFSET_P:%.+]] = getelementptr {{.*}} [[VA_LIST2]], i32 0, i32 1
+// CHECK: [[FP_OFFSET:%.+]] = load {{.*}}, ptr [[FP_OFFSET_P]]
+// CHECK: [[COMPARED:%.+]] = icmp ule i32 {{.*}}, 160
+// CHECK: br i1 [[COMPARED]], label %[[THEN_BB:.+]], label %[[ELSE_BB:.+]],
+//
+// CHECK: [[THEN_BB]]:
+// CHECK:   [[UPDATED_FP_OFFSET:%.+]] = add i32 [[FP_OFFSET]], 8
+// CHECK:   store i32 [[UPDATED_FP_OFFSET]], ptr [[FP_OFFSET_P]]
+// CHECK:   br label %[[CONT_BB:.+]],
+//
+// CHECK: [[ELSE_BB]]:
+// CHECK:   [[OVERFLOW_ARG_AREA_ADDR:%.+]] = getelementptr {{.*}} [[VA_LIST2]], i32 0, i32 2
+// CHECK:   [[OVERFLOW_ARG_AREA:%.+]] = load ptr, ptr [[OVERFLOW_ARG_AREA_ADDR]]
+// CHECK:   [[OVERFLOW_ARG_AREA_OFFSET:%.+]] = getelementptr {{.*}} [[OVERFLOW_ARG_AREA]], i64 8
+// CHECK:   store ptr [[OVERFLOW_ARG_AREA_OFFSET]], ptr [[OVERFLOW_ARG_AREA_ADDR]]
+// CHECK:   br label %[[CONT_BB]]
+//
+// CHECK: [[CONT_BB]]:
+// CHECK: [[VA_LIST3:%.+]] = getelementptr {{.*}} [[VA_LIST_ALLOCA]], i32 0
+// CHECK: call {{.*}}@llvm.va_end.p0(ptr [[VA_LIST3]])
+
+// CIR: cir.func @f1
+// CIR: [[VA_LIST_ALLOCA:%.+]] = cir.alloca !cir.array<!ty___va_list_tag x 1>,
+// CIR: [[RES:%.+]] = cir.alloca !cir.double, !cir.ptr<!cir.double>, ["res",
+// CIR: [[VASTED_VA_LIST:%.+]] = cir.cast(array_to_ptrdecay, [[VA_LIST_ALLOCA]] 
+// CIR: cir.va.start [[VASTED_VA_LIST]]
+// CIR: [[VASTED_VA_LIST:%.+]] = cir.cast(array_to_ptrdecay, [[VA_LIST_ALLOCA]] 
+// CIR: [[FP_OFFSET_P:%.+]] = cir.get_member [[VASTED_VA_LIST]][1] {name = "fp_offset"}
+// CIR: [[FP_OFFSET:%.+]] = cir.load [[FP_OFFSET_P]]
+// CIR: [[OFFSET_CONSTANT:%.+]] = cir.const #cir.int<160>
+// CIR: [[CMP:%.+]] = cir.cmp(le, [[FP_OFFSET]], [[OFFSET_CONSTANT]])
+// CIR: cir.brcond [[CMP]] ^[[InRegBlock:.+]], ^[[InMemBlock:.+]] loc
+// 
+// CIR: ^[[InRegBlock]]:
+// CIR: [[REG_SAVE_AREA_P:%.+]] = cir.get_member [[VASTED_VA_LIST]][3] {name = "reg_save_area"}
+// CIR: [[REG_SAVE_AREA:%.+]] = cir.load [[REG_SAVE_AREA_P]]
+// CIR: [[UPDATED:%.+]] = cir.ptr_stride([[REG_SAVE_AREA]] {{.*}}, [[FP_OFFSET]]
+// CIR: [[CONSTANT:%.+]] = cir.const #cir.int<8>
+// CIR: [[ADDED:%.+]] = cir.binop(add, [[FP_OFFSET]], [[CONSTANT]])
+// CIR: cir.store [[ADDED]], [[FP_OFFSET_P]]
+// CIR: cir.br ^[[ContBlock:.+]]([[UPDATED]]
+//
+// CIR: ^[[InMemBlock]]:
+// CIR: [[OVERFLOW_ARG_AREA_P:%.+]] = cir.get_member [[VASTED_VA_LIST]][2] {name = "overflow_arg_area"}
+// CIR: [[OVERFLOW_ARG_AREA:%.+]] = cir.load [[OVERFLOW_ARG_AREA_P]]
+// CIR: [[OFFSET:%.+]] = cir.const #cir.int<8>
+// CIR: [[CASTED:%.+]] = cir.cast(bitcast, [[OVERFLOW_ARG_AREA]] : !cir.ptr<!void>)
+// CIR: [[NEW_VALUE:%.+]] = cir.ptr_stride([[CASTED]] : !cir.ptr<!s8i>, [[OFFSET]]
+// CIR: [[CASTED_P:%.+]] = cir.cast(bitcast, [[OVERFLOW_ARG_AREA_P]] : !cir.ptr<!cir.ptr<!void>>)
+// CIR: store [[NEW_VALUE]], [[CASTED_P]]
+// CIR: cir.br ^[[ContBlock]]([[OVERFLOW_ARG_AREA]]
+//
+// CIR: ^[[ContBlock]]([[ARG:.+]]: !cir.ptr
+// CIR: [[CASTED_ARG_P:%.+]] = cir.cast(bitcast, [[ARG]]
+// CIR: [[CASTED_ARG:%.+]] = cir.load align(16) [[CASTED_ARG_P]]
+// CIR: store [[CASTED_ARG]], [[RES]]