Fix llvm#73. Store the translation result into task args. See more.

mikaoP · mikaoP · commit 94385ebdd316 · 2020-10-05T10:14:31.000Z
At this moment we have decided that address translation is going to be performed only for reductions and device dependencies. This is because we only support one indirection translation, so int **p; in([10](p[3])) will not work. Maybe in the future we can add translation for those exprs. that can be translatable, although in smp is unnecessary... Fix reduction init function. Closes llvm#73
diff --git a/clang/lib/CodeGen/CGOmpSsRuntime.cpp b/clang/lib/CodeGen/CGOmpSsRuntime.cpp
@@ -1820,13 +1820,13 @@ static llvm::Value *emitReduceInitFunction(CodeGenModule &CGM,
       // initializer(omp_priv = ...)
       // initializer(omp_priv(...))
       CGF.EmitExprAsInit(PrivVD->getInit(), PrivVD,
-                         CGF.MakeAddrLValue(PrivLV.getPointer(CGF), PrivLV.getType(), PrivLV.getAlignment()),
+                         CGF.MakeAddrLValue(PrivCur, PrivLV.getType(), PrivLV.getAlignment()),
                          /*capturedByInit=*/false);
     }
   } else {
     assert(RHSVD->hasInit() && "RHSVD has no initializer");
     CGF.EmitExprAsInit(RHSVD->getInit(), RHSVD,
-                       CGF.MakeAddrLValue(PrivLV.getPointer(CGF), PrivLV.getType(), PrivLV.getAlignment()),
+                       CGF.MakeAddrLValue(PrivCur, PrivLV.getType(), PrivLV.getAlignment()),
                        /*capturedByInit=*/false);
   }
 
diff --git a/clang/test/OmpSs-RT/Clang_OmpSs-2/success_task_reduction_03.cpp b/clang/test/OmpSs-RT/Clang_OmpSs-2/success_task_reduction_03.cpp
@@ -0,0 +1,51 @@
+/*--------------------------------------------------------------------
+  (C) Copyright 2006-2013 Barcelona Supercomputing Center
+                          Centro Nacional de Supercomputacion
+
+  This file is part of Mercurium C/C++ source-to-source compiler.
+
+  See AUTHORS file in the top level directory for information
+  regarding developers and contributors.
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 3 of the License, or (at your option) any later version.
+
+  Mercurium C/C++ source-to-source compiler is distributed in the hope
+  that it will be useful, but WITHOUT ANY WARRANTY; without even the
+  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+  PURPOSE.  See the GNU Lesser General Public License for more
+  details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with Mercurium C/C++ source-to-source compiler; if
+  not, write to the Free Software Foundation, Inc., 675 Mass Ave,
+  Cambridge, MA 02139, USA.
+--------------------------------------------------------------------*/
+
+// RUN: %oss-cxx-compile-and-run
+// RUN: %oss-cxx-O2-compile-and-run
+// UNSUPPORTED: true
+
+#include <cassert>
+
+#define N 5000
+
+int main() {
+    int size = N;
+    int vla[size];
+    // VLA init
+    for (int i = 0; i < N; ++i)
+        vla[i] = 1;
+
+    for (int i = 0; i < N; ++i) {
+        #pragma oss task reduction(+: [size]vla)
+        { vla[i]++; }
+    }
+    #pragma oss taskwait
+    // Check
+    for (int i = 0; i < N; ++i) {
+        assert(vla[i] == 2);
+    }
+}
diff --git a/clang/test/OmpSs/IR/task_reduction.c b/clang/test/OmpSs/IR/task_reduction.c
@@ -12,7 +12,7 @@ void foo(int x) {
 // CHECK: %1 = call token @llvm.directive.region.entry() [ "DIR.OSS"([5 x i8] c"TASK\00"), "QUAL.OSS.SHARED"(i32* %x.addr), "QUAL.OSS.DEP.WEAKREDUCTION"(i32 6000, i32* %x.addr, %struct._depend_unpack_t.0 (i32*)* @compute_dep.1, i32* %x.addr), "QUAL.OSS.DEP.REDUCTION.INIT"(i32* %x.addr, void (i32*, i32*, i64)* @red_init), "QUAL.OSS.DEP.REDUCTION.COMBINE"(i32* %x.addr, void (i32*, i32*, i64)* @red_comb) ]
 
 // CHECK: define internal void @red_init(i32* %0, i32* %1, i64 %2)
-// CHECK: store i32 0, i32* %3, align 4
+// CHECK: store i32 0, i32* %arrayctor.dst.cur, align 4
 
 // CHECK: define internal void @red_comb(i32* %0, i32* %1, i64 %2)
 // CHECK: %add = add nsw i32 %7, %8
diff --git a/clang/test/OmpSs/IR/task_reduction.cpp b/clang/test/OmpSs/IR/task_reduction.cpp
@@ -9,7 +9,7 @@ void foo(int &rx) {
 // CHECK: %1 = call token @llvm.directive.region.entry() [ "DIR.OSS"([5 x i8] c"TASK\00"), "QUAL.OSS.SHARED"(i32* %0), "QUAL.OSS.DEP.REDUCTION"(i32 6000, i32* %0, %struct._depend_unpack_t (i32*)* @compute_dep, i32* %0), "QUAL.OSS.DEP.REDUCTION.INIT"(i32* %0, void (i32*, i32*, i64)* @red_init), "QUAL.OSS.DEP.REDUCTION.COMBINE"(i32* %0, void (i32*, i32*, i64)* @red_comb) ]
 
 // CHECK: define internal void @red_init(i32* %0, i32* %1, i64 %2)
-// CHECK: store i32 0, i32* %3, align 4
+// CHECK: store i32 0, i32* %arrayctor.dst.cur, align 4
 
 // CHECK: define internal void @red_comb(i32* %0, i32* %1, i64 %2)
 // CHECK: %add = add nsw i32 %7, %8
diff --git a/llvm/lib/Transforms/OmpSs/OmpSsTransform.cpp b/llvm/lib/Transforms/OmpSs/OmpSsTransform.cpp
@@ -1009,22 +1009,15 @@ struct OmpSs : public ModulePass {
     return FuncVar;
   }
 
-  // Build a new storage for the translated reduction
-  // returns the storage of the translated reduction
-  void translateReductionUnpackedDSA(IRBuilder<> &IRB, const DependInfo *DepInfo,
-                                     Value *DSA, Value *&UnpackedDSA,
-                                     Value *AddrTranslationTable,
-                                     const std::map<Value *, int> &DepSymToIdx) {
+  // Rewrites task_args using address_translation
+  void translateDep(
+      IRBuilder<> &IRB, const DependInfo *DepInfo, Value *DSA,
+      Value *&UnpackedDSA, Value *AddrTranslationTable,
+      const std::map<Value *, int> &DepSymToIdx) {
+
     Function *ComputeDepFun = cast<Function>(DepInfo->ComputeDepFun);
     CallInst *CallComputeDep = IRB.CreateCall(ComputeDepFun, DepInfo->Args);
-    llvm::Value *Base = IRB.CreateExtractValue(CallComputeDep, 0);
-
-    // Save the original type since we are going to cast...
-    Type *UnpackedDSAType = UnpackedDSA->getType();
-    Type *BaseType = Base->getType();
-
-    // Storage of the translated DSA
-    AllocaInst *UnpackedDSATranslated = IRB.CreateAlloca(BaseType);
+    llvm::Value *DepBase = IRB.CreateExtractValue(CallComputeDep, 0);
 
     Value *Idx[2];
     Idx[0] = ConstantInt::get(Type::getInt32Ty(IRB.getContext()), DepSymToIdx.at(DSA));
@@ -1039,21 +1032,20 @@ struct OmpSs : public ModulePass {
     DeviceAddr = IRB.CreateLoad(DeviceAddr);
 
     // Res = device_addr + (DSA_addr - local_addr)
-    Base = IRB.CreateBitCast(Base, Type::getInt8PtrTy(IRB.getContext()));
-    UnpackedDSA = IRB.CreateGEP(Base, IRB.CreateNeg(LocalAddr));
-    UnpackedDSA = IRB.CreateGEP(UnpackedDSA, DeviceAddr);
-    UnpackedDSA = IRB.CreateBitCast(UnpackedDSA, BaseType );
-
-    IRB.CreateStore(UnpackedDSA, UnpackedDSATranslated);
-
-   // FIXME: Since we have no info about if we have to pass to unpack a load of the alloca
-   // or not, check if the type has changed after call to compute_dep.
-   // Pointers -> no load
-   // basic types/structs/arrays/vla -> load
-   if (UnpackedDSAType == BaseType)
-      UnpackedDSA = IRB.CreateLoad(UnpackedDSATranslated);
-   else
-      UnpackedDSA = UnpackedDSATranslated;
+    Value *Translation = IRB.CreateBitCast(DepBase, Type::getInt8PtrTy(IRB.getContext()));
+    Translation = IRB.CreateGEP(Translation, IRB.CreateNeg(LocalAddr));
+    Translation = IRB.CreateGEP(Translation, DeviceAddr);
+
+    // Store the translation in task_args
+    if (auto *LUnpackedDSA = dyn_cast<LoadInst>(UnpackedDSA)) {
+      Translation = IRB.CreateBitCast(Translation, LUnpackedDSA->getType());
+      IRB.CreateStore(Translation, LUnpackedDSA->getPointerOperand());
+      // Reload what we have translated
+      UnpackedDSA = IRB.CreateLoad(LUnpackedDSA->getPointerOperand());
+    } else {
+      Translation = IRB.CreateBitCast(Translation, UnpackedDSA->getType()->getPointerElementType());
+      IRB.CreateStore(Translation, UnpackedDSA);
+    }
   }
 
   // Given a Outline Function assuming that task args are the first parameter, and
@@ -1147,10 +1139,11 @@ struct OmpSs : public ModulePass {
       SmallVector<Value *, 4> UnpackParamsCopy(UnpackParams);
       for (auto &DepInfo : DependsInfo.List) {
         if (DepInfo->isReduction()) {
-          Value *DepBaseDSA = DepInfo->Args[0];
-          translateReductionUnpackedDSA(BBBuilder, DepInfo.get(), DepBaseDSA,
-                                        UnpackParams[StructToIdxMap.lookup(DepBaseDSA)],
-                                        AddrTranslationTable, DirInfo.DirEnv.DepSymToIdx);
+          Value *DepBaseDSA = DepInfo->Base;
+          translateDep(
+            BBBuilder, DepInfo.get(), DepBaseDSA,
+            UnpackParams[StructToIdxMap.lookup(DepBaseDSA)],
+            AddrTranslationTable, DirInfo.DirEnv.DepSymToIdx);
         }
       }
       for (Instruction &I : *BBBuilder.GetInsertBlock()) {
diff --git a/llvm/test/Transforms/OmpSs/task_reduction.ll b/llvm/test/Transforms/OmpSs/task_reduction.ll
@@ -51,40 +51,38 @@ entry:
 ; CHECK: define internal void @nanos6_ol_task_region_foo0(%nanos6_task_args_foo0* %task_args, i8* %device_env, %nanos6_address_translation_entry_t* %address_translation_table) {
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   %gep_n.addr = getelementptr %nanos6_task_args_foo0, %nanos6_task_args_foo0* %task_args, i32 0, i32 0
-; CHECK-NEXT:   %load_gep_n.addr = load i32*, i32** %gep_n.addr
+; CHECK-NEXT:   %load_gep_n.addr = load i32*, i32** %gep_n.addr, align 8
 ; CHECK-NEXT:   %gep_vla = getelementptr %nanos6_task_args_foo0, %nanos6_task_args_foo0* %task_args, i32 0, i32 1
-; CHECK-NEXT:   %load_gep_vla = load i32*, i32** %gep_vla
+; CHECK-NEXT:   %load_gep_vla = load i32*, i32** %gep_vla, align 8
 ; CHECK-NEXT:   %capt_gep = getelementptr %nanos6_task_args_foo0, %nanos6_task_args_foo0* %task_args, i32 0, i32 2
-; CHECK-NEXT:   %load_capt_gep = load i64, i64* %capt_gep
+; CHECK-NEXT:   %load_capt_gep = load i64, i64* %capt_gep, align 8
 ; CHECK-NEXT:   %0 = call %struct._depend_unpack_t @compute_dep(i32* %load_gep_n.addr)
 ; CHECK-NEXT:   %1 = extractvalue %struct._depend_unpack_t %0, 0
-; CHECK-NEXT:   %2 = alloca i32*
 ; CHECK-NEXT:   %local_lookup_n.addr = getelementptr %nanos6_address_translation_entry_t, %nanos6_address_translation_entry_t* %address_translation_table, i32 0, i32 0
-; CHECK-NEXT:   %3 = load i64, i64* %local_lookup_n.addr
+; CHECK-NEXT:   %2 = load i64, i64* %local_lookup_n.addr, align 8
 ; CHECK-NEXT:   %device_lookup_n.addr = getelementptr %nanos6_address_translation_entry_t, %nanos6_address_translation_entry_t* %address_translation_table, i32 0, i32 1
-; CHECK-NEXT:   %4 = load i64, i64* %device_lookup_n.addr
-; CHECK-NEXT:   %5 = bitcast i32* %1 to i8*
-; CHECK-NEXT:   %6 = sub i64 0, %3
-; CHECK-NEXT:   %7 = getelementptr i8, i8* %5, i64 %6
-; CHECK-NEXT:   %8 = getelementptr i8, i8* %7, i64 %4
-; CHECK-NEXT:   %9 = bitcast i8* %8 to i32*
-; CHECK-NEXT:   store i32* %9, i32** %2
-; CHECK-NEXT:   %10 = load i32*, i32** %2
-; CHECK-NEXT:   %11 = call %struct._depend_unpack_t.0 @compute_dep.1(i32* %load_gep_vla, i64 %load_capt_gep)
-; CHECK-NEXT:   %12 = extractvalue %struct._depend_unpack_t.0 %11, 0
-; CHECK-NEXT:   %13 = alloca i32*
+; CHECK-NEXT:   %3 = load i64, i64* %device_lookup_n.addr, align 8
+; CHECK-NEXT:   %4 = bitcast i32* %1 to i8*
+; CHECK-NEXT:   %5 = sub i64 0, %2
+; CHECK-NEXT:   %6 = getelementptr i8, i8* %4, i64 %5
+; CHECK-NEXT:   %7 = getelementptr i8, i8* %6, i64 %3
+; CHECK-NEXT:   %8 = bitcast i8* %7 to i32*
+; CHECK-NEXT:   store i32* %8, i32** %gep_n.addr, align 8
+; CHECK-NEXT:   %9 = load i32*, i32** %gep_n.addr, align 8
+; CHECK-NEXT:   %10 = call %struct._depend_unpack_t.0 @compute_dep.1(i32* %load_gep_vla, i64 %load_capt_gep)
+; CHECK-NEXT:   %11 = extractvalue %struct._depend_unpack_t.0 %10, 0
 ; CHECK-NEXT:   %local_lookup_vla = getelementptr %nanos6_address_translation_entry_t, %nanos6_address_translation_entry_t* %address_translation_table, i32 1, i32 0
-; CHECK-NEXT:   %14 = load i64, i64* %local_lookup_vla
+; CHECK-NEXT:   %12 = load i64, i64* %local_lookup_vla, align 8
 ; CHECK-NEXT:   %device_lookup_vla = getelementptr %nanos6_address_translation_entry_t, %nanos6_address_translation_entry_t* %address_translation_table, i32 1, i32 1
-; CHECK-NEXT:   %15 = load i64, i64* %device_lookup_vla
-; CHECK-NEXT:   %16 = bitcast i32* %12 to i8*
-; CHECK-NEXT:   %17 = sub i64 0, %14
-; CHECK-NEXT:   %18 = getelementptr i8, i8* %16, i64 %17
-; CHECK-NEXT:   %19 = getelementptr i8, i8* %18, i64 %15
-; CHECK-NEXT:   %20 = bitcast i8* %19 to i32*
-; CHECK-NEXT:   store i32* %20, i32** %13
-; CHECK-NEXT:   %21 = load i32*, i32** %13
-; CHECK-NEXT:   call void @nanos6_unpacked_task_region_foo0(i32* %10, i32* %21, i64 %load_capt_gep, i8* %device_env, %nanos6_address_translation_entry_t* %address_translation_table)
+; CHECK-NEXT:   %13 = load i64, i64* %device_lookup_vla, align 8
+; CHECK-NEXT:   %14 = bitcast i32* %11 to i8*
+; CHECK-NEXT:   %15 = sub i64 0, %12
+; CHECK-NEXT:   %16 = getelementptr i8, i8* %14, i64 %15
+; CHECK-NEXT:   %17 = getelementptr i8, i8* %16, i64 %13
+; CHECK-NEXT:   %18 = bitcast i8* %17 to i32*
+; CHECK-NEXT:   store i32* %18, i32** %gep_vla, align 8
+; CHECK-NEXT:   %19 = load i32*, i32** %gep_vla, align 8
+; CHECK-NEXT:   call void @nanos6_unpacked_task_region_foo0(i32* %9, i32* %19, i64 %load_capt_gep, i8* %device_env, %nanos6_address_translation_entry_t* %address_translation_table)
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 
@@ -137,7 +135,7 @@ entry:
 arrayctor.loop:                                   ; preds = %arrayctor.loop, %entry
   %arrayctor.dst.cur = phi i32* [ %3, %entry ], [ %arrayctor.dst.next, %arrayctor.loop ]
   %arrayctor.src.cur = phi i32* [ %4, %entry ], [ %arrayctor.src.next, %arrayctor.loop ]
-  store i32 0, i32* %3, align 4
+  store i32 0, i32* %arrayctor.dst.cur, align 4
   %arrayctor.dst.next = getelementptr inbounds i32, i32* %arrayctor.dst.cur, i64 1
   %arrayctor.src.next = getelementptr inbounds i32, i32* %arrayctor.src.cur, i64 1
   %arrayctor.done = icmp eq i32* %arrayctor.dst.next, %arrayctor.dst.end
@@ -237,40 +235,38 @@ entry:
 ; CHECK: define internal void @nanos6_ol_task_region_foo10(%nanos6_task_args_foo10* %task_args, i8* %device_env, %nanos6_address_translation_entry_t* %address_translation_table) {
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   %gep_n.addr = getelementptr %nanos6_task_args_foo10, %nanos6_task_args_foo10* %task_args, i32 0, i32 0
-; CHECK-NEXT:   %load_gep_n.addr = load i32*, i32** %gep_n.addr
+; CHECK-NEXT:   %load_gep_n.addr = load i32*, i32** %gep_n.addr, align 8
 ; CHECK-NEXT:   %gep_vla = getelementptr %nanos6_task_args_foo10, %nanos6_task_args_foo10* %task_args, i32 0, i32 1
-; CHECK-NEXT:   %load_gep_vla = load i32*, i32** %gep_vla
+; CHECK-NEXT:   %load_gep_vla = load i32*, i32** %gep_vla, align 8
 ; CHECK-NEXT:   %capt_gep = getelementptr %nanos6_task_args_foo10, %nanos6_task_args_foo10* %task_args, i32 0, i32 2
-; CHECK-NEXT:   %load_capt_gep = load i64, i64* %capt_gep
+; CHECK-NEXT:   %load_capt_gep = load i64, i64* %capt_gep, align 8
 ; CHECK-NEXT:   %0 = call %struct._depend_unpack_t.1 @compute_dep.4(i32* %load_gep_n.addr)
 ; CHECK-NEXT:   %1 = extractvalue %struct._depend_unpack_t.1 %0, 0
-; CHECK-NEXT:   %2 = alloca i32*
 ; CHECK-NEXT:   %local_lookup_n.addr = getelementptr %nanos6_address_translation_entry_t, %nanos6_address_translation_entry_t* %address_translation_table, i32 0, i32 0
-; CHECK-NEXT:   %3 = load i64, i64* %local_lookup_n.addr
+; CHECK-NEXT:   %2 = load i64, i64* %local_lookup_n.addr, align 8
 ; CHECK-NEXT:   %device_lookup_n.addr = getelementptr %nanos6_address_translation_entry_t, %nanos6_address_translation_entry_t* %address_translation_table, i32 0, i32 1
-; CHECK-NEXT:   %4 = load i64, i64* %device_lookup_n.addr
-; CHECK-NEXT:   %5 = bitcast i32* %1 to i8*
-; CHECK-NEXT:   %6 = sub i64 0, %3
-; CHECK-NEXT:   %7 = getelementptr i8, i8* %5, i64 %6
-; CHECK-NEXT:   %8 = getelementptr i8, i8* %7, i64 %4
-; CHECK-NEXT:   %9 = bitcast i8* %8 to i32*
-; CHECK-NEXT:   store i32* %9, i32** %2
-; CHECK-NEXT:   %10 = load i32*, i32** %2
-; CHECK-NEXT:   %11 = call %struct._depend_unpack_t.2 @compute_dep.5(i32* %load_gep_vla, i64 %load_capt_gep)
-; CHECK-NEXT:   %12 = extractvalue %struct._depend_unpack_t.2 %11, 0
-; CHECK-NEXT:   %13 = alloca i32*
+; CHECK-NEXT:   %3 = load i64, i64* %device_lookup_n.addr, align 8
+; CHECK-NEXT:   %4 = bitcast i32* %1 to i8*
+; CHECK-NEXT:   %5 = sub i64 0, %2
+; CHECK-NEXT:   %6 = getelementptr i8, i8* %4, i64 %5
+; CHECK-NEXT:   %7 = getelementptr i8, i8* %6, i64 %3
+; CHECK-NEXT:   %8 = bitcast i8* %7 to i32*
+; CHECK-NEXT:   store i32* %8, i32** %gep_n.addr, align 8
+; CHECK-NEXT:   %9 = load i32*, i32** %gep_n.addr, align 8
+; CHECK-NEXT:   %10 = call %struct._depend_unpack_t.2 @compute_dep.5(i32* %load_gep_vla, i64 %load_capt_gep)
+; CHECK-NEXT:   %11 = extractvalue %struct._depend_unpack_t.2 %10, 0
 ; CHECK-NEXT:   %local_lookup_vla = getelementptr %nanos6_address_translation_entry_t, %nanos6_address_translation_entry_t* %address_translation_table, i32 1, i32 0
-; CHECK-NEXT:   %14 = load i64, i64* %local_lookup_vla
+; CHECK-NEXT:   %12 = load i64, i64* %local_lookup_vla, align 8
 ; CHECK-NEXT:   %device_lookup_vla = getelementptr %nanos6_address_translation_entry_t, %nanos6_address_translation_entry_t* %address_translation_table, i32 1, i32 1
-; CHECK-NEXT:   %15 = load i64, i64* %device_lookup_vla
-; CHECK-NEXT:   %16 = bitcast i32* %12 to i8*
-; CHECK-NEXT:   %17 = sub i64 0, %14
-; CHECK-NEXT:   %18 = getelementptr i8, i8* %16, i64 %17
-; CHECK-NEXT:   %19 = getelementptr i8, i8* %18, i64 %15
-; CHECK-NEXT:   %20 = bitcast i8* %19 to i32*
-; CHECK-NEXT:   store i32* %20, i32** %13
-; CHECK-NEXT:   %21 = load i32*, i32** %13
-; CHECK-NEXT:   call void @nanos6_unpacked_task_region_foo10(i32* %10, i32* %21, i64 %load_capt_gep, i8* %device_env, %nanos6_address_translation_entry_t* %address_translation_table)
+; CHECK-NEXT:   %13 = load i64, i64* %device_lookup_vla, align 8
+; CHECK-NEXT:   %14 = bitcast i32* %11 to i8*
+; CHECK-NEXT:   %15 = sub i64 0, %12
+; CHECK-NEXT:   %16 = getelementptr i8, i8* %14, i64 %15
+; CHECK-NEXT:   %17 = getelementptr i8, i8* %16, i64 %13
+; CHECK-NEXT:   %18 = bitcast i8* %17 to i32*
+; CHECK-NEXT:   store i32* %18, i32** %gep_vla, align 8
+; CHECK-NEXT:   %19 = load i32*, i32** %gep_vla, align 8
+; CHECK-NEXT:   call void @nanos6_unpacked_task_region_foo10(i32* %9, i32* %19, i64 %load_capt_gep, i8* %device_env, %nanos6_address_translation_entry_t* %address_translation_table)
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 
@@ -314,7 +310,7 @@ entry:
 arrayctor.loop:                                   ; preds = %arrayctor.loop, %entry
   %arrayctor.dst.cur = phi i32* [ %3, %entry ], [ %arrayctor.dst.next, %arrayctor.loop ]
   %arrayctor.src.cur = phi i32* [ %4, %entry ], [ %arrayctor.src.next, %arrayctor.loop ]
-  store i32 0, i32* %3, align 4
+  store i32 0, i32* %arrayctor.dst.cur, align 4
   %arrayctor.dst.next = getelementptr inbounds i32, i32* %arrayctor.dst.cur, i64 1
   %arrayctor.src.next = getelementptr inbounds i32, i32* %arrayctor.src.cur, i64 1
   %arrayctor.done = icmp eq i32* %arrayctor.dst.next, %arrayctor.dst.end