Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -2371,6 +2371,7 @@ class OpenMPIRBuilder {
BasicBlock *EntryBB, *ExitBB, *OuterAllocBB;
SmallVector<BasicBlock *> OuterDeallocBBs;
SmallVector<Value *, 2> ExcludeArgsFromAggregate;
bool FixUpNonEntryAllocas = false;

LLVM_ABI virtual ~OutlineInfo() = default;

Expand Down
36 changes: 35 additions & 1 deletion llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Bitcode/BitcodeReader.h"
Expand Down Expand Up @@ -890,6 +891,28 @@ static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder,
}
}

static void hoistNonEntryAllocasToEntryBlock(llvm::BasicBlock &Block) {
llvm::SmallVector<llvm::Instruction *> AllocasToMove;

auto ShouldHoistAlloca = [](const llvm::AllocaInst &AllocaInst) {
// TODO: For now, we support simple static allocations, we might need to
// move non-static ones as well. However, this will need further analysis to
// move the lenght arguments as well.
return !AllocaInst.isArrayAllocation();
};

for (llvm::Instruction &Inst : Block)
if (auto *AllocaInst = llvm::dyn_cast<llvm::AllocaInst>(&Inst))
if (ShouldHoistAlloca(*AllocaInst))
AllocasToMove.push_back(AllocaInst);

auto InsertPoint =
Block.getParent()->getEntryBlock().getTerminator()->getIterator();

for (llvm::Instruction *AllocaInst : AllocasToMove)
AllocaInst->moveBefore(InsertPoint);
}

void OpenMPIRBuilder::finalize(Function *Fn) {
SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
SmallVector<BasicBlock *, 32> Blocks;
Expand Down Expand Up @@ -990,6 +1013,13 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
// Run a user callback, e.g. to add attributes.
if (OI->PostOutlineCB)
OI->PostOutlineCB(*OutlinedFn);

if (OI->FixUpNonEntryAllocas) {
PostDominatorTree PostDomTree(*OutlinedFn);
for (llvm::BasicBlock &BB : *OutlinedFn)
if (PostDomTree.properlyDominates(&BB, &OutlinedFn->getEntryBlock()))
hoistNonEntryAllocasToEntryBlock(BB);
}
}

// Remove work items that have been completed.
Expand Down Expand Up @@ -1908,18 +1938,22 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
if (Config.isTargetDevice()) {
// Generate OpenMP target specific runtime call
OI->PostOutlineCB = [=, ToBeDeletedVec =
std::move(ToBeDeleted)](Function &OutlinedFn) {
std::move(ToBeDeleted)](Function &OutlinedFn) {
targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
IfCondition, NumThreads, PrivTID, PrivTIDAddrAcast,
ThreadID, ToBeDeletedVec);


};
OI->FixUpNonEntryAllocas = true;
} else {
// Generate OpenMP host runtime call
OI->PostOutlineCB = [=, ToBeDeletedVec =
std::move(ToBeDeleted)](Function &OutlinedFn) {
hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
PrivTID, PrivTIDAddrAcast, ToBeDeletedVec);
};
// TODO: fix-up allocations on the host as well?
}

OI->OuterAllocBB = OuterAllocaBlock;
Expand Down
91 changes: 91 additions & 0 deletions mlir/test/Target/LLVMIR/openmp-private-allloca-hoisting.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// Tests that static alloca's in `omp.private ... init` regions are hoisted to
// the parent construct's alloca IP.
// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s

module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
llvm.func @foo1()
llvm.func @foo2()
llvm.func @foo3()
llvm.func @foo4()

omp.private {type = private} @multi_block.privatizer : f32 init {
^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
%0 = llvm.mlir.constant(1 : i32) : i32
%alloca1 = llvm.alloca %0 x !llvm.struct<(i64)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5>

%1 = llvm.load %arg0 : !llvm.ptr -> f32

%c1 = llvm.mlir.constant(1 : i32) : i32
%c2 = llvm.mlir.constant(2 : i32) : i32
%cond1 = llvm.icmp "eq" %c1, %c2 : i32
llvm.cond_br %cond1, ^bb1, ^bb2

^bb1:
llvm.call @foo1() : () -> ()
llvm.br ^bb3

^bb2:
llvm.call @foo2() : () -> ()
llvm.br ^bb3

^bb3:
llvm.store %1, %arg1 : f32, !llvm.ptr

omp.yield(%arg1 : !llvm.ptr)
}

omp.private {type = private} @multi_block.privatizer2 : f32 init {
^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
%0 = llvm.mlir.constant(1 : i32) : i32
%alloca1 = llvm.alloca %0 x !llvm.struct<(ptr)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5>

%1 = llvm.load %arg0 : !llvm.ptr -> f32

%c1 = llvm.mlir.constant(1 : i32) : i32
%c2 = llvm.mlir.constant(2 : i32) : i32
%cond1 = llvm.icmp "eq" %c1, %c2 : i32
llvm.cond_br %cond1, ^bb1, ^bb2

^bb1:
llvm.call @foo3() : () -> ()
llvm.br ^bb3

^bb2:
llvm.call @foo4() : () -> ()
llvm.br ^bb3

^bb3:
llvm.store %1, %arg1 : f32, !llvm.ptr

omp.yield(%arg1 : !llvm.ptr)
}

llvm.func @parallel_op_private_multi_block(%arg0: !llvm.ptr, %arg1: !llvm.ptr) {
%arg0_map = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.ptr)
map_clauses(is_device_ptr) capture(ByRef) -> !llvm.ptr {name = ""}
%arg1_map = omp.map.info var_ptr(%arg1 : !llvm.ptr, !llvm.ptr)
map_clauses(is_device_ptr) capture(ByRef) -> !llvm.ptr {name = ""}

omp.target map_entries(%arg0_map -> %arg0_arg, %arg1_map -> %arg1_arg : !llvm.ptr, !llvm.ptr) {
omp.parallel private(@multi_block.privatizer %arg0_arg -> %arg2,
@multi_block.privatizer2 %arg1_arg -> %arg3 : !llvm.ptr, !llvm.ptr) {
%0 = llvm.load %arg2 : !llvm.ptr -> f32
%1 = llvm.load %arg3 : !llvm.ptr -> f32
omp.terminator
}
omp.terminator
}
llvm.return
}
}

// CHECK: call void @__kmpc_parallel_60(ptr addrspacecast (ptr addrspace(1) @3 to ptr), i32 %omp_global_thread_num, i32 1, i32 -1, i32 -1, ptr @[[OUTLINED_FN:.*]], ptr @{{.*}}, ptr %8, i64 1, i32 0)

// CHECK: define internal void @[[OUTLINED_FN]]({{.*}}) {{.*}} {
// CHECK: omp.par.entry:
// Varify that both allocas were hoisted to the parallel region's entry block.
// CHECK: %{{.*}} = alloca { i64 }, align 8
// CHECK-NEXT: %{{.*}} = alloca { ptr }, align 8
// CHECK-NEXT: br label %omp.region.after_alloca1
// CHECK: omp.region.after_alloca1:
// CHECK: }