diff --git a/llvm/docs/KernelInfo.rst b/llvm/docs/KernelInfo.rst new file mode 100644 index 0000000000000..dac642f1ffc65 --- /dev/null +++ b/llvm/docs/KernelInfo.rst @@ -0,0 +1,63 @@ +========== +KernelInfo +========== + +.. contents:: + :local: + +Introduction +============ + +This LLVM IR pass reports various statistics for codes compiled for GPUs. The +goal of these statistics is to help identify bad code patterns and ways to +mitigate them. The pass operates at the LLVM IR level so that it can, in +theory, support any LLVM-based compiler for programming languages supporting +GPUs. + +By default, the pass runs at the end of LTO, and options like +``-Rpass=kernel-info`` enable its remarks. Example ``opt`` and ``clang`` +command lines appear in the next section. + +Remarks include summary statistics (e.g., total size of static allocas) and +individual occurrences (e.g., source location of each alloca). Examples of the +output appear in tests in `llvm/test/Analysis/KernelInfo`. + +Example Command Lines +===================== + +To analyze a C program as it appears to an LLVM GPU backend at the end of LTO: + +.. code-block:: shell + + $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ + -Rpass=kernel-info + +To analyze specified LLVM IR, perhaps previously generated by something like +``clang -save-temps -g -fopenmp --offload-arch=native test.c``: + +.. code-block:: shell + + $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ + -pass-remarks=kernel-info -passes=kernel-info + +When specifying an LLVM pass pipeline on the command line, ``kernel-info`` still +runs at the end of LTO by default. ``-no-kernel-info-end-lto`` disables that +behavior so you can position ``kernel-info`` explicitly: + +.. code-block:: shell + + $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ + -Rpass=kernel-info \ + -Xoffload-linker --lto-newpm-passes='lto' + + $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ + -Rpass=kernel-info -mllvm -no-kernel-info-end-lto \ + -Xoffload-linker --lto-newpm-passes='module(kernel-info),lto' + + $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ + -pass-remarks=kernel-info \ + -passes='lto' + + $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ + -pass-remarks=kernel-info -no-kernel-info-end-lto \ + -passes='module(kernel-info),lto' diff --git a/llvm/docs/Passes.rst b/llvm/docs/Passes.rst index 5e436db62be3a..435e748199b87 100644 --- a/llvm/docs/Passes.rst +++ b/llvm/docs/Passes.rst @@ -5,6 +5,11 @@ LLVM's Analysis and Transform Passes .. contents:: :local: +.. toctree:: + :hidden: + + KernelInfo + Introduction ============ .. warning:: This document is not updated frequently, and the list of passes @@ -148,6 +153,12 @@ This pass collects the count of all instructions and reports them. Bookkeeping for "interesting" users of expressions computed from induction variables. +``kernel-info``: GPU Kernel Info +-------------------------------- + +Reports various statistics for codes compiled for GPUs. This pass is +:doc:`documented separately`. + ``lazy-value-info``: Lazy Value Information Analysis ---------------------------------------------------- diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h new file mode 100644 index 0000000000000..75d92c202212b --- /dev/null +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -0,0 +1,35 @@ +//=- KernelInfo.h - Kernel Analysis -------------------------------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the KernelInfoPrinter class used to emit remarks about +// function properties from a GPU kernel. +// +// See llvm/docs/KernelInfo.rst. +// ===---------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_KERNELINFO_H +#define LLVM_ANALYSIS_KERNELINFO_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class TargetMachine; + +class KernelInfoPrinter : public PassInfoMixin { + TargetMachine *TM; + +public: + explicit KernelInfoPrinter(TargetMachine *TM) : TM(TM) {} + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + + static bool isRequired() { return true; } +}; +} // namespace llvm +#endif // LLVM_ANALYSIS_KERNELINFO_H diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 71b204f9c3fec..bcd3de9b7a6e4 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1886,6 +1886,11 @@ class TargetTransformInfo { /// @} + /// Collect kernel launch bounds for \p F into \p LB. + void collectKernelLaunchBounds( + const Function &F, + SmallVectorImpl> &LB) const; + private: /// The abstract base class used to type erase specific TTI /// implementations. @@ -2324,6 +2329,9 @@ class TargetTransformInfo::Concept { virtual unsigned getMaxNumArgs() const = 0; virtual unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const = 0; + virtual void collectKernelLaunchBounds( + const Function &F, + SmallVectorImpl> &LB) const = 0; }; template @@ -3169,6 +3177,12 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { Type *ArrayType) const override { return Impl.getNumBytesToPadGlobalArray(Size, ArrayType); } + + void collectKernelLaunchBounds( + const Function &F, + SmallVectorImpl> &LB) const override { + Impl.collectKernelLaunchBounds(F, LB); + } }; template diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index dcef4a1abcfa3..b51663adcd8d0 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1049,6 +1049,10 @@ class TargetTransformInfoImplBase { return 0; } + void collectKernelLaunchBounds( + const Function &F, + SmallVectorImpl> &LB) const {} + protected: // Obtain the minimum required size to hold the value (without the sign) // In case of a vector it returns the min required size for one element. diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h index e7afcbd31420c..fcd5396ccfdbc 100644 --- a/llvm/include/llvm/IR/Function.h +++ b/llvm/include/llvm/IR/Function.h @@ -284,6 +284,18 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node { setValueSubclassData((getSubclassDataFromValue() & 0xc00f) | (ID << 4)); } + /// Does it have a kernel calling convention? + bool hasKernelCallingConv() const { + switch (getCallingConv()) { + default: + return false; + case CallingConv::PTX_Kernel: + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return true; + } + } + enum ProfileCountType { PCT_Real, PCT_Synthetic }; /// Class to represent profile counts. diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h index 9bdb110bd3683..ec581ae5f5b8e 100644 --- a/llvm/include/llvm/Target/TargetMachine.h +++ b/llvm/include/llvm/Target/TargetMachine.h @@ -19,6 +19,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Support/PGOOptions.h" #include "llvm/Target/CGPassBuilderOption.h" @@ -28,6 +29,8 @@ #include #include +extern llvm::cl::opt NoKernelInfoEndLTO; + namespace llvm { class AAManager; diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt index 0db5b80f336cb..a44f6c6a135ef 100644 --- a/llvm/lib/Analysis/CMakeLists.txt +++ b/llvm/lib/Analysis/CMakeLists.txt @@ -79,6 +79,7 @@ add_llvm_component_library(LLVMAnalysis InstructionPrecedenceTracking.cpp InstructionSimplify.cpp InteractiveModelRunner.cpp + KernelInfo.cpp LastRunTrackingAnalysis.cpp LazyBranchProbabilityInfo.cpp LazyBlockFrequencyInfo.cpp diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp new file mode 100644 index 0000000000000..4a06fd5943089 --- /dev/null +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -0,0 +1,326 @@ +//===- KernelInfo.cpp - Kernel Analysis -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the KernelInfoPrinter class used to emit remarks about +// function properties from a GPU kernel. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/KernelInfo.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Passes/PassBuilder.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "kernel-info" + +namespace { + +/// Data structure holding function info for kernels. +class KernelInfo { + void updateForBB(const BasicBlock &BB, OptimizationRemarkEmitter &ORE); + +public: + static void emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, + TargetMachine *TM); + + /// Whether the function has external linkage and is not a kernel function. + bool ExternalNotKernel = false; + + /// Launch bounds. + SmallVector> LaunchBounds; + + /// The number of alloca instructions inside the function, the number of those + /// with allocation sizes that cannot be determined at compile time, and the + /// sum of the sizes that can be. + /// + /// With the current implementation for at least some GPU archs, + /// AllocasDyn > 0 might not be possible, but we report AllocasDyn anyway in + /// case the implementation changes. + int64_t Allocas = 0; + int64_t AllocasDyn = 0; + int64_t AllocasStaticSizeSum = 0; + + /// Number of direct/indirect calls (anything derived from CallBase). + int64_t DirectCalls = 0; + int64_t IndirectCalls = 0; + + /// Number of direct calls made from this function to other functions + /// defined in this module. + int64_t DirectCallsToDefinedFunctions = 0; + + /// Number of direct calls to inline assembly. + int64_t InlineAssemblyCalls = 0; + + /// Number of calls of type InvokeInst. + int64_t Invokes = 0; + + /// Target-specific flat address space. + unsigned FlatAddrspace; + + /// Number of flat address space memory accesses (via load, store, etc.). + int64_t FlatAddrspaceAccesses = 0; +}; + +} // end anonymous namespace + +static void identifyCallee(OptimizationRemark &R, const Module *M, + const Value *V, StringRef Kind = "") { + SmallString<100> Name; // might be function name or asm expression + if (const Function *F = dyn_cast(V)) { + if (auto *SubProgram = F->getSubprogram()) { + if (SubProgram->isArtificial()) + R << "artificial "; + Name = SubProgram->getName(); + } + } + if (Name.empty()) { + raw_svector_ostream OS(Name); + V->printAsOperand(OS, /*PrintType=*/false, M); + } + if (!Kind.empty()) + R << Kind << " "; + R << "'" << Name << "'"; +} + +static void identifyFunction(OptimizationRemark &R, const Function &F) { + identifyCallee(R, F.getParent(), &F, "function"); +} + +static void remarkAlloca(OptimizationRemarkEmitter &ORE, const Function &Caller, + const AllocaInst &Alloca, + TypeSize::ScalarTy StaticSize) { + ORE.emit([&] { + StringRef DbgName; + DebugLoc Loc; + bool Artificial = false; + auto DVRs = findDVRDeclares(&const_cast(Alloca)); + if (!DVRs.empty()) { + const DbgVariableRecord &DVR = **DVRs.begin(); + DbgName = DVR.getVariable()->getName(); + Loc = DVR.getDebugLoc(); + Artificial = DVR.Variable->isArtificial(); + } + OptimizationRemark R(DEBUG_TYPE, "Alloca", DiagnosticLocation(Loc), + Alloca.getParent()); + R << "in "; + identifyFunction(R, Caller); + R << ", "; + if (Artificial) + R << "artificial "; + SmallString<20> ValName; + raw_svector_ostream OS(ValName); + Alloca.printAsOperand(OS, /*PrintType=*/false, Caller.getParent()); + R << "alloca ('" << ValName << "') "; + if (!DbgName.empty()) + R << "for '" << DbgName << "' "; + else + R << "without debug info "; + R << "with "; + if (StaticSize) + R << "static size of " << itostr(StaticSize) << " bytes"; + else + R << "dynamic size"; + return R; + }); +} + +static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller, + const CallBase &Call, StringRef CallKind, + StringRef RemarkKind) { + ORE.emit([&] { + OptimizationRemark R(DEBUG_TYPE, RemarkKind, &Call); + R << "in "; + identifyFunction(R, Caller); + R << ", " << CallKind << ", callee is "; + identifyCallee(R, Caller.getParent(), Call.getCalledOperand()); + return R; + }); +} + +static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE, + const Function &Caller, + const Instruction &Inst) { + ORE.emit([&] { + OptimizationRemark R(DEBUG_TYPE, "FlatAddrspaceAccess", &Inst); + R << "in "; + identifyFunction(R, Caller); + if (const IntrinsicInst *II = dyn_cast(&Inst)) { + R << ", '" << II->getCalledFunction()->getName() << "' call"; + } else { + R << ", '" << Inst.getOpcodeName() << "' instruction"; + } + if (!Inst.getType()->isVoidTy()) { + SmallString<20> Name; + raw_svector_ostream OS(Name); + Inst.printAsOperand(OS, /*PrintType=*/false, Caller.getParent()); + R << " ('" << Name << "')"; + } + R << " accesses memory in flat address space"; + return R; + }); +} + +void KernelInfo::updateForBB(const BasicBlock &BB, + OptimizationRemarkEmitter &ORE) { + const Function &F = *BB.getParent(); + const Module &M = *F.getParent(); + const DataLayout &DL = M.getDataLayout(); + for (const Instruction &I : BB.instructionsWithoutDebug()) { + if (const AllocaInst *Alloca = dyn_cast(&I)) { + ++Allocas; + TypeSize::ScalarTy StaticSize = 0; + if (std::optional Size = Alloca->getAllocationSize(DL)) { + StaticSize = Size->getFixedValue(); + assert(StaticSize <= std::numeric_limits::max()); + AllocasStaticSizeSum += StaticSize; + } else { + ++AllocasDyn; + } + remarkAlloca(ORE, F, *Alloca, StaticSize); + } else if (const CallBase *Call = dyn_cast(&I)) { + SmallString<40> CallKind; + SmallString<40> RemarkKind; + if (Call->isIndirectCall()) { + ++IndirectCalls; + CallKind += "indirect"; + RemarkKind += "Indirect"; + } else { + ++DirectCalls; + CallKind += "direct"; + RemarkKind += "Direct"; + } + if (isa(Call)) { + ++Invokes; + CallKind += " invoke"; + RemarkKind += "Invoke"; + } else { + CallKind += " call"; + RemarkKind += "Call"; + } + if (!Call->isIndirectCall()) { + if (const Function *Callee = Call->getCalledFunction()) { + if (!Callee->isIntrinsic() && !Callee->isDeclaration()) { + ++DirectCallsToDefinedFunctions; + CallKind += " to defined function"; + RemarkKind += "ToDefinedFunction"; + } + } else if (Call->isInlineAsm()) { + ++InlineAssemblyCalls; + CallKind += " to inline assembly"; + RemarkKind += "ToInlineAssembly"; + } + } + remarkCall(ORE, F, *Call, CallKind, RemarkKind); + if (const AnyMemIntrinsic *MI = dyn_cast(Call)) { + if (MI->getDestAddressSpace() == FlatAddrspace) { + ++FlatAddrspaceAccesses; + remarkFlatAddrspaceAccess(ORE, F, I); + } else if (const AnyMemTransferInst *MT = + dyn_cast(MI)) { + if (MT->getSourceAddressSpace() == FlatAddrspace) { + ++FlatAddrspaceAccesses; + remarkFlatAddrspaceAccess(ORE, F, I); + } + } + } + } else if (const LoadInst *Load = dyn_cast(&I)) { + if (Load->getPointerAddressSpace() == FlatAddrspace) { + ++FlatAddrspaceAccesses; + remarkFlatAddrspaceAccess(ORE, F, I); + } + } else if (const StoreInst *Store = dyn_cast(&I)) { + if (Store->getPointerAddressSpace() == FlatAddrspace) { + ++FlatAddrspaceAccesses; + remarkFlatAddrspaceAccess(ORE, F, I); + } + } else if (const AtomicRMWInst *At = dyn_cast(&I)) { + if (At->getPointerAddressSpace() == FlatAddrspace) { + ++FlatAddrspaceAccesses; + remarkFlatAddrspaceAccess(ORE, F, I); + } + } else if (const AtomicCmpXchgInst *At = dyn_cast(&I)) { + if (At->getPointerAddressSpace() == FlatAddrspace) { + ++FlatAddrspaceAccesses; + remarkFlatAddrspaceAccess(ORE, F, I); + } + } + } +} + +static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F, + StringRef Name, int64_t Value) { + ORE.emit([&] { + OptimizationRemark R(DEBUG_TYPE, Name, &F); + R << "in "; + identifyFunction(R, F); + R << ", " << Name << " = " << itostr(Value); + return R; + }); +} + +static std::optional parseFnAttrAsInteger(Function &F, + StringRef Name) { + if (!F.hasFnAttribute(Name)) + return std::nullopt; + return F.getFnAttributeAsParsedInteger(Name); +} + +void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, + TargetMachine *TM) { + KernelInfo KI; + TargetTransformInfo &TheTTI = FAM.getResult(F); + KI.FlatAddrspace = TheTTI.getFlatAddressSpace(); + + // Record function properties. + KI.ExternalNotKernel = F.hasExternalLinkage() && !F.hasKernelCallingConv(); + for (StringRef Name : {"omp_target_num_teams", "omp_target_thread_limit"}) { + if (auto Val = parseFnAttrAsInteger(F, Name)) + KI.LaunchBounds.push_back({Name, *Val}); + } + TheTTI.collectKernelLaunchBounds(F, KI.LaunchBounds); + + auto &ORE = FAM.getResult(F); + for (const auto &BB : F) + KI.updateForBB(BB, ORE); + +#define REMARK_PROPERTY(PROP_NAME) \ + remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME) + REMARK_PROPERTY(ExternalNotKernel); + for (auto LB : KI.LaunchBounds) + remarkProperty(ORE, F, LB.first, LB.second); + REMARK_PROPERTY(Allocas); + REMARK_PROPERTY(AllocasStaticSizeSum); + REMARK_PROPERTY(AllocasDyn); + REMARK_PROPERTY(DirectCalls); + REMARK_PROPERTY(IndirectCalls); + REMARK_PROPERTY(DirectCallsToDefinedFunctions); + REMARK_PROPERTY(InlineAssemblyCalls); + REMARK_PROPERTY(Invokes); + REMARK_PROPERTY(FlatAddrspaceAccesses); +#undef REMARK_PROPERTY + + return; +} + +PreservedAnalyses KernelInfoPrinter::run(Function &F, + FunctionAnalysisManager &AM) { + // Skip it if remarks are not enabled as it will do nothing useful. + if (F.getContext().getDiagHandlerPtr()->isPassedOptRemarkEnabled(DEBUG_TYPE)) + KernelInfo::emitKernelInfo(F, AM, TM); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 8b9722d047edc..424bb7be23383 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1441,6 +1441,12 @@ TargetTransformInfo::getNumBytesToPadGlobalArray(unsigned Size, return TTIImpl->getNumBytesToPadGlobalArray(Size, ArrayType); } +void TargetTransformInfo::collectKernelLaunchBounds( + const Function &F, + SmallVectorImpl> &LB) const { + return TTIImpl->collectKernelLaunchBounds(F, LB); +} + TargetTransformInfo::Concept::~Concept() = default; TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {} diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 1e97cef22045d..328da2a185f02 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -46,6 +46,7 @@ #include "llvm/Analysis/InlineAdvisor.h" #include "llvm/Analysis/InlineSizeEstimatorAnalysis.h" #include "llvm/Analysis/InstCount.h" +#include "llvm/Analysis/KernelInfo.h" #include "llvm/Analysis/LastRunTrackingAnalysis.h" #include "llvm/Analysis/LazyCallGraph.h" #include "llvm/Analysis/LazyValueInfo.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 0eb050c8adb04..9300a3dfca1dc 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -391,6 +391,7 @@ FUNCTION_PASS("irce", IRCEPass()) FUNCTION_PASS("jump-threading", JumpThreadingPass()) FUNCTION_PASS("jump-table-to-switch", JumpTableToSwitchPass()); FUNCTION_PASS("kcfi", KCFIPass()) +FUNCTION_PASS("kernel-info", KernelInfoPrinter(TM)) FUNCTION_PASS("lcssa", LCSSAPass()) FUNCTION_PASS("libcalls-shrinkwrap", LibCallsShrinkWrapPass()) FUNCTION_PASS("lint", LintPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 1f29589146c80..5b2081c8fa213 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -53,6 +53,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/KernelInfo.h" #include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/CodeGen/AtomicExpand.h" #include "llvm/CodeGen/DeadMachineInstructionElim.h" @@ -879,6 +880,11 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(AMDGPUAttributorPass(*this, Opt)); } } + if (!NoKernelInfoEndLTO) { + FunctionPassManager FPM; + FPM.addPass(KernelInfoPrinter(this)); + PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } }); PB.registerRegClassFilterParsingCallback( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 5160851f8c442..5bfd8914b9a46 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1430,3 +1430,19 @@ unsigned GCNTTIImpl::getPrefetchDistance() const { bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const { return AMDGPU::isFlatGlobalAddrSpace(AS); } + +void GCNTTIImpl::collectKernelLaunchBounds( + const Function &F, + SmallVectorImpl> &LB) const { + SmallVector MaxNumWorkgroups = ST->getMaxNumWorkGroups(F); + LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]}); + LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]}); + LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]}); + std::pair FlatWorkGroupSize = + ST->getFlatWorkGroupSizes(F); + LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first}); + LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second}); + std::pair WavesPerEU = ST->getWavesPerEU(F); + LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first}); + LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second}); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 585f38fc02c29..a0d62008d9ddc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -273,6 +273,9 @@ class GCNTTIImpl final : public BasicTTIImplBase { /// \return if target want to issue a prefetch in address space \p AS. bool shouldPrefetchAddressSpace(unsigned AS) const override; + void collectKernelLaunchBounds( + const Function &F, + SmallVectorImpl> &LB) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 6d4b82aa54a2b..e88027f30a03c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -21,6 +21,7 @@ #include "NVPTXTargetObjectFile.h" #include "NVPTXTargetTransformInfo.h" #include "TargetInfo/NVPTXTargetInfo.h" +#include "llvm/Analysis/KernelInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -266,6 +267,15 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { FPM.addPass(NVPTXCopyByValArgsPass()); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); }); + + if (!NoKernelInfoEndLTO) { + PB.registerFullLinkTimeOptimizationLastEPCallback( + [this](ModulePassManager &PM, OptimizationLevel Level) { + FunctionPassManager FPM; + FPM.addPass(KernelInfoPrinter(this)); + PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + }); + } } TargetTransformInfo diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 4ec2ec100ab08..85e99d7fe97a2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -562,4 +562,18 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, } } return nullptr; -} \ No newline at end of file +} + +void NVPTXTTIImpl::collectKernelLaunchBounds( + const Function &F, + SmallVectorImpl> &LB) const { + std::optional Val; + if ((Val = getMaxClusterRank(F))) + LB.push_back({"maxclusterrank", *Val}); + if ((Val = getMaxNTIDx(F))) + LB.push_back({"maxntidx", *Val}); + if ((Val = getMaxNTIDy(F))) + LB.push_back({"maxntidy", *Val}); + if ((Val = getMaxNTIDz(F))) + LB.push_back({"maxntidz", *Val}); +} diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index 0f4fb280b2d99..b0a846a9c7f96 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -129,6 +129,10 @@ class NVPTXTTIImpl : public BasicTTIImplBase { Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const; + + void collectKernelLaunchBounds( + const Function &F, + SmallVectorImpl> &LB) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp index d5365f3c04743..027ae62007a72 100644 --- a/llvm/lib/Target/TargetMachine.cpp +++ b/llvm/lib/Target/TargetMachine.cpp @@ -26,6 +26,11 @@ #include "llvm/Target/TargetLoweringObjectFile.h" using namespace llvm; +cl::opt NoKernelInfoEndLTO( + "no-kernel-info-end-lto", + cl::desc("remove the kernel-info pass at the end of the full LTO pipeline"), + cl::init(false), cl::Hidden); + //--------------------------------------------------------------------------- // TargetMachine Class // diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 10008130016c3..682227916e712 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -5905,17 +5905,6 @@ bool llvm::omp::isOpenMPKernel(Function &Fn) { return Fn.hasFnAttribute("kernel"); } -static bool isKernelCC(Function &F) { - switch (F.getCallingConv()) { - default: - return false; - case CallingConv::PTX_Kernel: - case CallingConv::AMDGPU_KERNEL: - case CallingConv::SPIR_KERNEL: - return true; - } -} - KernelSet llvm::omp::getDeviceKernels(Module &M) { // TODO: Create a more cross-platform way of determining device kernels. KernelSet Kernels; @@ -5948,7 +5937,7 @@ KernelSet llvm::omp::getDeviceKernels(Module &M) { } for (Function &F : M) - if (isKernelCC(F)) + if (F.hasKernelCallingConv()) ProcessKernel(F); return Kernels; diff --git a/llvm/test/Analysis/KernelInfo/allocas.ll b/llvm/test/Analysis/KernelInfo/allocas.ll new file mode 100644 index 0000000000000..94506645f7ec6 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/allocas.ll @@ -0,0 +1,117 @@ +; Check info on allocas. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define void @h() !dbg !100 { +entry: + ; CHECK: remark: test.c:0:0: in artificial function 'h', artificial alloca ('%dyn_ptr.addr') for 'dyn_ptr' with static size of 8 bytes + %dyn_ptr.addr = alloca ptr, align 8 + ; CHECK: remark: test.c:14:9: in artificial function 'h', alloca ('%i') for 'i' with static size of 4 bytes + %i = alloca i32, align 4 + ; CHECK: remark: test.c:15:9: in artificial function 'h', alloca ('%a') for 'a' with static size of 8 bytes + %a = alloca [2 x i32], align 4 + %size = load i32, ptr %i, align 4 + ; CHECK: remark: test.c:16:9: in artificial function 'h', alloca ('%adyn') for 'adyn' with dynamic size + %adyn = alloca i32, i32 %size, align 4 + ; CHECK: remark: :0:0: in artificial function 'h', alloca ('%nodbg') without debug info with static size of 4 bytes + %nodbg = alloca i32, align 4 + tail call void @llvm.dbg.declare(metadata ptr %dyn_ptr.addr, metadata !110, metadata !DIExpression()), !dbg !114 + tail call void @llvm.dbg.declare(metadata ptr %i, metadata !120, metadata !DIExpression()), !dbg !121 + tail call void @llvm.dbg.declare(metadata ptr %a, metadata !130, metadata !DIExpression()), !dbg !131 + tail call void @llvm.dbg.declare(metadata ptr %adyn, metadata !140, metadata !DIExpression()), !dbg !141 + br label %non-entry + +non-entry: + ; CHECK: remark: test.c:17:9: in artificial function 'h', alloca ('%i2') for 'i2' with static size of 4 bytes + %i2 = alloca i32, align 4 + %size2 = load i32, ptr %i2, align 4 + ; CHECK: remark: test.c:18:9: in artificial function 'h', alloca ('%adyn2') for 'adyn2' with dynamic size + %adyn2 = alloca i32, i32 %size, align 4 + tail call void @llvm.dbg.declare(metadata ptr %i2, metadata !150, metadata !DIExpression()), !dbg !151 + tail call void @llvm.dbg.declare(metadata ptr %adyn2, metadata !160, metadata !DIExpression()), !dbg !161 + ret void +} +; CHECK: remark: test.c:13:0: in artificial function 'h', Allocas = 7 +; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasStaticSizeSum = 28 +; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasDyn = 2 + +define void @g() !dbg !200 { +entry: + ; CHECK: remark: test.c:4:7: in function 'g', alloca ('%i') for 'i' with static size of 4 bytes + %i = alloca i32, align 4 + ; CHECK: remark: test.c:5:7: in function 'g', alloca ('%a') for 'a' with static size of 8 bytes + %a = alloca [2 x i32], align 4 + tail call void @llvm.dbg.declare(metadata ptr %i, metadata !210, metadata !DIExpression()), !dbg !211 + tail call void @llvm.dbg.declare(metadata ptr %a, metadata !220, metadata !DIExpression()), !dbg !221 + ret void +} +; CHECK: remark: test.c:3:0: in function 'g', Allocas = 2 +; CHECK: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12 +; CHECK: remark: test.c:3:0: in function 'g', AllocasDyn = 0 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare void @llvm.dbg.declare(metadata, metadata, metadata) #0 + +; uselistorder directives +uselistorder ptr @llvm.dbg.declare, { 7, 6, 5, 4, 3, 2, 1, 0 } + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{null} +!4 = !{} + +!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + +!20 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !21) +!21 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !22) +!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) + +!30 = !DICompositeType(tag: DW_TAG_array_type, baseType: !10, size: 64, elements: !31) +!31 = !{!32} +!32 = !DISubrange(count: 2) + +!40 = !DICompositeType(tag: DW_TAG_array_type, baseType: !10, elements: !41) +!41 = !{!42} +!42 = !DISubrange(count: !43) +!43 = !DILocalVariable(name: "__vla_expr0", scope: !100, type: !10, flags: DIFlagArtificial) + +!100 = distinct !DISubprogram(name: "h", scope: !2, file: !2, line: 13, type: !101, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) +!101 = distinct !DISubroutineType(types: !3) + +!110 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !100, type: !20, flags: DIFlagArtificial) +!114 = !DILocation(line: 0, scope: !100) + +!120 = !DILocalVariable(name: "i", scope: !100, file: !2, line: 14, type: !10) +!121 = !DILocation(line: 14, column: 9, scope: !100) + +!130 = !DILocalVariable(name: "a", scope: !100, file: !2, line: 15, type: !30) +!131 = !DILocation(line: 15, column: 9, scope: !100) + +!140 = !DILocalVariable(name: "adyn", scope: !100, file: !2, line: 16, type: !40) +!141 = !DILocation(line: 16, column: 9, scope: !100) + +!150 = !DILocalVariable(name: "i2", scope: !100, file: !2, line: 17, type: !10) +!151 = !DILocation(line: 17, column: 9, scope: !100) + +!160 = !DILocalVariable(name: "adyn2", scope: !100, file: !2, line: 18, type: !40) +!161 = !DILocation(line: 18, column: 9, scope: !100) + +!200 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 3, type: !201, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!201 = !DISubroutineType(types: !3) + +!210 = !DILocalVariable(name: "i", scope: !200, file: !2, line: 4, type: !10) +!211 = !DILocation(line: 4, column: 7, scope: !200) + +!220 = !DILocalVariable(name: "a", scope: !200, file: !2, line: 5, type: !30) +!221 = !DILocation(line: 5, column: 7, scope: !200) diff --git a/llvm/test/Analysis/KernelInfo/calls.ll b/llvm/test/Analysis/KernelInfo/calls.ll new file mode 100644 index 0000000000000..6a2a5c426b78b --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/calls.ll @@ -0,0 +1,139 @@ +; Check info on calls. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +target datalayout = "e-i65:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +declare void @personality() + +define void @h() personality ptr @personality !dbg !100 { +entry: + ; CHECK: remark: test.c:16:5: in artificial function 'h_dbg', direct call, callee is '@f' + call void @f(), !dbg !102 + ; CHECK: remark: test.c:17:5: in artificial function 'h_dbg', direct call to defined function, callee is 'g_dbg' + call void @g(), !dbg !104 + ; CHECK: remark: test.c:18:5: in artificial function 'h_dbg', direct call to defined function, callee is artificial 'h_dbg' + call void @h(), !dbg !105 + ; CHECK: remark: test.c:24:5: in artificial function 'h_dbg', direct call to inline assembly, callee is 'asm sideeffect "eieio", ""' + call void asm sideeffect "eieio", ""(), !dbg !111 + %fnPtr = load ptr, ptr null, align 8 + ; CHECK: remark: test.c:19:5: in artificial function 'h_dbg', indirect call, callee is '%fnPtr' + call void %fnPtr(), !dbg !106 + ; CHECK: remark: test.c:20:5: in artificial function 'h_dbg', direct invoke, callee is '@f' + invoke void @f() to label %fcont unwind label %cleanup, !dbg !107 +fcont: + ; CHECK: remark: test.c:21:5: in artificial function 'h_dbg', direct invoke to defined function, callee is 'g_dbg' + invoke void @g() to label %gcont unwind label %cleanup, !dbg !108 +gcont: + ; CHECK: remark: test.c:22:5: in artificial function 'h_dbg', direct invoke to defined function, callee is artificial 'h_dbg' + invoke void @h() to label %hcont unwind label %cleanup, !dbg !109 +hcont: + ; CHECK: remark: test.c:25:5: in artificial function 'h_dbg', direct invoke to inline assembly, callee is 'asm sideeffect "eieio", ""' + invoke void asm sideeffect "eieio", ""() to label %asmcont unwind label %cleanup, !dbg !112 +asmcont: + ; CHECK: remark: test.c:23:5: in artificial function 'h_dbg', indirect invoke, callee is '%fnPtr' + invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !110 +cleanup: + %ll = landingpad { ptr, i32 } + cleanup + br label %end +end: + ret void +} +; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', DirectCalls = 8 +; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', IndirectCalls = 2 +; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', DirectCallsToDefinedFunctions = 4 +; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', InlineAssemblyCalls = 2 +; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', Invokes = 5 + +declare void @f() + +define void @g() personality ptr @personality !dbg !200 { +entry: + ; CHECK: remark: test.c:6:3: in function 'g_dbg', direct call, callee is '@f' + call void @f(), !dbg !202 + ; CHECK: remark: test.c:7:3: in function 'g_dbg', direct call to defined function, callee is 'g_dbg' + call void @g(), !dbg !203 + ; CHECK: remark: test.c:8:3: in function 'g_dbg', direct call to defined function, callee is artificial 'h_dbg' + call void @h(), !dbg !204 + ; CHECK: remark: test.c:14:3: in function 'g_dbg', direct call to inline assembly, callee is 'asm sideeffect "eieio", ""' + call void asm sideeffect "eieio", ""(), !dbg !210 + %fnPtr = load ptr, ptr null, align 8 + ; CHECK: remark: test.c:9:3: in function 'g_dbg', indirect call, callee is '%fnPtr' + call void %fnPtr(), !dbg !205 + ; CHECK: remark: test.c:10:3: in function 'g_dbg', direct invoke, callee is '@f' + invoke void @f() to label %fcont unwind label %cleanup, !dbg !206 +fcont: + ; CHECK: remark: test.c:11:3: in function 'g_dbg', direct invoke to defined function, callee is 'g_dbg' + invoke void @g() to label %gcont unwind label %cleanup, !dbg !207 +gcont: + ; CHECK: remark: test.c:12:3: in function 'g_dbg', direct invoke to defined function, callee is artificial 'h_dbg' + invoke void @h() to label %hcont unwind label %cleanup, !dbg !208 +hcont: + ; CHECK: remark: test.c:15:3: in function 'g_dbg', direct invoke to inline assembly, callee is 'asm sideeffect "eieio", ""' + invoke void asm sideeffect "eieio", ""() to label %asmcont unwind label %cleanup, !dbg !211 +asmcont: + ; CHECK: remark: test.c:13:3: in function 'g_dbg', indirect invoke, callee is '%fnPtr' + invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !209 +cleanup: + %ll = landingpad { ptr, i32 } + cleanup + br label %end +end: + ret void +} +; CHECK: remark: test.c:3:0: in function 'g_dbg', DirectCalls = 8 +; CHECK: remark: test.c:3:0: in function 'g_dbg', IndirectCalls = 2 +; CHECK: remark: test.c:3:0: in function 'g_dbg', DirectCallsToDefinedFunctions = 4 +; CHECK: remark: test.c:3:0: in function 'g_dbg', InlineAssemblyCalls = 2 +; CHECK: remark: test.c:3:0: in function 'g_dbg', Invokes = 5 + +define void @i() { + ; CHECK: remark: :0:0: in function '@i', direct call, callee is '@f' + call void @f() + ret void +} +; CHECK: remark: :0:0: in function '@i', DirectCalls = 1 +; CHECK: remark: :0:0: in function '@i', IndirectCalls = 0 +; CHECK: remark: :0:0: in function '@i', DirectCallsToDefinedFunctions = 0 +; CHECK: remark: :0:0: in function '@i', InlineAssemblyCalls = 0 +; CHECK: remark: :0:0: in function '@i', Invokes = 0 + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{null} +!4 = !{} + +!100 = distinct !DISubprogram(name: "h_dbg", scope: !2, file: !2, line: 13, type: !101, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) +!101 = distinct !DISubroutineType(types: !3) +!102 = !DILocation(line: 16, column: 5, scope: !103) +!103 = distinct !DILexicalBlock(scope: !100, file: !2, line: 13, column: 3) +!104 = !DILocation(line: 17, column: 5, scope: !103) +!105 = !DILocation(line: 18, column: 5, scope: !103) +!106 = !DILocation(line: 19, column: 5, scope: !103) +!107 = !DILocation(line: 20, column: 5, scope: !103) +!108 = !DILocation(line: 21, column: 5, scope: !103) +!109 = !DILocation(line: 22, column: 5, scope: !103) +!110 = !DILocation(line: 23, column: 5, scope: !103) +!111 = !DILocation(line: 24, column: 5, scope: !103) +!112 = !DILocation(line: 25, column: 5, scope: !103) + +!200 = distinct !DISubprogram(name: "g_dbg", scope: !2, file: !2, line: 3, type: !201, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!201 = !DISubroutineType(types: !3) +!202 = !DILocation(line: 6, column: 3, scope: !200) +!203 = !DILocation(line: 7, column: 3, scope: !200) +!204 = !DILocation(line: 8, column: 3, scope: !200) +!205 = !DILocation(line: 9, column: 3, scope: !200) +!206 = !DILocation(line: 10, column: 3, scope: !200) +!207 = !DILocation(line: 11, column: 3, scope: !200) +!208 = !DILocation(line: 12, column: 3, scope: !200) +!209 = !DILocation(line: 13, column: 3, scope: !200) +!210 = !DILocation(line: 14, column: 3, scope: !200) +!211 = !DILocation(line: 15, column: 3, scope: !200) diff --git a/llvm/test/Analysis/KernelInfo/enable-kernel-info/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/enable-kernel-info/Inputs/test.ll new file mode 100644 index 0000000000000..461544e44d538 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/enable-kernel-info/Inputs/test.ll @@ -0,0 +1,22 @@ +; CHECK: remark: test.c:10:0: in artificial function 'test', omp_target_num_teams = 100 +; NONE-NOT: remark: +define void @test() #0 !dbg !5 { +entry: + ret void +} + +attributes #0 = { + "omp_target_num_teams"="100" +} + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!6} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{} +!4 = !DISubroutineType(types: !3) +!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) +!6 = distinct !{ptr null, !"kernel", i32 1} diff --git a/llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.test b/llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.test new file mode 100644 index 0000000000000..e969eabfe7cd8 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.test @@ -0,0 +1,18 @@ +; Check when kernel-info is enabled in the AMD GPU target backend. + +; REQUIRES: amdgpu-registered-target + +; DEFINE: %{opt} = opt -disable-output %S/Inputs/test.ll \ +; DEFINE: -mtriple="amdgcn-amd-amdhsa" 2>&1 +; DEFINE: %{fcheck-on} = FileCheck -match-full-lines %S/Inputs/test.ll +; DEFINE: %{fcheck-off} = FileCheck -allow-empty -check-prefixes=NONE \ +; DEFINE: %S/Inputs/test.ll + +; By default, kernel-info is in the LTO pipeline. To see output, the LTO +; pipeline must run, -no-kernel-info-end-lto must not be specified, and remarks +; must be enabled. +; RUN: %{opt} -passes='lto' -pass-remarks=kernel-info | %{fcheck-on} +; RUN: %{opt} -passes='default' -pass-remarks=kernel-info | %{fcheck-off} +; RUN: %{opt} -passes='lto' -pass-remarks=kernel-info \ +; RUN: -no-kernel-info-end-lto | %{fcheck-off} +; RUN: %{opt} -passes='lto' | %{fcheck-off} diff --git a/llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.test b/llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.test new file mode 100644 index 0000000000000..65249b4d92e34 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.test @@ -0,0 +1,18 @@ +; Check when kernel-info is enabled in the NVPTX target backend. + +; REQUIRES: nvptx-registered-target + +; DEFINE: %{opt} = opt -disable-output %S/Inputs/test.ll \ +; DEFINE: -mtriple="nvptx64-nvidia-cuda" 2>&1 +; DEFINE: %{fcheck-on} = FileCheck -match-full-lines %S/Inputs/test.ll +; DEFINE: %{fcheck-off} = FileCheck -allow-empty -check-prefixes=NONE \ +; DEFINE: %S/Inputs/test.ll + +; By default, kernel-info is in the LTO pipeline. To see output, the LTO +; pipeline must run, -no-kernel-info-end-lto must not be specified, and remarks +; must be enabled. +; RUN: %{opt} -passes='lto' -pass-remarks=kernel-info | %{fcheck-on} +; RUN: %{opt} -passes='default' -pass-remarks=kernel-info | %{fcheck-off} +; RUN: %{opt} -passes='lto' -pass-remarks=kernel-info \ +; RUN: -no-kernel-info-end-lto | %{fcheck-off} +; RUN: %{opt} -passes='lto' | %{fcheck-off} diff --git a/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll new file mode 100644 index 0000000000000..b54c3a18f3e70 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll @@ -0,0 +1,143 @@ +define void @f() !dbg !3 { +entry: + ; load: check remarks for both unnamed and named values. + ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%0') accesses memory in flat address space + %0 = load i32, ptr null, align 4, !dbg !6 + ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load') accesses memory in flat address space + %load = load i32, ptr null, align 4, !dbg !6 + ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load0') accesses memory in flat address space + %load0 = load i32, ptr addrspace(0) null, align 4, !dbg !6 + %load1 = load i32, ptr addrspace(1) null, align 4, !dbg !6 + %load2 = load i32, ptr addrspace(2) null, align 4, !dbg !6 + + ; store + ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in flat address space + store i32 0, ptr null, align 4, !dbg !7 + ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in flat address space + store i32 0, ptr addrspace(0) null, align 4, !dbg !7 + store i32 0, ptr addrspace(1) null, align 4, !dbg !7 + store i32 0, ptr addrspace(8) null, align 4, !dbg !7 + + ; atomicrmw + ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction ('%[[#]]') accesses memory in flat address space + atomicrmw xchg ptr null, i32 10 seq_cst, !dbg !8 + ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction ('%[[#]]') accesses memory in flat address space + atomicrmw add ptr addrspace(0) null, i32 10 seq_cst, !dbg !8 + atomicrmw xchg ptr addrspace(1) null, i32 10 seq_cst, !dbg !8 + atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !8 + + ; cmpxchg + ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction ('%[[#]]') accesses memory in flat address space + cmpxchg ptr null, i32 0, i32 1 acq_rel monotonic, !dbg !9 + ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction ('%[[#]]') accesses memory in flat address space + cmpxchg ptr addrspace(0) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 + cmpxchg ptr addrspace(1) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 + cmpxchg ptr addrspace(934) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 + + ; llvm.memcpy + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in flat address space + call void @llvm.memcpy.p0.p1.i64(ptr align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in flat address space + call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in flat address space + call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in flat address space + call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 + call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) align 4 null, ptr addrspace(4) align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in flat address space + call void @llvm.memcpy.p0.p0.i64(ptr align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in flat address space + call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 + + ; llvm.memcpy.inline + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p0.i64' call accesses memory in flat address space + call void @llvm.memcpy.inline.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p1.i64' call accesses memory in flat address space + call void @llvm.memcpy.inline.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p1.p0.i64' call accesses memory in flat address space + call void @llvm.memcpy.inline.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 + call void @llvm.memcpy.inline.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + + ; llvm.memcpy.element.unordered.atomic + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p0.i64' call accesses memory in flat address space + call void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p1.i64' call accesses memory in flat address space + call void @llvm.memcpy.element.unordered.atomic.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p1.p0.i64' call accesses memory in flat address space + call void @llvm.memcpy.element.unordered.atomic.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !10 + call void @llvm.memcpy.element.unordered.atomic.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !10 + + ; llvm.memmove + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in flat address space + call void @llvm.memmove.p0.p1.i64(ptr align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in flat address space + call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 + call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 + call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in flat address space + call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in flat address space + call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !11 + call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 + call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) align 4 null, ptr addrspace(4) align 4 null, i64 10, i1 false), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in flat address space + call void @llvm.memmove.p0.p0.i64(ptr align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in flat address space + call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !11 + + ; llvm.memmove.element.unordered.atomic + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p0.i64' call accesses memory in flat address space + call void @llvm.memmove.element.unordered.atomic.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p1.i64' call accesses memory in flat address space + call void @llvm.memmove.element.unordered.atomic.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p1.p0.i64' call accesses memory in flat address space + call void @llvm.memmove.element.unordered.atomic.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !11 + call void @llvm.memmove.element.unordered.atomic.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !11 + + ; llvm.memset + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in flat address space + call void @llvm.memset.p0.i64(ptr align 4 null, i8 0, i64 10, i1 false), !dbg !12 + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in flat address space + call void @llvm.memset.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i1 false), !dbg !12 + call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i1 false), !dbg !12 + call void @llvm.memset.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i1 false), !dbg !12 + + ; llvm.memset.inline + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in flat address space + call void @llvm.memset.inline.p0.i64(ptr align 4 null, i8 0, i64 10, i1 false), !dbg !12 + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in flat address space + call void @llvm.memset.inline.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i1 false), !dbg !12 + call void @llvm.memset.inline.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i1 false), !dbg !12 + call void @llvm.memset.inline.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i1 false), !dbg !12 + + ; llvm.memset.element.unordered.atomic + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in flat address space + call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 null, i8 0, i64 10, i32 4), !dbg !12 + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in flat address space + call void @llvm.memset.element.unordered.atomic.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i32 4), !dbg !12 + call void @llvm.memset.element.unordered.atomic.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i32 4), !dbg !12 + call void @llvm.memset.element.unordered.atomic.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i32 4), !dbg !12 + + ret void +} +; CHECK: remark: test.c:2:0: in function 'f', FlatAddrspaceAccesses = 36 + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/tmp") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 2, type: !4, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !5) +!4 = !DISubroutineType(types: !5) +!5 = !{} +!6 = !DILocation(line: 3, column: 11, scope: !3) +!7 = !DILocation(line: 4, column: 6, scope: !3) +!8 = !DILocation(line: 5, column: 1, scope: !3) +!9 = !DILocation(line: 6, column: 2, scope: !3) +!10 = !DILocation(line: 7, column: 3, scope: !3) +!11 = !DILocation(line: 8, column: 4, scope: !3) +!12 = !DILocation(line: 9, column: 5, scope: !3) diff --git a/llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.test b/llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.test new file mode 100644 index 0000000000000..7447dcf51cc89 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.test @@ -0,0 +1,12 @@ +; Check info on flat address space memory accesses when the target is amdgpu. +; +; The target matters because kernel-info calls +; TargetTransformInfo::getFlatAddressSpace to select the flat address space. + +; REQUIRES: amdgpu-registered-target + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -mtriple="amdgcn-amd-amdhsa" \ +; RUN: -disable-output %S/Inputs/test.ll 2>&1 | \ +; RUN: FileCheck -match-full-lines -implicit-check-not='flat address space' \ +; RUN: %S/Inputs/test.ll diff --git a/llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.test b/llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.test new file mode 100644 index 0000000000000..02321c19e022d --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.test @@ -0,0 +1,12 @@ +; Check info on flat address space memory accesses when the target is nvptx. +; +; The target matters because kernel-info calls +; TargetTransformInfo::getFlatAddressSpace to select the flat address space. + +; REQUIRES: nvptx-registered-target + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -mtriple="nvptx64-nvidia-cuda" \ +; RUN: -disable-output %S/Inputs/test.ll 2>&1 | \ +; RUN: FileCheck -match-full-lines -implicit-check-not='flat address space' \ +; RUN: %S/Inputs/test.ll diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll new file mode 100644 index 0000000000000..7fbdb923d8800 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll @@ -0,0 +1,83 @@ +; Check info on launch bounds for AMD GPU. + +; REQUIRES: amdgpu-registered-target + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn-amd-amdhsa" + +; CHECK: remark: test.c:10:0: in artificial function 'all', omp_target_num_teams = 100 +; CHECK: remark: test.c:10:0: in artificial function 'all', omp_target_thread_limit = 101 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-max-num-workgroups[0] = 200 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-max-num-workgroups[1] = 201 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-max-num-workgroups[2] = 202 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-flat-work-group-size[0] = 210 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-flat-work-group-size[1] = 211 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-waves-per-eu[0] = 2 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-waves-per-eu[1] = 9 +define void @all() #0 !dbg !5 { +entry: + ret void +} + +; CHECK-NOT: remark: test.c:11:0: in function 'none', omp_target_num_teams = {{.*}} +; CHECK-NOT: remark: test.c:11:0: in function 'none', omp_target_thread_limit = {{.*}} +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[0] = 4294967295 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[1] = 4294967295 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[2] = 4294967295 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-flat-work-group-size[0] = 1 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-flat-work-group-size[1] = 1024 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-waves-per-eu[0] = 4 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-waves-per-eu[1] = 10 +define void @none() !dbg !6 { +entry: + ret void +} + +; CHECK: remark: test.c:12:0: in function 'bogus', omp_target_num_teams = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', omp_target_thread_limit = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-max-num-workgroups[0] = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-max-num-workgroups[1] = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-max-num-workgroups[2] = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-flat-work-group-size[0] = 1 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-flat-work-group-size[1] = 1024 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-waves-per-eu[0] = 4 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-waves-per-eu[1] = 10 +define void @bogus() #1 !dbg !7 { +entry: + ret void +} + +attributes #0 = { + "omp_target_num_teams"="100" + "omp_target_thread_limit"="101" + "amdgpu-max-num-workgroups"="200,201,202" + "amdgpu-flat-work-group-size"="210,211" + "amdgpu-waves-per-eu"="2,9" +} + +; We choose values that are small enough to parse successfully but that are +; impossibly large. For values that are validated, we check that they are +; overridden with realistic values. +attributes #1 = { + "omp_target_num_teams"="987654321" + "omp_target_thread_limit"="987654321" + "amdgpu-max-num-workgroups"="987654321,987654321,987654321" + "amdgpu-flat-work-group-size"="987654321,987654321" + "amdgpu-waves-per-eu"="987654321,987654321" +} + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{} +!4 = !DISubroutineType(types: !3) +!5 = distinct !DISubprogram(name: "all", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) +!6 = distinct !DISubprogram(name: "none", scope: !2, file: !2, line: 11, type: !4, scopeLine: 11, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) +!7 = distinct !DISubprogram(name: "bogus", scope: !2, file: !2, line: 12, type: !4, scopeLine: 12, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll new file mode 100644 index 0000000000000..7a055c7152ec8 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll @@ -0,0 +1,42 @@ +; Check info on launch bounds for NVPTX. + +; REQUIRES: nvptx-registered-target + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK: remark: test.c:10:0: in artificial function 'test', omp_target_num_teams = 100 +; CHECK: remark: test.c:10:0: in artificial function 'test', omp_target_thread_limit = 101 +; CHECK: remark: test.c:10:0: in artificial function 'test', maxclusterrank = 200 +; CHECK: remark: test.c:10:0: in artificial function 'test', maxntidx = 210 +; CHECK: remark: test.c:10:0: in artificial function 'test', maxntidy = 211 +; CHECK: remark: test.c:10:0: in artificial function 'test', maxntidz = 212 +define void @test() #0 !dbg !5 { +entry: + ret void +} + +attributes #0 = { + "omp_target_num_teams"="100" + "omp_target_thread_limit"="101" +} + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!6, !7, !8, !9, !10} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{} +!4 = !DISubroutineType(types: !3) +!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) +!6 = !{ptr @test, !"maxclusterrank", i32 200} +!7 = !{ptr @test, !"maxntidx", i32 210} +!8 = !{ptr @test, !"maxntidy", i32 211} +!9 = !{ptr @test, !"maxntidz", i32 212} +!10 = distinct !{ptr null, !"kernel", i32 1} diff --git a/llvm/test/Analysis/KernelInfo/linkage.ll b/llvm/test/Analysis/KernelInfo/linkage.ll new file mode 100644 index 0000000000000..8679d366d0cb7 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/linkage.ll @@ -0,0 +1,68 @@ +; Check info on linkage. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK: remark: test.c:13:0: in artificial function 'extNotKer', ExternalNotKernel = 1 +define external void @extNotKer() !dbg !10 { +entry: + ret void +} + +; CHECK: remark: test.c:23:0: in function 'impNotKer', ExternalNotKernel = 1 +define void @impNotKer() !dbg !20 { +entry: + ret void +} + +; CHECK: remark: test.c:33:0: in artificial function 'weakNotKer', ExternalNotKernel = 0 +define weak void @weakNotKer() !dbg !30 { +entry: + ret void +} + +; CHECK: remark: test.c:43:0: in function 'extPtxKer', ExternalNotKernel = 0 +define external ptx_kernel void @extPtxKer() !dbg !40 { +entry: + ret void +} + +; CHECK: remark: test.c:53:0: in artificial function 'extAmdgpuKer', ExternalNotKernel = 0 +define external amdgpu_kernel void @extAmdgpuKer() !dbg !50 { +entry: + ret void +} + +; CHECK: remark: test.c:63:0: in function 'extSpirKer', ExternalNotKernel = 0 +define external spir_kernel void @extSpirKer() !dbg !60 { +entry: + ret void +} + +; CHECK: remark: test.c:73:0: in artificial function 'weakKer', ExternalNotKernel = 0 +define weak ptx_kernel void @weakKer() !dbg !70 { +entry: + ret void +} + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{null} +!4 = !{} +!5 = !DISubroutineType(types: !3) + +!10 = distinct !DISubprogram(name: "extNotKer", scope: !2, file: !2, line: 13, type: !5, scopeLine: 13, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!20 = distinct !DISubprogram(name: "impNotKer", scope: !2, file: !2, line: 23, type: !5, scopeLine: 23, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!30 = distinct !DISubprogram(name: "weakNotKer", scope: !2, file: !2, line: 33, type: !5, scopeLine: 33, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!40 = distinct !DISubprogram(name: "extPtxKer", scope: !2, file: !2, line: 43, type: !5, scopeLine: 43, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!50 = distinct !DISubprogram(name: "extAmdgpuKer", scope: !2, file: !2, line: 53, type: !5, scopeLine: 53, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!60 = distinct !DISubprogram(name: "extSpirKer", scope: !2, file: !2, line: 63, type: !5, scopeLine: 63, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!70 = distinct !DISubprogram(name: "weakKer", scope: !2, file: !2, line: 73, type: !5, scopeLine: 73, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) diff --git a/llvm/test/Analysis/KernelInfo/openmp/README.md b/llvm/test/Analysis/KernelInfo/openmp/README.md new file mode 100644 index 0000000000000..0aeb52f83c5c7 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/openmp/README.md @@ -0,0 +1,40 @@ +The tests in this directory check that basic KernelInfoPrinter functionality +behaves reasonably for LLVM IR produced by Clang OpenMP codegen. + +So that these tests are straightforward to maintain and faithfully represent +Clang OpenMP codegen, do not tweak or reduce the LLVM IR in them. Other tests +more exhaustively check KernelInfoPrinter features using reduced LLVM IR. + +The LLVM IR in each test file `$TEST` can be regenerated as follows in the case +that Clang OpenMP codegen changes or it becomes desirable to adjust the source +OpenMP program below. First, remove the existing LLVM IR from `$TEST`. Then, +where `$TARGET` (e.g., `nvptx64-nvidia-cuda-sm_70` or `amdgcn-amd-amdhsa-gfx906`) +depends on `$TEST`: + +``` +$ cd /tmp +$ cat test.c +#pragma omp declare target +void f(); +void g() { + int i; + int a[2]; + f(); + g(); +} +#pragma omp end declare target + +void h(int i) { + #pragma omp target map(tofrom:i) + { + int i; + int a[2]; + f(); + g(); + } +} + +$ clang -g -fopenmp --offload-arch=native -save-temps -c test.c +$ llvm-dis test-openmp-$TARGET.bc +$ cat test-openmp-$TARGET.ll >> $TEST +``` diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll new file mode 100644 index 0000000000000..4843408bdda49 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -0,0 +1,225 @@ +; See ./README.md for how to maintain the LLVM IR in this test. + +; REQUIRES: amdgpu-registered-target + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +; CHECK-NOT: remark: +; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'a' with static size of 8 bytes +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space +; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@__kmpc_target_init' +; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@f' +; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' +; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@__kmpc_target_deinit' +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[0] = 4294967295 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[1] = 4294967295 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[2] = 4294967295 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[0] = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[1] = 1024 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-waves-per-eu[0] = 4 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-waves-per-eu[1] = 10 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCalls = 4 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', InlineAssemblyCalls = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1 + +; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', omp_target_thread_limit = 256 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[0] = 4294967295 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[1] = 4294967295 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[2] = 4294967295 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[0] = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[1] = 256 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-waves-per-eu[0] = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-waves-per-eu[1] = 10 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasDyn = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCalls = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', InlineAssemblyCalls = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2 + +; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca ('%[[#]]') for 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca ('%[[#]]') for 'a' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is '@f' +; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' +; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[0] = 4294967295 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[1] = 4294967295 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[2] = 4294967295 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-flat-work-group-size[0] = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-flat-work-group-size[1] = 1024 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-waves-per-eu[0] = 4 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-waves-per-eu[1] = 10 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', Allocas = 2 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasDyn = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCalls = 2 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0 +; CHECK-NOT: {{.}} + +; ModuleID = 'test-openmp-amdgcn-amd-amdhsa-gfx906.bc' +source_filename = "test.c" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn-amd-amdhsa" + +%struct.ident_t = type { i32, i32, i32, i32, ptr } +%struct.DynamicEnvironmentTy = type { i16 } +%struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr } +%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 } + +@__omp_rtl_debug_kind = weak_odr hidden addrspace(1) constant i32 0 +@__omp_rtl_assume_teams_oversubscription = weak_odr hidden addrspace(1) constant i32 0 +@__omp_rtl_assume_threads_oversubscription = weak_odr hidden addrspace(1) constant i32 0 +@__omp_rtl_assume_no_thread_state = weak_odr hidden addrspace(1) constant i32 0 +@__omp_rtl_assume_no_nested_parallelism = weak_odr hidden addrspace(1) constant i32 0 +@0 = private unnamed_addr constant [57 x i8] c";test.c;__omp_offloading_fd02_727e9_h_l12_debug__;13;3;;\00", align 1 +@1 = private unnamed_addr addrspace(1) constant %struct.ident_t { i32 0, i32 2, i32 0, i32 56, ptr @0 }, align 8 +@__omp_offloading_fd02_727e9_h_l12_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer +@__omp_offloading_fd02_727e9_h_l12_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 256, i32 -1, i32 -1, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_727e9_h_l12_dynamic_environment to ptr) } +@__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 + +; Function Attrs: convergent noinline norecurse nounwind optnone +define internal void @__omp_offloading_fd02_727e9_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !15 { + %2 = alloca ptr, align 8, addrspace(5) + %3 = alloca i32, align 4, addrspace(5) + %4 = alloca [2 x i32], align 4, addrspace(5) + %5 = addrspacecast ptr addrspace(5) %2 to ptr + %6 = addrspacecast ptr addrspace(5) %3 to ptr + %7 = addrspacecast ptr addrspace(5) %4 to ptr + store ptr %0, ptr %5, align 8 + #dbg_declare(ptr addrspace(5) %2, !23, !DIExpression(), !24) + %8 = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_727e9_h_l12_kernel_environment to ptr), ptr %0), !dbg !25 + %9 = icmp eq i32 %8, -1, !dbg !25 + br i1 %9, label %10, label %11, !dbg !25 + +10: ; preds = %1 + #dbg_declare(ptr addrspace(5) %3, !26, !DIExpression(), !29) + #dbg_declare(ptr addrspace(5) %4, !30, !DIExpression(), !34) + call void @f() #4, !dbg !35 + call void @g() #4, !dbg !36 + call void @__kmpc_target_deinit(), !dbg !37 + ret void, !dbg !38 + +11: ; preds = %1 + ret void, !dbg !25 +} + +; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone +define weak_odr protected amdgpu_kernel void @__omp_offloading_fd02_727e9_h_l12(ptr noalias noundef %0) #1 !dbg !39 { + %2 = alloca ptr, align 8, addrspace(5) + %3 = addrspacecast ptr addrspace(5) %2 to ptr + store ptr %0, ptr %3, align 8 + #dbg_declare(ptr addrspace(5) %2, !40, !DIExpression(), !41) + %4 = load ptr, ptr %3, align 8, !dbg !42 + call void @__omp_offloading_fd02_727e9_h_l12_debug__(ptr %4) #5, !dbg !42 + ret void, !dbg !42 +} + +declare i32 @__kmpc_target_init(ptr, ptr) + +; Function Attrs: convergent +declare void @f(...) #2 + +declare void @__kmpc_target_deinit() + +; Function Attrs: convergent noinline nounwind optnone +define hidden void @g() #3 !dbg !43 { + %1 = alloca i32, align 4, addrspace(5) + %2 = alloca [2 x i32], align 4, addrspace(5) + %3 = addrspacecast ptr addrspace(5) %1 to ptr + %4 = addrspacecast ptr addrspace(5) %2 to ptr + #dbg_declare(ptr addrspace(5) %1, !46, !DIExpression(), !47) + #dbg_declare(ptr addrspace(5) %2, !48, !DIExpression(), !49) + call void @f() #4, !dbg !50 + call void @g() #4, !dbg !51 + ret void, !dbg !52 +} + +attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +attributes #1 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "frame-pointer"="all" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="256" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" } +attributes #2 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +attributes #3 = { convergent noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +attributes #4 = { convergent } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!omp_offload.info = !{!2} +!llvm.module.flags = !{!3, !4, !5, !6, !7, !8, !9, !10, !11} +!llvm.ident = !{!12, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13} +!opencl.ocl.version = !{!14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.0.0git (/tmp/llvm/clang b9447c03a9ef2eed55b685a33511df86f7f94e89)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "27a878d5e894ab6d41bfe96f997f8821") +!2 = !{i32 0, i32 64770, i32 468969, !"h", i32 12, i32 0, i32 0} +!3 = !{i32 1, !"amdhsa_code_object_version", i32 500} +!4 = !{i32 7, !"Dwarf Version", i32 5} +!5 = !{i32 2, !"Debug Info Version", i32 3} +!6 = !{i32 1, !"wchar_size", i32 4} +!7 = !{i32 7, !"openmp", i32 51} +!8 = !{i32 7, !"openmp-device", i32 51} +!9 = !{i32 8, !"PIC Level", i32 2} +!10 = !{i32 7, !"frame-pointer", i32 2} +!11 = !{i32 4, !"amdgpu_hostcall", i32 1} +!12 = !{!"clang version 20.0.0git (/tmp/llvm/clang b9447c03a9ef2eed55b685a33511df86f7f94e89)"} +!13 = !{!"AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)"} +!14 = !{i32 2, i32 0} +!15 = distinct !DISubprogram(name: "__omp_offloading_fd02_727e9_h_l12_debug__", scope: !16, file: !16, line: 13, type: !17, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !22) +!16 = !DIFile(filename: "test.c", directory: "/tmp") +!17 = !DISubroutineType(types: !18) +!18 = !{null, !19} +!19 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !20) +!20 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !21) +!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) +!22 = !{} +!23 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !15, type: !19, flags: DIFlagArtificial) +!24 = !DILocation(line: 0, scope: !15) +!25 = !DILocation(line: 13, column: 3, scope: !15) +!26 = !DILocalVariable(name: "i", scope: !27, file: !16, line: 14, type: !28) +!27 = distinct !DILexicalBlock(scope: !15, file: !16, line: 13, column: 3) +!28 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!29 = !DILocation(line: 14, column: 9, scope: !27) +!30 = !DILocalVariable(name: "a", scope: !27, file: !16, line: 15, type: !31) +!31 = !DICompositeType(tag: DW_TAG_array_type, baseType: !28, size: 64, elements: !32) +!32 = !{!33} +!33 = !DISubrange(count: 2) +!34 = !DILocation(line: 15, column: 9, scope: !27) +!35 = !DILocation(line: 16, column: 5, scope: !27) +!36 = !DILocation(line: 17, column: 5, scope: !27) +!37 = !DILocation(line: 18, column: 3, scope: !27) +!38 = !DILocation(line: 18, column: 3, scope: !15) +!39 = distinct !DISubprogram(name: "__omp_offloading_fd02_727e9_h_l12", scope: !16, file: !16, line: 12, type: !17, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !22) +!40 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !39, type: !19, flags: DIFlagArtificial) +!41 = !DILocation(line: 0, scope: !39) +!42 = !DILocation(line: 12, column: 1, scope: !39) +!43 = distinct !DISubprogram(name: "g", scope: !16, file: !16, line: 3, type: !44, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !22) +!44 = !DISubroutineType(types: !45) +!45 = !{null} +!46 = !DILocalVariable(name: "i", scope: !43, file: !16, line: 4, type: !28) +!47 = !DILocation(line: 4, column: 7, scope: !43) +!48 = !DILocalVariable(name: "a", scope: !43, file: !16, line: 5, type: !31) +!49 = !DILocation(line: 5, column: 7, scope: !43) +!50 = !DILocation(line: 6, column: 3, scope: !43) +!51 = !DILocation(line: 7, column: 3, scope: !43) +!52 = !DILocation(line: 8, column: 1, scope: !43) diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll new file mode 100644 index 0000000000000..bd46741b24e8c --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -0,0 +1,813 @@ +; See ./README.md for how to maintain the LLVM IR in this test. + +; REQUIRES: nvptx-registered-target + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +; CHECK-NOT: remark: +; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'a' with static size of 8 bytes +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space +; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '@__kmpc_target_init' +; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@f' +; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' +; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '@__kmpc_target_deinit' +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCalls = 4 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 3 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', InlineAssemblyCalls = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1 + +; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', omp_target_thread_limit = 128 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', maxntidx = 128 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasDyn = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCalls = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', InlineAssemblyCalls = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2 + +; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca ('%[[#]]') for 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca ('%[[#]]') for 'a' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is '@f' +; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' +; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', Allocas = 2 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasDyn = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCalls = 2 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0 +; CHECK-NOT: remark: {{.*: in function 'g',.*}} + +; A lot of internal functions (e.g., __kmpc_target_init) come next, but we don't +; want to maintain a list of their allocas, calls, etc. in this test. + +; ModuleID = 'test-openmp-nvptx64-nvidia-cuda-sm_70.bc' +source_filename = "test.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.ident_t = type { i32, i32, i32, i32, ptr } +%struct.DynamicEnvironmentTy = type { i16 } +%struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr } +%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 } +%struct.DeviceMemoryPoolTy = type { ptr, i64 } +%struct.DeviceMemoryPoolTrackingTy = type { i64, i64, i64, i64 } +%struct.DeviceEnvironmentTy = type { i32, i32, i32, i32, i64, i64, i64, i64 } +%"struct.rpc::Client" = type { %"struct.rpc::Process" } +%"struct.rpc::Process" = type { i32, ptr, ptr, ptr, ptr, [128 x i32] } +%"struct.(anonymous namespace)::SharedMemorySmartStackTy" = type { [512 x i8], [1024 x i8] } +%"struct.ompx::state::TeamStateTy" = type { %"struct.ompx::state::ICVStateTy", i32, i32, ptr } +%"struct.ompx::state::ICVStateTy" = type { i32, i32, i32, i32, i32, i32, i32 } + +@__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 +@__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 +@0 = private unnamed_addr constant [58 x i8] c";test.c;__omp_offloading_fd02_1116d6_h_l12_debug__;13;3;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 57, ptr @0 }, align 8 +@__omp_offloading_fd02_1116d6_h_l12_dynamic_environment = weak_odr protected global %struct.DynamicEnvironmentTy zeroinitializer +@__omp_offloading_fd02_1116d6_h_l12_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 128, i32 -1, i32 -1, i32 0, i32 0 }, ptr @1, ptr @__omp_offloading_fd02_1116d6_h_l12_dynamic_environment } +@llvm.used = appending global [4 x ptr] [ptr @__llvm_rpc_client, ptr addrspacecast (ptr addrspace(4) @__omp_rtl_device_environment to ptr), ptr @__omp_rtl_device_memory_pool, ptr @__omp_rtl_device_memory_pool_tracker], section "llvm.metadata" +@__omp_rtl_device_memory_pool = weak protected global %struct.DeviceMemoryPoolTy zeroinitializer, align 8 +@__omp_rtl_device_memory_pool_tracker = weak protected global %struct.DeviceMemoryPoolTrackingTy zeroinitializer, align 8 +@__omp_rtl_debug_kind = weak_odr hidden constant i32 0 +@__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +@__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 +@__omp_rtl_device_environment = weak protected addrspace(4) global %struct.DeviceEnvironmentTy undef, align 8 +@.str = private unnamed_addr constant [40 x i8] c"%s:%u: %s: Assertion %s (`%s`) failed.\0A\00", align 1 +@.str1 = private unnamed_addr constant [35 x i8] c"%s:%u: %s: Assertion `%s` failed.\0A\00", align 1 +@.str15 = private unnamed_addr constant [43 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Kernel.cpp\00", align 1 +@__PRETTY_FUNCTION__._ZL19genericStateMachineP7IdentTy = private unnamed_addr constant [36 x i8] c"void genericStateMachine(IdentTy *)\00", align 1 +@.str2 = private unnamed_addr constant [18 x i8] c"WorkFn == nullptr\00", align 1 +@__PRETTY_FUNCTION__.__kmpc_target_deinit = private unnamed_addr constant [28 x i8] c"void __kmpc_target_deinit()\00", align 1 +@IsSPMDMode = internal local_unnamed_addr addrspace(3) global i32 undef, align 4 +@__llvm_rpc_client = weak protected global %"struct.rpc::Client" zeroinitializer, align 8 +@.str1027 = private unnamed_addr constant [48 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Parallelism.cpp\00", align 1 +@.str12 = private unnamed_addr constant [23 x i8] c"!mapping::isSPMDMode()\00", align 1 +@__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel = private unnamed_addr constant [34 x i8] c"void __kmpc_kernel_end_parallel()\00", align 1 +@_ZL20KernelEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8 +@_ZL26KernelLaunchEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8 +@_ZN12_GLOBAL__N_122SharedMemorySmartStackE = internal addrspace(3) global %"struct.(anonymous namespace)::SharedMemorySmartStackTy" undef, align 16 +@.str444 = private unnamed_addr constant [42 x i8] c"/tmp/llvm/offload/DeviceRTL/src/State.cpp\00", align 1 +@.str747 = private unnamed_addr constant [33 x i8] c"NThreadsVar == Other.NThreadsVar\00", align 1 +@__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_ = private unnamed_addr constant [68 x i8] c"void ompx::state::ICVStateTy::assertEqual(const ICVStateTy &) const\00", align 1 +@.str848 = private unnamed_addr constant [27 x i8] c"LevelVar == Other.LevelVar\00", align 1 +@.str949 = private unnamed_addr constant [39 x i8] c"ActiveLevelVar == Other.ActiveLevelVar\00", align 1 +@.str1050 = private unnamed_addr constant [47 x i8] c"MaxActiveLevelsVar == Other.MaxActiveLevelsVar\00", align 1 +@.str1151 = private unnamed_addr constant [33 x i8] c"RunSchedVar == Other.RunSchedVar\00", align 1 +@.str1252 = private unnamed_addr constant [43 x i8] c"RunSchedChunkVar == Other.RunSchedChunkVar\00", align 1 +@.str13 = private unnamed_addr constant [43 x i8] c"ParallelTeamSize == Other.ParallelTeamSize\00", align 1 +@__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_ = private unnamed_addr constant [64 x i8] c"void ompx::state::TeamStateTy::assertEqual(TeamStateTy &) const\00", align 1 +@.str14 = private unnamed_addr constant [39 x i8] c"HasThreadState == Other.HasThreadState\00", align 1 +@.str23 = private unnamed_addr constant [32 x i8] c"mapping::isSPMDMode() == IsSPMD\00", align 1 +@__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb = private unnamed_addr constant [43 x i8] c"void ompx::state::assumeInitialState(bool)\00", align 1 +@_ZL9ThreadDST = internal unnamed_addr addrspace(3) global ptr undef, align 8 +@_ZN4ompx5state9TeamStateE = internal local_unnamed_addr addrspace(3) global %"struct.ompx::state::TeamStateTy" undef, align 8 +@_ZN4ompx5state12ThreadStatesE = internal addrspace(3) global ptr undef, align 8 + +; Function Attrs: convergent noinline norecurse nounwind optnone +define internal void @__omp_offloading_fd02_1116d6_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !18 { + %2 = alloca ptr, align 8 + %3 = alloca i32, align 4 + %4 = alloca [2 x i32], align 4 + store ptr %0, ptr %2, align 8 + #dbg_declare(ptr %2, !25, !DIExpression(), !26) + %5 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_1116d6_h_l12_kernel_environment, ptr %0), !dbg !27 + %6 = icmp eq i32 %5, -1, !dbg !27 + br i1 %6, label %7, label %8, !dbg !27 + +7: ; preds = %1 + #dbg_declare(ptr %3, !28, !DIExpression(), !31) + #dbg_declare(ptr %4, !32, !DIExpression(), !36) + call void @f() #19, !dbg !37 + call void @g() #19, !dbg !38 + call void @__kmpc_target_deinit(), !dbg !39 + ret void, !dbg !40 + +8: ; preds = %1 + ret void, !dbg !27 +} + +; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone +define weak_odr protected ptx_kernel void @__omp_offloading_fd02_1116d6_h_l12(ptr noalias noundef %0) #1 !dbg !41 { + %2 = alloca ptr, align 8 + store ptr %0, ptr %2, align 8 + #dbg_declare(ptr %2, !42, !DIExpression(), !43) + %3 = load ptr, ptr %2, align 8, !dbg !44 + call void @__omp_offloading_fd02_1116d6_h_l12_debug__(ptr %3) #20, !dbg !44 + ret void, !dbg !44 +} + +; Function Attrs: convergent +declare void @f(...) #2 + +; Function Attrs: convergent noinline nounwind optnone +define hidden void @g() #3 !dbg !45 { + %1 = alloca i32, align 4 + %2 = alloca [2 x i32], align 4 + #dbg_declare(ptr %1, !48, !DIExpression(), !49) + #dbg_declare(ptr %2, !50, !DIExpression(), !51) + call void @f() #19, !dbg !52 + call void @g() #19, !dbg !53 + ret void, !dbg !54 +} + +; Function Attrs: convergent mustprogress nounwind +define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree noundef nonnull align 8 dereferenceable(48) %0, ptr nofree noundef nonnull align 8 dereferenceable(16) %1) #4 { + %3 = alloca ptr, align 8 + %4 = getelementptr inbounds nuw i8, ptr %0, i64 2 + %5 = load i8, ptr %4, align 2, !tbaa !55 + %6 = and i8 %5, 2 + %7 = icmp eq i8 %6, 0 + %8 = load i8, ptr %0, align 8, !tbaa !61 + %9 = icmp ne i8 %8, 0 + br i1 %7, label %21, label %10 + +10: ; preds = %2 + %11 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %12 = icmp eq i32 %11, 0 + br i1 %12, label %13, label %14 + +13: ; preds = %10 + store i32 1, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !62 + store i8 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512) to ptr addrspace(3)), align 1, !tbaa !63 + tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false) + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !64 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !69 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !70 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !73 + store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 + store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76 + store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !78 + br label %18 + +14: ; preds = %10 + %15 = zext nneg i32 %11 to i64 + %16 = getelementptr inbounds nuw [1024 x i8], ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %15 + %17 = addrspacecast ptr %16 to ptr addrspace(3) + store i8 0, ptr addrspace(3) %17, align 1, !tbaa !63 + br label %18 + +18: ; preds = %14, %13 + br i1 %12, label %19, label %20 + +19: ; preds = %18 + store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !80 + br label %20 + +20: ; preds = %18, %19 + tail call void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 poison) #21 + br label %37 + +21: ; preds = %2 + %22 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !82 + %23 = add nsw i32 %22, -1 + %24 = and i32 %23, -32 + %25 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %26 = icmp eq i32 %25, %24 + br i1 %26, label %27, label %31 + +27: ; preds = %21 + store i32 0, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !62 + %28 = zext nneg i32 %25 to i64 + %29 = getelementptr inbounds nuw [1024 x i8], ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %28 + %30 = addrspacecast ptr %29 to ptr addrspace(3) + store i8 0, ptr addrspace(3) %30, align 1, !tbaa !63 + tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false) + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !64 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !69 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !70 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !73 + store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 + store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76 + store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !78 + br label %35 + +31: ; preds = %21 + %32 = zext nneg i32 %25 to i64 + %33 = getelementptr inbounds nuw [1024 x i8], ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %32 + %34 = addrspacecast ptr %33 to ptr addrspace(3) + store i8 0, ptr addrspace(3) %34, align 1, !tbaa !63 + br label %35 + +35: ; preds = %31, %27 + br i1 %26, label %36, label %37 + +36: ; preds = %35 + store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !80 + br label %37 + +37: ; preds = %36, %35, %20 + br i1 %7, label %100, label %38 + +38: ; preds = %37 + %39 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !62 + %40 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !83 + %41 = and i32 %39, 1 + %42 = and i32 %41, %40 + %43 = icmp ne i32 %42, 0 + %44 = load i32, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !86 + %45 = icmp ne i32 %44, 0 + %46 = select i1 %43, i1 %45, i1 false + br i1 %46, label %47, label %48 + +47: ; preds = %38 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(33) @.str747, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 193, ptr noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #22 + unreachable + +48: ; preds = %38 + %49 = icmp eq i32 %44, 0 + tail call void @llvm.assume(i1 noundef %49) #23 + %50 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !87 + br i1 %43, label %51, label %54 + +51: ; preds = %48 + %52 = icmp eq i32 %50, 0 + br i1 %52, label %54, label %53 + +53: ; preds = %51 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(27) @.str848, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 194, ptr noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #22 + unreachable + +54: ; preds = %51, %48 + %55 = phi i32 [ 0, %51 ], [ %50, %48 ] + %56 = icmp eq i32 %55, 0 + tail call void @llvm.assume(i1 noundef %56) #23 + %57 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !88 + br i1 %43, label %58, label %61 + +58: ; preds = %54 + %59 = icmp eq i32 %57, 0 + br i1 %59, label %61, label %60 + +60: ; preds = %58 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(39) @.str949, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 195, ptr noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #22 + unreachable + +61: ; preds = %58, %54 + %62 = phi i32 [ 0, %58 ], [ %57, %54 ] + %63 = icmp eq i32 %62, 0 + tail call void @llvm.assume(i1 noundef %63) #23 + %64 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !89 + br i1 %43, label %65, label %68 + +65: ; preds = %61 + %66 = icmp eq i32 %64, 1 + br i1 %66, label %68, label %67 + +67: ; preds = %65 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(47) @.str1050, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 196, ptr noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #22 + unreachable + +68: ; preds = %65, %61 + %69 = phi i32 [ 1, %65 ], [ %64, %61 ] + %70 = icmp eq i32 %69, 1 + tail call void @llvm.assume(i1 noundef %70) #23 + %71 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !90 + br i1 %43, label %72, label %93 + +72: ; preds = %68 + %73 = icmp eq i32 %71, 1 + br i1 %73, label %75, label %74 + +74: ; preds = %72 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(33) @.str1151, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 197, ptr noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #22 + unreachable + +75: ; preds = %72 + %76 = icmp eq i32 1, 1 + tail call void @llvm.assume(i1 noundef %76) #23 + br i1 %43, label %77, label %95 + +77: ; preds = %75 + %78 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !91 + %79 = icmp eq i32 %78, 1 + br i1 %79, label %81, label %80 + +80: ; preds = %77 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(43) @.str1252, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 198, ptr noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #22 + unreachable + +81: ; preds = %77 + %82 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71 + %83 = icmp eq i32 %82, 1 + br i1 %83, label %85, label %84 + +84: ; preds = %81 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(43) @.str13, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 222, ptr noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #22 + unreachable + +85: ; preds = %81 + %86 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72 + %87 = icmp eq i32 %86, 0 + br i1 %87, label %89, label %88 + +88: ; preds = %85 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(39) @.str14, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 223, ptr noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #22 + unreachable + +89: ; preds = %85 + %90 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !62 + %91 = icmp eq i32 %90, 0 + br i1 %91, label %92, label %98 + +92: ; preds = %89 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(32) @.str23, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 326, ptr noundef nonnull dereferenceable(43) @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb) #22 + unreachable + +93: ; preds = %68 + %94 = icmp eq i32 %71, 1 + tail call void @llvm.assume(i1 noundef %94) #23 + br label %95 + +95: ; preds = %75, %93 + %96 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !62 + %97 = icmp ne i32 %96, 0 + br label %98 + +98: ; preds = %89, %95 + %99 = phi i1 [ %97, %95 ], [ true, %89 ] + tail call void @llvm.assume(i1 noundef %99) #23 + tail call void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 poison) #21 + br label %130 + +100: ; preds = %37 + %101 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !82 + %102 = add nsw i32 %101, -1 + %103 = and i32 %102, -32 + %104 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !92 + %105 = icmp eq i32 %104, %103 + br i1 %105, label %130, label %106 + +106: ; preds = %100 + %107 = add nsw i32 %101, -32 + %108 = icmp ult i32 %104, %107 + %109 = select i1 %9, i1 %108, i1 false + br i1 %109, label %110, label %130 + +110: ; preds = %106 + %111 = load i32, ptr @__omp_rtl_debug_kind, align 4 + %112 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8 + %113 = and i32 %111, 1 + %114 = and i32 %113, %112 + %115 = icmp ne i32 %114, 0 + br label %116 + +116: ; preds = %110, %128 + call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %3) #20 + tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) + %117 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %3) #20 + %118 = load ptr, ptr %3, align 8, !tbaa !93 + %119 = icmp eq ptr %118, null + br i1 %119, label %129, label %120 + +120: ; preds = %116 + br i1 %117, label %121, label %128 + +121: ; preds = %120 + %122 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + %123 = icmp ne i32 %122, 0 + %124 = select i1 %115, i1 %123, i1 false + br i1 %124, label %125, label %126 + +125: ; preds = %121 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(23) @.str12, ptr noundef null, ptr noundef nonnull dereferenceable(67) @.str15, i32 noundef 60, ptr noundef nonnull dereferenceable(36) @__PRETTY_FUNCTION__._ZL19genericStateMachineP7IdentTy) #22 + unreachable + +126: ; preds = %121 + %127 = icmp eq i32 %122, 0 + tail call void @llvm.assume(i1 noundef %127) #23 + tail call void %118(i32 noundef 0, i32 noundef %104) #24 + tail call void @__kmpc_kernel_end_parallel() #24 + br label %128 + +128: ; preds = %126, %120 + tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %3) #20 + br label %116, !llvm.loop !94 + +129: ; preds = %116 + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %3) #20 + br label %130 + +130: ; preds = %106, %129, %100, %98 + %131 = phi i32 [ -1, %98 ], [ -1, %100 ], [ %104, %129 ], [ %104, %106 ] + ret i32 %131 +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #5 + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #6 + +; Function Attrs: convergent mustprogress noinline norecurse nounwind +define internal void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 %0) local_unnamed_addr #7 { + tail call void @llvm.nvvm.barrier0() #25 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5 + +; Function Attrs: cold convergent mustprogress noreturn nounwind +define internal fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(8) %0, ptr noundef %1, ptr noundef nonnull dereferenceable(66) %2, i32 noundef range(i32 60, 905) %3, ptr noundef nonnull dereferenceable(20) %4) unnamed_addr #8 { + %6 = icmp eq ptr %1, null + br i1 %6, label %9, label %7 + +7: ; preds = %5 + %8 = tail call noundef i32 (ptr, ...) @_ZN4ompx6printfEPKcz(ptr noundef nonnull dereferenceable(40) @.str, ptr noundef nonnull dereferenceable(66) %2, i32 noundef %3, ptr noundef nonnull dereferenceable(20) %4, ptr noundef nonnull %1, ptr noundef nonnull dereferenceable(8) %0) #24 + br label %11 + +9: ; preds = %5 + %10 = tail call noundef i32 (ptr, ...) @_ZN4ompx6printfEPKcz(ptr noundef nonnull dereferenceable(35) @.str1, ptr noundef nonnull dereferenceable(66) %2, i32 noundef %3, ptr noundef nonnull dereferenceable(20) %4, ptr noundef nonnull dereferenceable(8) %0) #24 + br label %11 + +11: ; preds = %9, %7 + tail call void @llvm.trap() #26 + unreachable +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) +declare void @llvm.assume(i1 noundef) #9 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #10 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.sync(i32) #11 + +; Function Attrs: convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none) +define internal noundef zeroext i1 @__kmpc_kernel_parallel(ptr nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) initializes((0, 8)) %0) local_unnamed_addr #12 { + %2 = load ptr, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !93 + store ptr %2, ptr %0, align 8, !tbaa !93 + %3 = icmp eq ptr %2, null + br i1 %3, label %15, label %4 + +4: ; preds = %1 + %5 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !92 + %6 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !62 + %7 = icmp eq i32 %6, 0 + %8 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !82 + %9 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + %10 = icmp eq i32 %9, 0 + %11 = select i1 %10, i32 -32, i32 0 + %12 = add nsw i32 %11, %8 + %13 = select i1 %7, i32 %12, i32 %6 + %14 = icmp ult i32 %5, %13 + br label %15 + +15: ; preds = %4, %1 + %16 = phi i1 [ %14, %4 ], [ false, %1 ] + ret i1 %16 +} + +; Function Attrs: convergent mustprogress noinline nounwind +define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #13 { + %1 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !62 + %2 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !83 + %3 = and i32 %1, 1 + %4 = and i32 %3, %2 + %5 = icmp ne i32 %4, 0 + %6 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + %7 = icmp ne i32 %6, 0 + %8 = select i1 %5, i1 %7, i1 false + br i1 %8, label %9, label %10 + +9: ; preds = %0 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(23) @.str12, ptr noundef null, ptr noundef nonnull dereferenceable(72) @.str1027, i32 noundef 299, ptr noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #22 + unreachable + +10: ; preds = %0 + %11 = icmp eq i32 %6, 0 + tail call void @llvm.assume(i1 noundef %11) #23 + %12 = load i32, ptr @__omp_rtl_assume_no_thread_state, align 4, !tbaa !62 + %13 = icmp eq i32 %12, 0 + %14 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8 + %15 = icmp ne i32 %14, 0 + %16 = select i1 %13, i1 %15, i1 false + br i1 %16, label %17, label %30 + +17: ; preds = %10 + %18 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !92 + %19 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 + %20 = zext nneg i32 %18 to i64 + %21 = getelementptr inbounds nuw ptr, ptr %19, i64 %20 + %22 = load ptr, ptr %21, align 8, !tbaa !96 + %23 = icmp eq ptr %22, null + br i1 %23, label %30, label %24, !prof !98 + +24: ; preds = %17 + %25 = getelementptr inbounds nuw i8, ptr %22, i64 32 + %26 = load ptr, ptr %25, align 8, !tbaa !99 + tail call void @free(ptr noundef nonnull dereferenceable(40) %22) #28 + %27 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 + %28 = getelementptr inbounds nuw ptr, ptr %27, i64 %20 + store ptr %26, ptr %28, align 8, !tbaa !96 + %29 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + br label %30 + +30: ; preds = %10, %17, %24 + %31 = phi i32 [ 0, %10 ], [ 0, %17 ], [ %29, %24 ] + %32 = icmp ne i32 %31, 0 + %33 = select i1 %5, i1 %32, i1 false + br i1 %33, label %34, label %35 + +34: ; preds = %30 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(23) @.str12, ptr noundef null, ptr noundef nonnull dereferenceable(72) @.str1027, i32 noundef 302, ptr noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #22 + unreachable + +35: ; preds = %30 + %36 = icmp eq i32 %31, 0 + tail call void @llvm.assume(i1 noundef %36) #23 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #10 + +; Function Attrs: convergent mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) +declare extern_weak void @free(ptr allocptr nocapture noundef) local_unnamed_addr #14 + +; Function Attrs: convergent mustprogress nounwind +define internal noundef i32 @_ZN4ompx6printfEPKcz(ptr noundef %0, ...) local_unnamed_addr #15 { + %2 = alloca ptr, align 8 + call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 %2) #29 + call void @llvm.va_start.p0(ptr noundef nonnull align 8 %2) #27 + %3 = load ptr, ptr %2, align 8, !tbaa !101 + %4 = call i32 @vprintf(ptr noundef %0, ptr noundef %3) #24 + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %2) #20 + ret i32 %4 +} + +; Function Attrs: cold noreturn nounwind memory(inaccessiblemem: write) +declare void @llvm.trap() #16 + +; Function Attrs: nocallback nofree nosync nounwind willreturn +declare void @llvm.va_start.p0(ptr) #17 + +; Function Attrs: convergent nounwind +declare i32 @vprintf(ptr noundef, ptr noundef) local_unnamed_addr #18 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #11 + +; Function Attrs: convergent mustprogress nounwind +define internal void @__kmpc_target_deinit() #4 { + %1 = alloca ptr, align 8 + %2 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !62 + %3 = icmp eq i32 %2, 0 + br i1 %3, label %4, label %27 + +4: ; preds = %0 + %5 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !82 + %6 = add nsw i32 %5, -1 + %7 = and i32 %6, -32 + %8 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !92 + %9 = icmp eq i32 %8, %7 + br i1 %9, label %10, label %11 + +10: ; preds = %4 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !93 + br label %27 + +11: ; preds = %4 + %12 = load ptr, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76 + %13 = load i8, ptr %12, align 8, !tbaa !103 + %14 = icmp eq i8 %13, 0 + br i1 %14, label %15, label %27 + +15: ; preds = %11 + call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %1) #29 + %16 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %1) #20 + %17 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !62 + %18 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !83 + %19 = and i32 %17, 1 + %20 = and i32 %19, %18 + %21 = icmp eq i32 %20, 0 + %22 = load ptr, ptr %1, align 8 + %23 = icmp eq ptr %22, null + %24 = select i1 %21, i1 true, i1 %23 + br i1 %24, label %26, label %25 + +25: ; preds = %15 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(18) @.str2, ptr noundef null, ptr noundef nonnull dereferenceable(67) @.str15, i32 noundef 152, ptr noundef nonnull dereferenceable(28) @__PRETTY_FUNCTION__.__kmpc_target_deinit) #22 + unreachable + +26: ; preds = %15 + tail call void @llvm.assume(i1 noundef %23) #23 + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %1) #20 + br label %27 + +27: ; preds = %26, %11, %10, %0 + ret void +} + +attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx83,+sm_70" } +attributes #1 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="128" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx83,+sm_70" } +attributes #2 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx83,+sm_70" } +attributes #3 = { convergent noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx83,+sm_70" } +attributes #4 = { convergent mustprogress nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #5 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #6 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #7 = { convergent mustprogress noinline norecurse nounwind "frame-pointer"="all" "llvm.assume"="ompx_aligned_barrier" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #8 = { cold convergent mustprogress noreturn nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #9 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +attributes #10 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #11 = { convergent nocallback nounwind } +attributes #12 = { convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #13 = { convergent mustprogress noinline nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #14 = { convergent mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #15 = { convergent mustprogress nounwind "frame-pointer"="all" "no-builtin-printf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #16 = { cold noreturn nounwind memory(inaccessiblemem: write) } +attributes #17 = { nocallback nofree nosync nounwind willreturn } +attributes #18 = { convergent nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #19 = { convergent } +attributes #20 = { nounwind } +attributes #21 = { convergent nounwind "llvm.assume"="ompx_aligned_barrier" } +attributes #22 = { convergent noreturn nounwind } +attributes #23 = { memory(write) } +attributes #24 = { convergent nounwind } +attributes #25 = { "llvm.assume"="ompx_aligned_barrier" } +attributes #26 = { noreturn } +attributes #27 = { nofree willreturn } +attributes #28 = { convergent nounwind willreturn } +attributes #29 = { nofree nounwind willreturn } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10} +!llvm.dbg.cu = !{!11} +!nvvm.annotations = !{!13} +!omp_offload.info = !{!14} +!llvm.ident = !{!15, !16, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15} +!nvvmir.version = !{!17} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 12, i32 3]} +!1 = !{i32 7, !"Dwarf Version", i32 2} +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 1, !"wchar_size", i32 4} +!4 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!5 = !{i32 7, !"openmp", i32 51} +!6 = !{i32 7, !"openmp-device", i32 51} +!7 = !{i32 8, !"PIC Level", i32 2} +!8 = !{i32 7, !"frame-pointer", i32 2} +!9 = !{i32 1, !"ThinLTO", i32 0} +!10 = !{i32 1, !"EnableSplitLTOUnit", i32 1} +!11 = distinct !DICompileUnit(language: DW_LANG_C11, file: !12, producer: "clang version 20.0.0git (/tmp/llvm/clang b9447c03a9ef2eed55b685a33511df86f7f94e89)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!12 = !DIFile(filename: "test.c", directory: "/tmp") +!13 = !{ptr @__omp_offloading_fd02_1116d6_h_l12, !"maxntidx", i32 128} +!14 = !{i32 0, i32 64770, i32 1119958, !"h", i32 12, i32 0, i32 0} +!15 = !{!"clang version 20.0.0git (/tmp/llvm/clang b9447c03a9ef2eed55b685a33511df86f7f94e89)"} +!16 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!17 = !{i32 2, i32 0} +!18 = distinct !DISubprogram(name: "__omp_offloading_fd02_1116d6_h_l12_debug__", scope: !12, file: !12, line: 13, type: !19, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !24) +!19 = !DISubroutineType(types: !20) +!20 = !{null, !21} +!21 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !22) +!22 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !23) +!23 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) +!24 = !{} +!25 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !18, type: !21, flags: DIFlagArtificial) +!26 = !DILocation(line: 0, scope: !18) +!27 = !DILocation(line: 13, column: 3, scope: !18) +!28 = !DILocalVariable(name: "i", scope: !29, file: !12, line: 14, type: !30) +!29 = distinct !DILexicalBlock(scope: !18, file: !12, line: 13, column: 3) +!30 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!31 = !DILocation(line: 14, column: 9, scope: !29) +!32 = !DILocalVariable(name: "a", scope: !29, file: !12, line: 15, type: !33) +!33 = !DICompositeType(tag: DW_TAG_array_type, baseType: !30, size: 64, elements: !34) +!34 = !{!35} +!35 = !DISubrange(count: 2) +!36 = !DILocation(line: 15, column: 9, scope: !29) +!37 = !DILocation(line: 16, column: 5, scope: !29) +!38 = !DILocation(line: 17, column: 5, scope: !29) +!39 = !DILocation(line: 18, column: 3, scope: !29) +!40 = !DILocation(line: 18, column: 3, scope: !18) +!41 = distinct !DISubprogram(name: "__omp_offloading_fd02_1116d6_h_l12", scope: !12, file: !12, line: 12, type: !19, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !24) +!42 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !41, type: !21, flags: DIFlagArtificial) +!43 = !DILocation(line: 0, scope: !41) +!44 = !DILocation(line: 12, column: 1, scope: !41) +!45 = distinct !DISubprogram(name: "g", scope: !12, file: !12, line: 3, type: !46, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !11, retainedNodes: !24) +!46 = !DISubroutineType(types: !47) +!47 = !{null} +!48 = !DILocalVariable(name: "i", scope: !45, file: !12, line: 4, type: !30) +!49 = !DILocation(line: 4, column: 7, scope: !45) +!50 = !DILocalVariable(name: "a", scope: !45, file: !12, line: 5, type: !33) +!51 = !DILocation(line: 5, column: 7, scope: !45) +!52 = !DILocation(line: 6, column: 3, scope: !45) +!53 = !DILocation(line: 7, column: 3, scope: !45) +!54 = !DILocation(line: 8, column: 1, scope: !45) +!55 = !{!56, !59, i64 2} +!56 = !{!"_ZTS26ConfigurationEnvironmentTy", !57, i64 0, !57, i64 1, !59, i64 2, !60, i64 4, !60, i64 8, !60, i64 12, !60, i64 16, !60, i64 20, !60, i64 24} +!57 = !{!"omnipotent char", !58, i64 0} +!58 = !{!"Simple C++ TBAA"} +!59 = !{!"_ZTSN4llvm3omp19OMPTgtExecModeFlagsE", !57, i64 0} +!60 = !{!"int", !57, i64 0} +!61 = !{!56, !57, i64 0} +!62 = !{!60, !60, i64 0} +!63 = !{!57, !57, i64 0} +!64 = !{!65, !60, i64 16} +!65 = !{!"_ZTSN4ompx5state11TeamStateTyE", !66, i64 0, !60, i64 28, !60, i64 32, !67, i64 40} +!66 = !{!"_ZTSN4ompx5state10ICVStateTyE", !60, i64 0, !60, i64 4, !60, i64 8, !60, i64 12, !60, i64 16, !60, i64 20, !60, i64 24} +!67 = !{!"p1 void", !68, i64 0} +!68 = !{!"any pointer", !57, i64 0} +!69 = !{!65, !60, i64 20} +!70 = !{!65, !60, i64 24} +!71 = !{!65, !60, i64 28} +!72 = !{!65, !60, i64 32} +!73 = !{!65, !67, i64 40} +!74 = !{!75, !75, i64 0} +!75 = !{!"p2 _ZTSN4ompx5state13ThreadStateTyE", !68, i64 0} +!76 = !{!77, !77, i64 0} +!77 = !{!"p1 _ZTS19KernelEnvironmentTy", !68, i64 0} +!78 = !{!79, !79, i64 0} +!79 = !{!"p1 _ZTS25KernelLaunchEnvironmentTy", !68, i64 0} +!80 = !{!81, !81, i64 0} +!81 = !{!"p2 _ZTS22DynamicScheduleTracker", !68, i64 0} +!82 = !{i32 1, i32 1025} +!83 = !{!84, !60, i64 0} +!84 = !{!"_ZTS19DeviceEnvironmentTy", !60, i64 0, !60, i64 4, !60, i64 8, !60, i64 12, !85, i64 16, !85, i64 24, !85, i64 32, !85, i64 40} +!85 = !{!"long", !57, i64 0} +!86 = !{!66, !60, i64 0} +!87 = !{!66, !60, i64 4} +!88 = !{!66, !60, i64 8} +!89 = !{!66, !60, i64 16} +!90 = !{!66, !60, i64 20} +!91 = !{!66, !60, i64 24} +!92 = !{i32 0, i32 1024} +!93 = !{!67, !67, i64 0} +!94 = distinct !{!94, !95} +!95 = !{!"llvm.loop.mustprogress"} +!96 = !{!97, !97, i64 0} +!97 = !{!"p1 _ZTSN4ompx5state13ThreadStateTyE", !68, i64 0} +!98 = !{!"branch_weights", !"expected", i32 2000, i32 1} +!99 = !{!100, !97, i64 32} +!100 = !{!"_ZTSN4ompx5state13ThreadStateTyE", !66, i64 0, !97, i64 32} +!101 = !{!102, !102, i64 0} +!102 = !{!"p1 omnipotent char", !68, i64 0} +!103 = !{!104, !57, i64 0} +!104 = !{!"_ZTS19KernelEnvironmentTy", !56, i64 0, !105, i64 32, !106, i64 40} +!105 = !{!"p1 _ZTS7IdentTy", !68, i64 0} +!106 = !{!"p1 _ZTS20DynamicEnvironmentTy", !68, i64 0}