diff --git a/llvm/docs/KernelInfo.rst b/llvm/docs/KernelInfo.rst
new file mode 100644
index 0000000000000..dac642f1ffc65
--- /dev/null
+++ b/llvm/docs/KernelInfo.rst
@@ -0,0 +1,63 @@
+==========
+KernelInfo
+==========
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+This LLVM IR pass reports various statistics for codes compiled for GPUs.  The
+goal of these statistics is to help identify bad code patterns and ways to
+mitigate them.  The pass operates at the LLVM IR level so that it can, in
+theory, support any LLVM-based compiler for programming languages supporting
+GPUs.
+
+By default, the pass runs at the end of LTO, and options like
+``-Rpass=kernel-info`` enable its remarks.  Example ``opt`` and ``clang``
+command lines appear in the next section.
+
+Remarks include summary statistics (e.g., total size of static allocas) and
+individual occurrences (e.g., source location of each alloca).  Examples of the
+output appear in tests in `llvm/test/Analysis/KernelInfo`.
+
+Example Command Lines
+=====================
+
+To analyze a C program as it appears to an LLVM GPU backend at the end of LTO:
+
+.. code-block:: shell
+
+  $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \
+      -Rpass=kernel-info
+
+To analyze specified LLVM IR, perhaps previously generated by something like
+``clang -save-temps -g -fopenmp --offload-arch=native test.c``:
+
+.. code-block:: shell
+
+  $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
+      -pass-remarks=kernel-info -passes=kernel-info
+
+When specifying an LLVM pass pipeline on the command line, ``kernel-info`` still
+runs at the end of LTO by default.  ``-no-kernel-info-end-lto`` disables that
+behavior so you can position ``kernel-info`` explicitly:
+
+.. code-block:: shell
+
+  $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \
+      -Rpass=kernel-info \
+      -Xoffload-linker --lto-newpm-passes='lto<O2>'
+
+  $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \
+      -Rpass=kernel-info -mllvm -no-kernel-info-end-lto \
+      -Xoffload-linker --lto-newpm-passes='module(kernel-info),lto<O2>'
+
+  $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
+      -pass-remarks=kernel-info \
+      -passes='lto<O2>'
+
+  $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \
+      -pass-remarks=kernel-info -no-kernel-info-end-lto \
+      -passes='module(kernel-info),lto<O2>'
diff --git a/llvm/docs/Passes.rst b/llvm/docs/Passes.rst
index 5e436db62be3a..435e748199b87 100644
--- a/llvm/docs/Passes.rst
+++ b/llvm/docs/Passes.rst
@@ -5,6 +5,11 @@ LLVM's Analysis and Transform Passes
 .. contents::
     :local:
 
+.. toctree::
+   :hidden:
+
+   KernelInfo
+
 Introduction
 ============
 .. warning:: This document is not updated frequently, and the list of passes
@@ -148,6 +153,12 @@ This pass collects the count of all instructions and reports them.
 Bookkeeping for "interesting" users of expressions computed from induction
 variables.
 
+``kernel-info``: GPU Kernel Info
+--------------------------------
+
+Reports various statistics for codes compiled for GPUs.  This pass is
+:doc:`documented separately<KernelInfo>`.
+
 ``lazy-value-info``: Lazy Value Information Analysis
 ----------------------------------------------------
 
diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h
new file mode 100644
index 0000000000000..75d92c202212b
--- /dev/null
+++ b/llvm/include/llvm/Analysis/KernelInfo.h
@@ -0,0 +1,35 @@
+//=- KernelInfo.h - Kernel Analysis -------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the KernelInfoPrinter class used to emit remarks about
+// function properties from a GPU kernel.
+//
+// See llvm/docs/KernelInfo.rst.
+// ===---------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_KERNELINFO_H
+#define LLVM_ANALYSIS_KERNELINFO_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class TargetMachine;
+
+class KernelInfoPrinter : public PassInfoMixin<KernelInfoPrinter> {
+  TargetMachine *TM;
+
+public:
+  explicit KernelInfoPrinter(TargetMachine *TM) : TM(TM) {}
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
+};
+} // namespace llvm
+#endif // LLVM_ANALYSIS_KERNELINFO_H
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 71b204f9c3fec..bcd3de9b7a6e4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1886,6 +1886,11 @@ class TargetTransformInfo {
 
   /// @}
 
+  /// Collect kernel launch bounds for \p F into \p LB.
+  void collectKernelLaunchBounds(
+      const Function &F,
+      SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
+
 private:
   /// The abstract base class used to type erase specific TTI
   /// implementations.
@@ -2324,6 +2329,9 @@ class TargetTransformInfo::Concept {
   virtual unsigned getMaxNumArgs() const = 0;
   virtual unsigned getNumBytesToPadGlobalArray(unsigned Size,
                                                Type *ArrayType) const = 0;
+  virtual void collectKernelLaunchBounds(
+      const Function &F,
+      SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const = 0;
 };
 
 template <typename T>
@@ -3169,6 +3177,12 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
                                        Type *ArrayType) const override {
     return Impl.getNumBytesToPadGlobalArray(Size, ArrayType);
   }
+
+  void collectKernelLaunchBounds(
+      const Function &F,
+      SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override {
+    Impl.collectKernelLaunchBounds(F, LB);
+  }
 };
 
 template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index dcef4a1abcfa3..b51663adcd8d0 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1049,6 +1049,10 @@ class TargetTransformInfoImplBase {
     return 0;
   }
 
+  void collectKernelLaunchBounds(
+      const Function &F,
+      SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {}
+
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
   // In case of a vector it returns the min required size for one element.
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index e7afcbd31420c..fcd5396ccfdbc 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -284,6 +284,18 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node<Function> {
     setValueSubclassData((getSubclassDataFromValue() & 0xc00f) | (ID << 4));
   }
 
+  /// Does it have a kernel calling convention?
+  bool hasKernelCallingConv() const {
+    switch (getCallingConv()) {
+    default:
+      return false;
+    case CallingConv::PTX_Kernel:
+    case CallingConv::AMDGPU_KERNEL:
+    case CallingConv::SPIR_KERNEL:
+      return true;
+    }
+  }
+
   enum ProfileCountType { PCT_Real, PCT_Synthetic };
 
   /// Class to represent profile counts.
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 9bdb110bd3683..ec581ae5f5b8e 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/PGOOptions.h"
 #include "llvm/Target/CGPassBuilderOption.h"
@@ -28,6 +29,8 @@
 #include <string>
 #include <utility>
 
+extern llvm::cl::opt<bool> NoKernelInfoEndLTO;
+
 namespace llvm {
 
 class AAManager;
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 0db5b80f336cb..a44f6c6a135ef 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -79,6 +79,7 @@ add_llvm_component_library(LLVMAnalysis
   InstructionPrecedenceTracking.cpp
   InstructionSimplify.cpp
   InteractiveModelRunner.cpp
+  KernelInfo.cpp
   LastRunTrackingAnalysis.cpp
   LazyBranchProbabilityInfo.cpp
   LazyBlockFrequencyInfo.cpp
diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp
new file mode 100644
index 0000000000000..4a06fd5943089
--- /dev/null
+++ b/llvm/lib/Analysis/KernelInfo.cpp
@@ -0,0 +1,326 @@
+//===- KernelInfo.cpp - Kernel Analysis -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the KernelInfoPrinter class used to emit remarks about
+// function properties from a GPU kernel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/KernelInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "kernel-info"
+
+namespace {
+
+/// Data structure holding function info for kernels.
+class KernelInfo {
+  void updateForBB(const BasicBlock &BB, OptimizationRemarkEmitter &ORE);
+
+public:
+  static void emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
+                             TargetMachine *TM);
+
+  /// Whether the function has external linkage and is not a kernel function.
+  bool ExternalNotKernel = false;
+
+  /// Launch bounds.
+  SmallVector<std::pair<StringRef, int64_t>> LaunchBounds;
+
+  /// The number of alloca instructions inside the function, the number of those
+  /// with allocation sizes that cannot be determined at compile time, and the
+  /// sum of the sizes that can be.
+  ///
+  /// With the current implementation for at least some GPU archs,
+  /// AllocasDyn > 0 might not be possible, but we report AllocasDyn anyway in
+  /// case the implementation changes.
+  int64_t Allocas = 0;
+  int64_t AllocasDyn = 0;
+  int64_t AllocasStaticSizeSum = 0;
+
+  /// Number of direct/indirect calls (anything derived from CallBase).
+  int64_t DirectCalls = 0;
+  int64_t IndirectCalls = 0;
+
+  /// Number of direct calls made from this function to other functions
+  /// defined in this module.
+  int64_t DirectCallsToDefinedFunctions = 0;
+
+  /// Number of direct calls to inline assembly.
+  int64_t InlineAssemblyCalls = 0;
+
+  /// Number of calls of type InvokeInst.
+  int64_t Invokes = 0;
+
+  /// Target-specific flat address space.
+  unsigned FlatAddrspace;
+
+  /// Number of flat address space memory accesses (via load, store, etc.).
+  int64_t FlatAddrspaceAccesses = 0;
+};
+
+} // end anonymous namespace
+
+static void identifyCallee(OptimizationRemark &R, const Module *M,
+                           const Value *V, StringRef Kind = "") {
+  SmallString<100> Name; // might be function name or asm expression
+  if (const Function *F = dyn_cast<Function>(V)) {
+    if (auto *SubProgram = F->getSubprogram()) {
+      if (SubProgram->isArtificial())
+        R << "artificial ";
+      Name = SubProgram->getName();
+    }
+  }
+  if (Name.empty()) {
+    raw_svector_ostream OS(Name);
+    V->printAsOperand(OS, /*PrintType=*/false, M);
+  }
+  if (!Kind.empty())
+    R << Kind << " ";
+  R << "'" << Name << "'";
+}
+
+static void identifyFunction(OptimizationRemark &R, const Function &F) {
+  identifyCallee(R, F.getParent(), &F, "function");
+}
+
+static void remarkAlloca(OptimizationRemarkEmitter &ORE, const Function &Caller,
+                         const AllocaInst &Alloca,
+                         TypeSize::ScalarTy StaticSize) {
+  ORE.emit([&] {
+    StringRef DbgName;
+    DebugLoc Loc;
+    bool Artificial = false;
+    auto DVRs = findDVRDeclares(&const_cast<AllocaInst &>(Alloca));
+    if (!DVRs.empty()) {
+      const DbgVariableRecord &DVR = **DVRs.begin();
+      DbgName = DVR.getVariable()->getName();
+      Loc = DVR.getDebugLoc();
+      Artificial = DVR.Variable->isArtificial();
+    }
+    OptimizationRemark R(DEBUG_TYPE, "Alloca", DiagnosticLocation(Loc),
+                         Alloca.getParent());
+    R << "in ";
+    identifyFunction(R, Caller);
+    R << ", ";
+    if (Artificial)
+      R << "artificial ";
+    SmallString<20> ValName;
+    raw_svector_ostream OS(ValName);
+    Alloca.printAsOperand(OS, /*PrintType=*/false, Caller.getParent());
+    R << "alloca ('" << ValName << "') ";
+    if (!DbgName.empty())
+      R << "for '" << DbgName << "' ";
+    else
+      R << "without debug info ";
+    R << "with ";
+    if (StaticSize)
+      R << "static size of " << itostr(StaticSize) << " bytes";
+    else
+      R << "dynamic size";
+    return R;
+  });
+}
+
+static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller,
+                       const CallBase &Call, StringRef CallKind,
+                       StringRef RemarkKind) {
+  ORE.emit([&] {
+    OptimizationRemark R(DEBUG_TYPE, RemarkKind, &Call);
+    R << "in ";
+    identifyFunction(R, Caller);
+    R << ", " << CallKind << ", callee is ";
+    identifyCallee(R, Caller.getParent(), Call.getCalledOperand());
+    return R;
+  });
+}
+
+static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE,
+                                      const Function &Caller,
+                                      const Instruction &Inst) {
+  ORE.emit([&] {
+    OptimizationRemark R(DEBUG_TYPE, "FlatAddrspaceAccess", &Inst);
+    R << "in ";
+    identifyFunction(R, Caller);
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst)) {
+      R << ", '" << II->getCalledFunction()->getName() << "' call";
+    } else {
+      R << ", '" << Inst.getOpcodeName() << "' instruction";
+    }
+    if (!Inst.getType()->isVoidTy()) {
+      SmallString<20> Name;
+      raw_svector_ostream OS(Name);
+      Inst.printAsOperand(OS, /*PrintType=*/false, Caller.getParent());
+      R << " ('" << Name << "')";
+    }
+    R << " accesses memory in flat address space";
+    return R;
+  });
+}
+
+void KernelInfo::updateForBB(const BasicBlock &BB,
+                             OptimizationRemarkEmitter &ORE) {
+  const Function &F = *BB.getParent();
+  const Module &M = *F.getParent();
+  const DataLayout &DL = M.getDataLayout();
+  for (const Instruction &I : BB.instructionsWithoutDebug()) {
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(&I)) {
+      ++Allocas;
+      TypeSize::ScalarTy StaticSize = 0;
+      if (std::optional<TypeSize> Size = Alloca->getAllocationSize(DL)) {
+        StaticSize = Size->getFixedValue();
+        assert(StaticSize <= std::numeric_limits<int64_t>::max());
+        AllocasStaticSizeSum += StaticSize;
+      } else {
+        ++AllocasDyn;
+      }
+      remarkAlloca(ORE, F, *Alloca, StaticSize);
+    } else if (const CallBase *Call = dyn_cast<CallBase>(&I)) {
+      SmallString<40> CallKind;
+      SmallString<40> RemarkKind;
+      if (Call->isIndirectCall()) {
+        ++IndirectCalls;
+        CallKind += "indirect";
+        RemarkKind += "Indirect";
+      } else {
+        ++DirectCalls;
+        CallKind += "direct";
+        RemarkKind += "Direct";
+      }
+      if (isa<InvokeInst>(Call)) {
+        ++Invokes;
+        CallKind += " invoke";
+        RemarkKind += "Invoke";
+      } else {
+        CallKind += " call";
+        RemarkKind += "Call";
+      }
+      if (!Call->isIndirectCall()) {
+        if (const Function *Callee = Call->getCalledFunction()) {
+          if (!Callee->isIntrinsic() && !Callee->isDeclaration()) {
+            ++DirectCallsToDefinedFunctions;
+            CallKind += " to defined function";
+            RemarkKind += "ToDefinedFunction";
+          }
+        } else if (Call->isInlineAsm()) {
+          ++InlineAssemblyCalls;
+          CallKind += " to inline assembly";
+          RemarkKind += "ToInlineAssembly";
+        }
+      }
+      remarkCall(ORE, F, *Call, CallKind, RemarkKind);
+      if (const AnyMemIntrinsic *MI = dyn_cast<AnyMemIntrinsic>(Call)) {
+        if (MI->getDestAddressSpace() == FlatAddrspace) {
+          ++FlatAddrspaceAccesses;
+          remarkFlatAddrspaceAccess(ORE, F, I);
+        } else if (const AnyMemTransferInst *MT =
+                       dyn_cast<AnyMemTransferInst>(MI)) {
+          if (MT->getSourceAddressSpace() == FlatAddrspace) {
+            ++FlatAddrspaceAccesses;
+            remarkFlatAddrspaceAccess(ORE, F, I);
+          }
+        }
+      }
+    } else if (const LoadInst *Load = dyn_cast<LoadInst>(&I)) {
+      if (Load->getPointerAddressSpace() == FlatAddrspace) {
+        ++FlatAddrspaceAccesses;
+        remarkFlatAddrspaceAccess(ORE, F, I);
+      }
+    } else if (const StoreInst *Store = dyn_cast<StoreInst>(&I)) {
+      if (Store->getPointerAddressSpace() == FlatAddrspace) {
+        ++FlatAddrspaceAccesses;
+        remarkFlatAddrspaceAccess(ORE, F, I);
+      }
+    } else if (const AtomicRMWInst *At = dyn_cast<AtomicRMWInst>(&I)) {
+      if (At->getPointerAddressSpace() == FlatAddrspace) {
+        ++FlatAddrspaceAccesses;
+        remarkFlatAddrspaceAccess(ORE, F, I);
+      }
+    } else if (const AtomicCmpXchgInst *At = dyn_cast<AtomicCmpXchgInst>(&I)) {
+      if (At->getPointerAddressSpace() == FlatAddrspace) {
+        ++FlatAddrspaceAccesses;
+        remarkFlatAddrspaceAccess(ORE, F, I);
+      }
+    }
+  }
+}
+
+static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F,
+                           StringRef Name, int64_t Value) {
+  ORE.emit([&] {
+    OptimizationRemark R(DEBUG_TYPE, Name, &F);
+    R << "in ";
+    identifyFunction(R, F);
+    R << ", " << Name << " = " << itostr(Value);
+    return R;
+  });
+}
+
+static std::optional<int64_t> parseFnAttrAsInteger(Function &F,
+                                                   StringRef Name) {
+  if (!F.hasFnAttribute(Name))
+    return std::nullopt;
+  return F.getFnAttributeAsParsedInteger(Name);
+}
+
+void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM,
+                                TargetMachine *TM) {
+  KernelInfo KI;
+  TargetTransformInfo &TheTTI = FAM.getResult<TargetIRAnalysis>(F);
+  KI.FlatAddrspace = TheTTI.getFlatAddressSpace();
+
+  // Record function properties.
+  KI.ExternalNotKernel = F.hasExternalLinkage() && !F.hasKernelCallingConv();
+  for (StringRef Name : {"omp_target_num_teams", "omp_target_thread_limit"}) {
+    if (auto Val = parseFnAttrAsInteger(F, Name))
+      KI.LaunchBounds.push_back({Name, *Val});
+  }
+  TheTTI.collectKernelLaunchBounds(F, KI.LaunchBounds);
+
+  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  for (const auto &BB : F)
+    KI.updateForBB(BB, ORE);
+
+#define REMARK_PROPERTY(PROP_NAME)                                             \
+  remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME)
+  REMARK_PROPERTY(ExternalNotKernel);
+  for (auto LB : KI.LaunchBounds)
+    remarkProperty(ORE, F, LB.first, LB.second);
+  REMARK_PROPERTY(Allocas);
+  REMARK_PROPERTY(AllocasStaticSizeSum);
+  REMARK_PROPERTY(AllocasDyn);
+  REMARK_PROPERTY(DirectCalls);
+  REMARK_PROPERTY(IndirectCalls);
+  REMARK_PROPERTY(DirectCallsToDefinedFunctions);
+  REMARK_PROPERTY(InlineAssemblyCalls);
+  REMARK_PROPERTY(Invokes);
+  REMARK_PROPERTY(FlatAddrspaceAccesses);
+#undef REMARK_PROPERTY
+
+  return;
+}
+
+PreservedAnalyses KernelInfoPrinter::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+  // Skip it if remarks are not enabled as it will do nothing useful.
+  if (F.getContext().getDiagHandlerPtr()->isPassedOptRemarkEnabled(DEBUG_TYPE))
+    KernelInfo::emitKernelInfo(F, AM, TM);
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 8b9722d047edc..424bb7be23383 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1441,6 +1441,12 @@ TargetTransformInfo::getNumBytesToPadGlobalArray(unsigned Size,
   return TTIImpl->getNumBytesToPadGlobalArray(Size, ArrayType);
 }
 
+void TargetTransformInfo::collectKernelLaunchBounds(
+    const Function &F,
+    SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
+  return TTIImpl->collectKernelLaunchBounds(F, LB);
+}
+
 TargetTransformInfo::Concept::~Concept() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 1e97cef22045d..328da2a185f02 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -46,6 +46,7 @@
 #include "llvm/Analysis/InlineAdvisor.h"
 #include "llvm/Analysis/InlineSizeEstimatorAnalysis.h"
 #include "llvm/Analysis/InstCount.h"
+#include "llvm/Analysis/KernelInfo.h"
 #include "llvm/Analysis/LastRunTrackingAnalysis.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/LazyValueInfo.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 0eb050c8adb04..9300a3dfca1dc 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -391,6 +391,7 @@ FUNCTION_PASS("irce", IRCEPass())
 FUNCTION_PASS("jump-threading", JumpThreadingPass())
 FUNCTION_PASS("jump-table-to-switch", JumpTableToSwitchPass());
 FUNCTION_PASS("kcfi", KCFIPass())
+FUNCTION_PASS("kernel-info", KernelInfoPrinter(TM))
 FUNCTION_PASS("lcssa", LCSSAPass())
 FUNCTION_PASS("libcalls-shrinkwrap", LibCallsShrinkWrapPass())
 FUNCTION_PASS("lint", LintPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 1f29589146c80..5b2081c8fa213 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -53,6 +53,7 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/KernelInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/CodeGen/AtomicExpand.h"
 #include "llvm/CodeGen/DeadMachineInstructionElim.h"
@@ -879,6 +880,11 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
             PM.addPass(AMDGPUAttributorPass(*this, Opt));
           }
         }
+        if (!NoKernelInfoEndLTO) {
+          FunctionPassManager FPM;
+          FPM.addPass(KernelInfoPrinter(this));
+          PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+        }
       });
 
   PB.registerRegClassFilterParsingCallback(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 5160851f8c442..5bfd8914b9a46 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1430,3 +1430,19 @@ unsigned GCNTTIImpl::getPrefetchDistance() const {
 bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
   return AMDGPU::isFlatGlobalAddrSpace(AS);
 }
+
+void GCNTTIImpl::collectKernelLaunchBounds(
+    const Function &F,
+    SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
+  SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
+  LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
+  LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
+  LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
+  std::pair<unsigned, unsigned> FlatWorkGroupSize =
+      ST->getFlatWorkGroupSizes(F);
+  LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
+  LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
+  std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
+  LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
+  LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 585f38fc02c29..a0d62008d9ddc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -273,6 +273,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
 
   /// \return if target want to issue a prefetch in address space \p AS.
   bool shouldPrefetchAddressSpace(unsigned AS) const override;
+  void collectKernelLaunchBounds(
+      const Function &F,
+      SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 6d4b82aa54a2b..e88027f30a03c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -21,6 +21,7 @@
 #include "NVPTXTargetObjectFile.h"
 #include "NVPTXTargetTransformInfo.h"
 #include "TargetInfo/NVPTXTargetInfo.h"
+#include "llvm/Analysis/KernelInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -266,6 +267,15 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
           FPM.addPass(NVPTXCopyByValArgsPass());
         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
       });
+
+  if (!NoKernelInfoEndLTO) {
+    PB.registerFullLinkTimeOptimizationLastEPCallback(
+        [this](ModulePassManager &PM, OptimizationLevel Level) {
+          FunctionPassManager FPM;
+          FPM.addPass(KernelInfoPrinter(this));
+          PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+        });
+  }
 }
 
 TargetTransformInfo
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 4ec2ec100ab08..85e99d7fe97a2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -562,4 +562,18 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
   }
   }
   return nullptr;
-}
\ No newline at end of file
+}
+
+void NVPTXTTIImpl::collectKernelLaunchBounds(
+    const Function &F,
+    SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
+  std::optional<unsigned> Val;
+  if ((Val = getMaxClusterRank(F)))
+    LB.push_back({"maxclusterrank", *Val});
+  if ((Val = getMaxNTIDx(F)))
+    LB.push_back({"maxntidx", *Val});
+  if ((Val = getMaxNTIDy(F)))
+    LB.push_back({"maxntidy", *Val});
+  if ((Val = getMaxNTIDz(F)))
+    LB.push_back({"maxntidz", *Val});
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 0f4fb280b2d99..b0a846a9c7f96 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -129,6 +129,10 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
 
   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
                                           Value *NewV) const;
+
+  void collectKernelLaunchBounds(
+      const Function &F,
+      SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index d5365f3c04743..027ae62007a72 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -26,6 +26,11 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 using namespace llvm;
 
+cl::opt<bool> NoKernelInfoEndLTO(
+    "no-kernel-info-end-lto",
+    cl::desc("remove the kernel-info pass at the end of the full LTO pipeline"),
+    cl::init(false), cl::Hidden);
+
 //---------------------------------------------------------------------------
 // TargetMachine Class
 //
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 10008130016c3..682227916e712 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -5905,17 +5905,6 @@ bool llvm::omp::isOpenMPKernel(Function &Fn) {
   return Fn.hasFnAttribute("kernel");
 }
 
-static bool isKernelCC(Function &F) {
-  switch (F.getCallingConv()) {
-  default:
-    return false;
-  case CallingConv::PTX_Kernel:
-  case CallingConv::AMDGPU_KERNEL:
-  case CallingConv::SPIR_KERNEL:
-    return true;
-  }
-}
-
 KernelSet llvm::omp::getDeviceKernels(Module &M) {
   // TODO: Create a more cross-platform way of determining device kernels.
   KernelSet Kernels;
@@ -5948,7 +5937,7 @@ KernelSet llvm::omp::getDeviceKernels(Module &M) {
     }
 
   for (Function &F : M)
-    if (isKernelCC(F))
+    if (F.hasKernelCallingConv())
       ProcessKernel(F);
 
   return Kernels;
diff --git a/llvm/test/Analysis/KernelInfo/allocas.ll b/llvm/test/Analysis/KernelInfo/allocas.ll
new file mode 100644
index 0000000000000..94506645f7ec6
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/allocas.ll
@@ -0,0 +1,117 @@
+; Check info on allocas.
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -disable-output %s 2>&1 | \
+; RUN:   FileCheck -match-full-lines %s
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define void @h() !dbg !100 {
+entry:
+  ; CHECK: remark: test.c:0:0: in artificial function 'h', artificial alloca ('%dyn_ptr.addr') for 'dyn_ptr' with static size of 8 bytes
+  %dyn_ptr.addr = alloca ptr, align 8
+  ; CHECK: remark: test.c:14:9: in artificial function 'h', alloca ('%i') for 'i' with static size of 4 bytes
+  %i = alloca i32, align 4
+  ; CHECK: remark: test.c:15:9: in artificial function 'h', alloca ('%a') for 'a' with static size of 8 bytes
+  %a = alloca [2 x i32], align 4
+  %size = load i32, ptr %i, align 4
+  ; CHECK: remark: test.c:16:9: in artificial function 'h', alloca ('%adyn') for 'adyn' with dynamic size
+  %adyn = alloca i32, i32 %size, align 4
+  ; CHECK: remark: <unknown>:0:0: in artificial function 'h', alloca ('%nodbg') without debug info with static size of 4 bytes
+  %nodbg = alloca i32, align 4
+  tail call void @llvm.dbg.declare(metadata ptr %dyn_ptr.addr, metadata !110, metadata !DIExpression()), !dbg !114
+  tail call void @llvm.dbg.declare(metadata ptr %i, metadata !120, metadata !DIExpression()), !dbg !121
+  tail call void @llvm.dbg.declare(metadata ptr %a, metadata !130, metadata !DIExpression()), !dbg !131
+  tail call void @llvm.dbg.declare(metadata ptr %adyn, metadata !140, metadata !DIExpression()), !dbg !141
+  br label %non-entry
+
+non-entry:
+  ; CHECK: remark: test.c:17:9: in artificial function 'h', alloca ('%i2') for 'i2' with static size of 4 bytes
+  %i2 = alloca i32, align 4
+  %size2 = load i32, ptr %i2, align 4
+  ; CHECK: remark: test.c:18:9: in artificial function 'h', alloca ('%adyn2') for 'adyn2' with dynamic size
+  %adyn2 = alloca i32, i32 %size, align 4
+  tail call void @llvm.dbg.declare(metadata ptr %i2, metadata !150, metadata !DIExpression()), !dbg !151
+  tail call void @llvm.dbg.declare(metadata ptr %adyn2, metadata !160, metadata !DIExpression()), !dbg !161
+  ret void
+}
+; CHECK: remark: test.c:13:0: in artificial function 'h', Allocas = 7
+; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasStaticSizeSum = 28
+; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasDyn = 2
+
+define void @g() !dbg !200 {
+entry:
+  ; CHECK: remark: test.c:4:7: in function 'g', alloca ('%i') for 'i' with static size of 4 bytes
+  %i = alloca i32, align 4
+  ; CHECK: remark: test.c:5:7: in function 'g', alloca ('%a') for 'a' with static size of 8 bytes
+  %a = alloca [2 x i32], align 4
+  tail call void @llvm.dbg.declare(metadata ptr %i, metadata !210, metadata !DIExpression()), !dbg !211
+  tail call void @llvm.dbg.declare(metadata ptr %a, metadata !220, metadata !DIExpression()), !dbg !221
+  ret void
+}
+; CHECK: remark: test.c:3:0: in function 'g', Allocas = 2
+; CHECK: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12
+; CHECK: remark: test.c:3:0: in function 'g', AllocasDyn = 0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
+
+; uselistorder directives
+uselistorder ptr @llvm.dbg.declare, { 7, 6, 5, 4, 3, 2, 1, 0 }
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!2 = !DIFile(filename: "test.c", directory: "/tmp")
+!3 = !{null}
+!4 = !{}
+
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+
+!20 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !21)
+!21 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !22)
+!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+
+!30 = !DICompositeType(tag: DW_TAG_array_type, baseType: !10, size: 64, elements: !31)
+!31 = !{!32}
+!32 = !DISubrange(count: 2)
+
+!40 = !DICompositeType(tag: DW_TAG_array_type, baseType: !10, elements: !41)
+!41 = !{!42}
+!42 = !DISubrange(count: !43)
+!43 = !DILocalVariable(name: "__vla_expr0", scope: !100, type: !10, flags: DIFlagArtificial)
+
+!100 = distinct !DISubprogram(name: "h", scope: !2, file: !2, line: 13, type: !101, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4)
+!101 = distinct !DISubroutineType(types: !3)
+
+!110 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !100, type: !20, flags: DIFlagArtificial)
+!114 = !DILocation(line: 0, scope: !100)
+
+!120 = !DILocalVariable(name: "i", scope: !100, file: !2, line: 14, type: !10)
+!121 = !DILocation(line: 14, column: 9, scope: !100)
+
+!130 = !DILocalVariable(name: "a", scope: !100, file: !2, line: 15, type: !30)
+!131 = !DILocation(line: 15, column: 9, scope: !100)
+
+!140 = !DILocalVariable(name: "adyn", scope: !100, file: !2, line: 16, type: !40)
+!141 = !DILocation(line: 16, column: 9, scope: !100)
+
+!150 = !DILocalVariable(name: "i2", scope: !100, file: !2, line: 17, type: !10)
+!151 = !DILocation(line: 17, column: 9, scope: !100)
+
+!160 = !DILocalVariable(name: "adyn2", scope: !100, file: !2, line: 18, type: !40)
+!161 = !DILocation(line: 18, column: 9, scope: !100)
+
+!200 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 3, type: !201, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4)
+!201 = !DISubroutineType(types: !3)
+
+!210 = !DILocalVariable(name: "i", scope: !200, file: !2, line: 4, type: !10)
+!211 = !DILocation(line: 4, column: 7, scope: !200)
+
+!220 = !DILocalVariable(name: "a", scope: !200, file: !2, line: 5, type: !30)
+!221 = !DILocation(line: 5, column: 7, scope: !200)
diff --git a/llvm/test/Analysis/KernelInfo/calls.ll b/llvm/test/Analysis/KernelInfo/calls.ll
new file mode 100644
index 0000000000000..6a2a5c426b78b
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/calls.ll
@@ -0,0 +1,139 @@
+; Check info on calls.
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -disable-output %s 2>&1 | \
+; RUN:   FileCheck -match-full-lines %s
+
+target datalayout = "e-i65:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @personality()
+
+define void @h() personality ptr @personality !dbg !100 {
+entry:
+  ; CHECK: remark: test.c:16:5: in artificial function 'h_dbg', direct call, callee is '@f'
+  call void @f(), !dbg !102
+  ; CHECK: remark: test.c:17:5: in artificial function 'h_dbg', direct call to defined function, callee is 'g_dbg'
+  call void @g(), !dbg !104
+  ; CHECK: remark: test.c:18:5: in artificial function 'h_dbg', direct call to defined function, callee is artificial 'h_dbg'
+  call void @h(), !dbg !105
+  ; CHECK: remark: test.c:24:5: in artificial function 'h_dbg', direct call to inline assembly, callee is 'asm sideeffect "eieio", ""'
+  call void asm sideeffect "eieio", ""(), !dbg !111
+  %fnPtr = load ptr, ptr null, align 8
+  ; CHECK: remark: test.c:19:5: in artificial function 'h_dbg', indirect call, callee is '%fnPtr'
+  call void %fnPtr(), !dbg !106
+  ; CHECK: remark: test.c:20:5: in artificial function 'h_dbg', direct invoke, callee is '@f'
+  invoke void @f() to label %fcont unwind label %cleanup, !dbg !107
+fcont:
+  ; CHECK: remark: test.c:21:5: in artificial function 'h_dbg', direct invoke to defined function, callee is 'g_dbg'
+  invoke void @g() to label %gcont unwind label %cleanup, !dbg !108
+gcont:
+  ; CHECK: remark: test.c:22:5: in artificial function 'h_dbg', direct invoke to defined function, callee is artificial 'h_dbg'
+  invoke void @h() to label %hcont unwind label %cleanup, !dbg !109
+hcont:
+  ; CHECK: remark: test.c:25:5: in artificial function 'h_dbg', direct invoke to inline assembly, callee is 'asm sideeffect "eieio", ""'
+  invoke void asm sideeffect "eieio", ""() to label %asmcont unwind label %cleanup, !dbg !112
+asmcont:
+  ; CHECK: remark: test.c:23:5: in artificial function 'h_dbg', indirect invoke, callee is '%fnPtr'
+  invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !110
+cleanup:
+  %ll = landingpad { ptr, i32 }
+  cleanup
+  br label %end
+end:
+  ret void
+}
+; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', DirectCalls = 8
+; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', IndirectCalls = 2
+; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', DirectCallsToDefinedFunctions = 4
+; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', InlineAssemblyCalls = 2
+; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', Invokes = 5
+
+declare void @f()
+
+define void @g() personality ptr @personality !dbg !200 {
+entry:
+  ; CHECK: remark: test.c:6:3: in function 'g_dbg', direct call, callee is '@f'
+  call void @f(), !dbg !202
+  ; CHECK: remark: test.c:7:3: in function 'g_dbg', direct call to defined function, callee is 'g_dbg'
+  call void @g(), !dbg !203
+  ; CHECK: remark: test.c:8:3: in function 'g_dbg', direct call to defined function, callee is artificial 'h_dbg'
+  call void @h(), !dbg !204
+  ; CHECK: remark: test.c:14:3: in function 'g_dbg', direct call to inline assembly, callee is 'asm sideeffect "eieio", ""'
+  call void asm sideeffect "eieio", ""(), !dbg !210
+  %fnPtr = load ptr, ptr null, align 8
+  ; CHECK: remark: test.c:9:3: in function 'g_dbg', indirect call, callee is '%fnPtr'
+  call void %fnPtr(), !dbg !205
+  ; CHECK: remark: test.c:10:3: in function 'g_dbg', direct invoke, callee is '@f'
+  invoke void @f() to label %fcont unwind label %cleanup, !dbg !206
+fcont:
+  ; CHECK: remark: test.c:11:3: in function 'g_dbg', direct invoke to defined function, callee is 'g_dbg'
+  invoke void @g() to label %gcont unwind label %cleanup, !dbg !207
+gcont:
+  ; CHECK: remark: test.c:12:3: in function 'g_dbg', direct invoke to defined function, callee is artificial 'h_dbg'
+  invoke void @h() to label %hcont unwind label %cleanup, !dbg !208
+hcont:
+  ; CHECK: remark: test.c:15:3: in function 'g_dbg', direct invoke to inline assembly, callee is 'asm sideeffect "eieio", ""'
+  invoke void asm sideeffect "eieio", ""() to label %asmcont unwind label %cleanup, !dbg !211
+asmcont:
+  ; CHECK: remark: test.c:13:3: in function 'g_dbg', indirect invoke, callee is '%fnPtr'
+  invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !209
+cleanup:
+  %ll = landingpad { ptr, i32 }
+  cleanup
+  br label %end
+end:
+  ret void
+}
+; CHECK: remark: test.c:3:0: in function 'g_dbg', DirectCalls = 8
+; CHECK: remark: test.c:3:0: in function 'g_dbg', IndirectCalls = 2
+; CHECK: remark: test.c:3:0: in function 'g_dbg', DirectCallsToDefinedFunctions = 4
+; CHECK: remark: test.c:3:0: in function 'g_dbg', InlineAssemblyCalls = 2
+; CHECK: remark: test.c:3:0: in function 'g_dbg', Invokes = 5
+
+define void @i() {
+  ; CHECK: remark: <unknown>:0:0: in function '@i', direct call, callee is '@f'
+  call void @f()
+  ret void
+}
+; CHECK: remark: <unknown>:0:0: in function '@i', DirectCalls = 1
+; CHECK: remark: <unknown>:0:0: in function '@i', IndirectCalls = 0
+; CHECK: remark: <unknown>:0:0: in function '@i', DirectCallsToDefinedFunctions = 0
+; CHECK: remark: <unknown>:0:0: in function '@i', InlineAssemblyCalls = 0
+; CHECK: remark: <unknown>:0:0: in function '@i', Invokes = 0
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!2 = !DIFile(filename: "test.c", directory: "/tmp")
+!3 = !{null}
+!4 = !{}
+
+!100 = distinct !DISubprogram(name: "h_dbg", scope: !2, file: !2, line: 13, type: !101, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4)
+!101 = distinct !DISubroutineType(types: !3)
+!102 = !DILocation(line: 16, column: 5, scope: !103)
+!103 = distinct !DILexicalBlock(scope: !100, file: !2, line: 13, column: 3)
+!104 = !DILocation(line: 17, column: 5, scope: !103)
+!105 = !DILocation(line: 18, column: 5, scope: !103)
+!106 = !DILocation(line: 19, column: 5, scope: !103)
+!107 = !DILocation(line: 20, column: 5, scope: !103)
+!108 = !DILocation(line: 21, column: 5, scope: !103)
+!109 = !DILocation(line: 22, column: 5, scope: !103)
+!110 = !DILocation(line: 23, column: 5, scope: !103)
+!111 = !DILocation(line: 24, column: 5, scope: !103)
+!112 = !DILocation(line: 25, column: 5, scope: !103)
+
+!200 = distinct !DISubprogram(name: "g_dbg", scope: !2, file: !2, line: 3, type: !201, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4)
+!201 = !DISubroutineType(types: !3)
+!202 = !DILocation(line: 6, column: 3, scope: !200)
+!203 = !DILocation(line: 7, column: 3, scope: !200)
+!204 = !DILocation(line: 8, column: 3, scope: !200)
+!205 = !DILocation(line: 9, column: 3, scope: !200)
+!206 = !DILocation(line: 10, column: 3, scope: !200)
+!207 = !DILocation(line: 11, column: 3, scope: !200)
+!208 = !DILocation(line: 12, column: 3, scope: !200)
+!209 = !DILocation(line: 13, column: 3, scope: !200)
+!210 = !DILocation(line: 14, column: 3, scope: !200)
+!211 = !DILocation(line: 15, column: 3, scope: !200)
diff --git a/llvm/test/Analysis/KernelInfo/enable-kernel-info/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/enable-kernel-info/Inputs/test.ll
new file mode 100644
index 0000000000000..461544e44d538
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/enable-kernel-info/Inputs/test.ll
@@ -0,0 +1,22 @@
+; CHECK: remark: test.c:10:0: in artificial function 'test', omp_target_num_teams = 100
+; NONE-NOT: remark:
+define void @test() #0 !dbg !5 {
+entry:
+  ret void
+}
+
+attributes #0 = {
+  "omp_target_num_teams"="100"
+}
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!2 = !DIFile(filename: "test.c", directory: "/tmp")
+!3 = !{}
+!4 = !DISubroutineType(types: !3)
+!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
+!6 = distinct !{ptr null, !"kernel", i32 1}
diff --git a/llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.test b/llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.test
new file mode 100644
index 0000000000000..e969eabfe7cd8
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.test
@@ -0,0 +1,18 @@
+; Check when kernel-info is enabled in the AMD GPU target backend.
+
+; REQUIRES: amdgpu-registered-target
+
+; DEFINE: %{opt} = opt -disable-output %S/Inputs/test.ll \
+; DEFINE:              -mtriple="amdgcn-amd-amdhsa" 2>&1
+; DEFINE: %{fcheck-on} = FileCheck -match-full-lines %S/Inputs/test.ll
+; DEFINE: %{fcheck-off} = FileCheck -allow-empty -check-prefixes=NONE \
+; DEFINE:                 %S/Inputs/test.ll
+
+; By default, kernel-info is in the LTO pipeline.  To see output, the LTO
+; pipeline must run, -no-kernel-info-end-lto must not be specified, and remarks
+; must be enabled.
+; RUN: %{opt} -passes='lto<O2>' -pass-remarks=kernel-info | %{fcheck-on}
+; RUN: %{opt} -passes='default<O2>' -pass-remarks=kernel-info | %{fcheck-off}
+; RUN: %{opt} -passes='lto<O2>' -pass-remarks=kernel-info \
+; RUN:        -no-kernel-info-end-lto | %{fcheck-off}
+; RUN: %{opt} -passes='lto<O2>' | %{fcheck-off}
diff --git a/llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.test b/llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.test
new file mode 100644
index 0000000000000..65249b4d92e34
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.test
@@ -0,0 +1,18 @@
+; Check when kernel-info is enabled in the NVPTX target backend.
+
+; REQUIRES: nvptx-registered-target
+
+; DEFINE: %{opt} = opt -disable-output %S/Inputs/test.ll \
+; DEFINE:              -mtriple="nvptx64-nvidia-cuda" 2>&1
+; DEFINE: %{fcheck-on} = FileCheck -match-full-lines %S/Inputs/test.ll
+; DEFINE: %{fcheck-off} = FileCheck -allow-empty -check-prefixes=NONE \
+; DEFINE:                 %S/Inputs/test.ll
+
+; By default, kernel-info is in the LTO pipeline.  To see output, the LTO
+; pipeline must run, -no-kernel-info-end-lto must not be specified, and remarks
+; must be enabled.
+; RUN: %{opt} -passes='lto<O2>' -pass-remarks=kernel-info | %{fcheck-on}
+; RUN: %{opt} -passes='default<O2>' -pass-remarks=kernel-info | %{fcheck-off}
+; RUN: %{opt} -passes='lto<O2>' -pass-remarks=kernel-info \
+; RUN:        -no-kernel-info-end-lto | %{fcheck-off}
+; RUN: %{opt} -passes='lto<O2>' | %{fcheck-off}
diff --git a/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll
new file mode 100644
index 0000000000000..b54c3a18f3e70
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll
@@ -0,0 +1,143 @@
+define void @f() !dbg !3 {
+entry:
+  ; load: check remarks for both unnamed and named values.
+  ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%0') accesses memory in flat address space
+  %0 = load i32, ptr null, align 4, !dbg !6
+  ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load') accesses memory in flat address space
+  %load = load i32, ptr null, align 4, !dbg !6
+  ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load0') accesses memory in flat address space
+  %load0 = load i32, ptr addrspace(0) null, align 4, !dbg !6
+  %load1 = load i32, ptr addrspace(1) null, align 4, !dbg !6
+  %load2 = load i32, ptr addrspace(2) null, align 4, !dbg !6
+
+  ; store
+  ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in flat address space
+  store i32 0, ptr null, align 4, !dbg !7
+  ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in flat address space
+  store i32 0, ptr addrspace(0) null, align 4, !dbg !7
+  store i32 0, ptr addrspace(1) null, align 4, !dbg !7
+  store i32 0, ptr addrspace(8) null, align 4, !dbg !7
+
+  ; atomicrmw
+  ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction ('%[[#]]') accesses memory in flat address space
+  atomicrmw xchg ptr null, i32 10 seq_cst, !dbg !8
+  ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction ('%[[#]]') accesses memory in flat address space
+  atomicrmw add ptr addrspace(0) null, i32 10 seq_cst, !dbg !8
+  atomicrmw xchg ptr addrspace(1) null, i32 10 seq_cst, !dbg !8
+  atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !8
+
+  ; cmpxchg
+  ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction ('%[[#]]') accesses memory in flat address space
+  cmpxchg ptr null, i32 0, i32 1 acq_rel monotonic, !dbg !9
+  ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction ('%[[#]]') accesses memory in flat address space
+  cmpxchg ptr addrspace(0) null, i32 0, i32 1 acq_rel monotonic, !dbg !9
+  cmpxchg ptr addrspace(1) null, i32 0, i32 1 acq_rel monotonic, !dbg !9
+  cmpxchg ptr addrspace(934) null, i32 0, i32 1 acq_rel monotonic, !dbg !9
+
+  ; llvm.memcpy
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in flat address space
+  call void @llvm.memcpy.p0.p1.i64(ptr align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in flat address space
+  call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
+  call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
+  call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in flat address space
+  call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !10
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in flat address space
+  call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10
+  call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
+  call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) align 4 null, ptr addrspace(4) align 4 null, i64 10, i1 false), !dbg !10
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in flat address space
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !10
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in flat address space
+  call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10
+
+  ; llvm.memcpy.inline
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p0.i64' call accesses memory in flat address space
+  call void @llvm.memcpy.inline.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p1.i64' call accesses memory in flat address space
+  call void @llvm.memcpy.inline.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p1.p0.i64' call accesses memory in flat address space
+  call void @llvm.memcpy.inline.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10
+  call void @llvm.memcpy.inline.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10
+
+  ; llvm.memcpy.element.unordered.atomic
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p0.i64' call accesses memory in flat address space
+  call void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !10
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p1.i64' call accesses memory in flat address space
+  call void @llvm.memcpy.element.unordered.atomic.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !10
+  ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p1.p0.i64' call accesses memory in flat address space
+  call void @llvm.memcpy.element.unordered.atomic.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !10
+  call void @llvm.memcpy.element.unordered.atomic.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !10
+
+  ; llvm.memmove
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in flat address space
+  call void @llvm.memmove.p0.p1.i64(ptr align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in flat address space
+  call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11
+  call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11
+  call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in flat address space
+  call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !11
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in flat address space
+  call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !11
+  call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11
+  call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) align 4 null, ptr addrspace(4) align 4 null, i64 10, i1 false), !dbg !11
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in flat address space
+  call void @llvm.memmove.p0.p0.i64(ptr align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !11
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in flat address space
+  call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !11
+
+  ; llvm.memmove.element.unordered.atomic
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p0.i64' call accesses memory in flat address space
+  call void @llvm.memmove.element.unordered.atomic.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !11
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p1.i64' call accesses memory in flat address space
+  call void @llvm.memmove.element.unordered.atomic.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !11
+  ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p1.p0.i64' call accesses memory in flat address space
+  call void @llvm.memmove.element.unordered.atomic.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !11
+  call void @llvm.memmove.element.unordered.atomic.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !11
+
+  ; llvm.memset
+  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in flat address space
+  call void @llvm.memset.p0.i64(ptr align 4 null, i8 0, i64 10, i1 false), !dbg !12
+  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in flat address space
+  call void @llvm.memset.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i1 false), !dbg !12
+  call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i1 false), !dbg !12
+  call void @llvm.memset.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i1 false), !dbg !12
+
+  ; llvm.memset.inline
+  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in flat address space
+  call void @llvm.memset.inline.p0.i64(ptr align 4 null, i8 0, i64 10, i1 false), !dbg !12
+  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in flat address space
+  call void @llvm.memset.inline.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i1 false), !dbg !12
+  call void @llvm.memset.inline.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i1 false), !dbg !12
+  call void @llvm.memset.inline.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i1 false), !dbg !12
+
+  ; llvm.memset.element.unordered.atomic
+  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in flat address space
+  call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 null, i8 0, i64 10, i32 4), !dbg !12
+  ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in flat address space
+  call void @llvm.memset.element.unordered.atomic.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i32 4), !dbg !12
+  call void @llvm.memset.element.unordered.atomic.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i32 4), !dbg !12
+  call void @llvm.memset.element.unordered.atomic.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i32 4), !dbg !12
+
+  ret void
+}
+; CHECK: remark: test.c:2:0: in function 'f', FlatAddrspaceAccesses = 36
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/tmp")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 2, type: !4, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !5)
+!4 = !DISubroutineType(types: !5)
+!5 = !{}
+!6 = !DILocation(line: 3, column: 11, scope: !3)
+!7 = !DILocation(line: 4, column: 6, scope: !3)
+!8 = !DILocation(line: 5, column: 1, scope: !3)
+!9 = !DILocation(line: 6, column: 2, scope: !3)
+!10 = !DILocation(line: 7, column: 3, scope: !3)
+!11 = !DILocation(line: 8, column: 4, scope: !3)
+!12 = !DILocation(line: 9, column: 5, scope: !3)
diff --git a/llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.test b/llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.test
new file mode 100644
index 0000000000000..7447dcf51cc89
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.test
@@ -0,0 +1,12 @@
+; Check info on flat address space memory accesses when the target is amdgpu.
+;
+; The target matters because kernel-info calls
+; TargetTransformInfo::getFlatAddressSpace to select the flat address space.
+
+; REQUIRES: amdgpu-registered-target
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -mtriple="amdgcn-amd-amdhsa" \
+; RUN:     -disable-output %S/Inputs/test.ll 2>&1 | \
+; RUN:   FileCheck -match-full-lines -implicit-check-not='flat address space' \
+; RUN:       %S/Inputs/test.ll
diff --git a/llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.test b/llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.test
new file mode 100644
index 0000000000000..02321c19e022d
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.test
@@ -0,0 +1,12 @@
+; Check info on flat address space memory accesses when the target is nvptx.
+;
+; The target matters because kernel-info calls
+; TargetTransformInfo::getFlatAddressSpace to select the flat address space.
+
+; REQUIRES: nvptx-registered-target
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -mtriple="nvptx64-nvidia-cuda" \
+; RUN:     -disable-output %S/Inputs/test.ll 2>&1 | \
+; RUN:   FileCheck -match-full-lines -implicit-check-not='flat address space' \
+; RUN:       %S/Inputs/test.ll
diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll
new file mode 100644
index 0000000000000..7fbdb923d8800
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll
@@ -0,0 +1,83 @@
+; Check info on launch bounds for AMD GPU.
+
+; REQUIRES: amdgpu-registered-target
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -disable-output %s 2>&1 | \
+; RUN:   FileCheck -match-full-lines %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+; CHECK: remark: test.c:10:0: in artificial function 'all', omp_target_num_teams = 100
+; CHECK: remark: test.c:10:0: in artificial function 'all', omp_target_thread_limit = 101
+; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-max-num-workgroups[0] = 200
+; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-max-num-workgroups[1] = 201
+; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-max-num-workgroups[2] = 202
+; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-flat-work-group-size[0] = 210
+; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-flat-work-group-size[1] = 211
+; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-waves-per-eu[0] = 2
+; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-waves-per-eu[1] = 9
+define void @all() #0 !dbg !5 {
+entry:
+  ret void
+}
+
+; CHECK-NOT: remark: test.c:11:0: in function 'none', omp_target_num_teams = {{.*}}
+; CHECK-NOT: remark: test.c:11:0: in function 'none', omp_target_thread_limit = {{.*}}
+; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[0] = 4294967295
+; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[1] = 4294967295
+; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[2] = 4294967295
+; CHECK: remark: test.c:11:0: in function 'none', amdgpu-flat-work-group-size[0] = 1
+; CHECK: remark: test.c:11:0: in function 'none', amdgpu-flat-work-group-size[1] = 1024
+; CHECK: remark: test.c:11:0: in function 'none', amdgpu-waves-per-eu[0] = 4
+; CHECK: remark: test.c:11:0: in function 'none', amdgpu-waves-per-eu[1] = 10
+define void @none() !dbg !6 {
+entry:
+  ret void
+}
+
+; CHECK: remark: test.c:12:0: in function 'bogus', omp_target_num_teams = 987654321
+; CHECK: remark: test.c:12:0: in function 'bogus', omp_target_thread_limit = 987654321
+; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-max-num-workgroups[0] = 987654321
+; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-max-num-workgroups[1] = 987654321
+; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-max-num-workgroups[2] = 987654321
+; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-flat-work-group-size[0] = 1
+; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-flat-work-group-size[1] = 1024
+; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-waves-per-eu[0] = 4
+; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-waves-per-eu[1] = 10
+define void @bogus() #1 !dbg !7 {
+entry:
+  ret void
+}
+
+attributes #0 = {
+  "omp_target_num_teams"="100"
+  "omp_target_thread_limit"="101"
+  "amdgpu-max-num-workgroups"="200,201,202"
+  "amdgpu-flat-work-group-size"="210,211"
+  "amdgpu-waves-per-eu"="2,9"
+}
+
+; We choose values that are small enough to parse successfully but that are
+; impossibly large.  For values that are validated, we check that they are
+; overridden with realistic values.
+attributes #1 = {
+  "omp_target_num_teams"="987654321"
+  "omp_target_thread_limit"="987654321"
+  "amdgpu-max-num-workgroups"="987654321,987654321,987654321"
+  "amdgpu-flat-work-group-size"="987654321,987654321"
+  "amdgpu-waves-per-eu"="987654321,987654321"
+}
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!2 = !DIFile(filename: "test.c", directory: "/tmp")
+!3 = !{}
+!4 = !DISubroutineType(types: !3)
+!5 = distinct !DISubprogram(name: "all", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
+!6 = distinct !DISubprogram(name: "none", scope: !2, file: !2, line: 11, type: !4, scopeLine: 11, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
+!7 = distinct !DISubprogram(name: "bogus", scope: !2, file: !2, line: 12, type: !4, scopeLine: 12, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll
new file mode 100644
index 0000000000000..7a055c7152ec8
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll
@@ -0,0 +1,42 @@
+; Check info on launch bounds for NVPTX.
+
+; REQUIRES: nvptx-registered-target
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -disable-output %s 2>&1 | \
+; RUN:   FileCheck -match-full-lines %s
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK: remark: test.c:10:0: in artificial function 'test', omp_target_num_teams = 100
+; CHECK: remark: test.c:10:0: in artificial function 'test', omp_target_thread_limit = 101
+; CHECK: remark: test.c:10:0: in artificial function 'test', maxclusterrank = 200
+; CHECK: remark: test.c:10:0: in artificial function 'test', maxntidx = 210
+; CHECK: remark: test.c:10:0: in artificial function 'test', maxntidy = 211
+; CHECK: remark: test.c:10:0: in artificial function 'test', maxntidz = 212
+define void @test() #0 !dbg !5 {
+entry:
+  ret void
+}
+
+attributes #0 = {
+  "omp_target_num_teams"="100"
+  "omp_target_thread_limit"="101"
+}
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!6, !7, !8, !9, !10}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!2 = !DIFile(filename: "test.c", directory: "/tmp")
+!3 = !{}
+!4 = !DISubroutineType(types: !3)
+!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3)
+!6 = !{ptr @test, !"maxclusterrank", i32 200}
+!7 = !{ptr @test, !"maxntidx", i32 210}
+!8 = !{ptr @test, !"maxntidy", i32 211}
+!9 = !{ptr @test, !"maxntidz", i32 212}
+!10 = distinct !{ptr null, !"kernel", i32 1}
diff --git a/llvm/test/Analysis/KernelInfo/linkage.ll b/llvm/test/Analysis/KernelInfo/linkage.ll
new file mode 100644
index 0000000000000..8679d366d0cb7
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/linkage.ll
@@ -0,0 +1,68 @@
+; Check info on linkage.
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -disable-output %s 2>&1 | \
+; RUN:   FileCheck -match-full-lines %s
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK: remark: test.c:13:0: in artificial function 'extNotKer', ExternalNotKernel = 1
+define external void @extNotKer() !dbg !10 {
+entry:
+  ret void
+}
+
+; CHECK: remark: test.c:23:0: in function 'impNotKer', ExternalNotKernel = 1
+define void @impNotKer() !dbg !20 {
+entry:
+  ret void
+}
+
+; CHECK: remark: test.c:33:0: in artificial function 'weakNotKer', ExternalNotKernel = 0
+define weak void @weakNotKer() !dbg !30 {
+entry:
+  ret void
+}
+
+; CHECK: remark: test.c:43:0: in function 'extPtxKer', ExternalNotKernel = 0
+define external ptx_kernel void @extPtxKer() !dbg !40 {
+entry:
+  ret void
+}
+
+; CHECK: remark: test.c:53:0: in artificial function 'extAmdgpuKer', ExternalNotKernel = 0
+define external amdgpu_kernel void @extAmdgpuKer() !dbg !50 {
+entry:
+  ret void
+}
+
+; CHECK: remark: test.c:63:0: in function 'extSpirKer', ExternalNotKernel = 0
+define external spir_kernel void @extSpirKer() !dbg !60 {
+entry:
+  ret void
+}
+
+; CHECK: remark: test.c:73:0: in artificial function 'weakKer', ExternalNotKernel = 0
+define weak ptx_kernel void @weakKer() !dbg !70 {
+entry:
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!2 = !DIFile(filename: "test.c", directory: "/tmp")
+!3 = !{null}
+!4 = !{}
+!5 = !DISubroutineType(types: !3)
+
+!10 = distinct !DISubprogram(name: "extNotKer", scope: !2, file: !2, line: 13, type: !5, scopeLine: 13, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4)
+!20 = distinct !DISubprogram(name: "impNotKer", scope: !2, file: !2, line: 23, type: !5, scopeLine: 23, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4)
+!30 = distinct !DISubprogram(name: "weakNotKer", scope: !2, file: !2, line: 33, type: !5, scopeLine: 33, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4)
+!40 = distinct !DISubprogram(name: "extPtxKer", scope: !2, file: !2, line: 43, type: !5, scopeLine: 43, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4)
+!50 = distinct !DISubprogram(name: "extAmdgpuKer", scope: !2, file: !2, line: 53, type: !5, scopeLine: 53, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4)
+!60 = distinct !DISubprogram(name: "extSpirKer", scope: !2, file: !2, line: 63, type: !5, scopeLine: 63, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4)
+!70 = distinct !DISubprogram(name: "weakKer", scope: !2, file: !2, line: 73, type: !5, scopeLine: 73, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4)
diff --git a/llvm/test/Analysis/KernelInfo/openmp/README.md b/llvm/test/Analysis/KernelInfo/openmp/README.md
new file mode 100644
index 0000000000000..0aeb52f83c5c7
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/openmp/README.md
@@ -0,0 +1,40 @@
+The tests in this directory check that basic KernelInfoPrinter functionality
+behaves reasonably for LLVM IR produced by Clang OpenMP codegen.
+
+So that these tests are straightforward to maintain and faithfully represent
+Clang OpenMP codegen, do not tweak or reduce the LLVM IR in them.  Other tests
+more exhaustively check KernelInfoPrinter features using reduced LLVM IR.
+
+The LLVM IR in each test file `$TEST` can be regenerated as follows in the case
+that Clang OpenMP codegen changes or it becomes desirable to adjust the source
+OpenMP program below.  First, remove the existing LLVM IR from `$TEST`.  Then,
+where `$TARGET` (e.g., `nvptx64-nvidia-cuda-sm_70` or `amdgcn-amd-amdhsa-gfx906`)
+depends on `$TEST`:
+
+```
+$ cd /tmp
+$ cat test.c
+#pragma omp declare target
+void f();
+void g() {
+  int i;
+  int a[2];
+  f();
+  g();
+}
+#pragma omp end declare target
+
+void h(int i) {
+  #pragma omp target map(tofrom:i)
+  {
+    int i;
+    int a[2];
+    f();
+    g();
+  }
+}
+
+$ clang -g -fopenmp --offload-arch=native -save-temps -c test.c
+$ llvm-dis test-openmp-$TARGET.bc
+$ cat test-openmp-$TARGET.ll >> $TEST
+```
diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
new file mode 100644
index 0000000000000..4843408bdda49
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
@@ -0,0 +1,225 @@
+; See ./README.md for how to maintain the LLVM IR in this test.
+
+; REQUIRES: amdgpu-registered-target
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -disable-output %s 2>&1 | \
+; RUN:   FileCheck -match-full-lines %s
+
+;  CHECK-NOT: remark:
+;      CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes
+; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'i' with static size of 4 bytes
+; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'a' with static size of 8 bytes
+; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space
+; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@__kmpc_target_init'
+; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@f'
+; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g'
+; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@__kmpc_target_deinit'
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[0] = 4294967295
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[1] = 4294967295
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[2] = 4294967295
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[0] = 1
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[1] = 1024
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-waves-per-eu[0] = 4
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-waves-per-eu[1] = 10
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCalls = 4
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 1
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', InlineAssemblyCalls = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1
+
+; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes
+; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space
+; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space
+; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__'
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', omp_target_thread_limit = 256
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[0] = 4294967295
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[1] = 4294967295
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[2] = 4294967295
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[0] = 1
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[1] = 256
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-waves-per-eu[0] = 1
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-waves-per-eu[1] = 10
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasDyn = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCalls = 1
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', InlineAssemblyCalls = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2
+
+; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca ('%[[#]]') for 'i' with static size of 4 bytes
+; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca ('%[[#]]') for 'a' with static size of 8 bytes
+; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is '@f'
+; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g'
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[0] = 4294967295
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[1] = 4294967295
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[2] = 4294967295
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-flat-work-group-size[0] = 1
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-flat-work-group-size[1] = 1024
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-waves-per-eu[0] = 4
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-waves-per-eu[1] = 10
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', Allocas = 2
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasDyn = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCalls = 2
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0
+;  CHECK-NOT: {{.}}
+
+; ModuleID = 'test-openmp-amdgcn-amd-amdhsa-gfx906.bc'
+source_filename = "test.c"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+%struct.ident_t = type { i32, i32, i32, i32, ptr }
+%struct.DynamicEnvironmentTy = type { i16 }
+%struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 }
+
+@__omp_rtl_debug_kind = weak_odr hidden addrspace(1) constant i32 0
+@__omp_rtl_assume_teams_oversubscription = weak_odr hidden addrspace(1) constant i32 0
+@__omp_rtl_assume_threads_oversubscription = weak_odr hidden addrspace(1) constant i32 0
+@__omp_rtl_assume_no_thread_state = weak_odr hidden addrspace(1) constant i32 0
+@__omp_rtl_assume_no_nested_parallelism = weak_odr hidden addrspace(1) constant i32 0
+@0 = private unnamed_addr constant [57 x i8] c";test.c;__omp_offloading_fd02_727e9_h_l12_debug__;13;3;;\00", align 1
+@1 = private unnamed_addr addrspace(1) constant %struct.ident_t { i32 0, i32 2, i32 0, i32 56, ptr @0 }, align 8
+@__omp_offloading_fd02_727e9_h_l12_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer
+@__omp_offloading_fd02_727e9_h_l12_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 256, i32 -1, i32 -1, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_727e9_h_l12_dynamic_environment to ptr) }
+@__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define internal void @__omp_offloading_fd02_727e9_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !15 {
+  %2 = alloca ptr, align 8, addrspace(5)
+  %3 = alloca i32, align 4, addrspace(5)
+  %4 = alloca [2 x i32], align 4, addrspace(5)
+  %5 = addrspacecast ptr addrspace(5) %2 to ptr
+  %6 = addrspacecast ptr addrspace(5) %3 to ptr
+  %7 = addrspacecast ptr addrspace(5) %4 to ptr
+  store ptr %0, ptr %5, align 8
+    #dbg_declare(ptr addrspace(5) %2, !23, !DIExpression(), !24)
+  %8 = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_727e9_h_l12_kernel_environment to ptr), ptr %0), !dbg !25
+  %9 = icmp eq i32 %8, -1, !dbg !25
+  br i1 %9, label %10, label %11, !dbg !25
+
+10:                                               ; preds = %1
+    #dbg_declare(ptr addrspace(5) %3, !26, !DIExpression(), !29)
+    #dbg_declare(ptr addrspace(5) %4, !30, !DIExpression(), !34)
+  call void @f() #4, !dbg !35
+  call void @g() #4, !dbg !36
+  call void @__kmpc_target_deinit(), !dbg !37
+  ret void, !dbg !38
+
+11:                                               ; preds = %1
+  ret void, !dbg !25
+}
+
+; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
+define weak_odr protected amdgpu_kernel void @__omp_offloading_fd02_727e9_h_l12(ptr noalias noundef %0) #1 !dbg !39 {
+  %2 = alloca ptr, align 8, addrspace(5)
+  %3 = addrspacecast ptr addrspace(5) %2 to ptr
+  store ptr %0, ptr %3, align 8
+    #dbg_declare(ptr addrspace(5) %2, !40, !DIExpression(), !41)
+  %4 = load ptr, ptr %3, align 8, !dbg !42
+  call void @__omp_offloading_fd02_727e9_h_l12_debug__(ptr %4) #5, !dbg !42
+  ret void, !dbg !42
+}
+
+declare i32 @__kmpc_target_init(ptr, ptr)
+
+; Function Attrs: convergent
+declare void @f(...) #2
+
+declare void @__kmpc_target_deinit()
+
+; Function Attrs: convergent noinline nounwind optnone
+define hidden void @g() #3 !dbg !43 {
+  %1 = alloca i32, align 4, addrspace(5)
+  %2 = alloca [2 x i32], align 4, addrspace(5)
+  %3 = addrspacecast ptr addrspace(5) %1 to ptr
+  %4 = addrspacecast ptr addrspace(5) %2 to ptr
+    #dbg_declare(ptr addrspace(5) %1, !46, !DIExpression(), !47)
+    #dbg_declare(ptr addrspace(5) %2, !48, !DIExpression(), !49)
+  call void @f() #4, !dbg !50
+  call void @g() #4, !dbg !51
+  ret void, !dbg !52
+}
+
+attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+attributes #1 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "frame-pointer"="all" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="256" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
+attributes #2 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+attributes #3 = { convergent noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+attributes #4 = { convergent }
+attributes #5 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!omp_offload.info = !{!2}
+!llvm.module.flags = !{!3, !4, !5, !6, !7, !8, !9, !10, !11}
+!llvm.ident = !{!12, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13}
+!opencl.ocl.version = !{!14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.0.0git (/tmp/llvm/clang b9447c03a9ef2eed55b685a33511df86f7f94e89)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "27a878d5e894ab6d41bfe96f997f8821")
+!2 = !{i32 0, i32 64770, i32 468969, !"h", i32 12, i32 0, i32 0}
+!3 = !{i32 1, !"amdhsa_code_object_version", i32 500}
+!4 = !{i32 7, !"Dwarf Version", i32 5}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{i32 1, !"wchar_size", i32 4}
+!7 = !{i32 7, !"openmp", i32 51}
+!8 = !{i32 7, !"openmp-device", i32 51}
+!9 = !{i32 8, !"PIC Level", i32 2}
+!10 = !{i32 7, !"frame-pointer", i32 2}
+!11 = !{i32 4, !"amdgpu_hostcall", i32 1}
+!12 = !{!"clang version 20.0.0git (/tmp/llvm/clang b9447c03a9ef2eed55b685a33511df86f7f94e89)"}
+!13 = !{!"AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)"}
+!14 = !{i32 2, i32 0}
+!15 = distinct !DISubprogram(name: "__omp_offloading_fd02_727e9_h_l12_debug__", scope: !16, file: !16, line: 13, type: !17, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !22)
+!16 = !DIFile(filename: "test.c", directory: "/tmp")
+!17 = !DISubroutineType(types: !18)
+!18 = !{null, !19}
+!19 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !20)
+!20 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !21)
+!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!22 = !{}
+!23 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !15, type: !19, flags: DIFlagArtificial)
+!24 = !DILocation(line: 0, scope: !15)
+!25 = !DILocation(line: 13, column: 3, scope: !15)
+!26 = !DILocalVariable(name: "i", scope: !27, file: !16, line: 14, type: !28)
+!27 = distinct !DILexicalBlock(scope: !15, file: !16, line: 13, column: 3)
+!28 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!29 = !DILocation(line: 14, column: 9, scope: !27)
+!30 = !DILocalVariable(name: "a", scope: !27, file: !16, line: 15, type: !31)
+!31 = !DICompositeType(tag: DW_TAG_array_type, baseType: !28, size: 64, elements: !32)
+!32 = !{!33}
+!33 = !DISubrange(count: 2)
+!34 = !DILocation(line: 15, column: 9, scope: !27)
+!35 = !DILocation(line: 16, column: 5, scope: !27)
+!36 = !DILocation(line: 17, column: 5, scope: !27)
+!37 = !DILocation(line: 18, column: 3, scope: !27)
+!38 = !DILocation(line: 18, column: 3, scope: !15)
+!39 = distinct !DISubprogram(name: "__omp_offloading_fd02_727e9_h_l12", scope: !16, file: !16, line: 12, type: !17, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !22)
+!40 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !39, type: !19, flags: DIFlagArtificial)
+!41 = !DILocation(line: 0, scope: !39)
+!42 = !DILocation(line: 12, column: 1, scope: !39)
+!43 = distinct !DISubprogram(name: "g", scope: !16, file: !16, line: 3, type: !44, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !22)
+!44 = !DISubroutineType(types: !45)
+!45 = !{null}
+!46 = !DILocalVariable(name: "i", scope: !43, file: !16, line: 4, type: !28)
+!47 = !DILocation(line: 4, column: 7, scope: !43)
+!48 = !DILocalVariable(name: "a", scope: !43, file: !16, line: 5, type: !31)
+!49 = !DILocation(line: 5, column: 7, scope: !43)
+!50 = !DILocation(line: 6, column: 3, scope: !43)
+!51 = !DILocation(line: 7, column: 3, scope: !43)
+!52 = !DILocation(line: 8, column: 1, scope: !43)
diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll
new file mode 100644
index 0000000000000..bd46741b24e8c
--- /dev/null
+++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll
@@ -0,0 +1,813 @@
+; See ./README.md for how to maintain the LLVM IR in this test.
+
+; REQUIRES: nvptx-registered-target
+
+; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
+; RUN:     -disable-output %s 2>&1 | \
+; RUN:   FileCheck -match-full-lines %s
+
+;  CHECK-NOT: remark:
+;      CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes
+; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'i' with static size of 4 bytes
+; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'a' with static size of 8 bytes
+; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space
+; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '@__kmpc_target_init'
+; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@f'
+; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g'
+; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '@__kmpc_target_deinit'
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCalls = 4
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 3
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', InlineAssemblyCalls = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0
+; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1
+
+; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes
+; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space
+; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space
+; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__'
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', omp_target_thread_limit = 128
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', maxntidx = 128
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasDyn = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCalls = 1
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', InlineAssemblyCalls = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0
+; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2
+
+; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca ('%[[#]]') for 'i' with static size of 4 bytes
+; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca ('%[[#]]') for 'a' with static size of 8 bytes
+; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is '@f'
+; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g'
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', Allocas = 2
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasDyn = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCalls = 2
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0
+; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0
+;  CHECK-NOT: remark: {{.*: in function 'g',.*}}
+
+; A lot of internal functions (e.g., __kmpc_target_init) come next, but we don't
+; want to maintain a list of their allocas, calls, etc. in this test.
+
+; ModuleID = 'test-openmp-nvptx64-nvidia-cuda-sm_70.bc'
+source_filename = "test.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.ident_t = type { i32, i32, i32, i32, ptr }
+%struct.DynamicEnvironmentTy = type { i16 }
+%struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 }
+%struct.DeviceMemoryPoolTy = type { ptr, i64 }
+%struct.DeviceMemoryPoolTrackingTy = type { i64, i64, i64, i64 }
+%struct.DeviceEnvironmentTy = type { i32, i32, i32, i32, i64, i64, i64, i64 }
+%"struct.rpc::Client" = type { %"struct.rpc::Process" }
+%"struct.rpc::Process" = type { i32, ptr, ptr, ptr, ptr, [128 x i32] }
+%"struct.(anonymous namespace)::SharedMemorySmartStackTy" = type { [512 x i8], [1024 x i8] }
+%"struct.ompx::state::TeamStateTy" = type { %"struct.ompx::state::ICVStateTy", i32, i32, ptr }
+%"struct.ompx::state::ICVStateTy" = type { i32, i32, i32, i32, i32, i32, i32 }
+
+@__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
+@__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
+@0 = private unnamed_addr constant [58 x i8] c";test.c;__omp_offloading_fd02_1116d6_h_l12_debug__;13;3;;\00", align 1
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 57, ptr @0 }, align 8
+@__omp_offloading_fd02_1116d6_h_l12_dynamic_environment = weak_odr protected global %struct.DynamicEnvironmentTy zeroinitializer
+@__omp_offloading_fd02_1116d6_h_l12_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 128, i32 -1, i32 -1, i32 0, i32 0 }, ptr @1, ptr @__omp_offloading_fd02_1116d6_h_l12_dynamic_environment }
+@llvm.used = appending global [4 x ptr] [ptr @__llvm_rpc_client, ptr addrspacecast (ptr addrspace(4) @__omp_rtl_device_environment to ptr), ptr @__omp_rtl_device_memory_pool, ptr @__omp_rtl_device_memory_pool_tracker], section "llvm.metadata"
+@__omp_rtl_device_memory_pool = weak protected global %struct.DeviceMemoryPoolTy zeroinitializer, align 8
+@__omp_rtl_device_memory_pool_tracker = weak protected global %struct.DeviceMemoryPoolTrackingTy zeroinitializer, align 8
+@__omp_rtl_debug_kind = weak_odr hidden constant i32 0
+@__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+@__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0
+@__omp_rtl_device_environment = weak protected addrspace(4) global %struct.DeviceEnvironmentTy undef, align 8
+@.str = private unnamed_addr constant [40 x i8] c"%s:%u: %s: Assertion %s (`%s`) failed.\0A\00", align 1
+@.str1 = private unnamed_addr constant [35 x i8] c"%s:%u: %s: Assertion `%s` failed.\0A\00", align 1
+@.str15 = private unnamed_addr constant [43 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Kernel.cpp\00", align 1
+@__PRETTY_FUNCTION__._ZL19genericStateMachineP7IdentTy = private unnamed_addr constant [36 x i8] c"void genericStateMachine(IdentTy *)\00", align 1
+@.str2 = private unnamed_addr constant [18 x i8] c"WorkFn == nullptr\00", align 1
+@__PRETTY_FUNCTION__.__kmpc_target_deinit = private unnamed_addr constant [28 x i8] c"void __kmpc_target_deinit()\00", align 1
+@IsSPMDMode = internal local_unnamed_addr addrspace(3) global i32 undef, align 4
+@__llvm_rpc_client = weak protected global %"struct.rpc::Client" zeroinitializer, align 8
+@.str1027 = private unnamed_addr constant [48 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Parallelism.cpp\00", align 1
+@.str12 = private unnamed_addr constant [23 x i8] c"!mapping::isSPMDMode()\00", align 1
+@__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel = private unnamed_addr constant [34 x i8] c"void __kmpc_kernel_end_parallel()\00", align 1
+@_ZL20KernelEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8
+@_ZL26KernelLaunchEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8
+@_ZN12_GLOBAL__N_122SharedMemorySmartStackE = internal addrspace(3) global %"struct.(anonymous namespace)::SharedMemorySmartStackTy" undef, align 16
+@.str444 = private unnamed_addr constant [42 x i8] c"/tmp/llvm/offload/DeviceRTL/src/State.cpp\00", align 1
+@.str747 = private unnamed_addr constant [33 x i8] c"NThreadsVar == Other.NThreadsVar\00", align 1
+@__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_ = private unnamed_addr constant [68 x i8] c"void ompx::state::ICVStateTy::assertEqual(const ICVStateTy &) const\00", align 1
+@.str848 = private unnamed_addr constant [27 x i8] c"LevelVar == Other.LevelVar\00", align 1
+@.str949 = private unnamed_addr constant [39 x i8] c"ActiveLevelVar == Other.ActiveLevelVar\00", align 1
+@.str1050 = private unnamed_addr constant [47 x i8] c"MaxActiveLevelsVar == Other.MaxActiveLevelsVar\00", align 1
+@.str1151 = private unnamed_addr constant [33 x i8] c"RunSchedVar == Other.RunSchedVar\00", align 1
+@.str1252 = private unnamed_addr constant [43 x i8] c"RunSchedChunkVar == Other.RunSchedChunkVar\00", align 1
+@.str13 = private unnamed_addr constant [43 x i8] c"ParallelTeamSize == Other.ParallelTeamSize\00", align 1
+@__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_ = private unnamed_addr constant [64 x i8] c"void ompx::state::TeamStateTy::assertEqual(TeamStateTy &) const\00", align 1
+@.str14 = private unnamed_addr constant [39 x i8] c"HasThreadState == Other.HasThreadState\00", align 1
+@.str23 = private unnamed_addr constant [32 x i8] c"mapping::isSPMDMode() == IsSPMD\00", align 1
+@__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb = private unnamed_addr constant [43 x i8] c"void ompx::state::assumeInitialState(bool)\00", align 1
+@_ZL9ThreadDST = internal unnamed_addr addrspace(3) global ptr undef, align 8
+@_ZN4ompx5state9TeamStateE = internal local_unnamed_addr addrspace(3) global %"struct.ompx::state::TeamStateTy" undef, align 8
+@_ZN4ompx5state12ThreadStatesE = internal addrspace(3) global ptr undef, align 8
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define internal void @__omp_offloading_fd02_1116d6_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !18 {
+  %2 = alloca ptr, align 8
+  %3 = alloca i32, align 4
+  %4 = alloca [2 x i32], align 4
+  store ptr %0, ptr %2, align 8
+    #dbg_declare(ptr %2, !25, !DIExpression(), !26)
+  %5 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_1116d6_h_l12_kernel_environment, ptr %0), !dbg !27
+  %6 = icmp eq i32 %5, -1, !dbg !27
+  br i1 %6, label %7, label %8, !dbg !27
+
+7:                                                ; preds = %1
+    #dbg_declare(ptr %3, !28, !DIExpression(), !31)
+    #dbg_declare(ptr %4, !32, !DIExpression(), !36)
+  call void @f() #19, !dbg !37
+  call void @g() #19, !dbg !38
+  call void @__kmpc_target_deinit(), !dbg !39
+  ret void, !dbg !40
+
+8:                                                ; preds = %1
+  ret void, !dbg !27
+}
+
+; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
+define weak_odr protected ptx_kernel void @__omp_offloading_fd02_1116d6_h_l12(ptr noalias noundef %0) #1 !dbg !41 {
+  %2 = alloca ptr, align 8
+  store ptr %0, ptr %2, align 8
+    #dbg_declare(ptr %2, !42, !DIExpression(), !43)
+  %3 = load ptr, ptr %2, align 8, !dbg !44
+  call void @__omp_offloading_fd02_1116d6_h_l12_debug__(ptr %3) #20, !dbg !44
+  ret void, !dbg !44
+}
+
+; Function Attrs: convergent
+declare void @f(...) #2
+
+; Function Attrs: convergent noinline nounwind optnone
+define hidden void @g() #3 !dbg !45 {
+  %1 = alloca i32, align 4
+  %2 = alloca [2 x i32], align 4
+    #dbg_declare(ptr %1, !48, !DIExpression(), !49)
+    #dbg_declare(ptr %2, !50, !DIExpression(), !51)
+  call void @f() #19, !dbg !52
+  call void @g() #19, !dbg !53
+  ret void, !dbg !54
+}
+
+; Function Attrs: convergent mustprogress nounwind
+define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree noundef nonnull align 8 dereferenceable(48) %0, ptr nofree noundef nonnull align 8 dereferenceable(16) %1) #4 {
+  %3 = alloca ptr, align 8
+  %4 = getelementptr inbounds nuw i8, ptr %0, i64 2
+  %5 = load i8, ptr %4, align 2, !tbaa !55
+  %6 = and i8 %5, 2
+  %7 = icmp eq i8 %6, 0
+  %8 = load i8, ptr %0, align 8, !tbaa !61
+  %9 = icmp ne i8 %8, 0
+  br i1 %7, label %21, label %10
+
+10:                                               ; preds = %2
+  %11 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %12 = icmp eq i32 %11, 0
+  br i1 %12, label %13, label %14
+
+13:                                               ; preds = %10
+  store i32 1, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !62
+  store i8 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512) to ptr addrspace(3)), align 1, !tbaa !63
+  tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false)
+  store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !64
+  store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !69
+  store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !70
+  store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71
+  store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72
+  store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !73
+  store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74
+  store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76
+  store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !78
+  br label %18
+
+14:                                               ; preds = %10
+  %15 = zext nneg i32 %11 to i64
+  %16 = getelementptr inbounds nuw [1024 x i8], ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %15
+  %17 = addrspacecast ptr %16 to ptr addrspace(3)
+  store i8 0, ptr addrspace(3) %17, align 1, !tbaa !63
+  br label %18
+
+18:                                               ; preds = %14, %13
+  br i1 %12, label %19, label %20
+
+19:                                               ; preds = %18
+  store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !80
+  br label %20
+
+20:                                               ; preds = %18, %19
+  tail call void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 poison) #21
+  br label %37
+
+21:                                               ; preds = %2
+  %22 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !82
+  %23 = add nsw i32 %22, -1
+  %24 = and i32 %23, -32
+  %25 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %26 = icmp eq i32 %25, %24
+  br i1 %26, label %27, label %31
+
+27:                                               ; preds = %21
+  store i32 0, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !62
+  %28 = zext nneg i32 %25 to i64
+  %29 = getelementptr inbounds nuw [1024 x i8], ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %28
+  %30 = addrspacecast ptr %29 to ptr addrspace(3)
+  store i8 0, ptr addrspace(3) %30, align 1, !tbaa !63
+  tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false)
+  store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !64
+  store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !69
+  store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !70
+  store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71
+  store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72
+  store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !73
+  store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74
+  store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76
+  store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !78
+  br label %35
+
+31:                                               ; preds = %21
+  %32 = zext nneg i32 %25 to i64
+  %33 = getelementptr inbounds nuw [1024 x i8], ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %32
+  %34 = addrspacecast ptr %33 to ptr addrspace(3)
+  store i8 0, ptr addrspace(3) %34, align 1, !tbaa !63
+  br label %35
+
+35:                                               ; preds = %31, %27
+  br i1 %26, label %36, label %37
+
+36:                                               ; preds = %35
+  store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !80
+  br label %37
+
+37:                                               ; preds = %36, %35, %20
+  br i1 %7, label %100, label %38
+
+38:                                               ; preds = %37
+  %39 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !62
+  %40 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !83
+  %41 = and i32 %39, 1
+  %42 = and i32 %41, %40
+  %43 = icmp ne i32 %42, 0
+  %44 = load i32, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !86
+  %45 = icmp ne i32 %44, 0
+  %46 = select i1 %43, i1 %45, i1 false
+  br i1 %46, label %47, label %48
+
+47:                                               ; preds = %38
+  tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(33) @.str747, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 193, ptr noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #22
+  unreachable
+
+48:                                               ; preds = %38
+  %49 = icmp eq i32 %44, 0
+  tail call void @llvm.assume(i1 noundef %49) #23
+  %50 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !87
+  br i1 %43, label %51, label %54
+
+51:                                               ; preds = %48
+  %52 = icmp eq i32 %50, 0
+  br i1 %52, label %54, label %53
+
+53:                                               ; preds = %51
+  tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(27) @.str848, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 194, ptr noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #22
+  unreachable
+
+54:                                               ; preds = %51, %48
+  %55 = phi i32 [ 0, %51 ], [ %50, %48 ]
+  %56 = icmp eq i32 %55, 0
+  tail call void @llvm.assume(i1 noundef %56) #23
+  %57 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !88
+  br i1 %43, label %58, label %61
+
+58:                                               ; preds = %54
+  %59 = icmp eq i32 %57, 0
+  br i1 %59, label %61, label %60
+
+60:                                               ; preds = %58
+  tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(39) @.str949, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 195, ptr noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #22
+  unreachable
+
+61:                                               ; preds = %58, %54
+  %62 = phi i32 [ 0, %58 ], [ %57, %54 ]
+  %63 = icmp eq i32 %62, 0
+  tail call void @llvm.assume(i1 noundef %63) #23
+  %64 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !89
+  br i1 %43, label %65, label %68
+
+65:                                               ; preds = %61
+  %66 = icmp eq i32 %64, 1
+  br i1 %66, label %68, label %67
+
+67:                                               ; preds = %65
+  tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(47) @.str1050, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 196, ptr noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #22
+  unreachable
+
+68:                                               ; preds = %65, %61
+  %69 = phi i32 [ 1, %65 ], [ %64, %61 ]
+  %70 = icmp eq i32 %69, 1
+  tail call void @llvm.assume(i1 noundef %70) #23
+  %71 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !90
+  br i1 %43, label %72, label %93
+
+72:                                               ; preds = %68
+  %73 = icmp eq i32 %71, 1
+  br i1 %73, label %75, label %74
+
+74:                                               ; preds = %72
+  tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(33) @.str1151, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 197, ptr noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #22
+  unreachable
+
+75:                                               ; preds = %72
+  %76 = icmp eq i32 1, 1
+  tail call void @llvm.assume(i1 noundef %76) #23
+  br i1 %43, label %77, label %95
+
+77:                                               ; preds = %75
+  %78 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !91
+  %79 = icmp eq i32 %78, 1
+  br i1 %79, label %81, label %80
+
+80:                                               ; preds = %77
+  tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(43) @.str1252, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 198, ptr noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #22
+  unreachable
+
+81:                                               ; preds = %77
+  %82 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71
+  %83 = icmp eq i32 %82, 1
+  br i1 %83, label %85, label %84
+
+84:                                               ; preds = %81
+  tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(43) @.str13, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 222, ptr noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #22
+  unreachable
+
+85:                                               ; preds = %81
+  %86 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72
+  %87 = icmp eq i32 %86, 0
+  br i1 %87, label %89, label %88
+
+88:                                               ; preds = %85
+  tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(39) @.str14, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 223, ptr noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #22
+  unreachable
+
+89:                                               ; preds = %85
+  %90 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !62
+  %91 = icmp eq i32 %90, 0
+  br i1 %91, label %92, label %98
+
+92:                                               ; preds = %89
+  tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(32) @.str23, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 326, ptr noundef nonnull dereferenceable(43) @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb) #22
+  unreachable
+
+93:                                               ; preds = %68
+  %94 = icmp eq i32 %71, 1
+  tail call void @llvm.assume(i1 noundef %94) #23
+  br label %95
+
+95:                                               ; preds = %75, %93
+  %96 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !62
+  %97 = icmp ne i32 %96, 0
+  br label %98
+
+98:                                               ; preds = %89, %95
+  %99 = phi i1 [ %97, %95 ], [ true, %89 ]
+  tail call void @llvm.assume(i1 noundef %99) #23
+  tail call void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 poison) #21
+  br label %130
+
+100:                                              ; preds = %37
+  %101 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !82
+  %102 = add nsw i32 %101, -1
+  %103 = and i32 %102, -32
+  %104 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !92
+  %105 = icmp eq i32 %104, %103
+  br i1 %105, label %130, label %106
+
+106:                                              ; preds = %100
+  %107 = add nsw i32 %101, -32
+  %108 = icmp ult i32 %104, %107
+  %109 = select i1 %9, i1 %108, i1 false
+  br i1 %109, label %110, label %130
+
+110:                                              ; preds = %106
+  %111 = load i32, ptr @__omp_rtl_debug_kind, align 4
+  %112 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8
+  %113 = and i32 %111, 1
+  %114 = and i32 %113, %112
+  %115 = icmp ne i32 %114, 0
+  br label %116
+
+116:                                              ; preds = %110, %128
+  call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %3) #20
+  tail call void @llvm.nvvm.barrier.sync(i32 noundef 8)
+  %117 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %3) #20
+  %118 = load ptr, ptr %3, align 8, !tbaa !93
+  %119 = icmp eq ptr %118, null
+  br i1 %119, label %129, label %120
+
+120:                                              ; preds = %116
+  br i1 %117, label %121, label %128
+
+121:                                              ; preds = %120
+  %122 = load i32, ptr addrspace(3) @IsSPMDMode, align 4
+  %123 = icmp ne i32 %122, 0
+  %124 = select i1 %115, i1 %123, i1 false
+  br i1 %124, label %125, label %126
+
+125:                                              ; preds = %121
+  tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(23) @.str12, ptr noundef null, ptr noundef nonnull dereferenceable(67) @.str15, i32 noundef 60, ptr noundef nonnull dereferenceable(36) @__PRETTY_FUNCTION__._ZL19genericStateMachineP7IdentTy) #22
+  unreachable
+
+126:                                              ; preds = %121
+  %127 = icmp eq i32 %122, 0
+  tail call void @llvm.assume(i1 noundef %127) #23
+  tail call void %118(i32 noundef 0, i32 noundef %104) #24
+  tail call void @__kmpc_kernel_end_parallel() #24
+  br label %128
+
+128:                                              ; preds = %126, %120
+  tail call void @llvm.nvvm.barrier.sync(i32 noundef 8)
+  call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %3) #20
+  br label %116, !llvm.loop !94
+
+129:                                              ; preds = %116
+  call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %3) #20
+  br label %130
+
+130:                                              ; preds = %106, %129, %100, %98
+  %131 = phi i32 [ -1, %98 ], [ -1, %100 ], [ %104, %129 ], [ %104, %106 ]
+  ret i32 %131
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #5
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #6
+
+; Function Attrs: convergent mustprogress noinline norecurse nounwind
+define internal void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 %0) local_unnamed_addr #7 {
+  tail call void @llvm.nvvm.barrier0() #25
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5
+
+; Function Attrs: cold convergent mustprogress noreturn nounwind
+define internal fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(8) %0, ptr noundef %1, ptr noundef nonnull dereferenceable(66) %2, i32 noundef range(i32 60, 905) %3, ptr noundef nonnull dereferenceable(20) %4) unnamed_addr #8 {
+  %6 = icmp eq ptr %1, null
+  br i1 %6, label %9, label %7
+
+7:                                                ; preds = %5
+  %8 = tail call noundef i32 (ptr, ...) @_ZN4ompx6printfEPKcz(ptr noundef nonnull dereferenceable(40) @.str, ptr noundef nonnull dereferenceable(66) %2, i32 noundef %3, ptr noundef nonnull dereferenceable(20) %4, ptr noundef nonnull %1, ptr noundef nonnull dereferenceable(8) %0) #24
+  br label %11
+
+9:                                                ; preds = %5
+  %10 = tail call noundef i32 (ptr, ...) @_ZN4ompx6printfEPKcz(ptr noundef nonnull dereferenceable(35) @.str1, ptr noundef nonnull dereferenceable(66) %2, i32 noundef %3, ptr noundef nonnull dereferenceable(20) %4, ptr noundef nonnull dereferenceable(8) %0) #24
+  br label %11
+
+11:                                               ; preds = %9, %7
+  tail call void @llvm.trap() #26
+  unreachable
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
+declare void @llvm.assume(i1 noundef) #9
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #10
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.sync(i32) #11
+
+; Function Attrs: convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none)
+define internal noundef zeroext i1 @__kmpc_kernel_parallel(ptr nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) initializes((0, 8)) %0) local_unnamed_addr #12 {
+  %2 = load ptr, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !93
+  store ptr %2, ptr %0, align 8, !tbaa !93
+  %3 = icmp eq ptr %2, null
+  br i1 %3, label %15, label %4
+
+4:                                                ; preds = %1
+  %5 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !92
+  %6 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !62
+  %7 = icmp eq i32 %6, 0
+  %8 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !82
+  %9 = load i32, ptr addrspace(3) @IsSPMDMode, align 4
+  %10 = icmp eq i32 %9, 0
+  %11 = select i1 %10, i32 -32, i32 0
+  %12 = add nsw i32 %11, %8
+  %13 = select i1 %7, i32 %12, i32 %6
+  %14 = icmp ult i32 %5, %13
+  br label %15
+
+15:                                               ; preds = %4, %1
+  %16 = phi i1 [ %14, %4 ], [ false, %1 ]
+  ret i1 %16
+}
+
+; Function Attrs: convergent mustprogress noinline nounwind
+define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #13 {
+  %1 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !62
+  %2 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !83
+  %3 = and i32 %1, 1
+  %4 = and i32 %3, %2
+  %5 = icmp ne i32 %4, 0
+  %6 = load i32, ptr addrspace(3) @IsSPMDMode, align 4
+  %7 = icmp ne i32 %6, 0
+  %8 = select i1 %5, i1 %7, i1 false
+  br i1 %8, label %9, label %10
+
+9:                                                ; preds = %0
+  tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(23) @.str12, ptr noundef null, ptr noundef nonnull dereferenceable(72) @.str1027, i32 noundef 299, ptr noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #22
+  unreachable
+
+10:                                               ; preds = %0
+  %11 = icmp eq i32 %6, 0
+  tail call void @llvm.assume(i1 noundef %11) #23
+  %12 = load i32, ptr @__omp_rtl_assume_no_thread_state, align 4, !tbaa !62
+  %13 = icmp eq i32 %12, 0
+  %14 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8
+  %15 = icmp ne i32 %14, 0
+  %16 = select i1 %13, i1 %15, i1 false
+  br i1 %16, label %17, label %30
+
+17:                                               ; preds = %10
+  %18 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !92
+  %19 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74
+  %20 = zext nneg i32 %18 to i64
+  %21 = getelementptr inbounds nuw ptr, ptr %19, i64 %20
+  %22 = load ptr, ptr %21, align 8, !tbaa !96
+  %23 = icmp eq ptr %22, null
+  br i1 %23, label %30, label %24, !prof !98
+
+24:                                               ; preds = %17
+  %25 = getelementptr inbounds nuw i8, ptr %22, i64 32
+  %26 = load ptr, ptr %25, align 8, !tbaa !99
+  tail call void @free(ptr noundef nonnull dereferenceable(40) %22) #28
+  %27 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74
+  %28 = getelementptr inbounds nuw ptr, ptr %27, i64 %20
+  store ptr %26, ptr %28, align 8, !tbaa !96
+  %29 = load i32, ptr addrspace(3) @IsSPMDMode, align 4
+  br label %30
+
+30:                                               ; preds = %10, %17, %24
+  %31 = phi i32 [ 0, %10 ], [ 0, %17 ], [ %29, %24 ]
+  %32 = icmp ne i32 %31, 0
+  %33 = select i1 %5, i1 %32, i1 false
+  br i1 %33, label %34, label %35
+
+34:                                               ; preds = %30
+  tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(23) @.str12, ptr noundef null, ptr noundef nonnull dereferenceable(72) @.str1027, i32 noundef 302, ptr noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #22
+  unreachable
+
+35:                                               ; preds = %30
+  %36 = icmp eq i32 %31, 0
+  tail call void @llvm.assume(i1 noundef %36) #23
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #10
+
+; Function Attrs: convergent mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite)
+declare extern_weak void @free(ptr allocptr nocapture noundef) local_unnamed_addr #14
+
+; Function Attrs: convergent mustprogress nounwind
+define internal noundef i32 @_ZN4ompx6printfEPKcz(ptr noundef %0, ...) local_unnamed_addr #15 {
+  %2 = alloca ptr, align 8
+  call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 %2) #29
+  call void @llvm.va_start.p0(ptr noundef nonnull align 8 %2) #27
+  %3 = load ptr, ptr %2, align 8, !tbaa !101
+  %4 = call i32 @vprintf(ptr noundef %0, ptr noundef %3) #24
+  call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %2) #20
+  ret i32 %4
+}
+
+; Function Attrs: cold noreturn nounwind memory(inaccessiblemem: write)
+declare void @llvm.trap() #16
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare void @llvm.va_start.p0(ptr) #17
+
+; Function Attrs: convergent nounwind
+declare i32 @vprintf(ptr noundef, ptr noundef) local_unnamed_addr #18
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #11
+
+; Function Attrs: convergent mustprogress nounwind
+define internal void @__kmpc_target_deinit() #4 {
+  %1 = alloca ptr, align 8
+  %2 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !62
+  %3 = icmp eq i32 %2, 0
+  br i1 %3, label %4, label %27
+
+4:                                                ; preds = %0
+  %5 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !82
+  %6 = add nsw i32 %5, -1
+  %7 = and i32 %6, -32
+  %8 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !92
+  %9 = icmp eq i32 %8, %7
+  br i1 %9, label %10, label %11
+
+10:                                               ; preds = %4
+  store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !93
+  br label %27
+
+11:                                               ; preds = %4
+  %12 = load ptr, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76
+  %13 = load i8, ptr %12, align 8, !tbaa !103
+  %14 = icmp eq i8 %13, 0
+  br i1 %14, label %15, label %27
+
+15:                                               ; preds = %11
+  call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %1) #29
+  %16 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %1) #20
+  %17 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !62
+  %18 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !83
+  %19 = and i32 %17, 1
+  %20 = and i32 %19, %18
+  %21 = icmp eq i32 %20, 0
+  %22 = load ptr, ptr %1, align 8
+  %23 = icmp eq ptr %22, null
+  %24 = select i1 %21, i1 true, i1 %23
+  br i1 %24, label %26, label %25
+
+25:                                               ; preds = %15
+  tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(18) @.str2, ptr noundef null, ptr noundef nonnull dereferenceable(67) @.str15, i32 noundef 152, ptr noundef nonnull dereferenceable(28) @__PRETTY_FUNCTION__.__kmpc_target_deinit) #22
+  unreachable
+
+26:                                               ; preds = %15
+  tail call void @llvm.assume(i1 noundef %23) #23
+  call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %1) #20
+  br label %27
+
+27:                                               ; preds = %26, %11, %10, %0
+  ret void
+}
+
+attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx83,+sm_70" }
+attributes #1 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="128" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx83,+sm_70" }
+attributes #2 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx83,+sm_70" }
+attributes #3 = { convergent noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx83,+sm_70" }
+attributes #4 = { convergent mustprogress nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" }
+attributes #5 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #6 = { nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #7 = { convergent mustprogress noinline norecurse nounwind "frame-pointer"="all" "llvm.assume"="ompx_aligned_barrier" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" }
+attributes #8 = { cold convergent mustprogress noreturn nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" }
+attributes #9 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+attributes #10 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #11 = { convergent nocallback nounwind }
+attributes #12 = { convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" }
+attributes #13 = { convergent mustprogress noinline nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" }
+attributes #14 = { convergent mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" }
+attributes #15 = { convergent mustprogress nounwind "frame-pointer"="all" "no-builtin-printf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" }
+attributes #16 = { cold noreturn nounwind memory(inaccessiblemem: write) }
+attributes #17 = { nocallback nofree nosync nounwind willreturn }
+attributes #18 = { convergent nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" }
+attributes #19 = { convergent }
+attributes #20 = { nounwind }
+attributes #21 = { convergent nounwind "llvm.assume"="ompx_aligned_barrier" }
+attributes #22 = { convergent noreturn nounwind }
+attributes #23 = { memory(write) }
+attributes #24 = { convergent nounwind }
+attributes #25 = { "llvm.assume"="ompx_aligned_barrier" }
+attributes #26 = { noreturn }
+attributes #27 = { nofree willreturn }
+attributes #28 = { convergent nounwind willreturn }
+attributes #29 = { nofree nounwind willreturn }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10}
+!llvm.dbg.cu = !{!11}
+!nvvm.annotations = !{!13}
+!omp_offload.info = !{!14}
+!llvm.ident = !{!15, !16, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15}
+!nvvmir.version = !{!17}
+
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 12, i32 3]}
+!1 = !{i32 7, !"Dwarf Version", i32 2}
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 1, !"wchar_size", i32 4}
+!4 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
+!5 = !{i32 7, !"openmp", i32 51}
+!6 = !{i32 7, !"openmp-device", i32 51}
+!7 = !{i32 8, !"PIC Level", i32 2}
+!8 = !{i32 7, !"frame-pointer", i32 2}
+!9 = !{i32 1, !"ThinLTO", i32 0}
+!10 = !{i32 1, !"EnableSplitLTOUnit", i32 1}
+!11 = distinct !DICompileUnit(language: DW_LANG_C11, file: !12, producer: "clang version 20.0.0git (/tmp/llvm/clang b9447c03a9ef2eed55b685a33511df86f7f94e89)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!12 = !DIFile(filename: "test.c", directory: "/tmp")
+!13 = !{ptr @__omp_offloading_fd02_1116d6_h_l12, !"maxntidx", i32 128}
+!14 = !{i32 0, i32 64770, i32 1119958, !"h", i32 12, i32 0, i32 0}
+!15 = !{!"clang version 20.0.0git (/tmp/llvm/clang b9447c03a9ef2eed55b685a33511df86f7f94e89)"}
+!16 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!17 = !{i32 2, i32 0}
+!18 = distinct !DISubprogram(name: "__omp_offloading_fd02_1116d6_h_l12_debug__", scope: !12, file: !12, line: 13, type: !19, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !24)
+!19 = !DISubroutineType(types: !20)
+!20 = !{null, !21}
+!21 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !22)
+!22 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !23)
+!23 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!24 = !{}
+!25 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !18, type: !21, flags: DIFlagArtificial)
+!26 = !DILocation(line: 0, scope: !18)
+!27 = !DILocation(line: 13, column: 3, scope: !18)
+!28 = !DILocalVariable(name: "i", scope: !29, file: !12, line: 14, type: !30)
+!29 = distinct !DILexicalBlock(scope: !18, file: !12, line: 13, column: 3)
+!30 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!31 = !DILocation(line: 14, column: 9, scope: !29)
+!32 = !DILocalVariable(name: "a", scope: !29, file: !12, line: 15, type: !33)
+!33 = !DICompositeType(tag: DW_TAG_array_type, baseType: !30, size: 64, elements: !34)
+!34 = !{!35}
+!35 = !DISubrange(count: 2)
+!36 = !DILocation(line: 15, column: 9, scope: !29)
+!37 = !DILocation(line: 16, column: 5, scope: !29)
+!38 = !DILocation(line: 17, column: 5, scope: !29)
+!39 = !DILocation(line: 18, column: 3, scope: !29)
+!40 = !DILocation(line: 18, column: 3, scope: !18)
+!41 = distinct !DISubprogram(name: "__omp_offloading_fd02_1116d6_h_l12", scope: !12, file: !12, line: 12, type: !19, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !24)
+!42 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !41, type: !21, flags: DIFlagArtificial)
+!43 = !DILocation(line: 0, scope: !41)
+!44 = !DILocation(line: 12, column: 1, scope: !41)
+!45 = distinct !DISubprogram(name: "g", scope: !12, file: !12, line: 3, type: !46, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !11, retainedNodes: !24)
+!46 = !DISubroutineType(types: !47)
+!47 = !{null}
+!48 = !DILocalVariable(name: "i", scope: !45, file: !12, line: 4, type: !30)
+!49 = !DILocation(line: 4, column: 7, scope: !45)
+!50 = !DILocalVariable(name: "a", scope: !45, file: !12, line: 5, type: !33)
+!51 = !DILocation(line: 5, column: 7, scope: !45)
+!52 = !DILocation(line: 6, column: 3, scope: !45)
+!53 = !DILocation(line: 7, column: 3, scope: !45)
+!54 = !DILocation(line: 8, column: 1, scope: !45)
+!55 = !{!56, !59, i64 2}
+!56 = !{!"_ZTS26ConfigurationEnvironmentTy", !57, i64 0, !57, i64 1, !59, i64 2, !60, i64 4, !60, i64 8, !60, i64 12, !60, i64 16, !60, i64 20, !60, i64 24}
+!57 = !{!"omnipotent char", !58, i64 0}
+!58 = !{!"Simple C++ TBAA"}
+!59 = !{!"_ZTSN4llvm3omp19OMPTgtExecModeFlagsE", !57, i64 0}
+!60 = !{!"int", !57, i64 0}
+!61 = !{!56, !57, i64 0}
+!62 = !{!60, !60, i64 0}
+!63 = !{!57, !57, i64 0}
+!64 = !{!65, !60, i64 16}
+!65 = !{!"_ZTSN4ompx5state11TeamStateTyE", !66, i64 0, !60, i64 28, !60, i64 32, !67, i64 40}
+!66 = !{!"_ZTSN4ompx5state10ICVStateTyE", !60, i64 0, !60, i64 4, !60, i64 8, !60, i64 12, !60, i64 16, !60, i64 20, !60, i64 24}
+!67 = !{!"p1 void", !68, i64 0}
+!68 = !{!"any pointer", !57, i64 0}
+!69 = !{!65, !60, i64 20}
+!70 = !{!65, !60, i64 24}
+!71 = !{!65, !60, i64 28}
+!72 = !{!65, !60, i64 32}
+!73 = !{!65, !67, i64 40}
+!74 = !{!75, !75, i64 0}
+!75 = !{!"p2 _ZTSN4ompx5state13ThreadStateTyE", !68, i64 0}
+!76 = !{!77, !77, i64 0}
+!77 = !{!"p1 _ZTS19KernelEnvironmentTy", !68, i64 0}
+!78 = !{!79, !79, i64 0}
+!79 = !{!"p1 _ZTS25KernelLaunchEnvironmentTy", !68, i64 0}
+!80 = !{!81, !81, i64 0}
+!81 = !{!"p2 _ZTS22DynamicScheduleTracker", !68, i64 0}
+!82 = !{i32 1, i32 1025}
+!83 = !{!84, !60, i64 0}
+!84 = !{!"_ZTS19DeviceEnvironmentTy", !60, i64 0, !60, i64 4, !60, i64 8, !60, i64 12, !85, i64 16, !85, i64 24, !85, i64 32, !85, i64 40}
+!85 = !{!"long", !57, i64 0}
+!86 = !{!66, !60, i64 0}
+!87 = !{!66, !60, i64 4}
+!88 = !{!66, !60, i64 8}
+!89 = !{!66, !60, i64 16}
+!90 = !{!66, !60, i64 20}
+!91 = !{!66, !60, i64 24}
+!92 = !{i32 0, i32 1024}
+!93 = !{!67, !67, i64 0}
+!94 = distinct !{!94, !95}
+!95 = !{!"llvm.loop.mustprogress"}
+!96 = !{!97, !97, i64 0}
+!97 = !{!"p1 _ZTSN4ompx5state13ThreadStateTyE", !68, i64 0}
+!98 = !{!"branch_weights", !"expected", i32 2000, i32 1}
+!99 = !{!100, !97, i64 32}
+!100 = !{!"_ZTSN4ompx5state13ThreadStateTyE", !66, i64 0, !97, i64 32}
+!101 = !{!102, !102, i64 0}
+!102 = !{!"p1 omnipotent char", !68, i64 0}
+!103 = !{!104, !57, i64 0}
+!104 = !{!"_ZTS19KernelEnvironmentTy", !56, i64 0, !105, i64 32, !106, i64 40}
+!105 = !{!"p1 _ZTS7IdentTy", !68, i64 0}
+!106 = !{!"p1 _ZTS20DynamicEnvironmentTy", !68, i64 0}