From 1a2575e83c3ffdaa1d9192474ca0198589943747 Mon Sep 17 00:00:00 2001
From: Steffen Larsen <steffen.larsen@codeplay.com>
Date: Thu, 28 May 2020 15:45:52 +0100
Subject: [PATCH 1/3] [SYCL][PTX][CUDA] Implicit global offset implementation

This commit implements implicit global offset behavior for the kernels
generated for the PI CUDA backend. This includes the following changes:

 * A new builtin `__builtin_ptx_implicit_offset` and intrinsic
   `int.nvvm.implicit.offset` for getting the global offset. For the
   `ptx-nvidiacl` this is used for implementing the
   `__spirv_GlobalOffset` builtin.
 * A new pass that iterates over the uses of the
   `int.nvvm.implicit.offset` intrinsic, replacing it with a new
   function parameter. It then moves up the call-tree, adjusting calls
   to functions with this new parameter by adding a similar parameter to
   callers without it and adding this parameter to the calls. An
   exception are entry points, which are instead cloned with the clone
   being given the new parameter and the original using an offset of
   `{0,0,0}` in all uses of the intrinsic or functions with the new
   parameter. Any entry points that are not cloned are invariant to the
   offset parameter.

Additionally the PI CUDA backend now includes an offset parameter in the
set of arguments for kernels. PI CUDA attempt to load the corresponding
kernel both with and without the global offset parameter. If present,
the kernel with the offset parameter is used only when a non-zero global
offset is used.

Co-authored-by: David Wood <david.wood@codeplay.com>
Co-authored-by: Victor Lomuller <victor@codeplay.com>
Signed-off-by: Steffen Larsen <steffen.larsen@codeplay.com>
---
 clang/include/clang/Basic/BuiltinsNVPTX.def   |   3 +
 .../libspirv/workitem/get_global_id.cl        |   9 +-
 .../libspirv/workitem/get_global_offset.cl    |   6 +-
 llvm/include/llvm/IR/IntrinsicsNVVM.td        |   6 +
 llvm/lib/Target/NVPTX/CMakeLists.txt          |   1 +
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp  |   6 +-
 llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp   | 368 ++++++++++++++++++
 llvm/lib/Target/NVPTX/SYCL/GlobalOffset.h     |  27 ++
 .../NVPTX/global-offset-invalid-triple.ll     |  35 ++
 ...offset-multiple-calls-from-one-function.ll |  67 ++++
 .../global-offset-multiple-entry-points.ll    | 108 +++++
 .../CodeGen/NVPTX/global-offset-simple.ll     |  52 +++
 .../NVPTX/global-offset-valid-triple.ll       |  43 ++
 sycl/plugins/cuda/pi_cuda.cpp                 |  33 +-
 sycl/plugins/cuda/pi_cuda.hpp                 |  47 ++-
 .../basic_tests/parallel_for_indexers.cpp     |   4 -
 sycl/unittests/pi/cuda/test_kernels.cpp       |  17 +-
 17 files changed, 804 insertions(+), 28 deletions(-)
 create mode 100644 llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp
 create mode 100644 llvm/lib/Target/NVPTX/SYCL/GlobalOffset.h
 create mode 100644 llvm/test/CodeGen/NVPTX/global-offset-invalid-triple.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/global-offset-multiple-calls-from-one-function.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/global-offset-multiple-entry-points.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/global-offset-simple.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/global-offset-valid-triple.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 759c91290a60c..9a4a3ff013973 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -89,6 +89,9 @@ BUILTIN(__nvvm_read_ptx_sreg_pm1, "i", "n")
 BUILTIN(__nvvm_read_ptx_sreg_pm2, "i", "n")
 BUILTIN(__nvvm_read_ptx_sreg_pm3, "i", "n")
 
+// SYCL
+BUILTIN(__builtin_ptx_implicit_offset, "Ui*", "nc")
+
 // MISC
 
 BUILTIN(__nvvm_prmt, "UiUiUiUi", "")
diff --git a/libclc/ptx-nvidiacl/libspirv/workitem/get_global_id.cl b/libclc/ptx-nvidiacl/libspirv/workitem/get_global_id.cl
index da96caffb4f75..b856302625f42 100644
--- a/libclc/ptx-nvidiacl/libspirv/workitem/get_global_id.cl
+++ b/libclc/ptx-nvidiacl/libspirv/workitem/get_global_id.cl
@@ -9,13 +9,16 @@
 #include <spirv/spirv.h>
 
 _CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalInvocationId_x() {
-  return __spirv_WorkgroupId_x() * __spirv_WorkgroupSize_x() + __spirv_LocalInvocationId_x();
+  return __spirv_WorkgroupId_x() * __spirv_WorkgroupSize_x() +
+         __spirv_LocalInvocationId_x() + __spirv_GlobalOffset_x();
 }
 
 _CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalInvocationId_y() {
-  return __spirv_WorkgroupId_y() * __spirv_WorkgroupSize_y() + __spirv_LocalInvocationId_y();
+  return __spirv_WorkgroupId_y() * __spirv_WorkgroupSize_y() +
+         __spirv_LocalInvocationId_y() + __spirv_GlobalOffset_y();
 }
 
 _CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalInvocationId_z() {
-  return __spirv_WorkgroupId_z() * __spirv_WorkgroupSize_z() + __spirv_LocalInvocationId_z();
+  return __spirv_WorkgroupId_z() * __spirv_WorkgroupSize_z() +
+         __spirv_LocalInvocationId_z() + __spirv_GlobalOffset_z();
 }
diff --git a/libclc/ptx-nvidiacl/libspirv/workitem/get_global_offset.cl b/libclc/ptx-nvidiacl/libspirv/workitem/get_global_offset.cl
index de269c76602be..7eae8cf43c20e 100644
--- a/libclc/ptx-nvidiacl/libspirv/workitem/get_global_offset.cl
+++ b/libclc/ptx-nvidiacl/libspirv/workitem/get_global_offset.cl
@@ -11,13 +11,13 @@
 // Compiler support is required to provide global offset on NVPTX.
 
 _CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalOffset_x() {
-    return 0;
+  return __builtin_ptx_implicit_offset()[0];
 }
 
 _CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalOffset_y() {
-    return 0;
+  return __builtin_ptx_implicit_offset()[1];
 }
 
 _CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalOffset_z() {
-    return 0;
+  return __builtin_ptx_implicit_offset()[2];
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 61293418ec41d..9de9400848709 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -4161,4 +4161,10 @@ foreach layout_a = ["row", "col"] in {
   } // layout_b
 } // layout_a
 
+// SYCL
+def int_nvvm_implicit_offset :
+  GCCBuiltin<"__builtin_ptx_implicit_offset">,
+  Intrinsic<[LLVMPointerType<llvm_i32_ty>], [],
+  [IntrNoMem, IntrSpeculatable]>;
+
 } // let TargetPrefix = "nvvm"
diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt
index 097fc26cdab2c..9e9f7ebb63887 100644
--- a/llvm/lib/Target/NVPTX/CMakeLists.txt
+++ b/llvm/lib/Target/NVPTX/CMakeLists.txt
@@ -33,6 +33,7 @@ set(NVPTXCodeGen_sources
   NVVMIntrRange.cpp
   NVVMReflect.cpp
   NVPTXProxyRegErasure.cpp
+  SYCL/GlobalOffset.cpp
   SYCL/LocalAccessorToSharedMemory.cpp
   )
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index e5c89a191cc0e..47729e957d533 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -16,8 +16,9 @@
 #include "NVPTXLowerAggrCopies.h"
 #include "NVPTXTargetObjectFile.h"
 #include "NVPTXTargetTransformInfo.h"
-#include "TargetInfo/NVPTXTargetInfo.h"
+#include "SYCL/GlobalOffset.h"
 #include "SYCL/LocalAccessorToSharedMemory.h"
+#include "TargetInfo/NVPTXTargetInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -71,6 +72,7 @@ void initializeNVPTXLowerArgsPass(PassRegistry &);
 void initializeNVPTXLowerAllocaPass(PassRegistry &);
 void initializeNVPTXProxyRegErasurePass(PassRegistry &);
 
+void initializeGlobalOffsetPass(PassRegistry &);
 void initializeLocalAccessorToSharedMemoryPass(PassRegistry &);
 
 } // end namespace llvm
@@ -94,6 +96,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
   initializeNVPTXProxyRegErasurePass(PR);
 
   // SYCL-specific passes, needed here to be available to `opt`.
+  initializeGlobalOffsetPass(PR);
   initializeLocalAccessorToSharedMemoryPass(PR);
 }
 
@@ -274,6 +277,7 @@ void NVPTXPassConfig::addIRPasses() {
 
   if (getTM<NVPTXTargetMachine>().getTargetTriple().getOS() == Triple::CUDA &&
       getTM<NVPTXTargetMachine>().getTargetTriple().getEnvironment() == Triple::SYCLDevice) {
+    addPass(createGlobalOffsetPass());
     addPass(createLocalAccessorToSharedMemoryPass());
   }
 
diff --git a/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp b/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp
new file mode 100644
index 0000000000000..3f134f2867934
--- /dev/null
+++ b/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp
@@ -0,0 +1,368 @@
+//===--------- GlobalOffset.cpp - Global Offset Support for CUDA --------- ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass operates on SYCL kernels being compiled to CUDA. It looks for uses
+// of the `llvm.nvvm.implicit.offset` intrinsic and replaces it with a offset
+// parameter which will be threaded through from the kernel entry point.
+//
+//===----------------------------------------------------------------------===//
+
+#include "GlobalOffset.h"
+
+#include "../MCTargetDesc/NVPTXBaseInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "globaloffset"
+
+namespace llvm {
+void initializeGlobalOffsetPass(PassRegistry &);
+} // end namespace llvm
+
+namespace {
+
+class GlobalOffset : public ModulePass {
+public:
+  static char ID;
+  GlobalOffset() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    llvm::Function *ImplicitOffsetIntrinsic =
+        M.getFunction(Intrinsic::getName(Intrinsic::nvvm_implicit_offset));
+    KernelImplicitArgumentType =
+        ArrayType::get(Type::getInt32Ty(M.getContext()), 3);
+    ImplicitOffsetPtrType = Type::getInt32Ty(M.getContext())->getPointerTo();
+    assert(
+        (!ImplicitOffsetIntrinsic ||
+         ImplicitOffsetIntrinsic->getReturnType() == ImplicitOffsetPtrType) &&
+        "Intrinsic::nvvm_implicit_offset does not return the expected "
+        "type");
+
+    if (!ImplicitOffsetIntrinsic || ImplicitOffsetIntrinsic->use_empty()) {
+      return false;
+    }
+
+    // Find all entry points.
+    EntryPointMetadata = getEntryPointMetadata(M);
+
+    // Add implicit parameters to all direct and indirect users of the offset
+    this->addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr);
+
+    // Assert that all uses of `ImplicitOffsetIntrinsic` are removed and delete
+    // it.
+    assert(ImplicitOffsetIntrinsic->use_empty() &&
+           "Not all uses of intrinsic removed");
+    ImplicitOffsetIntrinsic->eraseFromParent();
+
+    return true;
+  }
+
+  void processKernelEntryPoint(Module &M, Function *Func) {
+    assert(EntryPointMetadata.count(Func) != 0 &&
+           "Function must be an entry point");
+
+    LLVMContext &Ctx = M.getContext();
+    MDNode *FuncMetadata = EntryPointMetadata[Func];
+
+    bool AlreadyProcessed = this->ProcessedFunctions.count(Func) == 1;
+    if (AlreadyProcessed)
+      return;
+
+    // Add the new argument to all other kernel entry points, despite not
+    // using the global offset.
+    auto NvvmMetadata = M.getNamedMetadata("nvvm.annotations");
+    assert(NvvmMetadata && "IR compiled to PTX must have nvvm.annotations");
+
+    auto NewFunc = this->addOffsetArgumentToFunction(
+                           M, Func, KernelImplicitArgumentType->getPointerTo(),
+                           /*KeepOriginal=*/true)
+                       .first;
+    Argument *NewArgument = NewFunc->arg_begin() + (NewFunc->arg_size() - 1);
+    // Pass the values by value to the kernel
+    NewArgument->addAttr(
+        Attribute::getWithByValType(Ctx, KernelImplicitArgumentType));
+
+    // Add the metadata.
+    Metadata *NewMetadata[] = {ConstantAsMetadata::get(NewFunc),
+                               FuncMetadata->getOperand(1),
+                               FuncMetadata->getOperand(2)};
+    NvvmMetadata->addOperand(MDNode::get(Ctx, NewMetadata));
+
+    // Create alloca of zeros for the implicit offset in original func
+    BasicBlock *EntryBlock = &Func->getEntryBlock();
+    IRBuilder<> Builder(EntryBlock, EntryBlock->getFirstInsertionPt());
+    Type *ImplicitOffsetType =
+        ArrayType::get(Type::getInt32Ty(M.getContext()), 3);
+    AllocaInst *ImplicitOffset = Builder.CreateAlloca(ImplicitOffsetType);
+    uint64_t AllocByteSize =
+        ImplicitOffset->getAllocationSizeInBits(M.getDataLayout()).getValue() /
+        8;
+    CallInst *MemsetCall =
+        Builder.CreateMemSet(ImplicitOffset, Builder.getInt8(0), AllocByteSize,
+                             ImplicitOffset->getAlign());
+    MemsetCall->addParamAttr(0, Attribute::NonNull);
+    MemsetCall->addDereferenceableAttr(1, AllocByteSize);
+    this->ProcessedFunctions[Func] = Builder.CreateConstInBoundsGEP2_32(
+        ImplicitOffsetType, ImplicitOffset, 0, 0);
+  }
+
+  // This function adds an implicit parameter to the function containing a call
+  // instruction to the implicit offset intrinsic or another function (which
+  // eventually calls the instrinsic). If the call instruction is to the
+  // implicit offset intrinsic, then the intrinisic is replaced with the
+  // parameter that was added.
+  //
+  // `Callee` is the function (to which this transformation has already been
+  // applied), or to the implicit offset intrinsic. `CalleeWithImplicitParam`
+  // indicates whether Callee is to the implicit intrinsic (when `nullptr`) or
+  // to another function (not `nullptr`) - this is used to know whether calls to
+  // it needs to have the implicit parameter added to it or replaced with the
+  // implicit parameter.
+  //
+  // Once the function, say `F`, containing a call to `Callee` has the implicit
+  // parameter added, callers of `F` are processed by recursively calling this
+  // function, passing `F` to `CalleeWithImplicitParam`.
+  //
+  // Since the cloning of entry points may alter the users of a function, the
+  // cloning must be done as early as possible, as to ensure that no users are
+  // added to previous callees in the call-tree.
+  void addImplicitParameterToCallers(Module &M, Value *Callee,
+                                     Function *CalleeWithImplicitParam) {
+
+    // Make sure that all entry point callers are processed.
+    SmallVector<User *, 8> Users{Callee->users()};
+    for (User *U : Users) {
+      auto *Call = dyn_cast<CallInst>(U);
+      if (!Call)
+        continue;
+
+      Function *Caller = Call->getFunction();
+      if (EntryPointMetadata.count(Caller) != 0) {
+        processKernelEntryPoint(M, Caller);
+      }
+    }
+
+    // User collection may have changed, so we reinitialize it.
+    Users = SmallVector<User *, 8>{Callee->users()};
+    for (User *U : Users) {
+      auto *CallToOld = dyn_cast<CallInst>(U);
+      if (!CallToOld)
+        return;
+
+      auto Caller = CallToOld->getFunction();
+
+      // Determine if `Caller` needs processed or if this is another callsite
+      // from an already-processed function.
+      Function *NewFunc;
+      Value *ImplicitOffset = this->ProcessedFunctions[Caller];
+      bool AlreadyProcessed = ImplicitOffset != nullptr;
+      if (AlreadyProcessed) {
+        NewFunc = Caller;
+      } else {
+        std::tie(NewFunc, ImplicitOffset) =
+            this->addOffsetArgumentToFunction(M, Caller);
+      }
+
+      if (!CalleeWithImplicitParam) {
+        // Replace intrinsic call with parameter.
+        CallToOld->replaceAllUsesWith(ImplicitOffset);
+      } else {
+        // Build up a list of arguments to call the modified function using.
+        llvm::SmallVector<Value *, 8> ImplicitOffsets;
+        for (Use &U : CallToOld->args()) {
+          ImplicitOffsets.push_back(U);
+        }
+        ImplicitOffsets.push_back(ImplicitOffset);
+
+        // Replace call to other function (which now has a new parameter),
+        // with a call including the new parameter to that same function.
+        auto NewCaller = CallInst::Create(
+            /* Ty= */ CalleeWithImplicitParam->getFunctionType(),
+            /* Func= */ CalleeWithImplicitParam,
+            /* Args= */ ImplicitOffsets,
+            /* NameStr= */ Twine(),
+            /* InsertBefore= */ CallToOld);
+        NewCaller->setTailCallKind(CallToOld->getTailCallKind());
+        CallToOld->replaceAllUsesWith(NewCaller);
+
+        if (CallToOld->hasName()) {
+          NewCaller->takeName(CallToOld);
+        }
+      }
+
+      // Remove the caller now that it has been replaced.
+      CallToOld->eraseFromParent();
+
+      if (!AlreadyProcessed) {
+        // Process callers of the old function.
+        this->addImplicitParameterToCallers(M, Caller, NewFunc);
+
+        // Now that the old function is dead, delete it.
+        Caller->dropAllReferences();
+        Caller->eraseFromParent();
+      }
+    }
+  }
+
+  std::pair<Function *, Value *>
+  addOffsetArgumentToFunction(Module &M, Function *Func,
+                              Type *ImplicitArgumentType = nullptr,
+                              bool KeepOriginal = false) {
+    FunctionType *FuncTy = Func->getFunctionType();
+    const AttributeList &FuncAttrs = Func->getAttributes();
+    ImplicitArgumentType =
+        ImplicitArgumentType ? ImplicitArgumentType : ImplicitOffsetPtrType;
+
+    // Construct an argument list containing all of the previous arguments.
+    SmallVector<Type *, 8> Arguments;
+    SmallVector<AttributeSet, 8> ArgumentAttributes;
+
+    unsigned i = 0;
+    for (Function::arg_iterator FuncArg = Func->arg_begin(),
+                                FuncEnd = Func->arg_end();
+         FuncArg != FuncEnd; ++FuncArg, ++i) {
+      Arguments.push_back(FuncArg->getType());
+      ArgumentAttributes.push_back(FuncAttrs.getParamAttributes(i));
+    }
+
+    // Add the offset argument. Must be the same type as returned by
+    // `llvm.nvvm.implicit.offset`.
+
+    Arguments.push_back(ImplicitArgumentType);
+    ArgumentAttributes.push_back(AttributeSet());
+
+    // Build the new function.
+    AttributeList NAttrs =
+        AttributeList::get(Func->getContext(), FuncAttrs.getFnAttributes(),
+                           FuncAttrs.getRetAttributes(), ArgumentAttributes);
+    assert(!FuncTy->isVarArg() && "Variadic arguments prohibited in SYCL");
+    FunctionType *NewFuncTy = FunctionType::get(FuncTy->getReturnType(),
+                                                Arguments, FuncTy->isVarArg());
+
+    Function *NewFunc = Function::Create(NewFuncTy, Func->getLinkage(),
+                                         Func->getAddressSpace());
+
+    if (KeepOriginal) {
+      NewFunc->setName(Func->getName() + "_with_offset");
+
+      ValueToValueMapTy VMap;
+      for (Function::arg_iterator FuncArg = Func->arg_begin(),
+                                  FuncEnd = Func->arg_end(),
+                                  NewFuncArg = NewFunc->arg_begin();
+           FuncArg != FuncEnd; ++FuncArg, ++NewFuncArg) {
+        VMap[FuncArg] = NewFuncArg;
+      }
+
+      SmallVector<ReturnInst *, 8> Returns;
+      CloneFunctionInto(NewFunc, Func, VMap, /*ModuleLevelChanges=*/false,
+                        Returns);
+    } else {
+      NewFunc->copyAttributesFrom(Func);
+      NewFunc->setComdat(Func->getComdat());
+      NewFunc->setAttributes(NAttrs);
+      NewFunc->takeName(Func);
+
+      // Splice the body of the old function right into the new function.
+      NewFunc->getBasicBlockList().splice(NewFunc->begin(),
+                                          Func->getBasicBlockList());
+
+      for (Function::arg_iterator FuncArg = Func->arg_begin(),
+                                  FuncEnd = Func->arg_end(),
+                                  NewFuncArg = NewFunc->arg_begin();
+           FuncArg != FuncEnd; ++FuncArg, ++NewFuncArg) {
+        FuncArg->replaceAllUsesWith(NewFuncArg);
+      }
+
+      // Clone metadata of the old function, including debug info descriptor.
+      SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+      Func->getAllMetadata(MDs);
+      for (auto MD : MDs)
+        NewFunc->addMetadata(MD.first, *MD.second);
+    }
+
+    // Keep original function ordering.
+    M.getFunctionList().insertAfter(Func->getIterator(), NewFunc);
+
+    Value *ImplicitOffset = NewFunc->arg_begin() + (NewFunc->arg_size() - 1);
+    // Add bitcast to match the return type of the intrinsic if needed.
+    if (ImplicitArgumentType != ImplicitOffsetPtrType) {
+      BasicBlock *EntryBlock = &NewFunc->getEntryBlock();
+      IRBuilder<> Builder(EntryBlock, EntryBlock->getFirstInsertionPt());
+      ImplicitOffset =
+          Builder.CreateBitCast(ImplicitOffset, ImplicitOffsetPtrType);
+    }
+
+    this->ProcessedFunctions[NewFunc] = ImplicitOffset;
+
+    // Return the new function and the offset argument.
+    return {NewFunc, ImplicitOffset};
+  }
+
+  static llvm::DenseMap<Function *, MDNode *> getEntryPointMetadata(Module &M) {
+    auto NvvmMetadata = M.getNamedMetadata("nvvm.annotations");
+    assert(NvvmMetadata && "IR compiled to PTX must have nvvm.annotations");
+
+    llvm::DenseMap<Function *, MDNode *> NvvmEntryPointMetadata;
+    for (auto MetadataNode : NvvmMetadata->operands()) {
+      if (MetadataNode->getNumOperands() != 3)
+        continue;
+
+      // NVPTX identifies kernel entry points using metadata nodes of the form:
+      //   !X = !{<function>, !"kernel", i32 1}
+      auto Type = dyn_cast<MDString>(MetadataNode->getOperand(1));
+      // Only process kernel entry points.
+      if (!Type || Type->getString() != "kernel")
+        continue;
+
+      // Get a pointer to the entry point function from the metadata.
+      auto FuncConstant =
+          dyn_cast<ConstantAsMetadata>(MetadataNode->getOperand(0));
+      if (!FuncConstant)
+        continue;
+      auto Func = dyn_cast<Function>(FuncConstant->getValue());
+      if (!Func)
+        continue;
+
+      assert(Func->use_empty() && "Kernel entry point with uses");
+      NvvmEntryPointMetadata[Func] = MetadataNode;
+    }
+    return NvvmEntryPointMetadata;
+  }
+
+  virtual llvm::StringRef getPassName() const {
+    return "Add implicit SYCL global offset";
+  }
+
+private:
+  // Keep track of which functions have been processed to avoid processing twice
+  llvm::DenseMap<Function *, Value *> ProcessedFunctions;
+  // Keep a map of all entry point functions with metadata
+  llvm::DenseMap<Function *, MDNode *> EntryPointMetadata;
+  llvm::Type *KernelImplicitArgumentType;
+  llvm::Type *ImplicitOffsetPtrType;
+};
+
+} // end anonymous namespace
+
+char GlobalOffset::ID = 0;
+
+INITIALIZE_PASS(GlobalOffset, "globaloffset", "SYCL Global Offset", false,
+                false)
+
+ModulePass *llvm::createGlobalOffsetPass() { return new GlobalOffset(); }
diff --git a/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.h b/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.h
new file mode 100644
index 0000000000000..8d17219d49a61
--- /dev/null
+++ b/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.h
@@ -0,0 +1,27 @@
+//===--------- GlobalOffset.cpp - Global Offset Support for CUDA --------- ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass operates on SYCL kernels being compiled to CUDA. It looks for uses
+// of the `llvm.nvvm.implicit.offset` intrinsic and replaces it with a offset
+// parameter which will be threaded through from the kernel entry point.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SYCL_GLOBALOFFSET_H
+#define LLVM_SYCL_GLOBALOFFSET_H
+
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+ModulePass *createGlobalOffsetPass();
+
+} // end namespace llvm
+
+#endif
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-invalid-triple.ll b/llvm/test/CodeGen/NVPTX/global-offset-invalid-triple.ll
new file mode 100644
index 0000000000000..1eadbf9c2d06b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/global-offset-invalid-triple.ll
@@ -0,0 +1,35 @@
+; RUN: not --crash llc -march=nvptx64 -mcpu=sm_20 %s -o - 2>&1 | FileCheck %s
+; ModuleID = 'invalid-triple.bc'
+; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.nvvm.implicit.offset
+source_filename = "invalid-triple.ll"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-nvcl-sycldevice"
+
+; This test checks that the pass does not run on nvcl triples.
+
+declare i32* @llvm.nvvm.implicit.offset()
+
+define weak_odr dso_local i64 @_ZTS14other_function() {
+  %1 = tail call i32* @llvm.nvvm.implicit.offset()
+  %2 = getelementptr inbounds i32, i32* %1, i64 2
+  %3 = load i32, i32* %2, align 4
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS14example_kernel() {
+entry:
+  %0 = call i64 @_ZTS14other_function()
+  ret void
+}
+
+!nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+!nvvmir.version = !{!5}
+
+!0 = distinct !{void ()* @_ZTS14example_kernel, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
+!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{null, !"align", i32 16}
+!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!5 = !{i32 1, i32 4}
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-multiple-calls-from-one-function.ll b/llvm/test/CodeGen/NVPTX/global-offset-multiple-calls-from-one-function.ll
new file mode 100644
index 0000000000000..79f10729b56c3
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/global-offset-multiple-calls-from-one-function.ll
@@ -0,0 +1,67 @@
+; RUN: opt -globaloffset %s -S -o - | FileCheck %s
+; ModuleID = 'multiple-calls-from-one-function.bc'
+source_filename = "multiple-calls-from-one-function.ll"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda-sycldevice"
+
+; This test checks that when there are multiple calls to a function that uses
+; the intrinsic that the caller and the callee only have a single clone each
+; with the offset parameter. It also checks that the clone with multiple calls
+; to other functions that has a variant that takes an offset parameter will have
+; all calls redirected to the corresponding variants.
+
+declare i32* @llvm.nvvm.implicit.offset()
+; CHECK-NOT: declare i32* @llvm.nvvm.implicit.offset()
+
+define weak_odr dso_local i64 @_ZTS14other_function() {
+; CHECK: define weak_odr dso_local i64 @_ZTS14other_function(i32* %0) {
+  %1 = tail call i32* @llvm.nvvm.implicit.offset()
+; CHECK-NOT: tail call i32* @llvm.nvvm.implicit.offset()
+  %2 = getelementptr inbounds i32, i32* %1, i64 2
+; CHECK: %2 = getelementptr inbounds i32, i32* %0, i64 2
+  %3 = load i32, i32* %2, align 4
+  %4 = zext i32 %3 to i64
+
+  %5 = tail call i32* @llvm.nvvm.implicit.offset()
+; CHECK-NOT: tail call i32* @llvm.nvvm.implicit.offset()
+  %6 = getelementptr inbounds i32, i32* %5, i64 2
+; CHECK: %5 = getelementptr inbounds i32, i32* %0, i64 2
+  %7 = load i32, i32* %6, align 4
+  %8 = zext i32 %7 to i64
+
+  ret i64 %4
+}
+
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS14example_kernel() {
+entry:
+; CHECK: %0 = alloca [3 x i32], align 4
+; CHECK: %1 = bitcast [3 x i32]* %0 to i8*
+; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %1, i8 0, i64 12, i1 false)
+; CHECK: %2 = getelementptr inbounds [3 x i32], [3 x i32]* %0, i32 0, i32 0
+  %0 = call i64 @_ZTS14other_function()
+; CHECK: %3 = call i64 @_ZTS14other_function(i32* %2)
+  %1 = call i64 @_ZTS14other_function()
+; CHECK: %4 = call i64 @_ZTS14other_function(i32* %2)
+  ret void
+}
+
+; CHECK: define weak_odr dso_local void @_ZTS14example_kernel_with_offset([3 x i32]* byval([3 x i32]) %0) {
+; CHECK: entry:
+; CHECK:   %1 = bitcast [3 x i32]* %0 to i32*
+; CHECK:   %2 = call i64 @_ZTS14other_function(i32* %1)
+; CHECK:   %3 = call i64 @_ZTS14other_function(i32* %1)
+; CHECK:   ret void
+; CHECK: }
+
+!nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+; CHECK: !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5}
+!nvvmir.version = !{!6}
+
+!0 = distinct !{void ()* @_ZTS14example_kernel, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
+!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{null, !"align", i32 16}
+!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+; CHECK: !5 = !{void ([3 x i32]*)* @_ZTS14example_kernel_with_offset, !"kernel", i32 1}
+!6 = !{i32 1, i32 4}
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-multiple-entry-points.ll b/llvm/test/CodeGen/NVPTX/global-offset-multiple-entry-points.ll
new file mode 100644
index 0000000000000..7784150f1082d
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/global-offset-multiple-entry-points.ll
@@ -0,0 +1,108 @@
+; RUN: opt -globaloffset %s -S -o - | FileCheck %s
+; ModuleID = 'multiple-entry-points.bc'
+source_filename = "multiple-entry-points.ll"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-nvcl-sycldevice"
+
+; This test checks that the pass works with multiple entry points.
+
+declare i32* @llvm.nvvm.implicit.offset()
+; CHECK-NOT: declare i32* @llvm.nvvm.implicit.offset()
+
+; This function is a kernel entry point that does not use global offset. It will
+; not get a clone with a global offset parameter.
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS12third_kernel() {
+entry:
+  ret void
+}
+
+define weak_odr dso_local i64 @_ZTS15common_function() {
+; CHECK: define weak_odr dso_local i64 @_ZTS15common_function(i32* %0) {
+  %1 = tail call i32* @llvm.nvvm.implicit.offset()
+; CHECK-NOT: tail call i32* @llvm.nvvm.implicit.offset()
+; CHECK: %2 = getelementptr inbounds i32, i32* %0, i64 2
+  %2 = getelementptr inbounds i32, i32* %1, i64 2
+  %3 = load i32, i32* %2, align 4
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define weak_odr dso_local i64 @_ZTS14first_function() {
+; CHECK: define weak_odr dso_local i64 @_ZTS14first_function(i32* %0) {
+  %1 = call i64 @_ZTS15common_function()
+; CHECK: %2 = call i64 @_ZTS15common_function(i32* %0)
+  ret i64 %1
+}
+
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS12first_kernel() {
+entry:
+; CHECK: %0 = alloca [3 x i32], align 4
+; CHECK: %1 = bitcast [3 x i32]* %0 to i8*
+; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %1, i8 0, i64 12, i1 false)
+; CHECK: %2 = getelementptr inbounds [3 x i32], [3 x i32]* %0, i32 0, i32 0
+  %0 = call i64 @_ZTS14first_function()
+; CHECK: %3 = call i64 @_ZTS14first_function(i32* %2)
+  ret void
+}
+
+; CHECK: define weak_odr dso_local void @_ZTS12first_kernel_with_offset([3 x i32]* byval([3 x i32]) %0) {
+; CHECK: entry:
+; CHECK:   %1 = bitcast [3 x i32]* %0 to i32*
+; CHECK:   %2 = call i64 @_ZTS14first_function(i32* %1)
+; CHECK:   ret void
+; CHECK: }
+
+define weak_odr dso_local i64 @_ZTS15second_function() {
+; CHECK: define weak_odr dso_local i64 @_ZTS15second_function(i32* %0) {
+  %1 = call i64 @_ZTS15common_function()
+; CHECK: %2 = call i64 @_ZTS15common_function(i32* %0)
+  ret i64 %1
+}
+
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS13second_kernel() {
+entry:
+; CHECK: %0 = alloca [3 x i32], align 4
+; CHECK: %1 = bitcast [3 x i32]* %0 to i8*
+; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %1, i8 0, i64 12, i1 false)
+; CHECK: %2 = getelementptr inbounds [3 x i32], [3 x i32]* %0, i32 0, i32 0
+  %0 = call i64 @_ZTS15second_function()
+; CHECK: %3 = call i64 @_ZTS15second_function(i32* %2)
+  ret void
+}
+
+; CHECK: define weak_odr dso_local void @_ZTS13second_kernel_with_offset([3 x i32]* byval([3 x i32]) %0) {
+; CHECK: entry:
+; CHECK:   %1 = bitcast [3 x i32]* %0 to i32*
+; CHECK:   %2 = call i64 @_ZTS15second_function(i32* %1)
+; CHECK:   ret void
+; CHECK: }
+
+; This function doesn't get called by a kernel entry point.
+define weak_odr dso_local i64 @_ZTS15no_entry_point() {
+; CHECK: define weak_odr dso_local i64 @_ZTS15no_entry_point(i32* %0) {
+  %1 = tail call i32* @llvm.nvvm.implicit.offset()
+; CHECK-NOT: tail call i32* @llvm.nvvm.implicit.offset()
+  %2 = getelementptr inbounds i32, i32* %1, i64 2
+; CHECK: %2 = getelementptr inbounds i32, i32* %0, i64 2
+  %3 = load i32, i32* %2, align 4
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+!nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5, !6}
+; CHECK: !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5, !6, !7, !8}
+!nvvmir.version = !{!9}
+
+!0 = distinct !{void ()* @_ZTS12first_kernel, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
+!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{null, !"align", i32 16}
+!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!5 = distinct !{void ()* @_ZTS13second_kernel, !"kernel", i32 1}
+!6 = distinct !{void ()* @_ZTS12third_kernel, !"kernel", i32 1}
+; CHECK: !7 = !{void ([3 x i32]*)* @_ZTS13second_kernel_with_offset, !"kernel", i32 1}
+; CHECK: !8 = !{void ([3 x i32]*)* @_ZTS12first_kernel_with_offset, !"kernel", i32 1}
+!9 = !{i32 1, i32 4}
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-simple.ll b/llvm/test/CodeGen/NVPTX/global-offset-simple.ll
new file mode 100644
index 0000000000000..8c301d5b223cf
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/global-offset-simple.ll
@@ -0,0 +1,52 @@
+; RUN: opt -globaloffset %s -S -o - | FileCheck %s
+; ModuleID = 'simple.bc'
+source_filename = "simple.ll"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda-sycldevice"
+
+; This test checks that the transformation is applied in the basic case.
+
+declare i32* @llvm.nvvm.implicit.offset()
+; CHECK-NOT: llvm.nvvm.implicit.offset
+
+define weak_odr dso_local i64 @_ZTS14other_function() {
+; CHECK: define weak_odr dso_local i64 @_ZTS14other_function(i32* %0) {
+; CHECK: %2 = getelementptr inbounds i32, i32* %0, i64 2
+  %1 = tail call i32* @llvm.nvvm.implicit.offset()
+; CHECK-NOT: tail call i32* @llvm.nvvm.implicit.offset()
+  %2 = getelementptr inbounds i32, i32* %1, i64 2
+  %3 = load i32, i32* %2, align 4
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS14example_kernel() {
+entry:
+; CHECK: %0 = alloca [3 x i32], align 4
+; CHECK: %1 = bitcast [3 x i32]* %0 to i8*
+; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %1, i8 0, i64 12, i1 false)
+; CHECK: %2 = getelementptr inbounds [3 x i32], [3 x i32]* %0, i32 0, i32 0
+  %0 = call i64 @_ZTS14other_function()
+; CHECK: %3 = call i64 @_ZTS14other_function(i32* %2)
+  ret void
+}
+
+; CHECK: define weak_odr dso_local void @_ZTS14example_kernel_with_offset([3 x i32]* byval([3 x i32]) %0) {
+; CHECK: entry:
+; CHECK:   %1 = bitcast [3 x i32]* %0 to i32*
+; CHECK:   %2 = call i64 @_ZTS14other_function(i32* %1)
+; CHECK:   ret void
+; CHECK: }
+
+!nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+; CHECK: !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5}
+!nvvmir.version = !{!6}
+
+!0 = distinct !{void ()* @_ZTS14example_kernel, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
+!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{null, !"align", i32 16}
+!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+; CHECK: !5 = !{void ([3 x i32]*)* @_ZTS14example_kernel_with_offset, !"kernel", i32 1}
+!6 = !{i32 1, i32 4}
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-valid-triple.ll b/llvm/test/CodeGen/NVPTX/global-offset-valid-triple.ll
new file mode 100644
index 0000000000000..e4ec130659905
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/global-offset-valid-triple.ll
@@ -0,0 +1,43 @@
+; RUN: llc -march=nvptx64 -mcpu=sm_20 < %s | FileCheck %s
+; ModuleID = 'valid-triple.bc'
+; CHECK-NOT: LLVM ERROR: Cannot select: intrinsic %llvm.nvvm.implicit.offset
+source_filename = "valid-triple.ll"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda-sycldevice"
+
+; This test checks that the pass does run on cuda triples.
+
+declare i32* @llvm.nvvm.implicit.offset()
+
+define weak_odr dso_local i64 @_ZTS14other_function() {
+  %1 = tail call i32* @llvm.nvvm.implicit.offset()
+  %2 = getelementptr inbounds i32, i32* %1, i64 2
+  %3 = load i32, i32* %2, align 4
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS14example_kernel() {
+entry:
+  %0 = call i64 @_ZTS14other_function()
+  ret void
+}
+
+!nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+!llvm.ident = !{!7, !8}
+!nvvmir.version = !{!9}
+!llvm.module.flags = !{!10, !11}
+
+!0 = distinct !{void ()* @_ZTS14example_kernel, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
+!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{null, !"align", i32 16}
+!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!5 = !{i32 1, i32 2}
+!6 = !{i32 4, i32 100000}
+!7 = !{!"clang version 9.0.0"}
+!8 = !{!"clang version 9.0.0"}
+!9 = !{i32 1, i32 4}
+!10 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 0]}
+!11 = !{i32 1, !"wchar_size", i32 4}
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index c839ab88707df..0302643294e6c 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -1997,13 +1997,26 @@ pi_result cuda_piKernelCreate(pi_program program, const char *kernel_name,
 
   try {
     ScopedContext active(program->get_context());
+
     CUfunction cuFunc;
     retErr = PI_CHECK_ERROR(
         cuModuleGetFunction(&cuFunc, program->get(), kernel_name));
 
-    retKernel = std::unique_ptr<_pi_kernel>(
-        new _pi_kernel{cuFunc, kernel_name, program, program->get_context()});
+    std::string kernel_name_woffset = std::string(kernel_name) + "_with_offset";
+    CUfunction cuFuncWithOffsetParam;
+    CUresult offsetRes = cuModuleGetFunction(
+        &cuFuncWithOffsetParam, program->get(), kernel_name_woffset.c_str());
+
+    // If there is no kernel with global offset parameter we mark it as missing
+    if (offsetRes == CUDA_ERROR_NOT_FOUND) {
+      cuFuncWithOffsetParam = nullptr;
+    } else {
+      retErr = PI_CHECK_ERROR(offsetRes);
+    }
 
+    retKernel = std::unique_ptr<_pi_kernel>(
+        new _pi_kernel{cuFunc, cuFuncWithOffsetParam, kernel_name, program,
+                       program->get_context()});
   } catch (pi_result err) {
     retErr = err;
   } catch (...) {
@@ -2071,6 +2084,22 @@ pi_result cuda_piEnqueueKernelLaunch(
     retError = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
                                         event_wait_list, nullptr);
 
+    // Set the implicit global offset parameter if kernel has offset variant
+    if (kernel->get_with_offset_parameter()) {
+      std::uint32_t cuda_implicit_offset[3] = {0, 0, 0};
+      if (global_work_offset) {
+        for (size_t i = 0; i < work_dim; i++) {
+          cuda_implicit_offset[i] =
+              static_cast<std::uint32_t>(global_work_offset[i]);
+          if (global_work_offset[i] != 0) {
+            cuFunc = kernel->get_with_offset_parameter();
+          }
+        }
+      }
+      kernel->set_implicit_offset_arg(sizeof(cuda_implicit_offset),
+                                      cuda_implicit_offset);
+    }
+
     // Set the number of threads per block to the number of threads per warp
     // by default unless user has provided a better number
     int threadsPerBlock[3] = {32, 1, 1};
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
index eac59b2a724a5..0a8ef2fe2f366 100644
--- a/sycl/plugins/cuda/pi_cuda.hpp
+++ b/sycl/plugins/cuda/pi_cuda.hpp
@@ -453,6 +453,7 @@ struct _pi_kernel {
   using native_type = CUfunction;
 
   native_type function_;
+  native_type functionWithOffsetParam_;
   std::string name_;
   pi_context context_;
   pi_program program_;
@@ -475,14 +476,23 @@ struct _pi_kernel {
     args_index_t indices_;
     args_size_t offsetPerIndex_;
 
+    std::uint32_t implicitOffsetArgs_[3] = {0, 0, 0};
+
+    arguments() {
+      // Place the implicit offset index at the end of the indicies collection
+      indices_.emplace_back(&implicitOffsetArgs_);
+    }
+
     /// Adds an argument to the kernel.
     /// If the argument existed before, it is replaced.
     /// Otherwise, it is added.
     /// Gaps are filled with empty arguments.
+    /// Implicit offset argument is kept at the back of the indices collection.
     void add_arg(size_t index, size_t size, const void *arg,
                  size_t localSize = 0) {
-      if (index + 1 > indices_.size()) {
-        indices_.resize(index + 1);
+      if (index + 2 > indices_.size()) {
+        // Move implicit offset argument index with the end
+        indices_.resize(index + 2, indices_.back());
         // Ensure enough space for the new argument
         paramSizes_.resize(index + 1);
         offsetPerIndex_.resize(index + 1);
@@ -502,6 +512,11 @@ struct _pi_kernel {
       add_arg(index, sizeof(size_t), (const void *)&(localOffset), size);
     }
 
+    void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) {
+      assert(size == sizeof(std::uint32_t) * 3);
+      std::memcpy(implicitOffsetArgs_, implicitOffset, size);
+    }
+
     void clear_local_size() {
       std::fill(std::begin(offsetPerIndex_), std::end(offsetPerIndex_), 0);
     }
@@ -514,14 +529,18 @@ struct _pi_kernel {
     }
   } args_;
 
-  _pi_kernel(CUfunction func, const char *name, pi_program program,
-             pi_context ctxt)
-      : function_{func}, name_{name}, context_{ctxt}, program_{program},
-        refCount_{1} {
+  _pi_kernel(CUfunction func, CUfunction funcWithOffsetParam, const char *name,
+             pi_program program, pi_context ctxt)
+      : function_{func}, functionWithOffsetParam_{funcWithOffsetParam},
+        name_{name}, context_{ctxt}, program_{program}, refCount_{1} {
     cuda_piProgramRetain(program_);
     cuda_piContextRetain(context_);
   }
 
+  _pi_kernel(CUfunction func, const char *name, pi_program program,
+             pi_context ctxt)
+      : _pi_kernel{func, nullptr, name, program, ctxt} {}
+
   ~_pi_kernel()
   {
     cuda_piProgramRelease(program_);
@@ -538,15 +557,23 @@ struct _pi_kernel {
 
   native_type get() const noexcept { return function_; };
 
+  native_type get_with_offset_parameter() const noexcept {
+    return functionWithOffsetParam_;
+  };
+
+  bool has_with_offset_parameter() const noexcept {
+    return functionWithOffsetParam_ != nullptr;
+  }
+
   pi_context get_context() const noexcept { return context_; };
 
   const char *get_name() const noexcept { return name_.c_str(); }
 
-  /// Returns the number of arguments.
+  /// Returns the number of arguments, excluding the implicit global offset.
   /// Note this only returns the current known number of arguments, not the
   /// real one required by the kernel, since this cannot be queried from
   /// the CUDA Driver API
-  pi_uint32 get_num_args() const noexcept { return args_.indices_.size(); }
+  pi_uint32 get_num_args() const noexcept { return args_.indices_.size() - 1; }
 
   void set_kernel_arg(int index, size_t size, const void *arg) {
     args_.add_arg(index, size, arg);
@@ -556,6 +583,10 @@ struct _pi_kernel {
     args_.add_local_arg(index, size);
   }
 
+  void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset) {
+    args_.set_implicit_offset(size, implicitOffset);
+  }
+
   arguments::args_index_t get_arg_indices() const {
     return args_.get_indices();
   }
diff --git a/sycl/test/basic_tests/parallel_for_indexers.cpp b/sycl/test/basic_tests/parallel_for_indexers.cpp
index ade6238b8c6c0..d02b6239472e4 100644
--- a/sycl/test/basic_tests/parallel_for_indexers.cpp
+++ b/sycl/test/basic_tests/parallel_for_indexers.cpp
@@ -6,10 +6,6 @@
 // RUN: %GPU_RUN_PLACEHOLDER %t2.out
 // RUN: %ACC_RUN_PLACEHOLDER %t2.out
 
-// TODO: Unexpected result
-// TODO: _indexers.cpp:37: int main(): Assertion `id == -1' failed.
-// XFAIL: cuda || level0
-
 #include <CL/sycl.hpp>
 
 #include <cassert>
diff --git a/sycl/unittests/pi/cuda/test_kernels.cpp b/sycl/unittests/pi/cuda/test_kernels.cpp
index 02ac6549e61e3..400ddf03d84b7 100644
--- a/sycl/unittests/pi/cuda/test_kernels.cpp
+++ b/sycl/unittests/pi/cuda/test_kernels.cpp
@@ -16,6 +16,9 @@
 #include <detail/plugin.hpp>
 #include <pi_cuda.hpp>
 
+// PI CUDA kernels carry an additional argument for the implicit global offset.
+#define NUM_IMPLICIT_ARGS 1
+
 using namespace cl::sycl;
 
 struct CudaKernelsTest : public ::testing::Test {
@@ -172,7 +175,7 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSimple) {
                 kern, 0, sizeof(int), &number)),
             PI_SUCCESS);
   const auto &kernArgs = kern->get_arg_indices();
-  ASSERT_EQ(kernArgs.size(), (size_t)1);
+  ASSERT_EQ(kernArgs.size(), (size_t)1 + NUM_IMPLICIT_ARGS);
   int storedValue = *(static_cast<const int *>(kernArgs[0]));
   ASSERT_EQ(storedValue, number);
 }
@@ -201,7 +204,7 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwice) {
                 kern, 0, sizeof(int), &number)),
             PI_SUCCESS);
   const auto &kernArgs = kern->get_arg_indices();
-  ASSERT_GT(kernArgs.size(), (size_t)0);
+  ASSERT_GT(kernArgs.size(), (size_t)0 + NUM_IMPLICIT_ARGS);
   int storedValue = *(static_cast<const int *>(kernArgs[0]));
   ASSERT_EQ(storedValue, number);
 
@@ -210,7 +213,7 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwice) {
                 kern, 0, sizeof(int), &otherNumber)),
             PI_SUCCESS);
   const auto &kernArgs2 = kern->get_arg_indices();
-  ASSERT_EQ(kernArgs2.size(), (size_t)1);
+  ASSERT_EQ(kernArgs2.size(), (size_t)1 + NUM_IMPLICIT_ARGS);
   storedValue = *(static_cast<const int *>(kernArgs2[0]));
   ASSERT_EQ(storedValue, otherNumber);
 }
@@ -244,7 +247,7 @@ TEST_F(CudaKernelsTest, PIKernelSetMemObj) {
                 kern, 0, sizeof(pi_mem), &memObj)),
             PI_SUCCESS);
   const auto &kernArgs = kern->get_arg_indices();
-  ASSERT_EQ(kernArgs.size(), (size_t)1);
+  ASSERT_EQ(kernArgs.size(), (size_t)1 + NUM_IMPLICIT_ARGS);
   pi_mem storedValue = *(static_cast<pi_mem *>(kernArgs[0]));
   ASSERT_EQ(storedValue, memObj);
 }
@@ -369,7 +372,7 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwiceOneLocal) {
                 kern, 0, sizeof(int), &number)),
             PI_SUCCESS);
   const auto &kernArgs = kern->get_arg_indices();
-  ASSERT_GT(kernArgs.size(), (size_t)0);
+  ASSERT_GT(kernArgs.size(), (size_t)0 + NUM_IMPLICIT_ARGS);
   int storedValue = *(static_cast<const int *>(kernArgs[0]));
   ASSERT_EQ(storedValue, number);
 
@@ -377,7 +380,7 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwiceOneLocal) {
                 kern, 1, sizeof(int), nullptr)),
             PI_SUCCESS);
   const auto &kernArgs2 = kern->get_arg_indices();
-  ASSERT_EQ(kernArgs2.size(), (size_t)2);
+  ASSERT_EQ(kernArgs2.size(), (size_t)2 + NUM_IMPLICIT_ARGS);
   storedValue = *(static_cast<const int *>(kernArgs2[1]));
   ASSERT_EQ(storedValue, 0);
 
@@ -385,7 +388,7 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwiceOneLocal) {
                 kern, 2, sizeof(int), nullptr)),
             PI_SUCCESS);
   const auto &kernArgs3 = kern->get_arg_indices();
-  ASSERT_EQ(kernArgs3.size(), (size_t)3);
+  ASSERT_EQ(kernArgs3.size(), (size_t)3 + NUM_IMPLICIT_ARGS);
   storedValue = *(static_cast<const int *>(kernArgs3[2]));
   ASSERT_EQ(storedValue, static_cast<int>(sizeof(int)));
 }

From 85edd462a3f70ceaa0e83620e5035414cb2637ae Mon Sep 17 00:00:00 2001
From: Steffen Larsen <steffen.larsen@codeplay.com>
Date: Thu, 4 Jun 2020 15:00:13 +0100
Subject: [PATCH 2/3] [SYCL][PTX] Minor changes based on comments

Signed-off-by: Steffen Larsen <steffen.larsen@codeplay.com>
---
 llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp | 30 +++++++++++----------
 llvm/lib/Target/NVPTX/SYCL/GlobalOffset.h   |  3 +--
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp b/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp
index 3f134f2867934..b03298d661093 100644
--- a/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp
+++ b/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp
@@ -45,6 +45,11 @@ class GlobalOffset : public ModulePass {
 
     llvm::Function *ImplicitOffsetIntrinsic =
         M.getFunction(Intrinsic::getName(Intrinsic::nvvm_implicit_offset));
+
+    if (!ImplicitOffsetIntrinsic || ImplicitOffsetIntrinsic->use_empty()) {
+      return false;
+    }
+
     KernelImplicitArgumentType =
         ArrayType::get(Type::getInt32Ty(M.getContext()), 3);
     ImplicitOffsetPtrType = Type::getInt32Ty(M.getContext())->getPointerTo();
@@ -54,15 +59,11 @@ class GlobalOffset : public ModulePass {
         "Intrinsic::nvvm_implicit_offset does not return the expected "
         "type");
 
-    if (!ImplicitOffsetIntrinsic || ImplicitOffsetIntrinsic->use_empty()) {
-      return false;
-    }
-
     // Find all entry points.
     EntryPointMetadata = getEntryPointMetadata(M);
 
     // Add implicit parameters to all direct and indirect users of the offset
-    this->addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr);
+    addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr);
 
     // Assert that all uses of `ImplicitOffsetIntrinsic` are removed and delete
     // it.
@@ -80,7 +81,7 @@ class GlobalOffset : public ModulePass {
     LLVMContext &Ctx = M.getContext();
     MDNode *FuncMetadata = EntryPointMetadata[Func];
 
-    bool AlreadyProcessed = this->ProcessedFunctions.count(Func) == 1;
+    bool AlreadyProcessed = ProcessedFunctions.count(Func) == 1;
     if (AlreadyProcessed)
       return;
 
@@ -89,9 +90,9 @@ class GlobalOffset : public ModulePass {
     auto NvvmMetadata = M.getNamedMetadata("nvvm.annotations");
     assert(NvvmMetadata && "IR compiled to PTX must have nvvm.annotations");
 
-    auto NewFunc = this->addOffsetArgumentToFunction(
-                           M, Func, KernelImplicitArgumentType->getPointerTo(),
-                           /*KeepOriginal=*/true)
+    auto NewFunc = addOffsetArgumentToFunction(
+                       M, Func, KernelImplicitArgumentType->getPointerTo(),
+                       /*KeepOriginal=*/true)
                        .first;
     Argument *NewArgument = NewFunc->arg_begin() + (NewFunc->arg_size() - 1);
     // Pass the values by value to the kernel
@@ -118,7 +119,7 @@ class GlobalOffset : public ModulePass {
                              ImplicitOffset->getAlign());
     MemsetCall->addParamAttr(0, Attribute::NonNull);
     MemsetCall->addDereferenceableAttr(1, AllocByteSize);
-    this->ProcessedFunctions[Func] = Builder.CreateConstInBoundsGEP2_32(
+    ProcessedFunctions[Func] = Builder.CreateConstInBoundsGEP2_32(
         ImplicitOffsetType, ImplicitOffset, 0, 0);
   }
 
@@ -170,13 +171,13 @@ class GlobalOffset : public ModulePass {
       // Determine if `Caller` needs processed or if this is another callsite
       // from an already-processed function.
       Function *NewFunc;
-      Value *ImplicitOffset = this->ProcessedFunctions[Caller];
+      Value *ImplicitOffset = ProcessedFunctions[Caller];
       bool AlreadyProcessed = ImplicitOffset != nullptr;
       if (AlreadyProcessed) {
         NewFunc = Caller;
       } else {
         std::tie(NewFunc, ImplicitOffset) =
-            this->addOffsetArgumentToFunction(M, Caller);
+            addOffsetArgumentToFunction(M, Caller);
       }
 
       if (!CalleeWithImplicitParam) {
@@ -211,7 +212,7 @@ class GlobalOffset : public ModulePass {
 
       if (!AlreadyProcessed) {
         // Process callers of the old function.
-        this->addImplicitParameterToCallers(M, Caller, NewFunc);
+        addImplicitParameterToCallers(M, Caller, NewFunc);
 
         // Now that the old function is dead, delete it.
         Caller->dropAllReferences();
@@ -259,6 +260,7 @@ class GlobalOffset : public ModulePass {
                                          Func->getAddressSpace());
 
     if (KeepOriginal) {
+      // TODO: Are there better naming alternatives that allow for unmangling?
       NewFunc->setName(Func->getName() + "_with_offset");
 
       ValueToValueMapTy VMap;
@@ -308,7 +310,7 @@ class GlobalOffset : public ModulePass {
           Builder.CreateBitCast(ImplicitOffset, ImplicitOffsetPtrType);
     }
 
-    this->ProcessedFunctions[NewFunc] = ImplicitOffset;
+    ProcessedFunctions[NewFunc] = ImplicitOffset;
 
     // Return the new function and the offset argument.
     return {NewFunc, ImplicitOffset};
diff --git a/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.h b/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.h
index 8d17219d49a61..3284af3a65d20 100644
--- a/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.h
+++ b/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.h
@@ -1,4 +1,4 @@
-//===--------- GlobalOffset.cpp - Global Offset Support for CUDA --------- ===//
+//===---------- GlobalOffset.h - Global Offset Support for CUDA ---------- ===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -15,7 +15,6 @@
 #ifndef LLVM_SYCL_GLOBALOFFSET_H
 #define LLVM_SYCL_GLOBALOFFSET_H
 
-#include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 
 namespace llvm {

From 3c9ba39f33fab19faa4f1fc524eb9202a2cef736 Mon Sep 17 00:00:00 2001
From: Steffen Larsen <steffen.larsen@codeplay.com>
Date: Tue, 16 Jun 2020 18:53:10 +0100
Subject: [PATCH 3/3] [SYCL] Re-disable parallel_for_indexers test for level0

Signed-off-by: Steffen Larsen <steffen.larsen@codeplay.com>
---
 sycl/test/basic_tests/parallel_for_indexers.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sycl/test/basic_tests/parallel_for_indexers.cpp b/sycl/test/basic_tests/parallel_for_indexers.cpp
index d02b6239472e4..3d20ec3d66903 100644
--- a/sycl/test/basic_tests/parallel_for_indexers.cpp
+++ b/sycl/test/basic_tests/parallel_for_indexers.cpp
@@ -6,6 +6,10 @@
 // RUN: %GPU_RUN_PLACEHOLDER %t2.out
 // RUN: %ACC_RUN_PLACEHOLDER %t2.out
 
+// TODO: Unexpected result
+// TODO: _indexers.cpp:37: int main(): Assertion `id == -1' failed.
+// XFAIL: level0
+
 #include <CL/sycl.hpp>
 
 #include <cassert>