diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 759c91290a60c..9a4a3ff013973 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -89,6 +89,9 @@ BUILTIN(__nvvm_read_ptx_sreg_pm1, "i", "n")
 BUILTIN(__nvvm_read_ptx_sreg_pm2, "i", "n")
 BUILTIN(__nvvm_read_ptx_sreg_pm3, "i", "n")
 
+// SYCL
+BUILTIN(__builtin_ptx_implicit_offset, "Ui*", "nc")
+
 // MISC
 
 BUILTIN(__nvvm_prmt, "UiUiUiUi", "")
diff --git a/libclc/ptx-nvidiacl/libspirv/workitem/get_global_id.cl b/libclc/ptx-nvidiacl/libspirv/workitem/get_global_id.cl
index da96caffb4f75..b856302625f42 100644
--- a/libclc/ptx-nvidiacl/libspirv/workitem/get_global_id.cl
+++ b/libclc/ptx-nvidiacl/libspirv/workitem/get_global_id.cl
@@ -9,13 +9,16 @@
 #include <spirv/spirv.h>
 
 _CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalInvocationId_x() {
-  return __spirv_WorkgroupId_x() * __spirv_WorkgroupSize_x() + __spirv_LocalInvocationId_x();
+  return __spirv_WorkgroupId_x() * __spirv_WorkgroupSize_x() +
+         __spirv_LocalInvocationId_x() + __spirv_GlobalOffset_x();
 }
 
 _CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalInvocationId_y() {
-  return __spirv_WorkgroupId_y() * __spirv_WorkgroupSize_y() + __spirv_LocalInvocationId_y();
+  return __spirv_WorkgroupId_y() * __spirv_WorkgroupSize_y() +
+         __spirv_LocalInvocationId_y() + __spirv_GlobalOffset_y();
 }
 
 _CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalInvocationId_z() {
-  return __spirv_WorkgroupId_z() * __spirv_WorkgroupSize_z() + __spirv_LocalInvocationId_z();
+  return __spirv_WorkgroupId_z() * __spirv_WorkgroupSize_z() +
+         __spirv_LocalInvocationId_z() + __spirv_GlobalOffset_z();
 }
diff --git a/libclc/ptx-nvidiacl/libspirv/workitem/get_global_offset.cl b/libclc/ptx-nvidiacl/libspirv/workitem/get_global_offset.cl
index de269c76602be..7eae8cf43c20e 100644
--- a/libclc/ptx-nvidiacl/libspirv/workitem/get_global_offset.cl
+++ b/libclc/ptx-nvidiacl/libspirv/workitem/get_global_offset.cl
@@ -11,13 +11,13 @@
 // Compiler support is required to provide global offset on NVPTX.
 
 _CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalOffset_x() {
-    return 0;
+  return __builtin_ptx_implicit_offset()[0];
 }
 
 _CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalOffset_y() {
-    return 0;
+  return __builtin_ptx_implicit_offset()[1];
 }
 
 _CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalOffset_z() {
-    return 0;
+  return __builtin_ptx_implicit_offset()[2];
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 61293418ec41d..9de9400848709 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -4161,4 +4161,10 @@ foreach layout_a = ["row", "col"] in {
   } // layout_b
 } // layout_a
 
+// SYCL
+def int_nvvm_implicit_offset :
+  GCCBuiltin<"__builtin_ptx_implicit_offset">,
+  Intrinsic<[LLVMPointerType<llvm_i32_ty>], [],
+  [IntrNoMem, IntrSpeculatable]>;
+
 } // let TargetPrefix = "nvvm"
diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt
index 097fc26cdab2c..9e9f7ebb63887 100644
--- a/llvm/lib/Target/NVPTX/CMakeLists.txt
+++ b/llvm/lib/Target/NVPTX/CMakeLists.txt
@@ -33,6 +33,7 @@ set(NVPTXCodeGen_sources
   NVVMIntrRange.cpp
   NVVMReflect.cpp
   NVPTXProxyRegErasure.cpp
+  SYCL/GlobalOffset.cpp
   SYCL/LocalAccessorToSharedMemory.cpp
   )
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index e5c89a191cc0e..47729e957d533 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -16,8 +16,9 @@
 #include "NVPTXLowerAggrCopies.h"
 #include "NVPTXTargetObjectFile.h"
 #include "NVPTXTargetTransformInfo.h"
-#include "TargetInfo/NVPTXTargetInfo.h"
+#include "SYCL/GlobalOffset.h"
 #include "SYCL/LocalAccessorToSharedMemory.h"
+#include "TargetInfo/NVPTXTargetInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -71,6 +72,7 @@ void initializeNVPTXLowerArgsPass(PassRegistry &);
 void initializeNVPTXLowerAllocaPass(PassRegistry &);
 void initializeNVPTXProxyRegErasurePass(PassRegistry &);
 
+void initializeGlobalOffsetPass(PassRegistry &);
 void initializeLocalAccessorToSharedMemoryPass(PassRegistry &);
 
 } // end namespace llvm
@@ -94,6 +96,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
   initializeNVPTXProxyRegErasurePass(PR);
 
   // SYCL-specific passes, needed here to be available to `opt`.
+  initializeGlobalOffsetPass(PR);
   initializeLocalAccessorToSharedMemoryPass(PR);
 }
 
@@ -274,6 +277,7 @@ void NVPTXPassConfig::addIRPasses() {
 
   if (getTM<NVPTXTargetMachine>().getTargetTriple().getOS() == Triple::CUDA &&
       getTM<NVPTXTargetMachine>().getTargetTriple().getEnvironment() == Triple::SYCLDevice) {
+    addPass(createGlobalOffsetPass());
     addPass(createLocalAccessorToSharedMemoryPass());
   }
 
diff --git a/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp b/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp
new file mode 100644
index 0000000000000..b03298d661093
--- /dev/null
+++ b/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp
@@ -0,0 +1,370 @@
+//===--------- GlobalOffset.cpp - Global Offset Support for CUDA --------- ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass operates on SYCL kernels being compiled to CUDA. It looks for uses
+// of the `llvm.nvvm.implicit.offset` intrinsic and replaces it with a offset
+// parameter which will be threaded through from the kernel entry point.
+//
+//===----------------------------------------------------------------------===//
+
+#include "GlobalOffset.h"
+
+#include "../MCTargetDesc/NVPTXBaseInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "globaloffset"
+
+namespace llvm {
+void initializeGlobalOffsetPass(PassRegistry &);
+} // end namespace llvm
+
+namespace {
+
+class GlobalOffset : public ModulePass {
+public:
+  static char ID;
+  GlobalOffset() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    llvm::Function *ImplicitOffsetIntrinsic =
+        M.getFunction(Intrinsic::getName(Intrinsic::nvvm_implicit_offset));
+
+    if (!ImplicitOffsetIntrinsic || ImplicitOffsetIntrinsic->use_empty()) {
+      return false;
+    }
+
+    KernelImplicitArgumentType =
+        ArrayType::get(Type::getInt32Ty(M.getContext()), 3);
+    ImplicitOffsetPtrType = Type::getInt32Ty(M.getContext())->getPointerTo();
+    assert(
+        (!ImplicitOffsetIntrinsic ||
+         ImplicitOffsetIntrinsic->getReturnType() == ImplicitOffsetPtrType) &&
+        "Intrinsic::nvvm_implicit_offset does not return the expected "
+        "type");
+
+    // Find all entry points.
+    EntryPointMetadata = getEntryPointMetadata(M);
+
+    // Add implicit parameters to all direct and indirect users of the offset
+    addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr);
+
+    // Assert that all uses of `ImplicitOffsetIntrinsic` are removed and delete
+    // it.
+    assert(ImplicitOffsetIntrinsic->use_empty() &&
+           "Not all uses of intrinsic removed");
+    ImplicitOffsetIntrinsic->eraseFromParent();
+
+    return true;
+  }
+
+  void processKernelEntryPoint(Module &M, Function *Func) {
+    assert(EntryPointMetadata.count(Func) != 0 &&
+           "Function must be an entry point");
+
+    LLVMContext &Ctx = M.getContext();
+    MDNode *FuncMetadata = EntryPointMetadata[Func];
+
+    bool AlreadyProcessed = ProcessedFunctions.count(Func) == 1;
+    if (AlreadyProcessed)
+      return;
+
+    // Add the new argument to all other kernel entry points, despite not
+    // using the global offset.
+    auto NvvmMetadata = M.getNamedMetadata("nvvm.annotations");
+    assert(NvvmMetadata && "IR compiled to PTX must have nvvm.annotations");
+
+    auto NewFunc = addOffsetArgumentToFunction(
+                       M, Func, KernelImplicitArgumentType->getPointerTo(),
+                       /*KeepOriginal=*/true)
+                       .first;
+    Argument *NewArgument = NewFunc->arg_begin() + (NewFunc->arg_size() - 1);
+    // Pass the values by value to the kernel
+    NewArgument->addAttr(
+        Attribute::getWithByValType(Ctx, KernelImplicitArgumentType));
+
+    // Add the metadata.
+    Metadata *NewMetadata[] = {ConstantAsMetadata::get(NewFunc),
+                               FuncMetadata->getOperand(1),
+                               FuncMetadata->getOperand(2)};
+    NvvmMetadata->addOperand(MDNode::get(Ctx, NewMetadata));
+
+    // Create alloca of zeros for the implicit offset in original func
+    BasicBlock *EntryBlock = &Func->getEntryBlock();
+    IRBuilder<> Builder(EntryBlock, EntryBlock->getFirstInsertionPt());
+    Type *ImplicitOffsetType =
+        ArrayType::get(Type::getInt32Ty(M.getContext()), 3);
+    AllocaInst *ImplicitOffset = Builder.CreateAlloca(ImplicitOffsetType);
+    uint64_t AllocByteSize =
+        ImplicitOffset->getAllocationSizeInBits(M.getDataLayout()).getValue() /
+        8;
+    CallInst *MemsetCall =
+        Builder.CreateMemSet(ImplicitOffset, Builder.getInt8(0), AllocByteSize,
+                             ImplicitOffset->getAlign());
+    MemsetCall->addParamAttr(0, Attribute::NonNull);
+    MemsetCall->addDereferenceableAttr(1, AllocByteSize);
+    ProcessedFunctions[Func] = Builder.CreateConstInBoundsGEP2_32(
+        ImplicitOffsetType, ImplicitOffset, 0, 0);
+  }
+
+  // This function adds an implicit parameter to the function containing a call
+  // instruction to the implicit offset intrinsic or another function (which
+  // eventually calls the instrinsic). If the call instruction is to the
+  // implicit offset intrinsic, then the intrinisic is replaced with the
+  // parameter that was added.
+  //
+  // `Callee` is the function (to which this transformation has already been
+  // applied), or to the implicit offset intrinsic. `CalleeWithImplicitParam`
+  // indicates whether Callee is to the implicit intrinsic (when `nullptr`) or
+  // to another function (not `nullptr`) - this is used to know whether calls to
+  // it needs to have the implicit parameter added to it or replaced with the
+  // implicit parameter.
+  //
+  // Once the function, say `F`, containing a call to `Callee` has the implicit
+  // parameter added, callers of `F` are processed by recursively calling this
+  // function, passing `F` to `CalleeWithImplicitParam`.
+  //
+  // Since the cloning of entry points may alter the users of a function, the
+  // cloning must be done as early as possible, as to ensure that no users are
+  // added to previous callees in the call-tree.
+  void addImplicitParameterToCallers(Module &M, Value *Callee,
+                                     Function *CalleeWithImplicitParam) {
+
+    // Make sure that all entry point callers are processed.
+    SmallVector<User *, 8> Users{Callee->users()};
+    for (User *U : Users) {
+      auto *Call = dyn_cast<CallInst>(U);
+      if (!Call)
+        continue;
+
+      Function *Caller = Call->getFunction();
+      if (EntryPointMetadata.count(Caller) != 0) {
+        processKernelEntryPoint(M, Caller);
+      }
+    }
+
+    // User collection may have changed, so we reinitialize it.
+    Users = SmallVector<User *, 8>{Callee->users()};
+    for (User *U : Users) {
+      auto *CallToOld = dyn_cast<CallInst>(U);
+      if (!CallToOld)
+        return;
+
+      auto Caller = CallToOld->getFunction();
+
+      // Determine if `Caller` needs processed or if this is another callsite
+      // from an already-processed function.
+      Function *NewFunc;
+      Value *ImplicitOffset = ProcessedFunctions[Caller];
+      bool AlreadyProcessed = ImplicitOffset != nullptr;
+      if (AlreadyProcessed) {
+        NewFunc = Caller;
+      } else {
+        std::tie(NewFunc, ImplicitOffset) =
+            addOffsetArgumentToFunction(M, Caller);
+      }
+
+      if (!CalleeWithImplicitParam) {
+        // Replace intrinsic call with parameter.
+        CallToOld->replaceAllUsesWith(ImplicitOffset);
+      } else {
+        // Build up a list of arguments to call the modified function using.
+        llvm::SmallVector<Value *, 8> ImplicitOffsets;
+        for (Use &U : CallToOld->args()) {
+          ImplicitOffsets.push_back(U);
+        }
+        ImplicitOffsets.push_back(ImplicitOffset);
+
+        // Replace call to other function (which now has a new parameter),
+        // with a call including the new parameter to that same function.
+        auto NewCaller = CallInst::Create(
+            /* Ty= */ CalleeWithImplicitParam->getFunctionType(),
+            /* Func= */ CalleeWithImplicitParam,
+            /* Args= */ ImplicitOffsets,
+            /* NameStr= */ Twine(),
+            /* InsertBefore= */ CallToOld);
+        NewCaller->setTailCallKind(CallToOld->getTailCallKind());
+        CallToOld->replaceAllUsesWith(NewCaller);
+
+        if (CallToOld->hasName()) {
+          NewCaller->takeName(CallToOld);
+        }
+      }
+
+      // Remove the caller now that it has been replaced.
+      CallToOld->eraseFromParent();
+
+      if (!AlreadyProcessed) {
+        // Process callers of the old function.
+        addImplicitParameterToCallers(M, Caller, NewFunc);
+
+        // Now that the old function is dead, delete it.
+        Caller->dropAllReferences();
+        Caller->eraseFromParent();
+      }
+    }
+  }
+
+  std::pair<Function *, Value *>
+  addOffsetArgumentToFunction(Module &M, Function *Func,
+                              Type *ImplicitArgumentType = nullptr,
+                              bool KeepOriginal = false) {
+    FunctionType *FuncTy = Func->getFunctionType();
+    const AttributeList &FuncAttrs = Func->getAttributes();
+    ImplicitArgumentType =
+        ImplicitArgumentType ? ImplicitArgumentType : ImplicitOffsetPtrType;
+
+    // Construct an argument list containing all of the previous arguments.
+    SmallVector<Type *, 8> Arguments;
+    SmallVector<AttributeSet, 8> ArgumentAttributes;
+
+    unsigned i = 0;
+    for (Function::arg_iterator FuncArg = Func->arg_begin(),
+                                FuncEnd = Func->arg_end();
+         FuncArg != FuncEnd; ++FuncArg, ++i) {
+      Arguments.push_back(FuncArg->getType());
+      ArgumentAttributes.push_back(FuncAttrs.getParamAttributes(i));
+    }
+
+    // Add the offset argument. Must be the same type as returned by
+    // `llvm.nvvm.implicit.offset`.
+
+    Arguments.push_back(ImplicitArgumentType);
+    ArgumentAttributes.push_back(AttributeSet());
+
+    // Build the new function.
+    AttributeList NAttrs =
+        AttributeList::get(Func->getContext(), FuncAttrs.getFnAttributes(),
+                           FuncAttrs.getRetAttributes(), ArgumentAttributes);
+    assert(!FuncTy->isVarArg() && "Variadic arguments prohibited in SYCL");
+    FunctionType *NewFuncTy = FunctionType::get(FuncTy->getReturnType(),
+                                                Arguments, FuncTy->isVarArg());
+
+    Function *NewFunc = Function::Create(NewFuncTy, Func->getLinkage(),
+                                         Func->getAddressSpace());
+
+    if (KeepOriginal) {
+      // TODO: Are there better naming alternatives that allow for unmangling?
+      NewFunc->setName(Func->getName() + "_with_offset");
+
+      ValueToValueMapTy VMap;
+      for (Function::arg_iterator FuncArg = Func->arg_begin(),
+                                  FuncEnd = Func->arg_end(),
+                                  NewFuncArg = NewFunc->arg_begin();
+           FuncArg != FuncEnd; ++FuncArg, ++NewFuncArg) {
+        VMap[FuncArg] = NewFuncArg;
+      }
+
+      SmallVector<ReturnInst *, 8> Returns;
+      CloneFunctionInto(NewFunc, Func, VMap, /*ModuleLevelChanges=*/false,
+                        Returns);
+    } else {
+      NewFunc->copyAttributesFrom(Func);
+      NewFunc->setComdat(Func->getComdat());
+      NewFunc->setAttributes(NAttrs);
+      NewFunc->takeName(Func);
+
+      // Splice the body of the old function right into the new function.
+      NewFunc->getBasicBlockList().splice(NewFunc->begin(),
+                                          Func->getBasicBlockList());
+
+      for (Function::arg_iterator FuncArg = Func->arg_begin(),
+                                  FuncEnd = Func->arg_end(),
+                                  NewFuncArg = NewFunc->arg_begin();
+           FuncArg != FuncEnd; ++FuncArg, ++NewFuncArg) {
+        FuncArg->replaceAllUsesWith(NewFuncArg);
+      }
+
+      // Clone metadata of the old function, including debug info descriptor.
+      SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+      Func->getAllMetadata(MDs);
+      for (auto MD : MDs)
+        NewFunc->addMetadata(MD.first, *MD.second);
+    }
+
+    // Keep original function ordering.
+    M.getFunctionList().insertAfter(Func->getIterator(), NewFunc);
+
+    Value *ImplicitOffset = NewFunc->arg_begin() + (NewFunc->arg_size() - 1);
+    // Add bitcast to match the return type of the intrinsic if needed.
+    if (ImplicitArgumentType != ImplicitOffsetPtrType) {
+      BasicBlock *EntryBlock = &NewFunc->getEntryBlock();
+      IRBuilder<> Builder(EntryBlock, EntryBlock->getFirstInsertionPt());
+      ImplicitOffset =
+          Builder.CreateBitCast(ImplicitOffset, ImplicitOffsetPtrType);
+    }
+
+    ProcessedFunctions[NewFunc] = ImplicitOffset;
+
+    // Return the new function and the offset argument.
+    return {NewFunc, ImplicitOffset};
+  }
+
+  static llvm::DenseMap<Function *, MDNode *> getEntryPointMetadata(Module &M) {
+    auto NvvmMetadata = M.getNamedMetadata("nvvm.annotations");
+    assert(NvvmMetadata && "IR compiled to PTX must have nvvm.annotations");
+
+    llvm::DenseMap<Function *, MDNode *> NvvmEntryPointMetadata;
+    for (auto MetadataNode : NvvmMetadata->operands()) {
+      if (MetadataNode->getNumOperands() != 3)
+        continue;
+
+      // NVPTX identifies kernel entry points using metadata nodes of the form:
+      //   !X = !{<function>, !"kernel", i32 1}
+      auto Type = dyn_cast<MDString>(MetadataNode->getOperand(1));
+      // Only process kernel entry points.
+      if (!Type || Type->getString() != "kernel")
+        continue;
+
+      // Get a pointer to the entry point function from the metadata.
+      auto FuncConstant =
+          dyn_cast<ConstantAsMetadata>(MetadataNode->getOperand(0));
+      if (!FuncConstant)
+        continue;
+      auto Func = dyn_cast<Function>(FuncConstant->getValue());
+      if (!Func)
+        continue;
+
+      assert(Func->use_empty() && "Kernel entry point with uses");
+      NvvmEntryPointMetadata[Func] = MetadataNode;
+    }
+    return NvvmEntryPointMetadata;
+  }
+
+  virtual llvm::StringRef getPassName() const {
+    return "Add implicit SYCL global offset";
+  }
+
+private:
+  // Keep track of which functions have been processed to avoid processing twice
+  llvm::DenseMap<Function *, Value *> ProcessedFunctions;
+  // Keep a map of all entry point functions with metadata
+  llvm::DenseMap<Function *, MDNode *> EntryPointMetadata;
+  llvm::Type *KernelImplicitArgumentType;
+  llvm::Type *ImplicitOffsetPtrType;
+};
+
+} // end anonymous namespace
+
+char GlobalOffset::ID = 0;
+
+INITIALIZE_PASS(GlobalOffset, "globaloffset", "SYCL Global Offset", false,
+                false)
+
+ModulePass *llvm::createGlobalOffsetPass() { return new GlobalOffset(); }
diff --git a/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.h b/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.h
new file mode 100644
index 0000000000000..3284af3a65d20
--- /dev/null
+++ b/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.h
@@ -0,0 +1,26 @@
+//===---------- GlobalOffset.h - Global Offset Support for CUDA ---------- ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass operates on SYCL kernels being compiled to CUDA. It looks for uses
+// of the `llvm.nvvm.implicit.offset` intrinsic and replaces it with a offset
+// parameter which will be threaded through from the kernel entry point.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SYCL_GLOBALOFFSET_H
+#define LLVM_SYCL_GLOBALOFFSET_H
+
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+ModulePass *createGlobalOffsetPass();
+
+} // end namespace llvm
+
+#endif
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-invalid-triple.ll b/llvm/test/CodeGen/NVPTX/global-offset-invalid-triple.ll
new file mode 100644
index 0000000000000..1eadbf9c2d06b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/global-offset-invalid-triple.ll
@@ -0,0 +1,35 @@
+; RUN: not --crash llc -march=nvptx64 -mcpu=sm_20 %s -o - 2>&1 | FileCheck %s
+; ModuleID = 'invalid-triple.bc'
+; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.nvvm.implicit.offset
+source_filename = "invalid-triple.ll"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-nvcl-sycldevice"
+
+; This test checks that the pass does not run on nvcl triples.
+
+declare i32* @llvm.nvvm.implicit.offset()
+
+define weak_odr dso_local i64 @_ZTS14other_function() {
+  %1 = tail call i32* @llvm.nvvm.implicit.offset()
+  %2 = getelementptr inbounds i32, i32* %1, i64 2
+  %3 = load i32, i32* %2, align 4
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS14example_kernel() {
+entry:
+  %0 = call i64 @_ZTS14other_function()
+  ret void
+}
+
+!nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+!nvvmir.version = !{!5}
+
+!0 = distinct !{void ()* @_ZTS14example_kernel, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
+!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{null, !"align", i32 16}
+!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!5 = !{i32 1, i32 4}
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-multiple-calls-from-one-function.ll b/llvm/test/CodeGen/NVPTX/global-offset-multiple-calls-from-one-function.ll
new file mode 100644
index 0000000000000..79f10729b56c3
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/global-offset-multiple-calls-from-one-function.ll
@@ -0,0 +1,67 @@
+; RUN: opt -globaloffset %s -S -o - | FileCheck %s
+; ModuleID = 'multiple-calls-from-one-function.bc'
+source_filename = "multiple-calls-from-one-function.ll"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda-sycldevice"
+
+; This test checks that when there are multiple calls to a function that uses
+; the intrinsic that the caller and the callee only have a single clone each
+; with the offset parameter. It also checks that the clone with multiple calls
+; to other functions that has a variant that takes an offset parameter will have
+; all calls redirected to the corresponding variants.
+
+declare i32* @llvm.nvvm.implicit.offset()
+; CHECK-NOT: declare i32* @llvm.nvvm.implicit.offset()
+
+define weak_odr dso_local i64 @_ZTS14other_function() {
+; CHECK: define weak_odr dso_local i64 @_ZTS14other_function(i32* %0) {
+  %1 = tail call i32* @llvm.nvvm.implicit.offset()
+; CHECK-NOT: tail call i32* @llvm.nvvm.implicit.offset()
+  %2 = getelementptr inbounds i32, i32* %1, i64 2
+; CHECK: %2 = getelementptr inbounds i32, i32* %0, i64 2
+  %3 = load i32, i32* %2, align 4
+  %4 = zext i32 %3 to i64
+
+  %5 = tail call i32* @llvm.nvvm.implicit.offset()
+; CHECK-NOT: tail call i32* @llvm.nvvm.implicit.offset()
+  %6 = getelementptr inbounds i32, i32* %5, i64 2
+; CHECK: %5 = getelementptr inbounds i32, i32* %0, i64 2
+  %7 = load i32, i32* %6, align 4
+  %8 = zext i32 %7 to i64
+
+  ret i64 %4
+}
+
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS14example_kernel() {
+entry:
+; CHECK: %0 = alloca [3 x i32], align 4
+; CHECK: %1 = bitcast [3 x i32]* %0 to i8*
+; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %1, i8 0, i64 12, i1 false)
+; CHECK: %2 = getelementptr inbounds [3 x i32], [3 x i32]* %0, i32 0, i32 0
+  %0 = call i64 @_ZTS14other_function()
+; CHECK: %3 = call i64 @_ZTS14other_function(i32* %2)
+  %1 = call i64 @_ZTS14other_function()
+; CHECK: %4 = call i64 @_ZTS14other_function(i32* %2)
+  ret void
+}
+
+; CHECK: define weak_odr dso_local void @_ZTS14example_kernel_with_offset([3 x i32]* byval([3 x i32]) %0) {
+; CHECK: entry:
+; CHECK:   %1 = bitcast [3 x i32]* %0 to i32*
+; CHECK:   %2 = call i64 @_ZTS14other_function(i32* %1)
+; CHECK:   %3 = call i64 @_ZTS14other_function(i32* %1)
+; CHECK:   ret void
+; CHECK: }
+
+!nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+; CHECK: !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5}
+!nvvmir.version = !{!6}
+
+!0 = distinct !{void ()* @_ZTS14example_kernel, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
+!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{null, !"align", i32 16}
+!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+; CHECK: !5 = !{void ([3 x i32]*)* @_ZTS14example_kernel_with_offset, !"kernel", i32 1}
+!6 = !{i32 1, i32 4}
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-multiple-entry-points.ll b/llvm/test/CodeGen/NVPTX/global-offset-multiple-entry-points.ll
new file mode 100644
index 0000000000000..7784150f1082d
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/global-offset-multiple-entry-points.ll
@@ -0,0 +1,108 @@
+; RUN: opt -globaloffset %s -S -o - | FileCheck %s
+; ModuleID = 'multiple-entry-points.bc'
+source_filename = "multiple-entry-points.ll"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-nvcl-sycldevice"
+
+; This test checks that the pass works with multiple entry points.
+
+declare i32* @llvm.nvvm.implicit.offset()
+; CHECK-NOT: declare i32* @llvm.nvvm.implicit.offset()
+
+; This function is a kernel entry point that does not use global offset. It will
+; not get a clone with a global offset parameter.
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS12third_kernel() {
+entry:
+  ret void
+}
+
+define weak_odr dso_local i64 @_ZTS15common_function() {
+; CHECK: define weak_odr dso_local i64 @_ZTS15common_function(i32* %0) {
+  %1 = tail call i32* @llvm.nvvm.implicit.offset()
+; CHECK-NOT: tail call i32* @llvm.nvvm.implicit.offset()
+; CHECK: %2 = getelementptr inbounds i32, i32* %0, i64 2
+  %2 = getelementptr inbounds i32, i32* %1, i64 2
+  %3 = load i32, i32* %2, align 4
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define weak_odr dso_local i64 @_ZTS14first_function() {
+; CHECK: define weak_odr dso_local i64 @_ZTS14first_function(i32* %0) {
+  %1 = call i64 @_ZTS15common_function()
+; CHECK: %2 = call i64 @_ZTS15common_function(i32* %0)
+  ret i64 %1
+}
+
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS12first_kernel() {
+entry:
+; CHECK: %0 = alloca [3 x i32], align 4
+; CHECK: %1 = bitcast [3 x i32]* %0 to i8*
+; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %1, i8 0, i64 12, i1 false)
+; CHECK: %2 = getelementptr inbounds [3 x i32], [3 x i32]* %0, i32 0, i32 0
+  %0 = call i64 @_ZTS14first_function()
+; CHECK: %3 = call i64 @_ZTS14first_function(i32* %2)
+  ret void
+}
+
+; CHECK: define weak_odr dso_local void @_ZTS12first_kernel_with_offset([3 x i32]* byval([3 x i32]) %0) {
+; CHECK: entry:
+; CHECK:   %1 = bitcast [3 x i32]* %0 to i32*
+; CHECK:   %2 = call i64 @_ZTS14first_function(i32* %1)
+; CHECK:   ret void
+; CHECK: }
+
+define weak_odr dso_local i64 @_ZTS15second_function() {
+; CHECK: define weak_odr dso_local i64 @_ZTS15second_function(i32* %0) {
+  %1 = call i64 @_ZTS15common_function()
+; CHECK: %2 = call i64 @_ZTS15common_function(i32* %0)
+  ret i64 %1
+}
+
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS13second_kernel() {
+entry:
+; CHECK: %0 = alloca [3 x i32], align 4
+; CHECK: %1 = bitcast [3 x i32]* %0 to i8*
+; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %1, i8 0, i64 12, i1 false)
+; CHECK: %2 = getelementptr inbounds [3 x i32], [3 x i32]* %0, i32 0, i32 0
+  %0 = call i64 @_ZTS15second_function()
+; CHECK: %3 = call i64 @_ZTS15second_function(i32* %2)
+  ret void
+}
+
+; CHECK: define weak_odr dso_local void @_ZTS13second_kernel_with_offset([3 x i32]* byval([3 x i32]) %0) {
+; CHECK: entry:
+; CHECK:   %1 = bitcast [3 x i32]* %0 to i32*
+; CHECK:   %2 = call i64 @_ZTS15second_function(i32* %1)
+; CHECK:   ret void
+; CHECK: }
+
+; This function doesn't get called by a kernel entry point.
+define weak_odr dso_local i64 @_ZTS15no_entry_point() {
+; CHECK: define weak_odr dso_local i64 @_ZTS15no_entry_point(i32* %0) {
+  %1 = tail call i32* @llvm.nvvm.implicit.offset()
+; CHECK-NOT: tail call i32* @llvm.nvvm.implicit.offset()
+  %2 = getelementptr inbounds i32, i32* %1, i64 2
+; CHECK: %2 = getelementptr inbounds i32, i32* %0, i64 2
+  %3 = load i32, i32* %2, align 4
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+!nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5, !6}
+; CHECK: !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5, !6, !7, !8}
+!nvvmir.version = !{!9}
+
+!0 = distinct !{void ()* @_ZTS12first_kernel, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
+!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{null, !"align", i32 16}
+!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!5 = distinct !{void ()* @_ZTS13second_kernel, !"kernel", i32 1}
+!6 = distinct !{void ()* @_ZTS12third_kernel, !"kernel", i32 1}
+; CHECK: !7 = !{void ([3 x i32]*)* @_ZTS13second_kernel_with_offset, !"kernel", i32 1}
+; CHECK: !8 = !{void ([3 x i32]*)* @_ZTS12first_kernel_with_offset, !"kernel", i32 1}
+!9 = !{i32 1, i32 4}
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-simple.ll b/llvm/test/CodeGen/NVPTX/global-offset-simple.ll
new file mode 100644
index 0000000000000..8c301d5b223cf
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/global-offset-simple.ll
@@ -0,0 +1,52 @@
+; RUN: opt -globaloffset %s -S -o - | FileCheck %s
+; ModuleID = 'simple.bc'
+source_filename = "simple.ll"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda-sycldevice"
+
+; This test checks that the transformation is applied in the basic case.
+
+declare i32* @llvm.nvvm.implicit.offset()
+; CHECK-NOT: llvm.nvvm.implicit.offset
+
+define weak_odr dso_local i64 @_ZTS14other_function() {
+; CHECK: define weak_odr dso_local i64 @_ZTS14other_function(i32* %0) {
+; CHECK: %2 = getelementptr inbounds i32, i32* %0, i64 2
+  %1 = tail call i32* @llvm.nvvm.implicit.offset()
+; CHECK-NOT: tail call i32* @llvm.nvvm.implicit.offset()
+  %2 = getelementptr inbounds i32, i32* %1, i64 2
+  %3 = load i32, i32* %2, align 4
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS14example_kernel() {
+entry:
+; CHECK: %0 = alloca [3 x i32], align 4
+; CHECK: %1 = bitcast [3 x i32]* %0 to i8*
+; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %1, i8 0, i64 12, i1 false)
+; CHECK: %2 = getelementptr inbounds [3 x i32], [3 x i32]* %0, i32 0, i32 0
+  %0 = call i64 @_ZTS14other_function()
+; CHECK: %3 = call i64 @_ZTS14other_function(i32* %2)
+  ret void
+}
+
+; CHECK: define weak_odr dso_local void @_ZTS14example_kernel_with_offset([3 x i32]* byval([3 x i32]) %0) {
+; CHECK: entry:
+; CHECK:   %1 = bitcast [3 x i32]* %0 to i32*
+; CHECK:   %2 = call i64 @_ZTS14other_function(i32* %1)
+; CHECK:   ret void
+; CHECK: }
+
+!nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+; CHECK: !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5}
+!nvvmir.version = !{!6}
+
+!0 = distinct !{void ()* @_ZTS14example_kernel, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
+!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{null, !"align", i32 16}
+!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+; CHECK: !5 = !{void ([3 x i32]*)* @_ZTS14example_kernel_with_offset, !"kernel", i32 1}
+!6 = !{i32 1, i32 4}
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-valid-triple.ll b/llvm/test/CodeGen/NVPTX/global-offset-valid-triple.ll
new file mode 100644
index 0000000000000..e4ec130659905
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/global-offset-valid-triple.ll
@@ -0,0 +1,43 @@
+; RUN: llc -march=nvptx64 -mcpu=sm_20 < %s | FileCheck %s
+; ModuleID = 'valid-triple.bc'
+; CHECK-NOT: LLVM ERROR: Cannot select: intrinsic %llvm.nvvm.implicit.offset
+source_filename = "valid-triple.ll"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda-sycldevice"
+
+; This test checks that the pass does run on cuda triples.
+
+declare i32* @llvm.nvvm.implicit.offset()
+
+define weak_odr dso_local i64 @_ZTS14other_function() {
+  %1 = tail call i32* @llvm.nvvm.implicit.offset()
+  %2 = getelementptr inbounds i32, i32* %1, i64 2
+  %3 = load i32, i32* %2, align 4
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+; Function Attrs: noinline
+define weak_odr dso_local void @_ZTS14example_kernel() {
+entry:
+  %0 = call i64 @_ZTS14other_function()
+  ret void
+}
+
+!nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+!llvm.ident = !{!7, !8}
+!nvvmir.version = !{!9}
+!llvm.module.flags = !{!10, !11}
+
+!0 = distinct !{void ()* @_ZTS14example_kernel, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
+!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{null, !"align", i32 16}
+!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!5 = !{i32 1, i32 2}
+!6 = !{i32 4, i32 100000}
+!7 = !{!"clang version 9.0.0"}
+!8 = !{!"clang version 9.0.0"}
+!9 = !{i32 1, i32 4}
+!10 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 0]}
+!11 = !{i32 1, !"wchar_size", i32 4}
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index c839ab88707df..0302643294e6c 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -1997,13 +1997,26 @@ pi_result cuda_piKernelCreate(pi_program program, const char *kernel_name,
 
   try {
     ScopedContext active(program->get_context());
+
     CUfunction cuFunc;
     retErr = PI_CHECK_ERROR(
         cuModuleGetFunction(&cuFunc, program->get(), kernel_name));
 
-    retKernel = std::unique_ptr<_pi_kernel>(
-        new _pi_kernel{cuFunc, kernel_name, program, program->get_context()});
+    std::string kernel_name_woffset = std::string(kernel_name) + "_with_offset";
+    CUfunction cuFuncWithOffsetParam;
+    CUresult offsetRes = cuModuleGetFunction(
+        &cuFuncWithOffsetParam, program->get(), kernel_name_woffset.c_str());
+
+    // If there is no kernel with global offset parameter we mark it as missing
+    if (offsetRes == CUDA_ERROR_NOT_FOUND) {
+      cuFuncWithOffsetParam = nullptr;
+    } else {
+      retErr = PI_CHECK_ERROR(offsetRes);
+    }
 
+    retKernel = std::unique_ptr<_pi_kernel>(
+        new _pi_kernel{cuFunc, cuFuncWithOffsetParam, kernel_name, program,
+                       program->get_context()});
   } catch (pi_result err) {
     retErr = err;
   } catch (...) {
@@ -2071,6 +2084,22 @@ pi_result cuda_piEnqueueKernelLaunch(
     retError = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
                                         event_wait_list, nullptr);
 
+    // Set the implicit global offset parameter if kernel has offset variant
+    if (kernel->get_with_offset_parameter()) {
+      std::uint32_t cuda_implicit_offset[3] = {0, 0, 0};
+      if (global_work_offset) {
+        for (size_t i = 0; i < work_dim; i++) {
+          cuda_implicit_offset[i] =
+              static_cast<std::uint32_t>(global_work_offset[i]);
+          if (global_work_offset[i] != 0) {
+            cuFunc = kernel->get_with_offset_parameter();
+          }
+        }
+      }
+      kernel->set_implicit_offset_arg(sizeof(cuda_implicit_offset),
+                                      cuda_implicit_offset);
+    }
+
     // Set the number of threads per block to the number of threads per warp
     // by default unless user has provided a better number
     int threadsPerBlock[3] = {32, 1, 1};
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
index eac59b2a724a5..0a8ef2fe2f366 100644
--- a/sycl/plugins/cuda/pi_cuda.hpp
+++ b/sycl/plugins/cuda/pi_cuda.hpp
@@ -453,6 +453,7 @@ struct _pi_kernel {
   using native_type = CUfunction;
 
   native_type function_;
+  native_type functionWithOffsetParam_;
   std::string name_;
   pi_context context_;
   pi_program program_;
@@ -475,14 +476,23 @@ struct _pi_kernel {
     args_index_t indices_;
     args_size_t offsetPerIndex_;
 
+    std::uint32_t implicitOffsetArgs_[3] = {0, 0, 0};
+
+    arguments() {
+      // Place the implicit offset index at the end of the indicies collection
+      indices_.emplace_back(&implicitOffsetArgs_);
+    }
+
     /// Adds an argument to the kernel.
     /// If the argument existed before, it is replaced.
     /// Otherwise, it is added.
     /// Gaps are filled with empty arguments.
+    /// Implicit offset argument is kept at the back of the indices collection.
     void add_arg(size_t index, size_t size, const void *arg,
                  size_t localSize = 0) {
-      if (index + 1 > indices_.size()) {
-        indices_.resize(index + 1);
+      if (index + 2 > indices_.size()) {
+        // Move implicit offset argument index with the end
+        indices_.resize(index + 2, indices_.back());
         // Ensure enough space for the new argument
         paramSizes_.resize(index + 1);
         offsetPerIndex_.resize(index + 1);
@@ -502,6 +512,11 @@ struct _pi_kernel {
       add_arg(index, sizeof(size_t), (const void *)&(localOffset), size);
     }
 
+    void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) {
+      assert(size == sizeof(std::uint32_t) * 3);
+      std::memcpy(implicitOffsetArgs_, implicitOffset, size);
+    }
+
     void clear_local_size() {
       std::fill(std::begin(offsetPerIndex_), std::end(offsetPerIndex_), 0);
     }
@@ -514,14 +529,18 @@ struct _pi_kernel {
     }
   } args_;
 
-  _pi_kernel(CUfunction func, const char *name, pi_program program,
-             pi_context ctxt)
-      : function_{func}, name_{name}, context_{ctxt}, program_{program},
-        refCount_{1} {
+  _pi_kernel(CUfunction func, CUfunction funcWithOffsetParam, const char *name,
+             pi_program program, pi_context ctxt)
+      : function_{func}, functionWithOffsetParam_{funcWithOffsetParam},
+        name_{name}, context_{ctxt}, program_{program}, refCount_{1} {
     cuda_piProgramRetain(program_);
     cuda_piContextRetain(context_);
   }
 
+  _pi_kernel(CUfunction func, const char *name, pi_program program,
+             pi_context ctxt)
+      : _pi_kernel{func, nullptr, name, program, ctxt} {}
+
   ~_pi_kernel()
   {
     cuda_piProgramRelease(program_);
@@ -538,15 +557,23 @@ struct _pi_kernel {
 
   native_type get() const noexcept { return function_; };
 
+  native_type get_with_offset_parameter() const noexcept {
+    return functionWithOffsetParam_;
+  };
+
+  bool has_with_offset_parameter() const noexcept {
+    return functionWithOffsetParam_ != nullptr;
+  }
+
   pi_context get_context() const noexcept { return context_; };
 
   const char *get_name() const noexcept { return name_.c_str(); }
 
-  /// Returns the number of arguments.
+  /// Returns the number of arguments, excluding the implicit global offset.
   /// Note this only returns the current known number of arguments, not the
   /// real one required by the kernel, since this cannot be queried from
   /// the CUDA Driver API
-  pi_uint32 get_num_args() const noexcept { return args_.indices_.size(); }
+  pi_uint32 get_num_args() const noexcept { return args_.indices_.size() - 1; }
 
   void set_kernel_arg(int index, size_t size, const void *arg) {
     args_.add_arg(index, size, arg);
@@ -556,6 +583,10 @@ struct _pi_kernel {
     args_.add_local_arg(index, size);
   }
 
+  void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset) {
+    args_.set_implicit_offset(size, implicitOffset);
+  }
+
   arguments::args_index_t get_arg_indices() const {
     return args_.get_indices();
   }
diff --git a/sycl/test/basic_tests/parallel_for_indexers.cpp b/sycl/test/basic_tests/parallel_for_indexers.cpp
index ade6238b8c6c0..3d20ec3d66903 100644
--- a/sycl/test/basic_tests/parallel_for_indexers.cpp
+++ b/sycl/test/basic_tests/parallel_for_indexers.cpp
@@ -8,7 +8,7 @@
 
 // TODO: Unexpected result
 // TODO: _indexers.cpp:37: int main(): Assertion `id == -1' failed.
-// XFAIL: cuda || level0
+// XFAIL: level0
 
 #include <CL/sycl.hpp>
 
diff --git a/sycl/unittests/pi/cuda/test_kernels.cpp b/sycl/unittests/pi/cuda/test_kernels.cpp
index 02ac6549e61e3..400ddf03d84b7 100644
--- a/sycl/unittests/pi/cuda/test_kernels.cpp
+++ b/sycl/unittests/pi/cuda/test_kernels.cpp
@@ -16,6 +16,9 @@
 #include <detail/plugin.hpp>
 #include <pi_cuda.hpp>
 
+// PI CUDA kernels carry an additional argument for the implicit global offset.
+#define NUM_IMPLICIT_ARGS 1
+
 using namespace cl::sycl;
 
 struct CudaKernelsTest : public ::testing::Test {
@@ -172,7 +175,7 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSimple) {
                 kern, 0, sizeof(int), &number)),
             PI_SUCCESS);
   const auto &kernArgs = kern->get_arg_indices();
-  ASSERT_EQ(kernArgs.size(), (size_t)1);
+  ASSERT_EQ(kernArgs.size(), (size_t)1 + NUM_IMPLICIT_ARGS);
   int storedValue = *(static_cast<const int *>(kernArgs[0]));
   ASSERT_EQ(storedValue, number);
 }
@@ -201,7 +204,7 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwice) {
                 kern, 0, sizeof(int), &number)),
             PI_SUCCESS);
   const auto &kernArgs = kern->get_arg_indices();
-  ASSERT_GT(kernArgs.size(), (size_t)0);
+  ASSERT_GT(kernArgs.size(), (size_t)0 + NUM_IMPLICIT_ARGS);
   int storedValue = *(static_cast<const int *>(kernArgs[0]));
   ASSERT_EQ(storedValue, number);
 
@@ -210,7 +213,7 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwice) {
                 kern, 0, sizeof(int), &otherNumber)),
             PI_SUCCESS);
   const auto &kernArgs2 = kern->get_arg_indices();
-  ASSERT_EQ(kernArgs2.size(), (size_t)1);
+  ASSERT_EQ(kernArgs2.size(), (size_t)1 + NUM_IMPLICIT_ARGS);
   storedValue = *(static_cast<const int *>(kernArgs2[0]));
   ASSERT_EQ(storedValue, otherNumber);
 }
@@ -244,7 +247,7 @@ TEST_F(CudaKernelsTest, PIKernelSetMemObj) {
                 kern, 0, sizeof(pi_mem), &memObj)),
             PI_SUCCESS);
   const auto &kernArgs = kern->get_arg_indices();
-  ASSERT_EQ(kernArgs.size(), (size_t)1);
+  ASSERT_EQ(kernArgs.size(), (size_t)1 + NUM_IMPLICIT_ARGS);
   pi_mem storedValue = *(static_cast<pi_mem *>(kernArgs[0]));
   ASSERT_EQ(storedValue, memObj);
 }
@@ -369,7 +372,7 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwiceOneLocal) {
                 kern, 0, sizeof(int), &number)),
             PI_SUCCESS);
   const auto &kernArgs = kern->get_arg_indices();
-  ASSERT_GT(kernArgs.size(), (size_t)0);
+  ASSERT_GT(kernArgs.size(), (size_t)0 + NUM_IMPLICIT_ARGS);
   int storedValue = *(static_cast<const int *>(kernArgs[0]));
   ASSERT_EQ(storedValue, number);
 
@@ -377,7 +380,7 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwiceOneLocal) {
                 kern, 1, sizeof(int), nullptr)),
             PI_SUCCESS);
   const auto &kernArgs2 = kern->get_arg_indices();
-  ASSERT_EQ(kernArgs2.size(), (size_t)2);
+  ASSERT_EQ(kernArgs2.size(), (size_t)2 + NUM_IMPLICIT_ARGS);
   storedValue = *(static_cast<const int *>(kernArgs2[1]));
   ASSERT_EQ(storedValue, 0);
 
@@ -385,7 +388,7 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwiceOneLocal) {
                 kern, 2, sizeof(int), nullptr)),
             PI_SUCCESS);
   const auto &kernArgs3 = kern->get_arg_indices();
-  ASSERT_EQ(kernArgs3.size(), (size_t)3);
+  ASSERT_EQ(kernArgs3.size(), (size_t)3 + NUM_IMPLICIT_ARGS);
   storedValue = *(static_cast<const int *>(kernArgs3[2]));
   ASSERT_EQ(storedValue, static_cast<int>(sizeof(int)));
 }