diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 759c91290a60c..9a4a3ff013973 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -89,6 +89,9 @@ BUILTIN(__nvvm_read_ptx_sreg_pm1, "i", "n") BUILTIN(__nvvm_read_ptx_sreg_pm2, "i", "n") BUILTIN(__nvvm_read_ptx_sreg_pm3, "i", "n") +// SYCL +BUILTIN(__builtin_ptx_implicit_offset, "Ui*", "nc") + // MISC BUILTIN(__nvvm_prmt, "UiUiUiUi", "") diff --git a/libclc/ptx-nvidiacl/libspirv/workitem/get_global_id.cl b/libclc/ptx-nvidiacl/libspirv/workitem/get_global_id.cl index da96caffb4f75..b856302625f42 100644 --- a/libclc/ptx-nvidiacl/libspirv/workitem/get_global_id.cl +++ b/libclc/ptx-nvidiacl/libspirv/workitem/get_global_id.cl @@ -9,13 +9,16 @@ #include _CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalInvocationId_x() { - return __spirv_WorkgroupId_x() * __spirv_WorkgroupSize_x() + __spirv_LocalInvocationId_x(); + return __spirv_WorkgroupId_x() * __spirv_WorkgroupSize_x() + + __spirv_LocalInvocationId_x() + __spirv_GlobalOffset_x(); } _CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalInvocationId_y() { - return __spirv_WorkgroupId_y() * __spirv_WorkgroupSize_y() + __spirv_LocalInvocationId_y(); + return __spirv_WorkgroupId_y() * __spirv_WorkgroupSize_y() + + __spirv_LocalInvocationId_y() + __spirv_GlobalOffset_y(); } _CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalInvocationId_z() { - return __spirv_WorkgroupId_z() * __spirv_WorkgroupSize_z() + __spirv_LocalInvocationId_z(); + return __spirv_WorkgroupId_z() * __spirv_WorkgroupSize_z() + + __spirv_LocalInvocationId_z() + __spirv_GlobalOffset_z(); } diff --git a/libclc/ptx-nvidiacl/libspirv/workitem/get_global_offset.cl b/libclc/ptx-nvidiacl/libspirv/workitem/get_global_offset.cl index de269c76602be..7eae8cf43c20e 100644 --- a/libclc/ptx-nvidiacl/libspirv/workitem/get_global_offset.cl +++ b/libclc/ptx-nvidiacl/libspirv/workitem/get_global_offset.cl @@ -11,13 +11,13 @@ // Compiler support is required to provide global offset on NVPTX. _CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalOffset_x() { - return 0; + return __builtin_ptx_implicit_offset()[0]; } _CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalOffset_y() { - return 0; + return __builtin_ptx_implicit_offset()[1]; } _CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalOffset_z() { - return 0; + return __builtin_ptx_implicit_offset()[2]; } diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 61293418ec41d..9de9400848709 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -4161,4 +4161,10 @@ foreach layout_a = ["row", "col"] in { } // layout_b } // layout_a +// SYCL +def int_nvvm_implicit_offset : + GCCBuiltin<"__builtin_ptx_implicit_offset">, + Intrinsic<[LLVMPointerType], [], + [IntrNoMem, IntrSpeculatable]>; + } // let TargetPrefix = "nvvm" diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt index 097fc26cdab2c..9e9f7ebb63887 100644 --- a/llvm/lib/Target/NVPTX/CMakeLists.txt +++ b/llvm/lib/Target/NVPTX/CMakeLists.txt @@ -33,6 +33,7 @@ set(NVPTXCodeGen_sources NVVMIntrRange.cpp NVVMReflect.cpp NVPTXProxyRegErasure.cpp + SYCL/GlobalOffset.cpp SYCL/LocalAccessorToSharedMemory.cpp ) diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index e5c89a191cc0e..47729e957d533 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -16,8 +16,9 @@ #include "NVPTXLowerAggrCopies.h" #include "NVPTXTargetObjectFile.h" #include "NVPTXTargetTransformInfo.h" -#include "TargetInfo/NVPTXTargetInfo.h" +#include "SYCL/GlobalOffset.h" #include "SYCL/LocalAccessorToSharedMemory.h" +#include "TargetInfo/NVPTXTargetInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Triple.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -71,6 +72,7 @@ void initializeNVPTXLowerArgsPass(PassRegistry &); void initializeNVPTXLowerAllocaPass(PassRegistry &); void initializeNVPTXProxyRegErasurePass(PassRegistry &); +void initializeGlobalOffsetPass(PassRegistry &); void initializeLocalAccessorToSharedMemoryPass(PassRegistry &); } // end namespace llvm @@ -94,6 +96,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() { initializeNVPTXProxyRegErasurePass(PR); // SYCL-specific passes, needed here to be available to `opt`. + initializeGlobalOffsetPass(PR); initializeLocalAccessorToSharedMemoryPass(PR); } @@ -274,6 +277,7 @@ void NVPTXPassConfig::addIRPasses() { if (getTM().getTargetTriple().getOS() == Triple::CUDA && getTM().getTargetTriple().getEnvironment() == Triple::SYCLDevice) { + addPass(createGlobalOffsetPass()); addPass(createLocalAccessorToSharedMemoryPass()); } diff --git a/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp b/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp new file mode 100644 index 0000000000000..b03298d661093 --- /dev/null +++ b/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp @@ -0,0 +1,370 @@ +//===--------- GlobalOffset.cpp - Global Offset Support for CUDA --------- ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass operates on SYCL kernels being compiled to CUDA. It looks for uses +// of the `llvm.nvvm.implicit.offset` intrinsic and replaces it with a offset +// parameter which will be threaded through from the kernel entry point. +// +//===----------------------------------------------------------------------===// + +#include "GlobalOffset.h" + +#include "../MCTargetDesc/NVPTXBaseInfo.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsNVPTX.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Target/TargetIntrinsicInfo.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; + +#define DEBUG_TYPE "globaloffset" + +namespace llvm { +void initializeGlobalOffsetPass(PassRegistry &); +} // end namespace llvm + +namespace { + +class GlobalOffset : public ModulePass { +public: + static char ID; + GlobalOffset() : ModulePass(ID) {} + + bool runOnModule(Module &M) override { + if (skipModule(M)) + return false; + + llvm::Function *ImplicitOffsetIntrinsic = + M.getFunction(Intrinsic::getName(Intrinsic::nvvm_implicit_offset)); + + if (!ImplicitOffsetIntrinsic || ImplicitOffsetIntrinsic->use_empty()) { + return false; + } + + KernelImplicitArgumentType = + ArrayType::get(Type::getInt32Ty(M.getContext()), 3); + ImplicitOffsetPtrType = Type::getInt32Ty(M.getContext())->getPointerTo(); + assert( + (!ImplicitOffsetIntrinsic || + ImplicitOffsetIntrinsic->getReturnType() == ImplicitOffsetPtrType) && + "Intrinsic::nvvm_implicit_offset does not return the expected " + "type"); + + // Find all entry points. + EntryPointMetadata = getEntryPointMetadata(M); + + // Add implicit parameters to all direct and indirect users of the offset + addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr); + + // Assert that all uses of `ImplicitOffsetIntrinsic` are removed and delete + // it. + assert(ImplicitOffsetIntrinsic->use_empty() && + "Not all uses of intrinsic removed"); + ImplicitOffsetIntrinsic->eraseFromParent(); + + return true; + } + + void processKernelEntryPoint(Module &M, Function *Func) { + assert(EntryPointMetadata.count(Func) != 0 && + "Function must be an entry point"); + + LLVMContext &Ctx = M.getContext(); + MDNode *FuncMetadata = EntryPointMetadata[Func]; + + bool AlreadyProcessed = ProcessedFunctions.count(Func) == 1; + if (AlreadyProcessed) + return; + + // Add the new argument to all other kernel entry points, despite not + // using the global offset. + auto NvvmMetadata = M.getNamedMetadata("nvvm.annotations"); + assert(NvvmMetadata && "IR compiled to PTX must have nvvm.annotations"); + + auto NewFunc = addOffsetArgumentToFunction( + M, Func, KernelImplicitArgumentType->getPointerTo(), + /*KeepOriginal=*/true) + .first; + Argument *NewArgument = NewFunc->arg_begin() + (NewFunc->arg_size() - 1); + // Pass the values by value to the kernel + NewArgument->addAttr( + Attribute::getWithByValType(Ctx, KernelImplicitArgumentType)); + + // Add the metadata. + Metadata *NewMetadata[] = {ConstantAsMetadata::get(NewFunc), + FuncMetadata->getOperand(1), + FuncMetadata->getOperand(2)}; + NvvmMetadata->addOperand(MDNode::get(Ctx, NewMetadata)); + + // Create alloca of zeros for the implicit offset in original func + BasicBlock *EntryBlock = &Func->getEntryBlock(); + IRBuilder<> Builder(EntryBlock, EntryBlock->getFirstInsertionPt()); + Type *ImplicitOffsetType = + ArrayType::get(Type::getInt32Ty(M.getContext()), 3); + AllocaInst *ImplicitOffset = Builder.CreateAlloca(ImplicitOffsetType); + uint64_t AllocByteSize = + ImplicitOffset->getAllocationSizeInBits(M.getDataLayout()).getValue() / + 8; + CallInst *MemsetCall = + Builder.CreateMemSet(ImplicitOffset, Builder.getInt8(0), AllocByteSize, + ImplicitOffset->getAlign()); + MemsetCall->addParamAttr(0, Attribute::NonNull); + MemsetCall->addDereferenceableAttr(1, AllocByteSize); + ProcessedFunctions[Func] = Builder.CreateConstInBoundsGEP2_32( + ImplicitOffsetType, ImplicitOffset, 0, 0); + } + + // This function adds an implicit parameter to the function containing a call + // instruction to the implicit offset intrinsic or another function (which + // eventually calls the instrinsic). If the call instruction is to the + // implicit offset intrinsic, then the intrinisic is replaced with the + // parameter that was added. + // + // `Callee` is the function (to which this transformation has already been + // applied), or to the implicit offset intrinsic. `CalleeWithImplicitParam` + // indicates whether Callee is to the implicit intrinsic (when `nullptr`) or + // to another function (not `nullptr`) - this is used to know whether calls to + // it needs to have the implicit parameter added to it or replaced with the + // implicit parameter. + // + // Once the function, say `F`, containing a call to `Callee` has the implicit + // parameter added, callers of `F` are processed by recursively calling this + // function, passing `F` to `CalleeWithImplicitParam`. + // + // Since the cloning of entry points may alter the users of a function, the + // cloning must be done as early as possible, as to ensure that no users are + // added to previous callees in the call-tree. + void addImplicitParameterToCallers(Module &M, Value *Callee, + Function *CalleeWithImplicitParam) { + + // Make sure that all entry point callers are processed. + SmallVector Users{Callee->users()}; + for (User *U : Users) { + auto *Call = dyn_cast(U); + if (!Call) + continue; + + Function *Caller = Call->getFunction(); + if (EntryPointMetadata.count(Caller) != 0) { + processKernelEntryPoint(M, Caller); + } + } + + // User collection may have changed, so we reinitialize it. + Users = SmallVector{Callee->users()}; + for (User *U : Users) { + auto *CallToOld = dyn_cast(U); + if (!CallToOld) + return; + + auto Caller = CallToOld->getFunction(); + + // Determine if `Caller` needs processed or if this is another callsite + // from an already-processed function. + Function *NewFunc; + Value *ImplicitOffset = ProcessedFunctions[Caller]; + bool AlreadyProcessed = ImplicitOffset != nullptr; + if (AlreadyProcessed) { + NewFunc = Caller; + } else { + std::tie(NewFunc, ImplicitOffset) = + addOffsetArgumentToFunction(M, Caller); + } + + if (!CalleeWithImplicitParam) { + // Replace intrinsic call with parameter. + CallToOld->replaceAllUsesWith(ImplicitOffset); + } else { + // Build up a list of arguments to call the modified function using. + llvm::SmallVector ImplicitOffsets; + for (Use &U : CallToOld->args()) { + ImplicitOffsets.push_back(U); + } + ImplicitOffsets.push_back(ImplicitOffset); + + // Replace call to other function (which now has a new parameter), + // with a call including the new parameter to that same function. + auto NewCaller = CallInst::Create( + /* Ty= */ CalleeWithImplicitParam->getFunctionType(), + /* Func= */ CalleeWithImplicitParam, + /* Args= */ ImplicitOffsets, + /* NameStr= */ Twine(), + /* InsertBefore= */ CallToOld); + NewCaller->setTailCallKind(CallToOld->getTailCallKind()); + CallToOld->replaceAllUsesWith(NewCaller); + + if (CallToOld->hasName()) { + NewCaller->takeName(CallToOld); + } + } + + // Remove the caller now that it has been replaced. + CallToOld->eraseFromParent(); + + if (!AlreadyProcessed) { + // Process callers of the old function. + addImplicitParameterToCallers(M, Caller, NewFunc); + + // Now that the old function is dead, delete it. + Caller->dropAllReferences(); + Caller->eraseFromParent(); + } + } + } + + std::pair + addOffsetArgumentToFunction(Module &M, Function *Func, + Type *ImplicitArgumentType = nullptr, + bool KeepOriginal = false) { + FunctionType *FuncTy = Func->getFunctionType(); + const AttributeList &FuncAttrs = Func->getAttributes(); + ImplicitArgumentType = + ImplicitArgumentType ? ImplicitArgumentType : ImplicitOffsetPtrType; + + // Construct an argument list containing all of the previous arguments. + SmallVector Arguments; + SmallVector ArgumentAttributes; + + unsigned i = 0; + for (Function::arg_iterator FuncArg = Func->arg_begin(), + FuncEnd = Func->arg_end(); + FuncArg != FuncEnd; ++FuncArg, ++i) { + Arguments.push_back(FuncArg->getType()); + ArgumentAttributes.push_back(FuncAttrs.getParamAttributes(i)); + } + + // Add the offset argument. Must be the same type as returned by + // `llvm.nvvm.implicit.offset`. + + Arguments.push_back(ImplicitArgumentType); + ArgumentAttributes.push_back(AttributeSet()); + + // Build the new function. + AttributeList NAttrs = + AttributeList::get(Func->getContext(), FuncAttrs.getFnAttributes(), + FuncAttrs.getRetAttributes(), ArgumentAttributes); + assert(!FuncTy->isVarArg() && "Variadic arguments prohibited in SYCL"); + FunctionType *NewFuncTy = FunctionType::get(FuncTy->getReturnType(), + Arguments, FuncTy->isVarArg()); + + Function *NewFunc = Function::Create(NewFuncTy, Func->getLinkage(), + Func->getAddressSpace()); + + if (KeepOriginal) { + // TODO: Are there better naming alternatives that allow for unmangling? + NewFunc->setName(Func->getName() + "_with_offset"); + + ValueToValueMapTy VMap; + for (Function::arg_iterator FuncArg = Func->arg_begin(), + FuncEnd = Func->arg_end(), + NewFuncArg = NewFunc->arg_begin(); + FuncArg != FuncEnd; ++FuncArg, ++NewFuncArg) { + VMap[FuncArg] = NewFuncArg; + } + + SmallVector Returns; + CloneFunctionInto(NewFunc, Func, VMap, /*ModuleLevelChanges=*/false, + Returns); + } else { + NewFunc->copyAttributesFrom(Func); + NewFunc->setComdat(Func->getComdat()); + NewFunc->setAttributes(NAttrs); + NewFunc->takeName(Func); + + // Splice the body of the old function right into the new function. + NewFunc->getBasicBlockList().splice(NewFunc->begin(), + Func->getBasicBlockList()); + + for (Function::arg_iterator FuncArg = Func->arg_begin(), + FuncEnd = Func->arg_end(), + NewFuncArg = NewFunc->arg_begin(); + FuncArg != FuncEnd; ++FuncArg, ++NewFuncArg) { + FuncArg->replaceAllUsesWith(NewFuncArg); + } + + // Clone metadata of the old function, including debug info descriptor. + SmallVector, 1> MDs; + Func->getAllMetadata(MDs); + for (auto MD : MDs) + NewFunc->addMetadata(MD.first, *MD.second); + } + + // Keep original function ordering. + M.getFunctionList().insertAfter(Func->getIterator(), NewFunc); + + Value *ImplicitOffset = NewFunc->arg_begin() + (NewFunc->arg_size() - 1); + // Add bitcast to match the return type of the intrinsic if needed. + if (ImplicitArgumentType != ImplicitOffsetPtrType) { + BasicBlock *EntryBlock = &NewFunc->getEntryBlock(); + IRBuilder<> Builder(EntryBlock, EntryBlock->getFirstInsertionPt()); + ImplicitOffset = + Builder.CreateBitCast(ImplicitOffset, ImplicitOffsetPtrType); + } + + ProcessedFunctions[NewFunc] = ImplicitOffset; + + // Return the new function and the offset argument. + return {NewFunc, ImplicitOffset}; + } + + static llvm::DenseMap getEntryPointMetadata(Module &M) { + auto NvvmMetadata = M.getNamedMetadata("nvvm.annotations"); + assert(NvvmMetadata && "IR compiled to PTX must have nvvm.annotations"); + + llvm::DenseMap NvvmEntryPointMetadata; + for (auto MetadataNode : NvvmMetadata->operands()) { + if (MetadataNode->getNumOperands() != 3) + continue; + + // NVPTX identifies kernel entry points using metadata nodes of the form: + // !X = !{, !"kernel", i32 1} + auto Type = dyn_cast(MetadataNode->getOperand(1)); + // Only process kernel entry points. + if (!Type || Type->getString() != "kernel") + continue; + + // Get a pointer to the entry point function from the metadata. + auto FuncConstant = + dyn_cast(MetadataNode->getOperand(0)); + if (!FuncConstant) + continue; + auto Func = dyn_cast(FuncConstant->getValue()); + if (!Func) + continue; + + assert(Func->use_empty() && "Kernel entry point with uses"); + NvvmEntryPointMetadata[Func] = MetadataNode; + } + return NvvmEntryPointMetadata; + } + + virtual llvm::StringRef getPassName() const { + return "Add implicit SYCL global offset"; + } + +private: + // Keep track of which functions have been processed to avoid processing twice + llvm::DenseMap ProcessedFunctions; + // Keep a map of all entry point functions with metadata + llvm::DenseMap EntryPointMetadata; + llvm::Type *KernelImplicitArgumentType; + llvm::Type *ImplicitOffsetPtrType; +}; + +} // end anonymous namespace + +char GlobalOffset::ID = 0; + +INITIALIZE_PASS(GlobalOffset, "globaloffset", "SYCL Global Offset", false, + false) + +ModulePass *llvm::createGlobalOffsetPass() { return new GlobalOffset(); } diff --git a/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.h b/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.h new file mode 100644 index 0000000000000..3284af3a65d20 --- /dev/null +++ b/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.h @@ -0,0 +1,26 @@ +//===---------- GlobalOffset.h - Global Offset Support for CUDA ---------- ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass operates on SYCL kernels being compiled to CUDA. It looks for uses +// of the `llvm.nvvm.implicit.offset` intrinsic and replaces it with a offset +// parameter which will be threaded through from the kernel entry point. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SYCL_GLOBALOFFSET_H +#define LLVM_SYCL_GLOBALOFFSET_H + +#include "llvm/Pass.h" + +namespace llvm { + +ModulePass *createGlobalOffsetPass(); + +} // end namespace llvm + +#endif diff --git a/llvm/test/CodeGen/NVPTX/global-offset-invalid-triple.ll b/llvm/test/CodeGen/NVPTX/global-offset-invalid-triple.ll new file mode 100644 index 0000000000000..1eadbf9c2d06b --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/global-offset-invalid-triple.ll @@ -0,0 +1,35 @@ +; RUN: not --crash llc -march=nvptx64 -mcpu=sm_20 %s -o - 2>&1 | FileCheck %s +; ModuleID = 'invalid-triple.bc' +; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.nvvm.implicit.offset +source_filename = "invalid-triple.ll" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-nvcl-sycldevice" + +; This test checks that the pass does not run on nvcl triples. + +declare i32* @llvm.nvvm.implicit.offset() + +define weak_odr dso_local i64 @_ZTS14other_function() { + %1 = tail call i32* @llvm.nvvm.implicit.offset() + %2 = getelementptr inbounds i32, i32* %1, i64 2 + %3 = load i32, i32* %2, align 4 + %4 = zext i32 %3 to i64 + ret i64 %4 +} + +; Function Attrs: noinline +define weak_odr dso_local void @_ZTS14example_kernel() { +entry: + %0 = call i64 @_ZTS14other_function() + ret void +} + +!nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3} +!nvvmir.version = !{!5} + +!0 = distinct !{void ()* @_ZTS14example_kernel, !"kernel", i32 1} +!1 = !{null, !"align", i32 8} +!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!3 = !{null, !"align", i32 16} +!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!5 = !{i32 1, i32 4} diff --git a/llvm/test/CodeGen/NVPTX/global-offset-multiple-calls-from-one-function.ll b/llvm/test/CodeGen/NVPTX/global-offset-multiple-calls-from-one-function.ll new file mode 100644 index 0000000000000..79f10729b56c3 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/global-offset-multiple-calls-from-one-function.ll @@ -0,0 +1,67 @@ +; RUN: opt -globaloffset %s -S -o - | FileCheck %s +; ModuleID = 'multiple-calls-from-one-function.bc' +source_filename = "multiple-calls-from-one-function.ll" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda-sycldevice" + +; This test checks that when there are multiple calls to a function that uses +; the intrinsic that the caller and the callee only have a single clone each +; with the offset parameter. It also checks that the clone with multiple calls +; to other functions that has a variant that takes an offset parameter will have +; all calls redirected to the corresponding variants. + +declare i32* @llvm.nvvm.implicit.offset() +; CHECK-NOT: declare i32* @llvm.nvvm.implicit.offset() + +define weak_odr dso_local i64 @_ZTS14other_function() { +; CHECK: define weak_odr dso_local i64 @_ZTS14other_function(i32* %0) { + %1 = tail call i32* @llvm.nvvm.implicit.offset() +; CHECK-NOT: tail call i32* @llvm.nvvm.implicit.offset() + %2 = getelementptr inbounds i32, i32* %1, i64 2 +; CHECK: %2 = getelementptr inbounds i32, i32* %0, i64 2 + %3 = load i32, i32* %2, align 4 + %4 = zext i32 %3 to i64 + + %5 = tail call i32* @llvm.nvvm.implicit.offset() +; CHECK-NOT: tail call i32* @llvm.nvvm.implicit.offset() + %6 = getelementptr inbounds i32, i32* %5, i64 2 +; CHECK: %5 = getelementptr inbounds i32, i32* %0, i64 2 + %7 = load i32, i32* %6, align 4 + %8 = zext i32 %7 to i64 + + ret i64 %4 +} + +; Function Attrs: noinline +define weak_odr dso_local void @_ZTS14example_kernel() { +entry: +; CHECK: %0 = alloca [3 x i32], align 4 +; CHECK: %1 = bitcast [3 x i32]* %0 to i8* +; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %1, i8 0, i64 12, i1 false) +; CHECK: %2 = getelementptr inbounds [3 x i32], [3 x i32]* %0, i32 0, i32 0 + %0 = call i64 @_ZTS14other_function() +; CHECK: %3 = call i64 @_ZTS14other_function(i32* %2) + %1 = call i64 @_ZTS14other_function() +; CHECK: %4 = call i64 @_ZTS14other_function(i32* %2) + ret void +} + +; CHECK: define weak_odr dso_local void @_ZTS14example_kernel_with_offset([3 x i32]* byval([3 x i32]) %0) { +; CHECK: entry: +; CHECK: %1 = bitcast [3 x i32]* %0 to i32* +; CHECK: %2 = call i64 @_ZTS14other_function(i32* %1) +; CHECK: %3 = call i64 @_ZTS14other_function(i32* %1) +; CHECK: ret void +; CHECK: } + +!nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3} +; CHECK: !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5} +!nvvmir.version = !{!6} + +!0 = distinct !{void ()* @_ZTS14example_kernel, !"kernel", i32 1} +!1 = !{null, !"align", i32 8} +!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!3 = !{null, !"align", i32 16} +!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +; CHECK: !5 = !{void ([3 x i32]*)* @_ZTS14example_kernel_with_offset, !"kernel", i32 1} +!6 = !{i32 1, i32 4} diff --git a/llvm/test/CodeGen/NVPTX/global-offset-multiple-entry-points.ll b/llvm/test/CodeGen/NVPTX/global-offset-multiple-entry-points.ll new file mode 100644 index 0000000000000..7784150f1082d --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/global-offset-multiple-entry-points.ll @@ -0,0 +1,108 @@ +; RUN: opt -globaloffset %s -S -o - | FileCheck %s +; ModuleID = 'multiple-entry-points.bc' +source_filename = "multiple-entry-points.ll" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-nvcl-sycldevice" + +; This test checks that the pass works with multiple entry points. + +declare i32* @llvm.nvvm.implicit.offset() +; CHECK-NOT: declare i32* @llvm.nvvm.implicit.offset() + +; This function is a kernel entry point that does not use global offset. It will +; not get a clone with a global offset parameter. +; Function Attrs: noinline +define weak_odr dso_local void @_ZTS12third_kernel() { +entry: + ret void +} + +define weak_odr dso_local i64 @_ZTS15common_function() { +; CHECK: define weak_odr dso_local i64 @_ZTS15common_function(i32* %0) { + %1 = tail call i32* @llvm.nvvm.implicit.offset() +; CHECK-NOT: tail call i32* @llvm.nvvm.implicit.offset() +; CHECK: %2 = getelementptr inbounds i32, i32* %0, i64 2 + %2 = getelementptr inbounds i32, i32* %1, i64 2 + %3 = load i32, i32* %2, align 4 + %4 = zext i32 %3 to i64 + ret i64 %4 +} + +define weak_odr dso_local i64 @_ZTS14first_function() { +; CHECK: define weak_odr dso_local i64 @_ZTS14first_function(i32* %0) { + %1 = call i64 @_ZTS15common_function() +; CHECK: %2 = call i64 @_ZTS15common_function(i32* %0) + ret i64 %1 +} + +; Function Attrs: noinline +define weak_odr dso_local void @_ZTS12first_kernel() { +entry: +; CHECK: %0 = alloca [3 x i32], align 4 +; CHECK: %1 = bitcast [3 x i32]* %0 to i8* +; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %1, i8 0, i64 12, i1 false) +; CHECK: %2 = getelementptr inbounds [3 x i32], [3 x i32]* %0, i32 0, i32 0 + %0 = call i64 @_ZTS14first_function() +; CHECK: %3 = call i64 @_ZTS14first_function(i32* %2) + ret void +} + +; CHECK: define weak_odr dso_local void @_ZTS12first_kernel_with_offset([3 x i32]* byval([3 x i32]) %0) { +; CHECK: entry: +; CHECK: %1 = bitcast [3 x i32]* %0 to i32* +; CHECK: %2 = call i64 @_ZTS14first_function(i32* %1) +; CHECK: ret void +; CHECK: } + +define weak_odr dso_local i64 @_ZTS15second_function() { +; CHECK: define weak_odr dso_local i64 @_ZTS15second_function(i32* %0) { + %1 = call i64 @_ZTS15common_function() +; CHECK: %2 = call i64 @_ZTS15common_function(i32* %0) + ret i64 %1 +} + +; Function Attrs: noinline +define weak_odr dso_local void @_ZTS13second_kernel() { +entry: +; CHECK: %0 = alloca [3 x i32], align 4 +; CHECK: %1 = bitcast [3 x i32]* %0 to i8* +; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %1, i8 0, i64 12, i1 false) +; CHECK: %2 = getelementptr inbounds [3 x i32], [3 x i32]* %0, i32 0, i32 0 + %0 = call i64 @_ZTS15second_function() +; CHECK: %3 = call i64 @_ZTS15second_function(i32* %2) + ret void +} + +; CHECK: define weak_odr dso_local void @_ZTS13second_kernel_with_offset([3 x i32]* byval([3 x i32]) %0) { +; CHECK: entry: +; CHECK: %1 = bitcast [3 x i32]* %0 to i32* +; CHECK: %2 = call i64 @_ZTS15second_function(i32* %1) +; CHECK: ret void +; CHECK: } + +; This function doesn't get called by a kernel entry point. +define weak_odr dso_local i64 @_ZTS15no_entry_point() { +; CHECK: define weak_odr dso_local i64 @_ZTS15no_entry_point(i32* %0) { + %1 = tail call i32* @llvm.nvvm.implicit.offset() +; CHECK-NOT: tail call i32* @llvm.nvvm.implicit.offset() + %2 = getelementptr inbounds i32, i32* %1, i64 2 +; CHECK: %2 = getelementptr inbounds i32, i32* %0, i64 2 + %3 = load i32, i32* %2, align 4 + %4 = zext i32 %3 to i64 + ret i64 %4 +} + +!nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5, !6} +; CHECK: !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5, !6, !7, !8} +!nvvmir.version = !{!9} + +!0 = distinct !{void ()* @_ZTS12first_kernel, !"kernel", i32 1} +!1 = !{null, !"align", i32 8} +!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!3 = !{null, !"align", i32 16} +!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!5 = distinct !{void ()* @_ZTS13second_kernel, !"kernel", i32 1} +!6 = distinct !{void ()* @_ZTS12third_kernel, !"kernel", i32 1} +; CHECK: !7 = !{void ([3 x i32]*)* @_ZTS13second_kernel_with_offset, !"kernel", i32 1} +; CHECK: !8 = !{void ([3 x i32]*)* @_ZTS12first_kernel_with_offset, !"kernel", i32 1} +!9 = !{i32 1, i32 4} diff --git a/llvm/test/CodeGen/NVPTX/global-offset-simple.ll b/llvm/test/CodeGen/NVPTX/global-offset-simple.ll new file mode 100644 index 0000000000000..8c301d5b223cf --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/global-offset-simple.ll @@ -0,0 +1,52 @@ +; RUN: opt -globaloffset %s -S -o - | FileCheck %s +; ModuleID = 'simple.bc' +source_filename = "simple.ll" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda-sycldevice" + +; This test checks that the transformation is applied in the basic case. + +declare i32* @llvm.nvvm.implicit.offset() +; CHECK-NOT: llvm.nvvm.implicit.offset + +define weak_odr dso_local i64 @_ZTS14other_function() { +; CHECK: define weak_odr dso_local i64 @_ZTS14other_function(i32* %0) { +; CHECK: %2 = getelementptr inbounds i32, i32* %0, i64 2 + %1 = tail call i32* @llvm.nvvm.implicit.offset() +; CHECK-NOT: tail call i32* @llvm.nvvm.implicit.offset() + %2 = getelementptr inbounds i32, i32* %1, i64 2 + %3 = load i32, i32* %2, align 4 + %4 = zext i32 %3 to i64 + ret i64 %4 +} + +; Function Attrs: noinline +define weak_odr dso_local void @_ZTS14example_kernel() { +entry: +; CHECK: %0 = alloca [3 x i32], align 4 +; CHECK: %1 = bitcast [3 x i32]* %0 to i8* +; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %1, i8 0, i64 12, i1 false) +; CHECK: %2 = getelementptr inbounds [3 x i32], [3 x i32]* %0, i32 0, i32 0 + %0 = call i64 @_ZTS14other_function() +; CHECK: %3 = call i64 @_ZTS14other_function(i32* %2) + ret void +} + +; CHECK: define weak_odr dso_local void @_ZTS14example_kernel_with_offset([3 x i32]* byval([3 x i32]) %0) { +; CHECK: entry: +; CHECK: %1 = bitcast [3 x i32]* %0 to i32* +; CHECK: %2 = call i64 @_ZTS14other_function(i32* %1) +; CHECK: ret void +; CHECK: } + +!nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3} +; CHECK: !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5} +!nvvmir.version = !{!6} + +!0 = distinct !{void ()* @_ZTS14example_kernel, !"kernel", i32 1} +!1 = !{null, !"align", i32 8} +!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!3 = !{null, !"align", i32 16} +!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +; CHECK: !5 = !{void ([3 x i32]*)* @_ZTS14example_kernel_with_offset, !"kernel", i32 1} +!6 = !{i32 1, i32 4} diff --git a/llvm/test/CodeGen/NVPTX/global-offset-valid-triple.ll b/llvm/test/CodeGen/NVPTX/global-offset-valid-triple.ll new file mode 100644 index 0000000000000..e4ec130659905 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/global-offset-valid-triple.ll @@ -0,0 +1,43 @@ +; RUN: llc -march=nvptx64 -mcpu=sm_20 < %s | FileCheck %s +; ModuleID = 'valid-triple.bc' +; CHECK-NOT: LLVM ERROR: Cannot select: intrinsic %llvm.nvvm.implicit.offset +source_filename = "valid-triple.ll" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda-sycldevice" + +; This test checks that the pass does run on cuda triples. + +declare i32* @llvm.nvvm.implicit.offset() + +define weak_odr dso_local i64 @_ZTS14other_function() { + %1 = tail call i32* @llvm.nvvm.implicit.offset() + %2 = getelementptr inbounds i32, i32* %1, i64 2 + %3 = load i32, i32* %2, align 4 + %4 = zext i32 %3 to i64 + ret i64 %4 +} + +; Function Attrs: noinline +define weak_odr dso_local void @_ZTS14example_kernel() { +entry: + %0 = call i64 @_ZTS14other_function() + ret void +} + +!nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3} +!llvm.ident = !{!7, !8} +!nvvmir.version = !{!9} +!llvm.module.flags = !{!10, !11} + +!0 = distinct !{void ()* @_ZTS14example_kernel, !"kernel", i32 1} +!1 = !{null, !"align", i32 8} +!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!3 = !{null, !"align", i32 16} +!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!5 = !{i32 1, i32 2} +!6 = !{i32 4, i32 100000} +!7 = !{!"clang version 9.0.0"} +!8 = !{!"clang version 9.0.0"} +!9 = !{i32 1, i32 4} +!10 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 0]} +!11 = !{i32 1, !"wchar_size", i32 4} diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index c839ab88707df..0302643294e6c 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -1997,13 +1997,26 @@ pi_result cuda_piKernelCreate(pi_program program, const char *kernel_name, try { ScopedContext active(program->get_context()); + CUfunction cuFunc; retErr = PI_CHECK_ERROR( cuModuleGetFunction(&cuFunc, program->get(), kernel_name)); - retKernel = std::unique_ptr<_pi_kernel>( - new _pi_kernel{cuFunc, kernel_name, program, program->get_context()}); + std::string kernel_name_woffset = std::string(kernel_name) + "_with_offset"; + CUfunction cuFuncWithOffsetParam; + CUresult offsetRes = cuModuleGetFunction( + &cuFuncWithOffsetParam, program->get(), kernel_name_woffset.c_str()); + + // If there is no kernel with global offset parameter we mark it as missing + if (offsetRes == CUDA_ERROR_NOT_FOUND) { + cuFuncWithOffsetParam = nullptr; + } else { + retErr = PI_CHECK_ERROR(offsetRes); + } + retKernel = std::unique_ptr<_pi_kernel>( + new _pi_kernel{cuFunc, cuFuncWithOffsetParam, kernel_name, program, + program->get_context()}); } catch (pi_result err) { retErr = err; } catch (...) { @@ -2071,6 +2084,22 @@ pi_result cuda_piEnqueueKernelLaunch( retError = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list, event_wait_list, nullptr); + // Set the implicit global offset parameter if kernel has offset variant + if (kernel->get_with_offset_parameter()) { + std::uint32_t cuda_implicit_offset[3] = {0, 0, 0}; + if (global_work_offset) { + for (size_t i = 0; i < work_dim; i++) { + cuda_implicit_offset[i] = + static_cast(global_work_offset[i]); + if (global_work_offset[i] != 0) { + cuFunc = kernel->get_with_offset_parameter(); + } + } + } + kernel->set_implicit_offset_arg(sizeof(cuda_implicit_offset), + cuda_implicit_offset); + } + // Set the number of threads per block to the number of threads per warp // by default unless user has provided a better number int threadsPerBlock[3] = {32, 1, 1}; diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index eac59b2a724a5..0a8ef2fe2f366 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -453,6 +453,7 @@ struct _pi_kernel { using native_type = CUfunction; native_type function_; + native_type functionWithOffsetParam_; std::string name_; pi_context context_; pi_program program_; @@ -475,14 +476,23 @@ struct _pi_kernel { args_index_t indices_; args_size_t offsetPerIndex_; + std::uint32_t implicitOffsetArgs_[3] = {0, 0, 0}; + + arguments() { + // Place the implicit offset index at the end of the indicies collection + indices_.emplace_back(&implicitOffsetArgs_); + } + /// Adds an argument to the kernel. /// If the argument existed before, it is replaced. /// Otherwise, it is added. /// Gaps are filled with empty arguments. + /// Implicit offset argument is kept at the back of the indices collection. void add_arg(size_t index, size_t size, const void *arg, size_t localSize = 0) { - if (index + 1 > indices_.size()) { - indices_.resize(index + 1); + if (index + 2 > indices_.size()) { + // Move implicit offset argument index with the end + indices_.resize(index + 2, indices_.back()); // Ensure enough space for the new argument paramSizes_.resize(index + 1); offsetPerIndex_.resize(index + 1); @@ -502,6 +512,11 @@ struct _pi_kernel { add_arg(index, sizeof(size_t), (const void *)&(localOffset), size); } + void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) { + assert(size == sizeof(std::uint32_t) * 3); + std::memcpy(implicitOffsetArgs_, implicitOffset, size); + } + void clear_local_size() { std::fill(std::begin(offsetPerIndex_), std::end(offsetPerIndex_), 0); } @@ -514,14 +529,18 @@ struct _pi_kernel { } } args_; - _pi_kernel(CUfunction func, const char *name, pi_program program, - pi_context ctxt) - : function_{func}, name_{name}, context_{ctxt}, program_{program}, - refCount_{1} { + _pi_kernel(CUfunction func, CUfunction funcWithOffsetParam, const char *name, + pi_program program, pi_context ctxt) + : function_{func}, functionWithOffsetParam_{funcWithOffsetParam}, + name_{name}, context_{ctxt}, program_{program}, refCount_{1} { cuda_piProgramRetain(program_); cuda_piContextRetain(context_); } + _pi_kernel(CUfunction func, const char *name, pi_program program, + pi_context ctxt) + : _pi_kernel{func, nullptr, name, program, ctxt} {} + ~_pi_kernel() { cuda_piProgramRelease(program_); @@ -538,15 +557,23 @@ struct _pi_kernel { native_type get() const noexcept { return function_; }; + native_type get_with_offset_parameter() const noexcept { + return functionWithOffsetParam_; + }; + + bool has_with_offset_parameter() const noexcept { + return functionWithOffsetParam_ != nullptr; + } + pi_context get_context() const noexcept { return context_; }; const char *get_name() const noexcept { return name_.c_str(); } - /// Returns the number of arguments. + /// Returns the number of arguments, excluding the implicit global offset. /// Note this only returns the current known number of arguments, not the /// real one required by the kernel, since this cannot be queried from /// the CUDA Driver API - pi_uint32 get_num_args() const noexcept { return args_.indices_.size(); } + pi_uint32 get_num_args() const noexcept { return args_.indices_.size() - 1; } void set_kernel_arg(int index, size_t size, const void *arg) { args_.add_arg(index, size, arg); @@ -556,6 +583,10 @@ struct _pi_kernel { args_.add_local_arg(index, size); } + void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset) { + args_.set_implicit_offset(size, implicitOffset); + } + arguments::args_index_t get_arg_indices() const { return args_.get_indices(); } diff --git a/sycl/test/basic_tests/parallel_for_indexers.cpp b/sycl/test/basic_tests/parallel_for_indexers.cpp index ade6238b8c6c0..3d20ec3d66903 100644 --- a/sycl/test/basic_tests/parallel_for_indexers.cpp +++ b/sycl/test/basic_tests/parallel_for_indexers.cpp @@ -8,7 +8,7 @@ // TODO: Unexpected result // TODO: _indexers.cpp:37: int main(): Assertion `id == -1' failed. -// XFAIL: cuda || level0 +// XFAIL: level0 #include diff --git a/sycl/unittests/pi/cuda/test_kernels.cpp b/sycl/unittests/pi/cuda/test_kernels.cpp index 02ac6549e61e3..400ddf03d84b7 100644 --- a/sycl/unittests/pi/cuda/test_kernels.cpp +++ b/sycl/unittests/pi/cuda/test_kernels.cpp @@ -16,6 +16,9 @@ #include #include +// PI CUDA kernels carry an additional argument for the implicit global offset. +#define NUM_IMPLICIT_ARGS 1 + using namespace cl::sycl; struct CudaKernelsTest : public ::testing::Test { @@ -172,7 +175,7 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSimple) { kern, 0, sizeof(int), &number)), PI_SUCCESS); const auto &kernArgs = kern->get_arg_indices(); - ASSERT_EQ(kernArgs.size(), (size_t)1); + ASSERT_EQ(kernArgs.size(), (size_t)1 + NUM_IMPLICIT_ARGS); int storedValue = *(static_cast(kernArgs[0])); ASSERT_EQ(storedValue, number); } @@ -201,7 +204,7 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwice) { kern, 0, sizeof(int), &number)), PI_SUCCESS); const auto &kernArgs = kern->get_arg_indices(); - ASSERT_GT(kernArgs.size(), (size_t)0); + ASSERT_GT(kernArgs.size(), (size_t)0 + NUM_IMPLICIT_ARGS); int storedValue = *(static_cast(kernArgs[0])); ASSERT_EQ(storedValue, number); @@ -210,7 +213,7 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwice) { kern, 0, sizeof(int), &otherNumber)), PI_SUCCESS); const auto &kernArgs2 = kern->get_arg_indices(); - ASSERT_EQ(kernArgs2.size(), (size_t)1); + ASSERT_EQ(kernArgs2.size(), (size_t)1 + NUM_IMPLICIT_ARGS); storedValue = *(static_cast(kernArgs2[0])); ASSERT_EQ(storedValue, otherNumber); } @@ -244,7 +247,7 @@ TEST_F(CudaKernelsTest, PIKernelSetMemObj) { kern, 0, sizeof(pi_mem), &memObj)), PI_SUCCESS); const auto &kernArgs = kern->get_arg_indices(); - ASSERT_EQ(kernArgs.size(), (size_t)1); + ASSERT_EQ(kernArgs.size(), (size_t)1 + NUM_IMPLICIT_ARGS); pi_mem storedValue = *(static_cast(kernArgs[0])); ASSERT_EQ(storedValue, memObj); } @@ -369,7 +372,7 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwiceOneLocal) { kern, 0, sizeof(int), &number)), PI_SUCCESS); const auto &kernArgs = kern->get_arg_indices(); - ASSERT_GT(kernArgs.size(), (size_t)0); + ASSERT_GT(kernArgs.size(), (size_t)0 + NUM_IMPLICIT_ARGS); int storedValue = *(static_cast(kernArgs[0])); ASSERT_EQ(storedValue, number); @@ -377,7 +380,7 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwiceOneLocal) { kern, 1, sizeof(int), nullptr)), PI_SUCCESS); const auto &kernArgs2 = kern->get_arg_indices(); - ASSERT_EQ(kernArgs2.size(), (size_t)2); + ASSERT_EQ(kernArgs2.size(), (size_t)2 + NUM_IMPLICIT_ARGS); storedValue = *(static_cast(kernArgs2[1])); ASSERT_EQ(storedValue, 0); @@ -385,7 +388,7 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwiceOneLocal) { kern, 2, sizeof(int), nullptr)), PI_SUCCESS); const auto &kernArgs3 = kern->get_arg_indices(); - ASSERT_EQ(kernArgs3.size(), (size_t)3); + ASSERT_EQ(kernArgs3.size(), (size_t)3 + NUM_IMPLICIT_ARGS); storedValue = *(static_cast(kernArgs3[2])); ASSERT_EQ(storedValue, static_cast(sizeof(int))); }