diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp index c1421c749db5d..d841b32685118 100644 --- a/sycl/source/detail/context_impl.cpp +++ b/sycl/source/detail/context_impl.cpp @@ -423,7 +423,7 @@ std::optional context_impl::getProgramForDeviceGlobal( } if (!BuildRes) return std::nullopt; - return MKernelProgramCache.waitUntilBuilt(BuildRes); + return *MKernelProgramCache.waitUntilBuilt(BuildRes); } } // namespace detail diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index 7dcc5814dd334..4e092ef12c4e3 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -578,6 +578,7 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, } const RTDeviceBinaryImage *DeviceImage = nullptr; RT::PiProgram Program = nullptr; + const KernelArgMask *EliminatedArgs = nullptr; if (KernelCG->getKernelBundle() != nullptr) { // Retrieve the device image from the kernel bundle. auto KernelBundle = KernelCG->getKernelBundle(); @@ -589,10 +590,12 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref(); Program = SyclKernel->getDeviceImage()->get_program_ref(); + EliminatedArgs = SyclKernel->getKernelArgMask(); } else if (KernelCG->MSyclKernel != nullptr) { DeviceImage = KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref(); Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref(); + EliminatedArgs = KernelCG->MSyclKernel->getKernelArgMask(); } else { auto ContextImpl = Queue->getContextImplPtr(); auto Context = detail::createSyclObjFromImpl(ContextImpl); @@ -602,18 +605,14 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, KernelCG->MOSModuleHandle, KernelName, Context, Device); Program = detail::ProgramManager::getInstance().createPIProgram( *DeviceImage, Context, Device); + EliminatedArgs = + detail::ProgramManager::getInstance().getEliminatedKernelArgMask( + KernelCG->MOSModuleHandle, Program, KernelName); } if (!DeviceImage || !Program) { printPerformanceWarning("No suitable IR available for fusion"); return nullptr; } - ProgramManager::KernelArgMask EliminatedArgs; - if (Program && (KernelCG->MSyclKernel == nullptr || - !KernelCG->MSyclKernel->isCreatedFromSource())) { - EliminatedArgs = - detail::ProgramManager::getInstance().getEliminatedKernelArgMask( - KernelCG->MOSModuleHandle, Program, KernelName); - } // Collect information about the arguments of this kernel. @@ -634,7 +633,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, // DPC++ internally uses 'true' to indicate that an argument has been // eliminated, while the JIT compiler uses 'true' to indicate an // argument is used. Translate this here. - bool Eliminated = !EliminatedArgs.empty() && EliminatedArgs[ArgIndex++]; + bool Eliminated = EliminatedArgs && !EliminatedArgs->empty() && + (*EliminatedArgs)[ArgIndex++]; ArgDescriptor.UsageMask.emplace_back(!Eliminated); // If the argument has not been eliminated, i.e., is still present on diff --git a/sycl/source/detail/kernel_arg_mask.hpp b/sycl/source/detail/kernel_arg_mask.hpp new file mode 100644 index 0000000000000..68113a969b0a4 --- /dev/null +++ b/sycl/source/detail/kernel_arg_mask.hpp @@ -0,0 +1,33 @@ +//==----------- kernel_arg_mask.hpp - SYCL KernelArgMask -------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +namespace sycl { +__SYCL_INLINE_VER_NAMESPACE(_V1) { +namespace detail { +using KernelArgMask = std::vector; +inline KernelArgMask createKernelArgMask(const ByteArray &Bytes) { + const int NBytesForSize = 8; + const int NBitsInElement = 8; + std::uint64_t SizeInBits = 0; + + KernelArgMask Result; + for (int I = 0; I < NBytesForSize; ++I) + SizeInBits |= static_cast(Bytes[I]) << I * NBitsInElement; + + Result.reserve(SizeInBits); + for (std::uint64_t I = 0; I < SizeInBits; ++I) { + std::uint8_t Byte = Bytes[NBytesForSize + (I / NBitsInElement)]; + Result.push_back(Byte & (1 << (I % NBitsInElement))); + } + return Result; +} +} // namespace detail +} // __SYCL_INLINE_VER_NAMESPACE(_V1) +} // namespace sycl diff --git a/sycl/source/detail/kernel_bundle_impl.hpp b/sycl/source/detail/kernel_bundle_impl.hpp index 4dab1076c3033..0e45a997f45e6 100644 --- a/sycl/source/detail/kernel_bundle_impl.hpp +++ b/sycl/source/detail/kernel_bundle_impl.hpp @@ -367,13 +367,15 @@ class kernel_bundle_impl { detail::getSyclObjImpl(*It); RT::PiKernel Kernel = nullptr; - std::tie(Kernel, std::ignore) = + const KernelArgMask *ArgMask = nullptr; + std::tie(Kernel, std::ignore, ArgMask) = detail::ProgramManager::getInstance().getOrCreateKernel( MContext, KernelID.get_name(), /*PropList=*/{}, DeviceImageImpl->get_program_ref()); - std::shared_ptr KernelImpl = std::make_shared( - Kernel, detail::getSyclObjImpl(MContext), DeviceImageImpl, Self); + std::shared_ptr KernelImpl = + std::make_shared(Kernel, detail::getSyclObjImpl(MContext), + DeviceImageImpl, Self, ArgMask); return detail::createSyclObjFromImpl(KernelImpl); } diff --git a/sycl/source/detail/kernel_impl.cpp b/sycl/source/detail/kernel_impl.cpp index cc4409b1c9dd6..8784cd967b3ce 100644 --- a/sycl/source/detail/kernel_impl.cpp +++ b/sycl/source/detail/kernel_impl.cpp @@ -18,10 +18,11 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) { namespace detail { kernel_impl::kernel_impl(RT::PiKernel Kernel, ContextImplPtr Context, - KernelBundleImplPtr KernelBundleImpl) + KernelBundleImplPtr KernelBundleImpl, + const KernelArgMask *ArgMask) : kernel_impl(Kernel, Context, std::make_shared(Context, Kernel), - /*IsCreatedFromSource*/ true, KernelBundleImpl) { + /*IsCreatedFromSource*/ true, KernelBundleImpl, ArgMask) { // Enable USM indirect access for interoperability kernels. // Some PI Plugins (like OpenCL) require this call to enable USM // For others, PI will turn this into a NOP. @@ -34,11 +35,13 @@ kernel_impl::kernel_impl(RT::PiKernel Kernel, ContextImplPtr Context, kernel_impl::kernel_impl(RT::PiKernel Kernel, ContextImplPtr ContextImpl, ProgramImplPtr ProgramImpl, bool IsCreatedFromSource, - KernelBundleImplPtr KernelBundleImpl) + KernelBundleImplPtr KernelBundleImpl, + const KernelArgMask *ArgMask) : MKernel(Kernel), MContext(ContextImpl), MProgramImpl(std::move(ProgramImpl)), MCreatedFromSource(IsCreatedFromSource), - MKernelBundleImpl(std::move(KernelBundleImpl)) { + MKernelBundleImpl(std::move(KernelBundleImpl)), + MKernelArgMaskPtr{ArgMask} { RT::PiContext Context = nullptr; // Using the plugin from the passed ContextImpl @@ -54,10 +57,12 @@ kernel_impl::kernel_impl(RT::PiKernel Kernel, ContextImplPtr ContextImpl, kernel_impl::kernel_impl(RT::PiKernel Kernel, ContextImplPtr ContextImpl, DeviceImageImplPtr DeviceImageImpl, - KernelBundleImplPtr KernelBundleImpl) + KernelBundleImplPtr KernelBundleImpl, + const KernelArgMask *ArgMask) : MKernel(Kernel), MContext(std::move(ContextImpl)), MProgramImpl(nullptr), MCreatedFromSource(false), MDeviceImageImpl(std::move(DeviceImageImpl)), - MKernelBundleImpl(std::move(KernelBundleImpl)) { + MKernelBundleImpl(std::move(KernelBundleImpl)), + MKernelArgMaskPtr{ArgMask} { // kernel_impl shared ownership of kernel handle if (!is_host()) { diff --git a/sycl/source/detail/kernel_impl.hpp b/sycl/source/detail/kernel_impl.hpp index beb59234e415f..dd36c5648d06d 100644 --- a/sycl/source/detail/kernel_impl.hpp +++ b/sycl/source/detail/kernel_impl.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -42,7 +43,8 @@ class kernel_impl { /// \param Context is a valid SYCL context /// \param KernelBundleImpl is a valid instance of kernel_bundle_impl kernel_impl(RT::PiKernel Kernel, ContextImplPtr Context, - KernelBundleImplPtr KernelBundleImpl); + KernelBundleImplPtr KernelBundleImpl, + const KernelArgMask *ArgMask = nullptr); /// Constructs a SYCL kernel instance from a SYCL program and a PiKernel /// @@ -59,7 +61,8 @@ class kernel_impl { /// \param KernelBundleImpl is a valid instance of kernel_bundle_impl kernel_impl(RT::PiKernel Kernel, ContextImplPtr ContextImpl, ProgramImplPtr ProgramImpl, bool IsCreatedFromSource, - KernelBundleImplPtr KernelBundleImpl); + KernelBundleImplPtr KernelBundleImpl, + const KernelArgMask *ArgMask); /// Constructs a SYCL kernel_impl instance from a SYCL device_image, /// kernel_bundle and / PiKernel. @@ -69,7 +72,8 @@ class kernel_impl { /// \param KernelBundleImpl is a valid instance of kernel_bundle_impl kernel_impl(RT::PiKernel Kernel, ContextImplPtr ContextImpl, DeviceImageImplPtr DeviceImageImpl, - KernelBundleImplPtr KernelBundleImpl); + KernelBundleImplPtr KernelBundleImpl, + const KernelArgMask *ArgMask); /// Constructs a SYCL kernel for host device /// @@ -177,6 +181,8 @@ class kernel_impl { return MNoncacheableEnqueueMutex; } + const KernelArgMask *getKernelArgMask() const { return MKernelArgMaskPtr; } + private: RT::PiKernel MKernel; const ContextImplPtr MContext; @@ -186,6 +192,7 @@ class kernel_impl { const KernelBundleImplPtr MKernelBundleImpl; bool MIsInterop = false; std::mutex MNoncacheableEnqueueMutex; + const KernelArgMask *MKernelArgMaskPtr; bool isBuiltInKernel(const device &Device) const; void checkIfValidForNumArgsInfoQuery() const; diff --git a/sycl/source/detail/kernel_program_cache.cpp b/sycl/source/detail/kernel_program_cache.cpp index 47db378f65a49..f17e1f0ddfdd5 100644 --- a/sycl/source/detail/kernel_program_cache.cpp +++ b/sycl/source/detail/kernel_program_cache.cpp @@ -16,28 +16,28 @@ namespace detail { KernelProgramCache::~KernelProgramCache() { for (auto &ProgIt : MCachedPrograms.Cache) { ProgramWithBuildStateT &ProgWithState = ProgIt.second; - PiProgramT *ToBeDeleted = ProgWithState.Ptr.load(); + RT::PiProgram *ToBeDeleted = ProgWithState.Ptr.load(); if (!ToBeDeleted) continue; - auto KernIt = MKernelsPerProgramCache.find(ToBeDeleted); + auto KernIt = MKernelsPerProgramCache.find(*ToBeDeleted); if (KernIt != MKernelsPerProgramCache.end()) { for (auto &p : KernIt->second) { - KernelWithBuildStateT &KernelWithState = p.second; - PiKernelT *Kern = KernelWithState.Ptr.load(); + BuildResult &KernelWithState = p.second; + KernelArgMaskPairT *KernelArgMaskPair = KernelWithState.Ptr.load(); - if (Kern) { + if (KernelArgMaskPair) { const detail::plugin &Plugin = MParentContext->getPlugin(); - Plugin.call(Kern); + Plugin.call(KernelArgMaskPair->first); } } MKernelsPerProgramCache.erase(KernIt); } const detail::plugin &Plugin = MParentContext->getPlugin(); - Plugin.call(ToBeDeleted); + Plugin.call(*ToBeDeleted); } } } // namespace detail diff --git a/sycl/source/detail/kernel_program_cache.hpp b/sycl/source/detail/kernel_program_cache.hpp index 1cc3b3adf88ee..3e98a1764c7cd 100644 --- a/sycl/source/detail/kernel_program_cache.hpp +++ b/sycl/source/detail/kernel_program_cache.hpp @@ -48,6 +48,7 @@ class KernelProgramCache { /// Currently there is only a single user - ProgramManager class. template struct BuildResult { std::atomic Ptr; + T Val; std::atomic State; BuildError Error; @@ -68,9 +69,7 @@ class KernelProgramCache { BuildResult(T *P, BuildState S) : Ptr{P}, State{S}, Error{"", 0} {} }; - using PiProgramT = std::remove_pointer::type; - using PiProgramPtrT = std::atomic; - using ProgramWithBuildStateT = BuildResult; + using ProgramWithBuildStateT = BuildResult; using ProgramCacheKeyT = std::pair, std::pair>; using CommonProgramKeyT = std::pair; @@ -84,18 +83,15 @@ class KernelProgramCache { using ContextPtr = context_impl *; - using PiKernelT = std::remove_pointer::type; - - using PiKernelPtrT = std::atomic; - using KernelWithBuildStateT = BuildResult; - using KernelByNameT = std::map; + using KernelArgMaskPairT = std::pair; + using KernelByNameT = std::map>; using KernelCacheT = std::map; using KernelFastCacheKeyT = std::tuple; - using KernelFastCacheValT = - std::tuple; + using KernelFastCacheValT = std::tuple; using KernelFastCacheT = std::map; ~KernelProgramCache(); @@ -128,7 +124,7 @@ class KernelProgramCache { return std::make_pair(&Inserted.first->second, Inserted.second); } - std::pair + std::pair *, bool> getOrInsertKernel(RT::PiProgram Program, const std::string &KernelName) { auto LockedCache = acquireKernelsPerProgramCache(); auto &Cache = LockedCache.get()[Program]; @@ -173,7 +169,7 @@ class KernelProgramCache { if (It != MKernelFastCache.end()) { return It->second; } - return std::make_tuple(nullptr, nullptr, nullptr); + return std::make_tuple(nullptr, nullptr, nullptr, nullptr); } template diff --git a/sycl/source/detail/program_impl.cpp b/sycl/source/detail/program_impl.cpp index 59fb15178114e..7362b1a8bad71 100644 --- a/sycl/source/detail/program_impl.cpp +++ b/sycl/source/detail/program_impl.cpp @@ -351,9 +351,9 @@ kernel program_impl::get_kernel(std::string KernelName, return createSyclObjFromImpl( std::make_shared(MContext, PtrToSelf)); } - return createSyclObjFromImpl( - std::make_shared(get_pi_kernel(KernelName), MContext, - PtrToSelf, IsCreatedFromSource, nullptr)); + auto [Kernel, ArgMask] = get_pi_kernel_arg_mask_pair(KernelName); + return createSyclObjFromImpl(std::make_shared( + Kernel, MContext, PtrToSelf, IsCreatedFromSource, nullptr, ArgMask)); } std::vector> program_impl::get_binaries() const { @@ -447,19 +447,20 @@ std::vector program_impl::get_pi_devices() const { return PiDevices; } -RT::PiKernel program_impl::get_pi_kernel(const std::string &KernelName) const { - RT::PiKernel Kernel = nullptr; +std::pair +program_impl::get_pi_kernel_arg_mask_pair(const std::string &KernelName) const { + std::pair Result; if (is_cacheable()) { - std::tie(Kernel, std::ignore, std::ignore) = + std::tie(Result.first, std::ignore, Result.second, std::ignore) = ProgramManager::getInstance().getOrCreateKernel( MProgramModuleHandle, detail::getSyclObjImpl(get_context()), detail::getSyclObjImpl(get_devices()[0]), KernelName, this); - getPlugin().call(Kernel); + getPlugin().call(Result.first); } else { const detail::plugin &Plugin = getPlugin(); RT::PiResult Err = Plugin.call_nocheck( - MProgram, KernelName.c_str(), &Kernel); + MProgram, KernelName.c_str(), &Result.first); if (Err == PI_ERROR_INVALID_KERNEL_NAME) { throw invalid_object_error( "This instance of program does not contain the kernel requested", @@ -469,11 +470,11 @@ RT::PiKernel program_impl::get_pi_kernel(const std::string &KernelName) const { // Some PI Plugins (like OpenCL) require this call to enable USM // For others, PI will turn this into a NOP. - Plugin.call(Kernel, PI_USM_INDIRECT_ACCESS, - sizeof(pi_bool), &PI_TRUE); + Plugin.call( + Result.first, PI_USM_INDIRECT_ACCESS, sizeof(pi_bool), &PI_TRUE); } - return Kernel; + return Result; } std::vector diff --git a/sycl/source/detail/program_impl.hpp b/sycl/source/detail/program_impl.hpp index 04a785d261102..b35b6baf4295a 100644 --- a/sycl/source/detail/program_impl.hpp +++ b/sycl/source/detail/program_impl.hpp @@ -405,7 +405,8 @@ class program_impl { /// \param KernelName is a string containing PI kernel name. /// \return an instance of PI kernel with specific name. If kernel is /// unavailable, an invalid_object_error exception is thrown. - RT::PiKernel get_pi_kernel(const std::string &KernelName) const; + std::pair + get_pi_kernel_arg_mask_pair(const std::string &KernelName) const; /// \return a vector of sorted in ascending order SYCL devices. std::vector sort_devices_by_cl_device_id(std::vector Devices); diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index a84bc290430ea..05c8b0feb0de5 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -169,7 +169,8 @@ getOrBuild(KernelProgramCache &KPCache, GetCachedBuildFT &&GetCachedBuild, // only the building thread will run this try { - RetT *Desired = Build(); + BuildResult->Val = Build(); + RetT *Desired = &BuildResult->Val; #ifndef NDEBUG RetT *Expected = nullptr; @@ -532,8 +533,6 @@ RT::PiProgram ProgramManager::getBuiltPIProgram( // kernel built with different options is present in the fat binary. KernelSetId KSId = getKernelSetId(M, KernelName); - using PiProgramT = KernelProgramCache::PiProgramT; - KernelProgramCache &Cache = ContextImpl->getKernelProgramCache(); std::string CompileOpts; @@ -676,14 +675,14 @@ RT::PiProgram ProgramManager::getBuiltPIProgram( return Cache.getOrInsertProgram(CacheKey); }; - auto BuildResult = getOrBuild( + auto BuildResult = getOrBuild( Cache, GetCachedBuildF, BuildF); // getOrBuild is not supposed to return nullptr assert(BuildResult != nullptr && "Invalid build result"); - return BuildResult->Ptr.load(); + return *BuildResult->Ptr.load(); } -std::tuple +std::tuple ProgramManager::getOrCreateKernel(OSModuleHandle M, const ContextImplPtr &ContextImpl, const DeviceImplPtr &DeviceImpl, @@ -695,7 +694,7 @@ ProgramManager::getOrCreateKernel(OSModuleHandle M, << KernelName << ")\n"; } - using PiKernelT = KernelProgramCache::PiKernelT; + using KernelArgMaskPairT = KernelProgramCache::KernelArgMaskPairT; KernelProgramCache &Cache = ContextImpl->getKernelProgramCache(); @@ -717,31 +716,35 @@ ProgramManager::getOrCreateKernel(OSModuleHandle M, RT::PiProgram Program = getBuiltPIProgram(M, ContextImpl, DeviceImpl, KernelName, Prg); - auto BuildF = [&Program, &KernelName, &ContextImpl] { - PiKernelT *Result = nullptr; + auto BuildF = [this, &Program, &KernelName, &ContextImpl, M] { + RT::PiKernel Kernel = nullptr; const detail::plugin &Plugin = ContextImpl->getPlugin(); Plugin.call( - Program, KernelName.c_str(), &Result); + Program, KernelName.c_str(), &Kernel); // Some PI Plugins (like OpenCL) require this call to enable USM // For others, PI will turn this into a NOP. - Plugin.call(Result, PI_USM_INDIRECT_ACCESS, + Plugin.call(Kernel, PI_USM_INDIRECT_ACCESS, sizeof(pi_bool), &PI_TRUE); - return Result; + const KernelArgMask *ArgMask = + getEliminatedKernelArgMask(M, Program, KernelName); + return std::make_pair(Kernel, ArgMask); }; auto GetCachedBuildF = [&Cache, &KernelName, Program]() { return Cache.getOrInsertKernel(Program, KernelName); }; - auto BuildResult = getOrBuild( + auto BuildResult = getOrBuild( Cache, GetCachedBuildF, BuildF); // getOrBuild is not supposed to return nullptr assert(BuildResult != nullptr && "Invalid build result"); - auto ret_val = std::make_tuple(BuildResult->Ptr.load(), - &(BuildResult->MBuildResultMutex), Program); + const KernelArgMaskPairT &KernelArgMaskPair = *BuildResult->Ptr.load(); + auto ret_val = std::make_tuple(KernelArgMaskPair.first, + &(BuildResult->MBuildResultMutex), + KernelArgMaskPair.second, Program); Cache.saveKernel(key, ret_val); return ret_val; } @@ -1181,23 +1184,6 @@ ProgramManager::build(ProgramPtr Program, const ContextImplPtr Context, return Program; } -static ProgramManager::KernelArgMask -createKernelArgMask(const ByteArray &Bytes) { - const int NBytesForSize = 8; - const int NBitsInElement = 8; - std::uint64_t SizeInBits = 0; - for (int I = 0; I < NBytesForSize; ++I) - SizeInBits |= static_cast(Bytes[I]) << I * NBitsInElement; - - ProgramManager::KernelArgMask Result; - for (std::uint64_t I = 0; I < SizeInBits; ++I) { - std::uint8_t Byte = Bytes[NBytesForSize + (I / NBitsInElement)]; - Result.push_back(Byte & (1 << (I % NBitsInElement))); - } - - return Result; -} - void ProgramManager::cacheKernelUsesAssertInfo(OSModuleHandle M, RTDeviceBinaryImage &Img) { const RTDeviceBinaryImage::PropertyRange &AssertUsedRange = @@ -1538,26 +1524,28 @@ uint32_t ProgramManager::getDeviceLibReqMask(const RTDeviceBinaryImage &Img) { return 0xFFFFFFFF; } -// TODO consider another approach with storing the masks in the integration -// header instead. -ProgramManager::KernelArgMask ProgramManager::getEliminatedKernelArgMask( - OSModuleHandle M, pi::PiProgram NativePrg, const std::string &KernelName) { - // If instructed to use a spv file, assume no eliminated arguments. - if (m_UseSpvFile && M == OSUtil::ExeModuleHandle) - return {}; - +// This version does not check m_UseSpvFile, but it's used in the kernel_bundle +// path, which does not currently check it and always uses images from the fat +// binary anyway. +// TODO consider making m_UseSpvFile interact with kernel bundles as well. +const KernelArgMask * +ProgramManager::getEliminatedKernelArgMask(pi::PiProgram NativePrg, + const std::string &KernelName) { // Bail out if there are no eliminated kernel arg masks in our images if (m_EliminatedKernelArgMasks.empty()) - return {}; + return nullptr; { std::lock_guard Lock(MNativeProgramsMutex); auto ImgIt = NativePrograms.find(NativePrg); if (ImgIt != NativePrograms.end()) { auto MapIt = m_EliminatedKernelArgMasks.find(ImgIt->second); - if (MapIt != m_EliminatedKernelArgMasks.end()) - return MapIt->second[KernelName]; - return {}; + if (MapIt != m_EliminatedKernelArgMasks.end()) { + auto ArgMaskMapIt = MapIt->second.find(KernelName); + if (ArgMaskMapIt != MapIt->second.end()) + return &MapIt->second[KernelName]; + } + return nullptr; } } @@ -1566,11 +1554,21 @@ ProgramManager::KernelArgMask ProgramManager::getEliminatedKernelArgMask( for (auto &Elem : m_EliminatedKernelArgMasks) { auto ArgMask = Elem.second.find(KernelName); if (ArgMask != Elem.second.end()) - return ArgMask->second; + return &ArgMask->second; } // The kernel is not generated by DPCPP stack, so a mask doesn't exist for it - return {}; + return nullptr; +} + +// TODO consider another approach with storing the masks in the integration +// header instead. +const KernelArgMask *ProgramManager::getEliminatedKernelArgMask( + OSModuleHandle M, pi::PiProgram NativePrg, const std::string &KernelName) { + // If instructed to use a spv file, assume no eliminated arguments. + if (m_UseSpvFile && M == OSUtil::ExeModuleHandle) + return nullptr; + return getEliminatedKernelArgMask(NativePrg, KernelName); } static bundle_state getBinImageState(const RTDeviceBinaryImage *BinImage) { @@ -2225,8 +2223,6 @@ device_image_plain ProgramManager::build(const device_image_plain &DeviceImage, const ContextImplPtr ContextImpl = getSyclObjImpl(Context); - using PiProgramT = KernelProgramCache::PiProgramT; - KernelProgramCache &Cache = ContextImpl->getKernelProgramCache(); std::string CompileOpts; @@ -2310,12 +2306,12 @@ device_image_plain ProgramManager::build(const device_image_plain &DeviceImage, }; // TODO: Throw SYCL2020 style exception - auto BuildResult = getOrBuild( + auto BuildResult = getOrBuild( Cache, GetCachedBuildF, BuildF); // getOrBuild is not supposed to return nullptr assert(BuildResult != nullptr && "Invalid build result"); - RT::PiProgram ResProgram = BuildResult->Ptr.load(); + RT::PiProgram ResProgram = *BuildResult->Ptr.load(); // Cache supports key with once device only, but here we have multiple // devices a program is built for, so add the program to the cache for all @@ -2334,8 +2330,8 @@ device_image_plain ProgramManager::build(const device_image_plain &DeviceImage, // Change device in the cache key to reduce copying of spec const data. CacheKey.second.first = PiDeviceAdd; - getOrBuild(Cache, GetCachedBuildF, - CacheOtherDevices); + getOrBuild(Cache, GetCachedBuildF, + CacheOtherDevices); // getOrBuild is not supposed to return nullptr assert(BuildResult != nullptr && "Invalid build result"); } @@ -2354,41 +2350,45 @@ device_image_plain ProgramManager::build(const device_image_plain &DeviceImage, return createSyclObjFromImpl(ExecImpl); } -std::pair ProgramManager::getOrCreateKernel( - const context &Context, const std::string &KernelName, - const property_list &PropList, RT::PiProgram Program) { +std::tuple +ProgramManager::getOrCreateKernel(const context &Context, + const std::string &KernelName, + const property_list &PropList, + RT::PiProgram Program) { (void)PropList; const ContextImplPtr Ctx = getSyclObjImpl(Context); - using PiKernelT = KernelProgramCache::PiKernelT; - KernelProgramCache &Cache = Ctx->getKernelProgramCache(); - auto BuildF = [&Program, &KernelName, &Ctx] { - PiKernelT *Result = nullptr; + auto BuildF = [this, &Program, &KernelName, &Ctx] { + RT::PiKernel Kernel = nullptr; const detail::plugin &Plugin = Ctx->getPlugin(); Plugin.call(Program, KernelName.c_str(), - &Result); + &Kernel); - Plugin.call(Result, PI_USM_INDIRECT_ACCESS, + Plugin.call(Kernel, PI_USM_INDIRECT_ACCESS, sizeof(pi_bool), &PI_TRUE); - return Result; + const KernelArgMask *KernelArgMask = + getEliminatedKernelArgMask(Program, KernelName); + return std::make_pair(Kernel, KernelArgMask); }; auto GetCachedBuildF = [&Cache, &KernelName, Program]() { return Cache.getOrInsertKernel(Program, KernelName); }; - auto BuildResult = getOrBuild( - Cache, GetCachedBuildF, BuildF); + auto BuildResult = + getOrBuild( + Cache, GetCachedBuildF, BuildF); // getOrBuild is not supposed to return nullptr assert(BuildResult != nullptr && "Invalid build result"); - return std::make_pair(BuildResult->Ptr.load(), - &(BuildResult->MBuildResultMutex)); + return std::make_tuple(BuildResult->Ptr.load()->first, + &(BuildResult->MBuildResultMutex), + BuildResult->Ptr.load()->second); } bool doesDevSupportDeviceRequirements(const device &Dev, diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index 3dba0a499ab61..9050e81e8a2bb 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -82,9 +83,6 @@ enum class DeviceLibExt : std::uint32_t { // that is necessary for no interoperability cases with lambda. class ProgramManager { public: - // TODO use a custom dynamic bitset instead to make initialization simpler. - using KernelArgMask = std::vector; - // Returns the single instance of the program manager for the entire // process. Can only be called after staticInit is done. static ProgramManager &getInstance(); @@ -148,7 +146,7 @@ class ProgramManager { const property_list &PropList, bool JITCompilationIsRequired = false); - std::tuple + std::tuple getOrCreateKernel(OSModuleHandle M, const ContextImplPtr &ContextImpl, const DeviceImplPtr &DeviceImpl, const std::string &KernelName, const program_impl *Prg); @@ -177,15 +175,23 @@ class ProgramManager { const RTDeviceBinaryImage *Img = nullptr); uint32_t getDeviceLibReqMask(const RTDeviceBinaryImage &Img); + /// Returns the mask for eliminated kernel arguments for the requested kernel + /// within the native program. + /// \param NativePrg the PI program associated with the kernel. + /// \param KernelName the name of the kernel. + const KernelArgMask * + getEliminatedKernelArgMask(pi::PiProgram NativePrg, + const std::string &KernelName); + /// Returns the mask for eliminated kernel arguments for the requested kernel /// within the native program. /// \param M identifies the OS module the kernel comes from (multiple OS /// modules may have kernels with the same name). /// \param NativePrg the PI program associated with the kernel. /// \param KernelName the name of the kernel. - KernelArgMask getEliminatedKernelArgMask(OSModuleHandle M, - pi::PiProgram NativePrg, - const std::string &KernelName); + const KernelArgMask * + getEliminatedKernelArgMask(OSModuleHandle M, pi::PiProgram NativePrg, + const std::string &KernelName); // The function returns the unique SYCL kernel identifier associated with a // kernel name. @@ -283,7 +289,7 @@ class ProgramManager { const std::vector &Devs, const property_list &PropList); - std::pair + std::tuple getOrCreateKernel(const context &Context, const std::string &KernelName, const property_list &PropList, RT::PiProgram Program); diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 4ab7c6bdce954..306f2af2af00a 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -95,10 +95,9 @@ static std::string deviceToString(device Device) { } static void applyFuncOnFilteredArgs( - const ProgramManager::KernelArgMask &EliminatedArgMask, - std::vector &Args, + const KernelArgMask *EliminatedArgMask, std::vector &Args, std::function Func) { - if (EliminatedArgMask.empty()) { + if (!EliminatedArgMask) { for (ArgDesc &Arg : Args) { Func(Arg, Arg.MIndex); } @@ -116,11 +115,11 @@ static void applyFuncOnFilteredArgs( // Handle potential gaps in set arguments (e. g. if some of them are // set on the user side). for (int Idx = LastIndex + 1; Idx < Arg.MIndex; ++Idx) - if (!EliminatedArgMask[Idx]) + if (!(*EliminatedArgMask)[Idx]) ++NextTrueIndex; LastIndex = Arg.MIndex; - if (EliminatedArgMask[Arg.MIndex]) + if ((*EliminatedArgMask)[Arg.MIndex]) continue; Func(Arg, NextTrueIndex); @@ -1948,6 +1947,7 @@ void ExecCGCommand::emitInstrumentationData() { RT::PiProgram Program = nullptr; RT::PiKernel Kernel = nullptr; std::mutex *KernelMutex = nullptr; + const KernelArgMask *EliminatedArgMask = nullptr; std::shared_ptr SyclKernelImpl; std::shared_ptr DeviceImageImpl; @@ -1964,27 +1964,23 @@ void ExecCGCommand::emitInstrumentationData() { KernelCG->MKernelName); kernel SyclKernel = KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr); - Program = detail::getSyclObjImpl(SyclKernel) - ->getDeviceImage() - ->get_program_ref(); + std::shared_ptr KernelImpl = + detail::getSyclObjImpl(SyclKernel); + + EliminatedArgMask = KernelImpl->getKernelArgMask(); + Program = KernelImpl->getDeviceImage()->get_program_ref(); } else if (nullptr != KernelCG->MSyclKernel) { auto SyclProg = KernelCG->MSyclKernel->getProgramImpl(); Program = SyclProg->getHandleRef(); + if (!KernelCG->MSyclKernel->isCreatedFromSource()) + EliminatedArgMask = KernelCG->MSyclKernel->getKernelArgMask(); } else { - std::tie(Kernel, KernelMutex, Program) = + std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) = detail::ProgramManager::getInstance().getOrCreateKernel( KernelCG->MOSModuleHandle, MQueue->getContextImplPtr(), MQueue->getDeviceImplPtr(), KernelCG->MKernelName, nullptr); } - ProgramManager::KernelArgMask EliminatedArgMask; - if (nullptr == KernelCG->MSyclKernel || - !KernelCG->MSyclKernel->isCreatedFromSource()) { - EliminatedArgMask = - detail::ProgramManager::getInstance().getEliminatedKernelArgMask( - KernelCG->MOSModuleHandle, Program, KernelCG->MKernelName); - } - applyFuncOnFilteredArgs(EliminatedArgMask, KernelCG->MArgs, FilterArgs); xpti::offload_kernel_enqueue_data_t KernelData{ @@ -2093,8 +2089,7 @@ static pi_result SetKernelParamsAndLaunch( const QueueImplPtr &Queue, std::vector &Args, const std::shared_ptr &DeviceImageImpl, RT::PiKernel Kernel, NDRDescT &NDRDesc, std::vector &RawEvents, - RT::PiEvent *OutEvent, - const ProgramManager::KernelArgMask &EliminatedArgMask, + RT::PiEvent *OutEvent, const KernelArgMask *EliminatedArgMask, const std::function &getMemAllocationFunc) { const detail::plugin &Plugin = Queue->getPlugin(); @@ -2236,6 +2231,7 @@ pi_int32 enqueueImpKernel( RT::PiKernel Kernel = nullptr; std::mutex *KernelMutex = nullptr; RT::PiProgram Program = nullptr; + const KernelArgMask *EliminatedArgMask; std::shared_ptr SyclKernelImpl; std::shared_ptr DeviceImageImpl; @@ -2258,7 +2254,7 @@ pi_int32 enqueueImpKernel( Program = DeviceImageImpl->get_program_ref(); - std::tie(Kernel, KernelMutex) = + std::tie(Kernel, KernelMutex, EliminatedArgMask) = detail::ProgramManager::getInstance().getOrCreateKernel( KernelBundleImplPtr->get_context(), KernelName, /*PropList=*/{}, Program); @@ -2270,7 +2266,7 @@ pi_int32 enqueueImpKernel( Program = SyclProg->getHandleRef(); if (SyclProg->is_cacheable()) { RT::PiKernel FoundKernel = nullptr; - std::tie(FoundKernel, KernelMutex, std::ignore) = + std::tie(FoundKernel, KernelMutex, EliminatedArgMask, std::ignore) = detail::ProgramManager::getInstance().getOrCreateKernel( OSModuleHandle, ContextImpl, DeviceImpl, KernelName, SyclProg.get()); @@ -2283,9 +2279,10 @@ pi_int32 enqueueImpKernel( // reuse and return existing SYCL kernels from make_native to avoid // their duplication in such cases. KernelMutex = &MSyclKernel->getNoncacheableEnqueueMutex(); + EliminatedArgMask = MSyclKernel->getKernelArgMask(); } } else { - std::tie(Kernel, KernelMutex, Program) = + std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) = detail::ProgramManager::getInstance().getOrCreateKernel( OSModuleHandle, ContextImpl, DeviceImpl, KernelName, nullptr); } @@ -2309,12 +2306,6 @@ pi_int32 enqueueImpKernel( } pi_result Error = PI_SUCCESS; - ProgramManager::KernelArgMask EliminatedArgMask; - if (nullptr == MSyclKernel || !MSyclKernel->isCreatedFromSource()) { - EliminatedArgMask = - detail::ProgramManager::getInstance().getEliminatedKernelArgMask( - OSModuleHandle, Program, KernelName); - } { assert(KernelMutex); std::lock_guard Lock(*KernelMutex); diff --git a/sycl/source/kernel.cpp b/sycl/source/kernel.cpp index 5b659581c9317..69f6f2a09b7f4 100644 --- a/sycl/source/kernel.cpp +++ b/sycl/source/kernel.cpp @@ -19,7 +19,7 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) { kernel::kernel(cl_kernel ClKernel, const context &SyclContext) : impl(std::make_shared( detail::pi::cast(ClKernel), - detail::getSyclObjImpl(SyclContext), nullptr)) { + detail::getSyclObjImpl(SyclContext), nullptr, nullptr)) { // This is a special interop constructor for OpenCL, so the kernel must be // retained. impl->getPlugin().call( diff --git a/sycl/unittests/program_manager/EliminatedArgMask.cpp b/sycl/unittests/program_manager/EliminatedArgMask.cpp index c59d592e07659..3284cc8c37a48 100644 --- a/sycl/unittests/program_manager/EliminatedArgMask.cpp +++ b/sycl/unittests/program_manager/EliminatedArgMask.cpp @@ -138,7 +138,7 @@ class MockHandler : public sycl::handler { } }; -sycl::detail::ProgramManager::KernelArgMask getKernelArgMaskFromBundle( +const sycl::detail::KernelArgMask *getKernelArgMaskFromBundle( const sycl::kernel_bundle &KernelBundle, std::shared_ptr QueueImpl) { @@ -193,13 +193,12 @@ TEST(EliminatedArgMask, KernelBundleWith2Kernels) { {sycl::get_kernel_id(), sycl::get_kernel_id()}); - sycl::detail::ProgramManager::KernelArgMask EliminatedArgMask = + const sycl::detail::KernelArgMask *EliminatedArgMask = getKernelArgMaskFromBundle(KernelBundle, sycl::detail::getSyclObjImpl(Queue)); - sycl::detail::ProgramManager::KernelArgMask ExpElimArgMask( - EAMTestKernelNumArgs); + sycl::detail::KernelArgMask ExpElimArgMask(EAMTestKernelNumArgs); ExpElimArgMask[0] = ExpElimArgMask[2] = true; - EXPECT_EQ(EliminatedArgMask, ExpElimArgMask); + EXPECT_EQ(*EliminatedArgMask, ExpElimArgMask); }