Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sycl/source/detail/context_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,7 @@ std::optional<RT::PiProgram> context_impl::getProgramForDeviceGlobal(
}
if (!BuildRes)
return std::nullopt;
return MKernelProgramCache.waitUntilBuilt<compile_program_error>(BuildRes);
return *MKernelProgramCache.waitUntilBuilt<compile_program_error>(BuildRes);
}

} // namespace detail
Expand Down
16 changes: 8 additions & 8 deletions sycl/source/detail/jit_compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -578,6 +578,7 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
}
const RTDeviceBinaryImage *DeviceImage = nullptr;
RT::PiProgram Program = nullptr;
const KernelArgMask *EliminatedArgs = nullptr;
if (KernelCG->getKernelBundle() != nullptr) {
// Retrieve the device image from the kernel bundle.
auto KernelBundle = KernelCG->getKernelBundle();
Expand All @@ -589,10 +590,12 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,

DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref();
Program = SyclKernel->getDeviceImage()->get_program_ref();
EliminatedArgs = SyclKernel->getKernelArgMask();
} else if (KernelCG->MSyclKernel != nullptr) {
DeviceImage =
KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref();
Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref();
EliminatedArgs = KernelCG->MSyclKernel->getKernelArgMask();
} else {
auto ContextImpl = Queue->getContextImplPtr();
auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
Expand All @@ -602,18 +605,14 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
KernelCG->MOSModuleHandle, KernelName, Context, Device);
Program = detail::ProgramManager::getInstance().createPIProgram(
*DeviceImage, Context, Device);
EliminatedArgs =
detail::ProgramManager::getInstance().getEliminatedKernelArgMask(
KernelCG->MOSModuleHandle, Program, KernelName);
}
if (!DeviceImage || !Program) {
printPerformanceWarning("No suitable IR available for fusion");
return nullptr;
}
ProgramManager::KernelArgMask EliminatedArgs;
if (Program && (KernelCG->MSyclKernel == nullptr ||
!KernelCG->MSyclKernel->isCreatedFromSource())) {
EliminatedArgs =
detail::ProgramManager::getInstance().getEliminatedKernelArgMask(
KernelCG->MOSModuleHandle, Program, KernelName);
}

// Collect information about the arguments of this kernel.

Expand All @@ -634,7 +633,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
// DPC++ internally uses 'true' to indicate that an argument has been
// eliminated, while the JIT compiler uses 'true' to indicate an
// argument is used. Translate this here.
bool Eliminated = !EliminatedArgs.empty() && EliminatedArgs[ArgIndex++];
bool Eliminated = EliminatedArgs && !EliminatedArgs->empty() &&
(*EliminatedArgs)[ArgIndex++];
ArgDescriptor.UsageMask.emplace_back(!Eliminated);

// If the argument has not been eliminated, i.e., is still present on
Expand Down
33 changes: 33 additions & 0 deletions sycl/source/detail/kernel_arg_mask.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
//==----------- kernel_arg_mask.hpp - SYCL KernelArgMask -------------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#pragma once

namespace sycl {
__SYCL_INLINE_VER_NAMESPACE(_V1) {
namespace detail {
using KernelArgMask = std::vector<bool>;
inline KernelArgMask createKernelArgMask(const ByteArray &Bytes) {
const int NBytesForSize = 8;
const int NBitsInElement = 8;
std::uint64_t SizeInBits = 0;

KernelArgMask Result;
for (int I = 0; I < NBytesForSize; ++I)
SizeInBits |= static_cast<std::uint64_t>(Bytes[I]) << I * NBitsInElement;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This op is fairly simple, but a quick glance stumbles on a number of red flags.

What if the ByteArray Bytes has less than 8 elements? Or more? I'm assuming that doesn't happen in practice, but the ByteArray class itself is constructed with a Ptr and a Size.
I know the multiplication takes precedence but it parentheses around I * NBitsInElement might make it clearer.
And the static_cast is just so we can do the I= with no compiler warnings, I presume. But with all these things occurring in one line, a helpful comment might help put the reader at ease.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense, I'll follow up on this in another PR.


Result.reserve(SizeInBits);
for (std::uint64_t I = 0; I < SizeInBits; ++I) {
std::uint8_t Byte = Bytes[NBytesForSize + (I / NBitsInElement)];
Result.push_back(Byte & (1 << (I % NBitsInElement)));
}
return Result;
}
} // namespace detail
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
} // namespace sycl
8 changes: 5 additions & 3 deletions sycl/source/detail/kernel_bundle_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -367,13 +367,15 @@ class kernel_bundle_impl {
detail::getSyclObjImpl(*It);

RT::PiKernel Kernel = nullptr;
std::tie(Kernel, std::ignore) =
const KernelArgMask *ArgMask = nullptr;
std::tie(Kernel, std::ignore, ArgMask) =
Comment on lines 369 to +371
Copy link
Contributor

@aelovikov-intel aelovikov-intel Apr 6, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

auto [Kernel, NameToIgnore, ArgMask] = ...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On second thought, I prefer the std::tie version since it allows to ignore the value without creating an unused variable.

detail::ProgramManager::getInstance().getOrCreateKernel(
MContext, KernelID.get_name(), /*PropList=*/{},
DeviceImageImpl->get_program_ref());

std::shared_ptr<kernel_impl> KernelImpl = std::make_shared<kernel_impl>(
Kernel, detail::getSyclObjImpl(MContext), DeviceImageImpl, Self);
std::shared_ptr<kernel_impl> KernelImpl =
std::make_shared<kernel_impl>(Kernel, detail::getSyclObjImpl(MContext),
DeviceImageImpl, Self, ArgMask);

return detail::createSyclObjFromImpl<kernel>(KernelImpl);
}
Expand Down
17 changes: 11 additions & 6 deletions sycl/source/detail/kernel_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
namespace detail {

kernel_impl::kernel_impl(RT::PiKernel Kernel, ContextImplPtr Context,
KernelBundleImplPtr KernelBundleImpl)
KernelBundleImplPtr KernelBundleImpl,
const KernelArgMask *ArgMask)
: kernel_impl(Kernel, Context,
std::make_shared<program_impl>(Context, Kernel),
/*IsCreatedFromSource*/ true, KernelBundleImpl) {
/*IsCreatedFromSource*/ true, KernelBundleImpl, ArgMask) {
// Enable USM indirect access for interoperability kernels.
// Some PI Plugins (like OpenCL) require this call to enable USM
// For others, PI will turn this into a NOP.
Expand All @@ -34,11 +35,13 @@ kernel_impl::kernel_impl(RT::PiKernel Kernel, ContextImplPtr Context,

kernel_impl::kernel_impl(RT::PiKernel Kernel, ContextImplPtr ContextImpl,
ProgramImplPtr ProgramImpl, bool IsCreatedFromSource,
KernelBundleImplPtr KernelBundleImpl)
KernelBundleImplPtr KernelBundleImpl,
const KernelArgMask *ArgMask)
: MKernel(Kernel), MContext(ContextImpl),
MProgramImpl(std::move(ProgramImpl)),
MCreatedFromSource(IsCreatedFromSource),
MKernelBundleImpl(std::move(KernelBundleImpl)) {
MKernelBundleImpl(std::move(KernelBundleImpl)),
MKernelArgMaskPtr{ArgMask} {

RT::PiContext Context = nullptr;
// Using the plugin from the passed ContextImpl
Expand All @@ -54,10 +57,12 @@ kernel_impl::kernel_impl(RT::PiKernel Kernel, ContextImplPtr ContextImpl,

kernel_impl::kernel_impl(RT::PiKernel Kernel, ContextImplPtr ContextImpl,
DeviceImageImplPtr DeviceImageImpl,
KernelBundleImplPtr KernelBundleImpl)
KernelBundleImplPtr KernelBundleImpl,
const KernelArgMask *ArgMask)
: MKernel(Kernel), MContext(std::move(ContextImpl)), MProgramImpl(nullptr),
MCreatedFromSource(false), MDeviceImageImpl(std::move(DeviceImageImpl)),
MKernelBundleImpl(std::move(KernelBundleImpl)) {
MKernelBundleImpl(std::move(KernelBundleImpl)),
MKernelArgMaskPtr{ArgMask} {

// kernel_impl shared ownership of kernel handle
if (!is_host()) {
Expand Down
13 changes: 10 additions & 3 deletions sycl/source/detail/kernel_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <detail/context_impl.hpp>
#include <detail/device_impl.hpp>
#include <detail/kernel_arg_mask.hpp>
#include <detail/kernel_info.hpp>
#include <sycl/detail/common.hpp>
#include <sycl/detail/pi.h>
Expand Down Expand Up @@ -42,7 +43,8 @@ class kernel_impl {
/// \param Context is a valid SYCL context
/// \param KernelBundleImpl is a valid instance of kernel_bundle_impl
kernel_impl(RT::PiKernel Kernel, ContextImplPtr Context,
KernelBundleImplPtr KernelBundleImpl);
KernelBundleImplPtr KernelBundleImpl,
const KernelArgMask *ArgMask = nullptr);

/// Constructs a SYCL kernel instance from a SYCL program and a PiKernel
///
Expand All @@ -59,7 +61,8 @@ class kernel_impl {
/// \param KernelBundleImpl is a valid instance of kernel_bundle_impl
kernel_impl(RT::PiKernel Kernel, ContextImplPtr ContextImpl,
ProgramImplPtr ProgramImpl, bool IsCreatedFromSource,
KernelBundleImplPtr KernelBundleImpl);
KernelBundleImplPtr KernelBundleImpl,
const KernelArgMask *ArgMask);

/// Constructs a SYCL kernel_impl instance from a SYCL device_image,
/// kernel_bundle and / PiKernel.
Expand All @@ -69,7 +72,8 @@ class kernel_impl {
/// \param KernelBundleImpl is a valid instance of kernel_bundle_impl
kernel_impl(RT::PiKernel Kernel, ContextImplPtr ContextImpl,
DeviceImageImplPtr DeviceImageImpl,
KernelBundleImplPtr KernelBundleImpl);
KernelBundleImplPtr KernelBundleImpl,
const KernelArgMask *ArgMask);

/// Constructs a SYCL kernel for host device
///
Expand Down Expand Up @@ -177,6 +181,8 @@ class kernel_impl {
return MNoncacheableEnqueueMutex;
}

const KernelArgMask *getKernelArgMask() const { return MKernelArgMaskPtr; }

private:
RT::PiKernel MKernel;
const ContextImplPtr MContext;
Expand All @@ -186,6 +192,7 @@ class kernel_impl {
const KernelBundleImplPtr MKernelBundleImpl;
bool MIsInterop = false;
std::mutex MNoncacheableEnqueueMutex;
const KernelArgMask *MKernelArgMaskPtr;

bool isBuiltInKernel(const device &Device) const;
void checkIfValidForNumArgsInfoQuery() const;
Expand Down
14 changes: 7 additions & 7 deletions sycl/source/detail/kernel_program_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,28 @@ namespace detail {
KernelProgramCache::~KernelProgramCache() {
for (auto &ProgIt : MCachedPrograms.Cache) {
ProgramWithBuildStateT &ProgWithState = ProgIt.second;
PiProgramT *ToBeDeleted = ProgWithState.Ptr.load();
RT::PiProgram *ToBeDeleted = ProgWithState.Ptr.load();

if (!ToBeDeleted)
continue;

auto KernIt = MKernelsPerProgramCache.find(ToBeDeleted);
auto KernIt = MKernelsPerProgramCache.find(*ToBeDeleted);

if (KernIt != MKernelsPerProgramCache.end()) {
for (auto &p : KernIt->second) {
KernelWithBuildStateT &KernelWithState = p.second;
PiKernelT *Kern = KernelWithState.Ptr.load();
BuildResult<KernelArgMaskPairT> &KernelWithState = p.second;
KernelArgMaskPairT *KernelArgMaskPair = KernelWithState.Ptr.load();

if (Kern) {
if (KernelArgMaskPair) {
const detail::plugin &Plugin = MParentContext->getPlugin();
Plugin.call<PiApiKind::piKernelRelease>(Kern);
Plugin.call<PiApiKind::piKernelRelease>(KernelArgMaskPair->first);
}
}
MKernelsPerProgramCache.erase(KernIt);
}

const detail::plugin &Plugin = MParentContext->getPlugin();
Plugin.call<PiApiKind::piProgramRelease>(ToBeDeleted);
Plugin.call<PiApiKind::piProgramRelease>(*ToBeDeleted);
}
}
} // namespace detail
Expand Down
20 changes: 8 additions & 12 deletions sycl/source/detail/kernel_program_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ class KernelProgramCache {
/// Currently there is only a single user - ProgramManager class.
template <typename T> struct BuildResult {
std::atomic<T *> Ptr;
T Val;
std::atomic<BuildState> State;
BuildError Error;

Expand All @@ -68,9 +69,7 @@ class KernelProgramCache {
BuildResult(T *P, BuildState S) : Ptr{P}, State{S}, Error{"", 0} {}
};

using PiProgramT = std::remove_pointer<RT::PiProgram>::type;
using PiProgramPtrT = std::atomic<PiProgramT *>;
using ProgramWithBuildStateT = BuildResult<PiProgramT>;
using ProgramWithBuildStateT = BuildResult<RT::PiProgram>;
using ProgramCacheKeyT = std::pair<std::pair<SerializedObj, std::uintptr_t>,
std::pair<RT::PiDevice, std::string>>;
using CommonProgramKeyT = std::pair<std::uintptr_t, RT::PiDevice>;
Expand All @@ -84,18 +83,15 @@ class KernelProgramCache {

using ContextPtr = context_impl *;

using PiKernelT = std::remove_pointer<RT::PiKernel>::type;

using PiKernelPtrT = std::atomic<PiKernelT *>;
using KernelWithBuildStateT = BuildResult<PiKernelT>;
using KernelByNameT = std::map<std::string, KernelWithBuildStateT>;
using KernelArgMaskPairT = std::pair<RT::PiKernel, const KernelArgMask *>;
using KernelByNameT = std::map<std::string, BuildResult<KernelArgMaskPairT>>;
using KernelCacheT = std::map<RT::PiProgram, KernelByNameT>;

using KernelFastCacheKeyT =
std::tuple<SerializedObj, OSModuleHandle, RT::PiDevice, std::string,
std::string>;
using KernelFastCacheValT =
std::tuple<RT::PiKernel, std::mutex *, RT::PiProgram>;
using KernelFastCacheValT = std::tuple<RT::PiKernel, std::mutex *,
const KernelArgMask *, RT::PiProgram>;
using KernelFastCacheT = std::map<KernelFastCacheKeyT, KernelFastCacheValT>;

~KernelProgramCache();
Expand Down Expand Up @@ -128,7 +124,7 @@ class KernelProgramCache {
return std::make_pair(&Inserted.first->second, Inserted.second);
}

std::pair<KernelWithBuildStateT *, bool>
std::pair<BuildResult<KernelArgMaskPairT> *, bool>
getOrInsertKernel(RT::PiProgram Program, const std::string &KernelName) {
auto LockedCache = acquireKernelsPerProgramCache();
auto &Cache = LockedCache.get()[Program];
Expand Down Expand Up @@ -173,7 +169,7 @@ class KernelProgramCache {
if (It != MKernelFastCache.end()) {
return It->second;
}
return std::make_tuple(nullptr, nullptr, nullptr);
return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
}

template <typename KeyT, typename ValT>
Expand Down
23 changes: 12 additions & 11 deletions sycl/source/detail/program_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -351,9 +351,9 @@ kernel program_impl::get_kernel(std::string KernelName,
return createSyclObjFromImpl<kernel>(
std::make_shared<kernel_impl>(MContext, PtrToSelf));
}
return createSyclObjFromImpl<kernel>(
std::make_shared<kernel_impl>(get_pi_kernel(KernelName), MContext,
PtrToSelf, IsCreatedFromSource, nullptr));
auto [Kernel, ArgMask] = get_pi_kernel_arg_mask_pair(KernelName);
return createSyclObjFromImpl<kernel>(std::make_shared<kernel_impl>(
Kernel, MContext, PtrToSelf, IsCreatedFromSource, nullptr, ArgMask));
}

std::vector<std::vector<char>> program_impl::get_binaries() const {
Expand Down Expand Up @@ -447,19 +447,20 @@ std::vector<RT::PiDevice> program_impl::get_pi_devices() const {
return PiDevices;
}

RT::PiKernel program_impl::get_pi_kernel(const std::string &KernelName) const {
RT::PiKernel Kernel = nullptr;
std::pair<RT::PiKernel, const KernelArgMask *>
program_impl::get_pi_kernel_arg_mask_pair(const std::string &KernelName) const {
std::pair<RT::PiKernel, const KernelArgMask *> Result;

if (is_cacheable()) {
std::tie(Kernel, std::ignore, std::ignore) =
std::tie(Result.first, std::ignore, Result.second, std::ignore) =
ProgramManager::getInstance().getOrCreateKernel(
MProgramModuleHandle, detail::getSyclObjImpl(get_context()),
detail::getSyclObjImpl(get_devices()[0]), KernelName, this);
getPlugin().call<PiApiKind::piKernelRetain>(Kernel);
getPlugin().call<PiApiKind::piKernelRetain>(Result.first);
} else {
const detail::plugin &Plugin = getPlugin();
RT::PiResult Err = Plugin.call_nocheck<PiApiKind::piKernelCreate>(
MProgram, KernelName.c_str(), &Kernel);
MProgram, KernelName.c_str(), &Result.first);
if (Err == PI_ERROR_INVALID_KERNEL_NAME) {
throw invalid_object_error(
"This instance of program does not contain the kernel requested",
Expand All @@ -469,11 +470,11 @@ RT::PiKernel program_impl::get_pi_kernel(const std::string &KernelName) const {

// Some PI Plugins (like OpenCL) require this call to enable USM
// For others, PI will turn this into a NOP.
Plugin.call<PiApiKind::piKernelSetExecInfo>(Kernel, PI_USM_INDIRECT_ACCESS,
sizeof(pi_bool), &PI_TRUE);
Plugin.call<PiApiKind::piKernelSetExecInfo>(
Result.first, PI_USM_INDIRECT_ACCESS, sizeof(pi_bool), &PI_TRUE);
}

return Kernel;
return Result;
}

std::vector<device>
Expand Down
3 changes: 2 additions & 1 deletion sycl/source/detail/program_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,8 @@ class program_impl {
/// \param KernelName is a string containing PI kernel name.
/// \return an instance of PI kernel with specific name. If kernel is
/// unavailable, an invalid_object_error exception is thrown.
RT::PiKernel get_pi_kernel(const std::string &KernelName) const;
std::pair<RT::PiKernel, const KernelArgMask *>
get_pi_kernel_arg_mask_pair(const std::string &KernelName) const;

/// \return a vector of sorted in ascending order SYCL devices.
std::vector<device> sort_devices_by_cl_device_id(std::vector<device> Devices);
Expand Down
Loading