Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/intel-llvm-mirror-base-commit
Original file line number Diff line number Diff line change
@@ -1 +1 @@
05538e008ad1e1b348e79bbb40888387288a2428
a50acd0244276fb9efb231abae5ce9d71495768b
269 changes: 7 additions & 262 deletions source/adapters/level_zero/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,263 +56,6 @@ ur_result_t urKernelGetSuggestedLocalWorkSize(
return UR_RESULT_SUCCESS;
}

inline ur_result_t EnqueueCooperativeKernelLaunchHelper(
/// [in] handle of the queue object
ur_queue_handle_t Queue,
/// [in] handle of the kernel object
ur_kernel_handle_t Kernel,
/// [in] number of dimensions, from 1 to 3, to specify the global and
/// work-group work-items
uint32_t WorkDim,
/// [in][optional] pointer to an array of workDim unsigned values that
/// specify the offset used to calculate the global ID of a work-item
const size_t *GlobalWorkOffset,
/// [in] pointer to an array of workDim unsigned values that specify the
/// number of global work-items in workDim that will execute the kernel
/// function
const size_t *GlobalWorkSize,
/// [in][optional] pointer to an array of workDim unsigned values that
/// specify the number of local work-items forming a work-group that
/// will execute the kernel function. If nullptr, the runtime
/// implementation will choose the work-group size.
const size_t *LocalWorkSize,
/// [in] size of the event wait list
uint32_t NumEventsInWaitList,
/// [in][optional][range(0, numEventsInWaitList)] pointer to a list of
/// events that must be complete before the kernel execution. If
/// nullptr, the numEventsInWaitList must be 0, indicating that no wait
/// event.
const ur_event_handle_t *EventWaitList,
/// [in,out][optional] return an event object that identifies this
/// particular kernel execution instance.
ur_event_handle_t *OutEvent) {
UR_ASSERT(WorkDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
UR_ASSERT(WorkDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);

auto ZeDevice = Queue->Device->ZeDevice;

ze_kernel_handle_t ZeKernel{};
if (Kernel->ZeKernelMap.empty()) {
ZeKernel = Kernel->ZeKernel;
} else {
auto It = Kernel->ZeKernelMap.find(ZeDevice);
if (It == Kernel->ZeKernelMap.end()) {
/* kernel and queue don't match */
return UR_RESULT_ERROR_INVALID_QUEUE;
}
ZeKernel = It->second;
}
// Lock automatically releases when this goes out of scope.
std::scoped_lock<ur_shared_mutex, ur_shared_mutex, ur_shared_mutex> Lock(
Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex);
if (GlobalWorkOffset != NULL) {
UR_CALL(setKernelGlobalOffset(Queue->Context, ZeKernel, WorkDim,
GlobalWorkOffset));
}

// If there are any pending arguments set them now.
for (auto &Arg : Kernel->PendingArguments) {
// The ArgValue may be a NULL pointer in which case a NULL value is used for
// the kernel argument declared as a pointer to global or constant memory.
char **ZeHandlePtr = nullptr;
if (Arg.Value) {
UR_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode,
Queue->Device, EventWaitList,
NumEventsInWaitList));
}
ZE2UR_CALL(zeKernelSetArgumentValue,
(ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr));
}
Kernel->PendingArguments.clear();

ze_group_count_t ZeThreadGroupDimensions{1, 1, 1};
uint32_t WG[3]{};

// New variable needed because GlobalWorkSize parameter might not be of size 3
size_t GlobalWorkSize3D[3]{1, 1, 1};
std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D);

if (LocalWorkSize) {
// L0
for (uint32_t I = 0; I < WorkDim; I++) {
UR_ASSERT(LocalWorkSize[I] < (std::numeric_limits<uint32_t>::max)(),
UR_RESULT_ERROR_INVALID_VALUE);
WG[I] = static_cast<uint32_t>(LocalWorkSize[I]);
}
} else {
// We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize
// values do not fit to 32-bit that the API only supports currently.
bool SuggestGroupSize = true;
for (int I : {0, 1, 2}) {
if (GlobalWorkSize3D[I] > UINT32_MAX) {
SuggestGroupSize = false;
}
}
if (SuggestGroupSize) {
ZE2UR_CALL(zeKernelSuggestGroupSize,
(ZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1],
GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2]));
} else {
for (int I : {0, 1, 2}) {
// Try to find a I-dimension WG size that the GlobalWorkSize[I] is
// fully divisable with. Start with the max possible size in
// each dimension.
uint32_t GroupSize[] = {
Queue->Device->ZeDeviceComputeProperties->maxGroupSizeX,
Queue->Device->ZeDeviceComputeProperties->maxGroupSizeY,
Queue->Device->ZeDeviceComputeProperties->maxGroupSizeZ};
GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]);
while (GlobalWorkSize3D[I] % GroupSize[I]) {
--GroupSize[I];
}

if (GlobalWorkSize3D[I] / GroupSize[I] > UINT32_MAX) {
UR_LOG(ERR,
"urEnqueueCooperativeKernelLaunchExp: can't find a WG size "
"suitable for global work size > UINT32_MAX");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}
WG[I] = GroupSize[I];
}
UR_LOG(DEBUG,
"urEnqueueCooperativeKernelLaunchExp: using computed WG "
"size = {{{}, {}, {}}}",
WG[0], WG[1], WG[2]);
}
}

// TODO: assert if sizes do not fit into 32-bit?

switch (WorkDim) {
case 3:
ZeThreadGroupDimensions.groupCountX =
static_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
ZeThreadGroupDimensions.groupCountY =
static_cast<uint32_t>(GlobalWorkSize3D[1] / WG[1]);
ZeThreadGroupDimensions.groupCountZ =
static_cast<uint32_t>(GlobalWorkSize3D[2] / WG[2]);
break;
case 2:
ZeThreadGroupDimensions.groupCountX =
static_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
ZeThreadGroupDimensions.groupCountY =
static_cast<uint32_t>(GlobalWorkSize3D[1] / WG[1]);
WG[2] = 1;
break;
case 1:
ZeThreadGroupDimensions.groupCountX =
static_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
WG[1] = WG[2] = 1;
break;

default:
UR_LOG(ERR, "urEnqueueCooperativeKernelLaunchExp: unsupported work_dim");
return UR_RESULT_ERROR_INVALID_VALUE;
}

// Error handling for non-uniform group size case
if (GlobalWorkSize3D[0] !=
size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) {
UR_LOG(ERR,
"urEnqueueCooperativeKernelLaunchExp: invalid work_dim. The "
"range is not a multiple of the group size in the 1st dimension");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}
if (GlobalWorkSize3D[1] !=
size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) {
UR_LOG(ERR,
"urEnqueueCooperativeKernelLaunchExp: invalid work_dim. The "
"range is not a multiple of the group size in the 2nd dimension");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}
if (GlobalWorkSize3D[2] !=
size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) {
UR_LOG(DEBUG,
"urEnqueueCooperativeKernelLaunchExp: invalid work_dim. The "
"range is not a multiple of the group size in the 3rd dimension");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}

ZE2UR_CALL(zeKernelSetGroupSize, (ZeKernel, WG[0], WG[1], WG[2]));

bool UseCopyEngine = false;
ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));

// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
UR_CALL(Queue->Context->getAvailableCommandList(
Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
true /* AllowBatching */, nullptr /*ForcedCmdQueue*/));

ze_event_handle_t ZeEvent = nullptr;
ur_event_handle_t InternalEvent{};
bool IsInternal = OutEvent == nullptr;
ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;

UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH,
CommandList, IsInternal, false));
UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
NumEventsInWaitList, EventWaitList,
CommandList->second.ZeQueue));
(*Event)->WaitList = TmpWaitList;

// Save the kernel in the event, so that when the event is signalled
// the code can do a urKernelRelease on this kernel.
(*Event)->CommandData = (void *)Kernel;

// Increment the reference count of the Kernel and indicate that the Kernel
// is in use. Once the event has been signalled, the code in
// CleanupCompletedEvent(Event) will do a urKernelRelease to update the
// reference count on the kernel, using the kernel saved in CommandData.
UR_CALL(ur::level_zero::urKernelRetain(Kernel));

// Add to list of kernels to be submitted
if (IndirectAccessTrackingEnabled)
Queue->KernelsToBeSubmitted.push_back(Kernel);

if (Queue->UsingImmCmdLists && IndirectAccessTrackingEnabled) {
// If using immediate commandlists then gathering of indirect
// references and appending to the queue (which means submission)
// must be done together.
std::unique_lock<ur_shared_mutex> ContextsLock(
Queue->Device->Platform->ContextsMutex, std::defer_lock);
// We are going to submit kernels for execution. If indirect access flag is
// set for a kernel then we need to make a snapshot of existing memory
// allocations in all contexts in the platform. We need to lock the mutex
// guarding the list of contexts in the platform to prevent creation of new
// memory alocations in any context before we submit the kernel for
// execution.
ContextsLock.lock();
Queue->CaptureIndirectAccesses();
// Add the command to the command list, which implies submission.
ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel,
(CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent,
(*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList));
} else {
// Add the command to the command list for later submission.
// No lock is needed here, unlike the immediate commandlist case above,
// because the kernels are not actually submitted yet. Kernels will be
// submitted only when the comamndlist is closed. Then, a lock is held.
ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel,
(CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent,
(*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList));
}

UR_LOG(DEBUG,
"calling zeCommandListAppendLaunchCooperativeKernel() with ZeEvent {}",
ur_cast<std::uintptr_t>(ZeEvent));
printZeEventList((*Event)->WaitList);

// Execute command list asynchronously, as the event will be used
// to track down its completion.
UR_CALL(Queue->executeCommandList(CommandList, false /*IsBlocking*/,
true /*OKToBatchCommand*/));

return UR_RESULT_SUCCESS;
}

ur_result_t urEnqueueKernelLaunch(
/// [in] handle of the queue object
ur_queue_handle_t Queue,
Expand Down Expand Up @@ -348,14 +91,16 @@ ur_result_t urEnqueueKernelLaunch(
/// [in,out][optional] return an event object that identifies this
/// particular kernel execution instance.
ur_event_handle_t *OutEvent) {
using ZeKernelLaunchFuncT = ze_result_t (*)(
ze_command_list_handle_t, ze_kernel_handle_t, const ze_group_count_t *,
ze_event_handle_t, uint32_t, ze_event_handle_t *);
ZeKernelLaunchFuncT ZeKernelLaunchFunc = &zeCommandListAppendLaunchKernel;
for (uint32_t PropIndex = 0; PropIndex < NumPropsInLaunchPropList;
PropIndex++) {
if (LaunchPropList[PropIndex].id ==
UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE &&
LaunchPropList[PropIndex].value.cooperative) {
return EnqueueCooperativeKernelLaunchHelper(
Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize,
LocalWorkSize, NumEventsInWaitList, EventWaitList, OutEvent);
ZeKernelLaunchFunc = &zeCommandListAppendLaunchCooperativeKernel;
}
if (LaunchPropList[PropIndex].id != UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE &&
LaunchPropList[PropIndex].id !=
Expand Down Expand Up @@ -454,15 +199,15 @@ ur_result_t urEnqueueKernelLaunch(
ContextsLock.lock();
Queue->CaptureIndirectAccesses();
// Add the command to the command list, which implies submission.
ZE2UR_CALL(zeCommandListAppendLaunchKernel,
ZE2UR_CALL(ZeKernelLaunchFunc,
(CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent,
(*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList));
} else {
// Add the command to the command list for later submission.
// No lock is needed here, unlike the immediate commandlist case above,
// because the kernels are not actually submitted yet. Kernels will be
// submitted only when the comamndlist is closed. Then, a lock is held.
ZE2UR_CALL(zeCommandListAppendLaunchKernel,
ZE2UR_CALL(ZeKernelLaunchFunc,
(CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent,
(*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList));
}
Expand Down
1 change: 1 addition & 0 deletions source/adapters/offload/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ urContextRelease(ur_context_handle_t hContext) {
return UR_RESULT_SUCCESS;
}

// Offload currently doesn't have an equivalent to context handles
UR_APIEXPORT ur_result_t UR_APICALL
urContextGetNativeHandle(ur_context_handle_t, ur_native_handle_t *) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
Expand Down
29 changes: 24 additions & 5 deletions source/adapters/offload/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <ur/ur.hpp>
#include <ur_api.h>

#include "adapters/offload/adapter.hpp"
#include "device.hpp"
#include "platform.hpp"
#include "ur2offload.hpp"
Expand Down Expand Up @@ -211,14 +212,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary(
return UR_RESULT_ERROR_INVALID_BINARY;
}

UR_APIEXPORT ur_result_t UR_APICALL
urDeviceGetNativeHandle(ur_device_handle_t, ur_native_handle_t *) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle(
ur_device_handle_t UrDevice, ur_native_handle_t *Handle) {
*Handle = reinterpret_cast<ur_native_handle_t>(UrDevice->OffloadDevice);
return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
ur_native_handle_t, ur_adapter_handle_t,
const ur_device_native_properties_t *, ur_device_handle_t *) {
ur_native_handle_t hNativeDevice, ur_adapter_handle_t hAdapter,
const ur_device_native_properties_t *, ur_device_handle_t *phDevice) {
ol_device_handle_t OlDevice =
reinterpret_cast<ol_device_handle_t>(hNativeDevice);

// Currently, all devices are found at initialization, there is no way to
// create sub devices yet
for (auto &P : hAdapter->Platforms) {
auto Found =
std::find_if(P->Devices.begin(), P->Devices.end(),
[&](std::unique_ptr<ur_device_handle_t_> &PDevice) {
return PDevice->OffloadDevice == OlDevice;
});
if (Found != P->Devices.end()) {
*phDevice = Found->get();
return UR_RESULT_SUCCESS;
}
}

return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}

Expand Down
Loading