diff --git a/.github/intel-llvm-mirror-base-commit b/.github/intel-llvm-mirror-base-commit index 9d93df2036..dd0312365f 100644 --- a/.github/intel-llvm-mirror-base-commit +++ b/.github/intel-llvm-mirror-base-commit @@ -1 +1 @@ -05538e008ad1e1b348e79bbb40888387288a2428 +a50acd0244276fb9efb231abae5ce9d71495768b diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index 838cb96dc5..b6d3d2e64c 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -56,263 +56,6 @@ ur_result_t urKernelGetSuggestedLocalWorkSize( return UR_RESULT_SUCCESS; } -inline ur_result_t EnqueueCooperativeKernelLaunchHelper( - /// [in] handle of the queue object - ur_queue_handle_t Queue, - /// [in] handle of the kernel object - ur_kernel_handle_t Kernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t WorkDim, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the offset used to calculate the global ID of a work-item - const size_t *GlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *GlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that - /// will execute the kernel function. If nullptr, the runtime - /// implementation will choose the work-group size. - const size_t *LocalWorkSize, - /// [in] size of the event wait list - uint32_t NumEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. If - /// nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *EventWaitList, - /// [in,out][optional] return an event object that identifies this - /// particular kernel execution instance. - ur_event_handle_t *OutEvent) { - UR_ASSERT(WorkDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - UR_ASSERT(WorkDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - - auto ZeDevice = Queue->Device->ZeDevice; - - ze_kernel_handle_t ZeKernel{}; - if (Kernel->ZeKernelMap.empty()) { - ZeKernel = Kernel->ZeKernel; - } else { - auto It = Kernel->ZeKernelMap.find(ZeDevice); - if (It == Kernel->ZeKernelMap.end()) { - /* kernel and queue don't match */ - return UR_RESULT_ERROR_INVALID_QUEUE; - } - ZeKernel = It->second; - } - // Lock automatically releases when this goes out of scope. - std::scoped_lock Lock( - Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex); - if (GlobalWorkOffset != NULL) { - UR_CALL(setKernelGlobalOffset(Queue->Context, ZeKernel, WorkDim, - GlobalWorkOffset)); - } - - // If there are any pending arguments set them now. - for (auto &Arg : Kernel->PendingArguments) { - // The ArgValue may be a NULL pointer in which case a NULL value is used for - // the kernel argument declared as a pointer to global or constant memory. - char **ZeHandlePtr = nullptr; - if (Arg.Value) { - UR_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode, - Queue->Device, EventWaitList, - NumEventsInWaitList)); - } - ZE2UR_CALL(zeKernelSetArgumentValue, - (ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr)); - } - Kernel->PendingArguments.clear(); - - ze_group_count_t ZeThreadGroupDimensions{1, 1, 1}; - uint32_t WG[3]{}; - - // New variable needed because GlobalWorkSize parameter might not be of size 3 - size_t GlobalWorkSize3D[3]{1, 1, 1}; - std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D); - - if (LocalWorkSize) { - // L0 - for (uint32_t I = 0; I < WorkDim; I++) { - UR_ASSERT(LocalWorkSize[I] < (std::numeric_limits::max)(), - UR_RESULT_ERROR_INVALID_VALUE); - WG[I] = static_cast(LocalWorkSize[I]); - } - } else { - // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize - // values do not fit to 32-bit that the API only supports currently. - bool SuggestGroupSize = true; - for (int I : {0, 1, 2}) { - if (GlobalWorkSize3D[I] > UINT32_MAX) { - SuggestGroupSize = false; - } - } - if (SuggestGroupSize) { - ZE2UR_CALL(zeKernelSuggestGroupSize, - (ZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1], - GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2])); - } else { - for (int I : {0, 1, 2}) { - // Try to find a I-dimension WG size that the GlobalWorkSize[I] is - // fully divisable with. Start with the max possible size in - // each dimension. - uint32_t GroupSize[] = { - Queue->Device->ZeDeviceComputeProperties->maxGroupSizeX, - Queue->Device->ZeDeviceComputeProperties->maxGroupSizeY, - Queue->Device->ZeDeviceComputeProperties->maxGroupSizeZ}; - GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]); - while (GlobalWorkSize3D[I] % GroupSize[I]) { - --GroupSize[I]; - } - - if (GlobalWorkSize3D[I] / GroupSize[I] > UINT32_MAX) { - UR_LOG(ERR, - "urEnqueueCooperativeKernelLaunchExp: can't find a WG size " - "suitable for global work size > UINT32_MAX"); - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - WG[I] = GroupSize[I]; - } - UR_LOG(DEBUG, - "urEnqueueCooperativeKernelLaunchExp: using computed WG " - "size = {{{}, {}, {}}}", - WG[0], WG[1], WG[2]); - } - } - - // TODO: assert if sizes do not fit into 32-bit? - - switch (WorkDim) { - case 3: - ZeThreadGroupDimensions.groupCountX = - static_cast(GlobalWorkSize3D[0] / WG[0]); - ZeThreadGroupDimensions.groupCountY = - static_cast(GlobalWorkSize3D[1] / WG[1]); - ZeThreadGroupDimensions.groupCountZ = - static_cast(GlobalWorkSize3D[2] / WG[2]); - break; - case 2: - ZeThreadGroupDimensions.groupCountX = - static_cast(GlobalWorkSize3D[0] / WG[0]); - ZeThreadGroupDimensions.groupCountY = - static_cast(GlobalWorkSize3D[1] / WG[1]); - WG[2] = 1; - break; - case 1: - ZeThreadGroupDimensions.groupCountX = - static_cast(GlobalWorkSize3D[0] / WG[0]); - WG[1] = WG[2] = 1; - break; - - default: - UR_LOG(ERR, "urEnqueueCooperativeKernelLaunchExp: unsupported work_dim"); - return UR_RESULT_ERROR_INVALID_VALUE; - } - - // Error handling for non-uniform group size case - if (GlobalWorkSize3D[0] != - size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) { - UR_LOG(ERR, - "urEnqueueCooperativeKernelLaunchExp: invalid work_dim. The " - "range is not a multiple of the group size in the 1st dimension"); - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - if (GlobalWorkSize3D[1] != - size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) { - UR_LOG(ERR, - "urEnqueueCooperativeKernelLaunchExp: invalid work_dim. The " - "range is not a multiple of the group size in the 2nd dimension"); - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - if (GlobalWorkSize3D[2] != - size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) { - UR_LOG(DEBUG, - "urEnqueueCooperativeKernelLaunchExp: invalid work_dim. The " - "range is not a multiple of the group size in the 3rd dimension"); - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - - ZE2UR_CALL(zeKernelSetGroupSize, (ZeKernel, WG[0], WG[1], WG[2])); - - bool UseCopyEngine = false; - ur_ze_event_list_t TmpWaitList; - UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); - - // Get a new command list to be used on this call - ur_command_list_ptr_t CommandList{}; - UR_CALL(Queue->Context->getAvailableCommandList( - Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList, - true /* AllowBatching */, nullptr /*ForcedCmdQueue*/)); - - ze_event_handle_t ZeEvent = nullptr; - ur_event_handle_t InternalEvent{}; - bool IsInternal = OutEvent == nullptr; - ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; - - UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH, - CommandList, IsInternal, false)); - UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, - NumEventsInWaitList, EventWaitList, - CommandList->second.ZeQueue)); - (*Event)->WaitList = TmpWaitList; - - // Save the kernel in the event, so that when the event is signalled - // the code can do a urKernelRelease on this kernel. - (*Event)->CommandData = (void *)Kernel; - - // Increment the reference count of the Kernel and indicate that the Kernel - // is in use. Once the event has been signalled, the code in - // CleanupCompletedEvent(Event) will do a urKernelRelease to update the - // reference count on the kernel, using the kernel saved in CommandData. - UR_CALL(ur::level_zero::urKernelRetain(Kernel)); - - // Add to list of kernels to be submitted - if (IndirectAccessTrackingEnabled) - Queue->KernelsToBeSubmitted.push_back(Kernel); - - if (Queue->UsingImmCmdLists && IndirectAccessTrackingEnabled) { - // If using immediate commandlists then gathering of indirect - // references and appending to the queue (which means submission) - // must be done together. - std::unique_lock ContextsLock( - Queue->Device->Platform->ContextsMutex, std::defer_lock); - // We are going to submit kernels for execution. If indirect access flag is - // set for a kernel then we need to make a snapshot of existing memory - // allocations in all contexts in the platform. We need to lock the mutex - // guarding the list of contexts in the platform to prevent creation of new - // memory alocations in any context before we submit the kernel for - // execution. - ContextsLock.lock(); - Queue->CaptureIndirectAccesses(); - // Add the command to the command list, which implies submission. - ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel, - (CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent, - (*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList)); - } else { - // Add the command to the command list for later submission. - // No lock is needed here, unlike the immediate commandlist case above, - // because the kernels are not actually submitted yet. Kernels will be - // submitted only when the comamndlist is closed. Then, a lock is held. - ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel, - (CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent, - (*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList)); - } - - UR_LOG(DEBUG, - "calling zeCommandListAppendLaunchCooperativeKernel() with ZeEvent {}", - ur_cast(ZeEvent)); - printZeEventList((*Event)->WaitList); - - // Execute command list asynchronously, as the event will be used - // to track down its completion. - UR_CALL(Queue->executeCommandList(CommandList, false /*IsBlocking*/, - true /*OKToBatchCommand*/)); - - return UR_RESULT_SUCCESS; -} - ur_result_t urEnqueueKernelLaunch( /// [in] handle of the queue object ur_queue_handle_t Queue, @@ -348,14 +91,16 @@ ur_result_t urEnqueueKernelLaunch( /// [in,out][optional] return an event object that identifies this /// particular kernel execution instance. ur_event_handle_t *OutEvent) { + using ZeKernelLaunchFuncT = ze_result_t (*)( + ze_command_list_handle_t, ze_kernel_handle_t, const ze_group_count_t *, + ze_event_handle_t, uint32_t, ze_event_handle_t *); + ZeKernelLaunchFuncT ZeKernelLaunchFunc = &zeCommandListAppendLaunchKernel; for (uint32_t PropIndex = 0; PropIndex < NumPropsInLaunchPropList; PropIndex++) { if (LaunchPropList[PropIndex].id == UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE && LaunchPropList[PropIndex].value.cooperative) { - return EnqueueCooperativeKernelLaunchHelper( - Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, - LocalWorkSize, NumEventsInWaitList, EventWaitList, OutEvent); + ZeKernelLaunchFunc = &zeCommandListAppendLaunchCooperativeKernel; } if (LaunchPropList[PropIndex].id != UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE && LaunchPropList[PropIndex].id != @@ -454,7 +199,7 @@ ur_result_t urEnqueueKernelLaunch( ContextsLock.lock(); Queue->CaptureIndirectAccesses(); // Add the command to the command list, which implies submission. - ZE2UR_CALL(zeCommandListAppendLaunchKernel, + ZE2UR_CALL(ZeKernelLaunchFunc, (CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent, (*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList)); } else { @@ -462,7 +207,7 @@ ur_result_t urEnqueueKernelLaunch( // No lock is needed here, unlike the immediate commandlist case above, // because the kernels are not actually submitted yet. Kernels will be // submitted only when the comamndlist is closed. Then, a lock is held. - ZE2UR_CALL(zeCommandListAppendLaunchKernel, + ZE2UR_CALL(ZeKernelLaunchFunc, (CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent, (*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList)); } diff --git a/source/adapters/offload/context.cpp b/source/adapters/offload/context.cpp index 2dcbcd4da8..f8772de801 100644 --- a/source/adapters/offload/context.cpp +++ b/source/adapters/offload/context.cpp @@ -59,6 +59,7 @@ urContextRelease(ur_context_handle_t hContext) { return UR_RESULT_SUCCESS; } +// Offload currently doesn't have an equivalent to context handles UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle(ur_context_handle_t, ur_native_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; diff --git a/source/adapters/offload/device.cpp b/source/adapters/offload/device.cpp index ece73a199c..6ca2ddf24b 100644 --- a/source/adapters/offload/device.cpp +++ b/source/adapters/offload/device.cpp @@ -12,6 +12,7 @@ #include #include +#include "adapters/offload/adapter.hpp" #include "device.hpp" #include "platform.hpp" #include "ur2offload.hpp" @@ -211,14 +212,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( return UR_RESULT_ERROR_INVALID_BINARY; } -UR_APIEXPORT ur_result_t UR_APICALL -urDeviceGetNativeHandle(ur_device_handle_t, ur_native_handle_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( + ur_device_handle_t UrDevice, ur_native_handle_t *Handle) { + *Handle = reinterpret_cast(UrDevice->OffloadDevice); + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( - ur_native_handle_t, ur_adapter_handle_t, - const ur_device_native_properties_t *, ur_device_handle_t *) { + ur_native_handle_t hNativeDevice, ur_adapter_handle_t hAdapter, + const ur_device_native_properties_t *, ur_device_handle_t *phDevice) { + ol_device_handle_t OlDevice = + reinterpret_cast(hNativeDevice); + + // Currently, all devices are found at initialization, there is no way to + // create sub devices yet + for (auto &P : hAdapter->Platforms) { + auto Found = + std::find_if(P->Devices.begin(), P->Devices.end(), + [&](std::unique_ptr &PDevice) { + return PDevice->OffloadDevice == OlDevice; + }); + if (Found != P->Devices.end()) { + *phDevice = Found->get(); + return UR_RESULT_SUCCESS; + } + } + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/source/adapters/offload/enqueue.cpp b/source/adapters/offload/enqueue.cpp index 7dd6754b92..b1a1edac52 100644 --- a/source/adapters/offload/enqueue.cpp +++ b/source/adapters/offload/enqueue.cpp @@ -67,17 +67,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( LaunchArgs.GroupSize.z = GroupSize[2]; LaunchArgs.DynSharedMemory = 0; - ol_event_handle_t EventOut; ol_queue_handle_t Queue; OL_RETURN_ON_ERR(hQueue->nextQueue(Queue)); - OL_RETURN_ON_ERR( - olLaunchKernel(Queue, hQueue->OffloadDevice, hKernel->OffloadKernel, - hKernel->Args.getStorage(), hKernel->Args.getStorageSize(), - &LaunchArgs, &EventOut)); + OL_RETURN_ON_ERR(olLaunchKernel( + Queue, hQueue->OffloadDevice, hKernel->OffloadKernel, + hKernel->Args.getStorage(), hKernel->Args.getStorageSize(), &LaunchArgs)); if (phEvent) { auto *Event = new ur_event_handle_t_(UR_COMMAND_KERNEL_LAUNCH, hQueue); - Event->OffloadEvent = EventOut; + if (auto Res = olCreateEvent(Queue, &Event->OffloadEvent)) { + delete Event; + return offloadResultToUR(Res); + }; *phEvent = Event; } return UR_RESULT_SUCCESS; @@ -108,22 +109,24 @@ ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue, // if (blocking) { - OL_RETURN_ON_ERR(olMemcpy(nullptr, DestPtr, DestDevice, SrcPtr, SrcDevice, - size, nullptr)); + OL_RETURN_ON_ERR( + olMemcpy(nullptr, DestPtr, DestDevice, SrcPtr, SrcDevice, size)); if (phEvent) { *phEvent = ur_event_handle_t_::createEmptyEvent(Command, hQueue); } return UR_RESULT_SUCCESS; } - ol_event_handle_t EventOut = nullptr; ol_queue_handle_t Queue; OL_RETURN_ON_ERR(hQueue->nextQueue(Queue)); - OL_RETURN_ON_ERR(olMemcpy(Queue, DestPtr, DestDevice, SrcPtr, SrcDevice, size, - phEvent ? &EventOut : nullptr)); + OL_RETURN_ON_ERR( + olMemcpy(Queue, DestPtr, DestDevice, SrcPtr, SrcDevice, size)); if (phEvent) { auto *Event = new ur_event_handle_t_(Command, hQueue); - Event->OffloadEvent = EventOut; + if (auto Res = olCreateEvent(Queue, &Event->OffloadEvent)) { + delete Event; + return offloadResultToUR(Res); + }; *phEvent = Event; } diff --git a/source/adapters/offload/memory.cpp b/source/adapters/offload/memory.cpp index 564e616a97..5b12407d0d 100644 --- a/source/adapters/offload/memory.cpp +++ b/source/adapters/offload/memory.cpp @@ -57,7 +57,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( if (PerformInitialCopy) { OL_RETURN_ON_ERR(olMemcpy(nullptr, Ptr, OffloadDevice, HostPtr, - Adapter->HostDevice, size, nullptr)); + Adapter->HostDevice, size)); } *phBuffer = URMemObj.release(); diff --git a/source/adapters/offload/platform.cpp b/source/adapters/offload/platform.cpp index 671dd98fbe..8558f19585 100644 --- a/source/adapters/offload/platform.cpp +++ b/source/adapters/offload/platform.cpp @@ -95,14 +95,27 @@ urPlatformGetBackendOption(ur_platform_handle_t, const char *pFrontendOption, return UR_RESULT_ERROR_INVALID_VALUE; } -UR_APIEXPORT ur_result_t UR_APICALL -urPlatformGetNativeHandle(ur_platform_handle_t, ur_native_handle_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetNativeHandle( + ur_platform_handle_t hAdapter, ur_native_handle_t *phNativePlatform) { + *phNativePlatform = + reinterpret_cast(hAdapter->OffloadPlatform); + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( - ur_native_handle_t, ur_adapter_handle_t, - const ur_platform_native_properties_t *, ur_platform_handle_t *) { + ur_native_handle_t hNativePlatform, ur_adapter_handle_t hAdapter, + const ur_platform_native_properties_t *, ur_platform_handle_t *phPlatform) { + + auto Found = std::find_if( + hAdapter->Platforms.begin(), hAdapter->Platforms.end(), [&](auto &P) { + return P->OffloadPlatform == + reinterpret_cast(hNativePlatform); + }); + if (Found != hAdapter->Platforms.end()) { + *phPlatform = Found->get(); + return UR_RESULT_SUCCESS; + } + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; }