diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index 9e832bbb9a..7322d84f61 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -877,7 +877,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( } case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: { // L0 doesn't support enqueueing native work through the urNativeEnqueueExp - return ReturnValue(static_cast(false)); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_ESIMD_SUPPORT: { diff --git a/source/adapters/level_zero/enqueue_native.cpp b/source/adapters/level_zero/enqueue_native.cpp index b67cccc4f1..cc646a2cc2 100644 --- a/source/adapters/level_zero/enqueue_native.cpp +++ b/source/adapters/level_zero/enqueue_native.cpp @@ -10,11 +10,72 @@ #include +#include "logger/ur_logger.hpp" #include "queue.hpp" +#include "ur_level_zero.hpp" ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp( - ur_exp_enqueue_native_command_function_t, void *, uint32_t, - const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, - uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, + uint32_t, const ur_mem_handle_t *, + const ur_exp_enqueue_native_command_properties_t *, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventList, + ur_event_handle_t *phEvent) { + auto Queue = this; + std::scoped_lock lock(Queue->Mutex); + + bool UseCopyEngine = false; + + // Please note that the following code should be run before the + // subsequent getAvailableCommandList() call so that there is no + // dead-lock from waiting unsubmitted events in an open batch. + // The createAndRetainUrZeEventList() has the proper side-effect + // of submitting batches with dependent events. + // + _ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, phEventList, Queue, UseCopyEngine)); + + // Get a new command list to be used on this call + ur_command_list_ptr_t CommandList{}; + // TODO: Change UseCopyEngine argument to 'true' once L0 backend + // support is added + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, phEventList)); + + // TODO: do we need to create a unique command type for this? + ze_event_handle_t ZeEvent = nullptr; + ur_event_handle_t InternalEvent; + bool IsInternal = phEvent == nullptr; + ur_event_handle_t *Event = phEvent ? phEvent : &InternalEvent; + UR_CALL(createEventAndAssociateQueue(Queue, Event, + UR_COMMAND_ENQUEUE_NATIVE_EXP, + CommandList, IsInternal, false)); + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + + const auto &WaitList = (*Event)->WaitList; + if (WaitList.Length) { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (CommandList->first, WaitList.Length, WaitList.ZeEventList)); + } + + UR_CALL(Queue->executeCommandList(CommandList, false, false)); + UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, + UseCopyEngine, 0, nullptr)); + + { + ScopedCommandList Active{Queue, CommandList->first}; + + // Call interop func which enqueues native async work + pfnNativeEnqueue(Queue, data); + } + + UR_CALL(Queue->executeCommandList(CommandList, false, false)); + UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, + UseCopyEngine, 0, nullptr)); + + ZE2UR_CALL(zeCommandListAppendSignalEvent, (CommandList->first, ZeEvent)); + + UR_CALL(Queue->executeCommandList(CommandList, false)); + return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index f467447753..8eccada361 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -705,6 +705,15 @@ ur_result_t ur_queue_handle_legacy_t_::queueGetNativeHandle( ) { auto Queue = this; + // Needed for EnqueueNativeCommandExp, so that the native queue 'got' in the + // interop func is the as the native queue used to manage dependencies + // before the interop func invocation + if (Queue->getThreadLocalCommandList() != ze_command_list_handle_t{0}) { + auto ZeCmdList = ur_cast(NativeQueue); + *ZeCmdList = Queue->getThreadLocalCommandList(); + return UR_RESULT_SUCCESS; + } + // Lock automatically releases when this goes out of scope. std::shared_lock lock(Queue->Mutex); diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp index 97ddcf014c..4ac85d285d 100644 --- a/source/adapters/level_zero/queue.hpp +++ b/source/adapters/level_zero/queue.hpp @@ -423,6 +423,12 @@ struct ur_queue_handle_legacy_t_ : _ur_object, public ur_queue_handle_t_ { uint32_t, const ur_event_handle_t *, ur_event_handle_t *) override; + // Thread local stream will be used if ScopedStream is active + static ze_command_list_handle_t &getThreadLocalCommandList() { + static thread_local ze_command_list_handle_t CommandList{0}; + return CommandList; + } + using queue_type = ur_device_handle_t_::queue_group_info_t::type; // PI queue is in general a one to many mapping to L0 native queues. struct ur_queue_group_t { @@ -941,3 +947,23 @@ ur_result_t setSignalEvent(ur_queue_handle_legacy_t Queue, bool UseCopyEngine, ur_result_t CleanupEventListFromResetCmdList( std::vector &EventListToCleanup, bool QueueLocked = false); + +// RAII object to make hQueue command list getter methods all return the same +// command list within the lifetime of this object. +// +// This is useful for urEnqueueNativeCommandExp where we want guarantees that +// the user submitted native calls will be dispatched to a known command list, +// which must be "got" within the user submitted fuction. +class ScopedCommandList { + ur_queue_handle_legacy_t hQueue; + +public: + ScopedCommandList(ur_queue_handle_legacy_t hQueue, + ze_command_list_handle_t CommandList) + : hQueue{hQueue} { + hQueue->getThreadLocalCommandList() = CommandList; + } + ~ScopedCommandList() { + hQueue->getThreadLocalCommandList() = ze_command_list_handle_t{0}; + } +}; diff --git a/test/conformance/exp_enqueue_native/CMakeLists.txt b/test/conformance/exp_enqueue_native/CMakeLists.txt index 8638fa1349..8769cf716b 100644 --- a/test/conformance/exp_enqueue_native/CMakeLists.txt +++ b/test/conformance/exp_enqueue_native/CMakeLists.txt @@ -15,4 +15,21 @@ if (UR_BUILD_ADAPTER_CUDA) target_link_libraries(test-exp_enqueue_native PRIVATE cudadrv) endif() +if (UR_BUILD_ADAPTER_L0) + add_conformance_test_with_kernels_environment( + exp_enqueue_native + enqueue_native_level_zero.cpp + ) + target_link_libraries(test-exp_enqueue_native PRIVATE + LevelZeroLoader + LevelZeroLoader-Headers + ) + + target_include_directories(test-exp_enqueue_native PRIVATE + ${PROJECT_SOURCE_DIR}/source + ${PROJECT_SOURCE_DIR}/source/adapters/level_zero + LevelZeroLoader-Headers + ) +endif() + # TODO: Add more tests for different triples diff --git a/test/conformance/exp_enqueue_native/enqueue_native_level_zero.cpp b/test/conformance/exp_enqueue_native/enqueue_native_level_zero.cpp new file mode 100644 index 0000000000..75dacebddb --- /dev/null +++ b/test/conformance/exp_enqueue_native/enqueue_native_level_zero.cpp @@ -0,0 +1,128 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "ze_api.h" + +#include +#include + +using T = uint32_t; + +struct urLevelZeroEnqueueNativeCommandTest : uur::urQueueTest { + void SetUp() { + UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTest::SetUp()); + + host_vec = std::vector(global_size, 0); + ASSERT_EQ(host_vec.size(), global_size); + ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, + allocation_size, &device_ptr)); + ASSERT_NE(device_ptr, nullptr); + } + static constexpr T val = 42; + static constexpr uint32_t global_size = 1e7; + std::vector host_vec; + void *device_ptr = nullptr; + static constexpr size_t allocation_size = sizeof(val) * global_size; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urLevelZeroEnqueueNativeCommandTest); + +struct InteropData1 { + void *fill_ptr; +}; + +// Fill a device ptr with the pattern val +void interop_func_1(ur_queue_handle_t hQueue, void *data) { + ze_command_list_handle_t CommandList; + ASSERT_SUCCESS(urQueueGetNativeHandle(hQueue, nullptr, + (ur_native_handle_t *)&CommandList)); + InteropData1 *func_data = reinterpret_cast(data); + + // If L0 interop becomes a real use case we should make a new UR entry point + // to propagate events into and out of the the interop func. + zeCommandListAppendMemoryFill( + CommandList, func_data->fill_ptr, + &urLevelZeroEnqueueNativeCommandTest::val, + sizeof(urLevelZeroEnqueueNativeCommandTest::val), + urLevelZeroEnqueueNativeCommandTest::allocation_size, nullptr, 0, + nullptr); +} + +struct InteropData2 { + void *from, *to; +}; + +// Read from device ptr to host ptr +void interop_func_2(ur_queue_handle_t hQueue, void *data) { + ze_command_list_handle_t CommandList; + ASSERT_SUCCESS(urQueueGetNativeHandle(hQueue, nullptr, + (ur_native_handle_t *)&CommandList)); + InteropData2 *func_data = reinterpret_cast(data); + + // If L0 interop becomes a real use case we should make a new UR entry point + // to propagate events into and out of the the interop func. + zeCommandListAppendMemoryCopy( + CommandList, func_data->to, func_data->from, + urLevelZeroEnqueueNativeCommandTest::allocation_size, nullptr, 0, + nullptr); +} + +TEST_P(urLevelZeroEnqueueNativeCommandTest, Success) { + InteropData1 data_1{device_ptr}; + ur_event_handle_t event_1; + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1)); +} + +TEST_P(urLevelZeroEnqueueNativeCommandTest, Dependencies) { + ur_event_handle_t event_1, event_2; + + InteropData1 data_1{device_ptr}; + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1)); + + InteropData2 data_2{device_ptr, host_vec.data()}; + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_2, &data_2, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 1, &event_1, &event_2)); + urQueueFinish(queue); + for (auto &i : host_vec) { + ASSERT_EQ(i, val); + } +} + +TEST_P(urLevelZeroEnqueueNativeCommandTest, DependenciesURBefore) { + ur_event_handle_t event_1, event_2; + + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptr, sizeof(val), &val, + allocation_size, 0, + nullptr /*phEventWaitList=*/, &event_1)); + + InteropData2 data_2{device_ptr, host_vec.data()}; + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_2, &data_2, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 1, &event_1, &event_2)); + urQueueFinish(queue); + for (auto &i : host_vec) { + ASSERT_EQ(i, val); + } +} + +TEST_P(urLevelZeroEnqueueNativeCommandTest, DependenciesURAfter) { + ur_event_handle_t event_1; + + InteropData1 data_1{device_ptr}; + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1)); + + urEnqueueUSMMemcpy(queue, /*blocking*/ true, host_vec.data(), device_ptr, + allocation_size, 1, &event_1, nullptr); + for (auto &i : host_vec) { + ASSERT_EQ(i, val); + } +} diff --git a/test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_level_zero-v2.match b/test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_level_zero-v2.match new file mode 100644 index 0000000000..2c9b3a0f8d --- /dev/null +++ b/test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_level_zero-v2.match @@ -0,0 +1,4 @@ +urLevelZeroEnqueueNativeCommandTest.Success{{.*}} +urLevelZeroEnqueueNativeCommandTest.Dependencies{{.*}} +urLevelZeroEnqueueNativeCommandTest.DependenciesURBefore{{.*}} +urLevelZeroEnqueueNativeCommandTest.DependenciesURAfter{{.*}} diff --git a/test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_level_zero.match b/test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_level_zero.match new file mode 100644 index 0000000000..e69de29bb2