diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 81092d7db24f9..a7f16e9a2fdae 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -6042,7 +6042,7 @@ pi_result _pi_queue::synchronize() { // Otherwise sync all L0 queues/immediate command-lists. for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID}) { for (auto &QueueGroup : QueueMap) { - if (Device->ImmCommandListUsed) { + if (UsingImmCmdLists) { for (auto ImmCmdList : QueueGroup.second.ImmCmdLists) syncImmCmdList(this, ImmCmdList); } else { diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp index 7c8e39522711f..cc17a17b6901c 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp @@ -1040,7 +1040,7 @@ getRangeOfAllowedCopyEngines(const ur_device_handle_t &Device) { // used. if (!EnvVar) { if (Device->useImmediateCommandLists()) - return std::pair(-1, -1); // No copy engines can be used. + return std::pair(0, 0); // Only main copy engine will be used. return std::pair(0, INT_MAX); // All copy engines will be used. } std::string CopyEngineRange = EnvVar; @@ -1089,8 +1089,13 @@ _ur_device_handle_t::useImmediateCommandLists() { }(); if (ImmediateCommandlistsSetting == -1) - // Change this to PerQueue as default after more testing. + // Immediate command lists will be used by default only on Linux PVC. +#ifdef _WIN32 return NotUsed; +#else + return isPVC() ? PerQueue : NotUsed; +#endif + switch (ImmediateCommandlistsSetting) { case 0: return NotUsed; @@ -1128,75 +1133,6 @@ static const EventsScope DeviceEventsSetting = [] { ur_result_t _ur_device_handle_t::initialize(int SubSubDeviceOrdinal, int SubSubDeviceIndex) { - uint32_t numQueueGroups = 0; - ZE_CALL(zeDeviceGetCommandQueueGroupProperties, - (ZeDevice, &numQueueGroups, nullptr)); - if (numQueueGroups == 0) { - return UR_RESULT_ERROR_UNKNOWN; - } - urPrint("NOTE: Number of queue groups = %d\n", numQueueGroups); - std::vector> - QueueGroupProperties(numQueueGroups); - ZE_CALL(zeDeviceGetCommandQueueGroupProperties, - (ZeDevice, &numQueueGroups, QueueGroupProperties.data())); - - // Initialize ordinal and compute queue group properties - for (uint32_t i = 0; i < numQueueGroups; i++) { - if (QueueGroupProperties[i].flags & - ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) { - QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal = - i; - QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] - .ZeProperties = QueueGroupProperties[i]; - break; - } - } - - // Reinitialize a sub-sub-device with its own ordinal, index. - // Our sub-sub-device representation is currently [Level-Zero sub-device - // handle + Level-Zero compute group/engine index]. Only the specified - // index queue will be used to submit work to the sub-sub-device. - if (SubSubDeviceOrdinal >= 0) { - QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal = - SubSubDeviceOrdinal; - QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeIndex = - SubSubDeviceIndex; - } else { // Proceed with initialization for root and sub-device - // How is it possible that there are no "compute" capabilities? - if (QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal < - 0) { - return UR_RESULT_ERROR_UNKNOWN; - } - - if (CopyEngineRequested((ur_device_handle_t)this)) { - for (uint32_t i = 0; i < numQueueGroups; i++) { - if (((QueueGroupProperties[i].flags & - ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) && - (QueueGroupProperties[i].flags & - ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) { - if (QueueGroupProperties[i].numQueues == 1) { - QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal = i; - QueueGroup[queue_group_info_t::MainCopy].ZeProperties = - QueueGroupProperties[i]; - } else { - QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal = i; - QueueGroup[queue_group_info_t::LinkCopy].ZeProperties = - QueueGroupProperties[i]; - break; - } - } - } - if (QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal < 0) - urPrint("NOTE: main blitter/copy engine is not available\n"); - else - urPrint("NOTE: main blitter/copy engine is available\n"); - - if (QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal < 0) - urPrint("NOTE: link blitter/copy engines are not available\n"); - else - urPrint("NOTE: link blitter/copy engines are available\n"); - } - } // Maintain various device properties cache. // Note that we just describe here how to compute the data. @@ -1269,6 +1205,76 @@ ur_result_t _ur_device_handle_t::initialize(int SubSubDeviceOrdinal, ZeEventsScope = DeviceEventsSetting; } + uint32_t numQueueGroups = 0; + ZE_CALL(zeDeviceGetCommandQueueGroupProperties, + (ZeDevice, &numQueueGroups, nullptr)); + if (numQueueGroups == 0) { + return UR_RESULT_ERROR_UNKNOWN; + } + urPrint("NOTE: Number of queue groups = %d\n", numQueueGroups); + std::vector> + QueueGroupProperties(numQueueGroups); + ZE_CALL(zeDeviceGetCommandQueueGroupProperties, + (ZeDevice, &numQueueGroups, QueueGroupProperties.data())); + + // Initialize ordinal and compute queue group properties + for (uint32_t i = 0; i < numQueueGroups; i++) { + if (QueueGroupProperties[i].flags & + ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) { + QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal = + i; + QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] + .ZeProperties = QueueGroupProperties[i]; + break; + } + } + + // Reinitialize a sub-sub-device with its own ordinal, index. + // Our sub-sub-device representation is currently [Level-Zero sub-device + // handle + Level-Zero compute group/engine index]. Only the specified + // index queue will be used to submit work to the sub-sub-device. + if (SubSubDeviceOrdinal >= 0) { + QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal = + SubSubDeviceOrdinal; + QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeIndex = + SubSubDeviceIndex; + } else { // Proceed with initialization for root and sub-device + // How is it possible that there are no "compute" capabilities? + if (QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal < + 0) { + return UR_RESULT_ERROR_UNKNOWN; + } + + if (CopyEngineRequested((ur_device_handle_t)this)) { + for (uint32_t i = 0; i < numQueueGroups; i++) { + if (((QueueGroupProperties[i].flags & + ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) && + (QueueGroupProperties[i].flags & + ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) { + if (QueueGroupProperties[i].numQueues == 1) { + QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal = i; + QueueGroup[queue_group_info_t::MainCopy].ZeProperties = + QueueGroupProperties[i]; + } else { + QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal = i; + QueueGroup[queue_group_info_t::LinkCopy].ZeProperties = + QueueGroupProperties[i]; + break; + } + } + } + if (QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal < 0) + urPrint("NOTE: main blitter/copy engine is not available\n"); + else + urPrint("NOTE: main blitter/copy engine is available\n"); + + if (QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal < 0) + urPrint("NOTE: link blitter/copy engines are not available\n"); + else + urPrint("NOTE: link blitter/copy engines are available\n"); + } + } + return UR_RESULT_SUCCESS; } diff --git a/sycl/test-e2e/DiscardEvents/discard_events_l0_inorder.cpp b/sycl/test-e2e/DiscardEvents/discard_events_l0_inorder.cpp index 2cafa76be6f7b..ce47e2011954c 100644 --- a/sycl/test-e2e/DiscardEvents/discard_events_l0_inorder.cpp +++ b/sycl/test-e2e/DiscardEvents/discard_events_l0_inorder.cpp @@ -2,10 +2,10 @@ // // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // -// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=0 ONEAPI_DEVICE_SELECTOR="level_zero:*" %GPU_RUN_PLACEHOLDER %t.out -// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=1 ONEAPI_DEVICE_SELECTOR="level_zero:*" %GPU_RUN_PLACEHOLDER %t.out -// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=2 ONEAPI_DEVICE_SELECTOR="level_zero:*" %GPU_RUN_PLACEHOLDER %t.out -// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=3 ONEAPI_DEVICE_SELECTOR="level_zero:*" %GPU_RUN_PLACEHOLDER %t.out +// RUN: env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 SYCL_PI_LEVEL_ZERO_BATCH_SIZE=0 ONEAPI_DEVICE_SELECTOR="level_zero:*" %GPU_RUN_PLACEHOLDER %t.out +// RUN: env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 SYCL_PI_LEVEL_ZERO_BATCH_SIZE=1 ONEAPI_DEVICE_SELECTOR="level_zero:*" %GPU_RUN_PLACEHOLDER %t.out +// RUN: env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 SYCL_PI_LEVEL_ZERO_BATCH_SIZE=2 ONEAPI_DEVICE_SELECTOR="level_zero:*" %GPU_RUN_PLACEHOLDER %t.out +// RUN: env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 SYCL_PI_LEVEL_ZERO_BATCH_SIZE=3 ONEAPI_DEVICE_SELECTOR="level_zero:*" %GPU_RUN_PLACEHOLDER %t.out // // The test is to check the execution of different queue operations has in-order // semantics regardless of batching. diff --git a/sycl/test-e2e/DiscardEvents/discard_events_l0_leak.cpp b/sycl/test-e2e/DiscardEvents/discard_events_l0_leak.cpp index fcf705e77d454..a5aaefdcd6cff 100644 --- a/sycl/test-e2e/DiscardEvents/discard_events_l0_leak.cpp +++ b/sycl/test-e2e/DiscardEvents/discard_events_l0_leak.cpp @@ -2,8 +2,8 @@ // // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // -// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=4 ONEAPI_DEVICE_SELECTOR='level_zero:*' ZE_DEBUG=4 %GPU_RUN_PLACEHOLDER %t.out wait 2>&1 %GPU_CHECK_PLACEHOLDER -// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=4 ONEAPI_DEVICE_SELECTOR='level_zero:*' ZE_DEBUG=4 %GPU_RUN_PLACEHOLDER %t.out nowait 2>&1 %GPU_CHECK_PLACEHOLDER +// RUN: env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 SYCL_PI_LEVEL_ZERO_BATCH_SIZE=4 ONEAPI_DEVICE_SELECTOR='level_zero:*' ZE_DEBUG=4 %GPU_RUN_PLACEHOLDER %t.out wait 2>&1 %GPU_CHECK_PLACEHOLDER +// RUN: env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 SYCL_PI_LEVEL_ZERO_BATCH_SIZE=4 ONEAPI_DEVICE_SELECTOR='level_zero:*' ZE_DEBUG=4 %GPU_RUN_PLACEHOLDER %t.out nowait 2>&1 %GPU_CHECK_PLACEHOLDER // // CHECK-NOT: LEAK // diff --git a/sycl/test-e2e/Plugin/level_zero_imm_cmdlist.cpp b/sycl/test-e2e/Plugin/level_zero_imm_cmdlist.cpp index 07c77e57c44a6..56905e699b444 100755 --- a/sycl/test-e2e/Plugin/level_zero_imm_cmdlist.cpp +++ b/sycl/test-e2e/Plugin/level_zero_imm_cmdlist.cpp @@ -1,12 +1,13 @@ -// REQUIRES: level_zero, level_zero_dev_kit +// REQUIRES: linux, gpu-intel-pvc, level_zero, level_zero_dev_kit // // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %level_zero_options %s -o %t.out -// RUN: env ONEAPI_DEVICE_SELECTOR="level_zero:*" ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER +// RUN: env ONEAPI_DEVICE_SELECTOR="level_zero:*" ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER // // CHECK-NOT: zeCommandListCreate( // CHECK: zeCommandListCreateImmediate( -// The test checks that immediate commandlists are used and not regular ones. +// This test checks that immediate commandlists are used and not regular ones on +// PVC Linux. #include