@@ -184,6 +184,14 @@ static void zePrint(const char *Format, ...) {
184184 }
185185}
186186
187+ // Controls whether device-scope events are used.
188+ static const bool ZeAllHostVisibleEvents = [] {
189+ const auto DeviceEventsStr =
190+ std::getenv (" SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS" );
191+ bool result = (DeviceEventsStr ? (std::atoi (DeviceEventsStr) == 0 ) : true );
192+ return result;
193+ }();
194+
187195// Helper function to implement zeHostSynchronize.
188196// The behavior is to avoid infinite wait during host sync under ZE_DEBUG.
189197// This allows for a much more responsive debugging of hangs.
@@ -382,8 +390,8 @@ pi_result _pi_mem::removeMapping(void *MappedTo, Mapping &MapInfo) {
382390}
383391
384392pi_result
385- _pi_context::getFreeSlotInExistingOrNewPool (ze_event_pool_handle_t &ZePool ,
386- size_t &Index) {
393+ _pi_context::getFreeSlotInExistingOrNewPool (ze_event_pool_handle_t &Pool ,
394+ size_t &Index, bool HostVisible ) {
387395 // Maximum number of events that can be present in an event ZePool is captured
388396 // here. Setting it to 256 gave best possible performance for several
389397 // benchmarks.
@@ -399,10 +407,23 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &ZePool,
399407 return PI_INVALID_VALUE;
400408 }
401409
410+ // Setup for host-visible pool as needed.
411+ ze_event_pool_flag_t ZePoolFlag = {};
412+ ze_event_pool_handle_t *ZePool = [&] {
413+ if (ZeAllHostVisibleEvents) {
414+ ZePoolFlag = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
415+ return &ZeEventPool;
416+ } else if (HostVisible) {
417+ ZePoolFlag = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
418+ return &ZeHostVisibleEventPool;
419+ } else {
420+ return &ZeEventPool;
421+ }
422+ }();
423+
402424 Index = 0 ;
403425 // Create one event ZePool per MaxNumEventsPerPool events
404- if ((ZeEventPool == nullptr ) ||
405- (NumEventsAvailableInEventPool[ZeEventPool] == 0 )) {
426+ if ((*ZePool == nullptr ) || (NumEventsAvailableInEventPool[*ZePool] == 0 )) {
406427 // Creation of the new ZePool with record in NumEventsAvailableInEventPool
407428 // and initialization of the record in NumEventsUnreleasedInEventPool must
408429 // be done atomically. Otherwise it is possible that
@@ -417,34 +438,28 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &ZePool,
417438
418439 ZeStruct<ze_event_pool_desc_t > ZeEventPoolDesc;
419440 ZeEventPoolDesc.count = MaxNumEventsPerPool;
420-
421- // Make all events visible on the host.
422- // TODO: events that are used only on device side APIs can be optimized
423- // to not be from the host-visible pool.
424- //
425- ZeEventPoolDesc.flags =
426- ZE_EVENT_POOL_FLAG_HOST_VISIBLE | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
441+ ZeEventPoolDesc.flags = ZePoolFlag | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
427442
428443 std::vector<ze_device_handle_t > ZeDevices;
429444 std::for_each (Devices.begin (), Devices.end (),
430445 [&](pi_device &D) { ZeDevices.push_back (D->ZeDevice ); });
431446
432447 ZE_CALL (zeEventPoolCreate, (ZeContext, &ZeEventPoolDesc, ZeDevices.size (),
433- &ZeDevices[0 ], &ZeEventPool ));
434- NumEventsAvailableInEventPool[ZeEventPool ] = MaxNumEventsPerPool - 1 ;
435- NumEventsUnreleasedInEventPool[ZeEventPool ] = MaxNumEventsPerPool;
448+ &ZeDevices[0 ], ZePool ));
449+ NumEventsAvailableInEventPool[*ZePool ] = MaxNumEventsPerPool - 1 ;
450+ NumEventsUnreleasedInEventPool[*ZePool ] = MaxNumEventsPerPool;
436451 } else {
437452 std::lock_guard<std::mutex> NumEventsAvailableInEventPoolGuard (
438453 NumEventsAvailableInEventPoolMutex);
439- Index = MaxNumEventsPerPool - NumEventsAvailableInEventPool[ZeEventPool ];
440- --NumEventsAvailableInEventPool[ZeEventPool ];
454+ Index = MaxNumEventsPerPool - NumEventsAvailableInEventPool[*ZePool ];
455+ --NumEventsAvailableInEventPool[*ZePool ];
441456 }
442- ZePool = ZeEventPool ;
457+ Pool = *ZePool ;
443458 return PI_SUCCESS;
444459}
445460
446- pi_result _pi_context::decrementUnreleasedEventsInPool (pi_event Event) {
447- ze_event_pool_handle_t ZePool = Event-> ZeEventPool ;
461+ pi_result
462+ _pi_context::decrementUnreleasedEventsInPool ( ze_event_pool_handle_t & ZePool) {
448463 if (!ZePool) {
449464 // This must be an interop event created on a users's pool.
450465 // Do nothing.
@@ -463,9 +478,9 @@ pi_result _pi_context::decrementUnreleasedEventsInPool(pi_event Event) {
463478 // multiple pi_context::ZeEventPool can be created if all slots in the pool
464479 // are already used up. So nullifying pi_context::ZeEventPool may point
465480 // a different EventPool than Event->ZeEventPool.
466- if (ZeEventPool == Event-> ZeEventPool )
481+ if (ZeEventPool == ZePool )
467482 ZeEventPool = nullptr ;
468- Event-> ZeEventPool = nullptr ;
483+ ZePool = nullptr ;
469484 }
470485 return PI_SUCCESS;
471486}
@@ -764,6 +779,8 @@ pi_result _pi_context::finalize() {
764779 NumEventsUnreleasedInEventPoolMutex);
765780 if (ZeEventPool)
766781 ZE_CALL (zeEventPoolDestroy, (ZeEventPool));
782+ if (ZeHostVisibleEventPool)
783+ ZE_CALL (zeEventPoolDestroy, (ZeHostVisibleEventPool));
767784
768785 // Destroy the command list used for initializations
769786 ZE_CALL (zeCommandListDestroy, (ZeCommandListInit));
@@ -1134,7 +1151,10 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList,
11341151 // therefore that this Queue is idle.
11351152 bool CurrentlyEmpty = this ->LastCommandEvent == nullptr ;
11361153
1137- this ->LastCommandEvent = CommandList->second .EventList .back ();
1154+ // The list can be empty if command-list only contains signals of proxy
1155+ // events.
1156+ if (!CommandList->second .EventList .empty ())
1157+ this ->LastCommandEvent = CommandList->second .EventList .back ();
11381158
11391159 // Batch if allowed to, but don't batch if we know there are no kernels
11401160 // from this queue that are currently executing. This is intended to get
@@ -1329,7 +1349,9 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList(
13291349 PI_ASSERT (EventList[I] != nullptr , PI_INVALID_VALUE);
13301350 auto ZeEvent = EventList[I]->ZeEvent ;
13311351
1332- if (FilterEventWaitList) {
1352+ // Avoid polling of the device-scope events.
1353+ // TODO: be more fine-grain and check individual events.
1354+ if (FilterEventWaitList && ZeAllHostVisibleEvents) {
13331355 auto Res = ZE_CALL_NOCHECK (zeEventQueryStatus, (ZeEvent));
13341356 if (Res == ZE_RESULT_SUCCESS) {
13351357 // Event has already completed, don't put it into the list
@@ -1629,6 +1651,8 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms,
16291651 if (NumPlatforms)
16301652 *NumPlatforms = PiPlatformsCache->size ();
16311653
1654+ zePrint (" Using %s events\n " ,
1655+ ZeAllHostVisibleEvents ? " all host-visible" : " device-only" );
16321656 return PI_SUCCESS;
16331657}
16341658
@@ -4477,6 +4501,74 @@ pi_result piextKernelGetNativeHandle(pi_kernel Kernel,
44774501//
44784502// Events
44794503//
4504+ ze_event_handle_t _pi_event::getHostVisibleEvent () const {
4505+ if (ZeAllHostVisibleEvents) {
4506+ return ZeEvent;
4507+ } else if (ZeHostVisibleEvent) {
4508+ return ZeHostVisibleEvent;
4509+ } else {
4510+ die (" The host-visible proxy event missing" );
4511+ }
4512+ }
4513+
4514+ pi_result
4515+ _pi_event::getOrCreateHostVisibleEvent (ze_event_handle_t &HostVisibleEvent) {
4516+
4517+ if (ZeAllHostVisibleEvents) {
4518+ HostVisibleEvent = ZeEvent;
4519+ } else if (ZeHostVisibleEvent) {
4520+ HostVisibleEvent = ZeHostVisibleEvent;
4521+ } else {
4522+ size_t Index;
4523+ ze_event_pool_handle_t ZeEventPool = {};
4524+ if (auto Res =
4525+ Context->getFreeSlotInExistingOrNewPool (ZeEventPool, Index, true ))
4526+ return Res;
4527+
4528+ // Create a "proxy" host-visible event.
4529+ //
4530+ // TODO: consider creating just single host-visible proxy event to
4531+ // represent multiple device-scope events. E.g. have a host-visible
4532+ // event at the end of each command-list to represent device-scope
4533+ // events from every command in that command-list.
4534+ //
4535+ ZeStruct<ze_event_desc_t > ZeEventDesc;
4536+ ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
4537+ ZeEventDesc.wait = 0 ;
4538+ ZeEventDesc.index = Index;
4539+
4540+ ZE_CALL (zeEventCreate, (ZeEventPool, &ZeEventDesc, &ZeHostVisibleEvent));
4541+ ZeHostVisibleEventPool = ZeEventPool;
4542+ HostVisibleEvent = ZeHostVisibleEvent;
4543+
4544+ // Submit the command(s) signalling the proxy event to the queue.
4545+ // We have to first submit a wait for the device-only event for which this
4546+ // proxy is created.
4547+ //
4548+ // Get a new command list to be used on this call
4549+ {
4550+ std::lock_guard<std::mutex> Lock (Queue->PiQueueMutex );
4551+
4552+ // We want to batch these commands to avoid extra submissions (costly)
4553+ bool OkToBatch = true ;
4554+
4555+ pi_command_list_ptr_t CommandList{};
4556+ if (auto Res = Queue->Context ->getAvailableCommandList (Queue, CommandList,
4557+ false , OkToBatch))
4558+ return Res;
4559+
4560+ ZE_CALL (zeCommandListAppendWaitOnEvents,
4561+ (CommandList->first , 1 , &ZeEvent));
4562+ ZE_CALL (zeCommandListAppendSignalEvent,
4563+ (CommandList->first , ZeHostVisibleEvent));
4564+
4565+ if (auto Res = Queue->executeCommandList (CommandList, false , OkToBatch))
4566+ return Res;
4567+ }
4568+ }
4569+ return PI_SUCCESS;
4570+ }
4571+
44804572pi_result piEventCreate (pi_context Context, pi_event *RetEvent) {
44814573 size_t Index = 0 ;
44824574 ze_event_pool_handle_t ZeEventPool = {};
@@ -4485,12 +4577,21 @@ pi_result piEventCreate(pi_context Context, pi_event *RetEvent) {
44854577
44864578 ze_event_handle_t ZeEvent;
44874579 ZeStruct<ze_event_desc_t > ZeEventDesc;
4488- // We have to set the SIGNAL flag as HOST scope because the
4489- // Level-Zero plugin implementation waits for the events to complete
4490- // on the host.
4491- ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
4492- ZeEventDesc.wait = 0 ;
44934580 ZeEventDesc.index = Index;
4581+ ZeEventDesc.wait = 0 ;
4582+ //
4583+ // Set the scope to "device" for every event. This is sufficient for global
4584+ // device access and peer device access. If needed to be waited on the host
4585+ // we are doing special handling, see piEventsWait.
4586+ //
4587+ // TODO: see if "sub-device" (ZE_EVENT_SCOPE_FLAG_SUBDEVICE) can better be
4588+ // used in some circumstances.
4589+ //
4590+ if (ZeAllHostVisibleEvents) {
4591+ ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
4592+ } else {
4593+ ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_DEVICE;
4594+ }
44944595
44954596 ZE_CALL (zeEventCreate, (ZeEventPool, &ZeEventDesc, &ZeEvent));
44964597
@@ -4541,13 +4642,18 @@ pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName,
45414642 }
45424643 }
45434644
4645+ // Make sure that we query the host-visible event.
4646+ ze_event_handle_t ZeHostVisibleEvent;
4647+ if (auto Res = Event->getOrCreateHostVisibleEvent (ZeHostVisibleEvent))
4648+ return Res;
4649+
45444650 ze_result_t ZeResult;
4545- ZeResult = ZE_CALL_NOCHECK (zeEventQueryStatus, (Event-> ZeEvent ));
4651+ ZeResult = ZE_CALL_NOCHECK (zeEventQueryStatus, (ZeHostVisibleEvent ));
45464652 if (ZeResult == ZE_RESULT_SUCCESS) {
45474653 return getInfo (ParamValueSize, ParamValue, ParamValueSizeRet,
45484654 pi_int32{CL_COMPLETE}); // Untie from OpenCL
45494655 }
4550- // TODO: We don't know if the status is queueed , submitted or running.
4656+ // TODO: We don't know if the status is queued , submitted or running.
45514657 // For now return "running", as others are unlikely to be of
45524658 // interest.
45534659 return getInfo (ParamValueSize, ParamValue, ParamValueSizeRet,
@@ -4750,6 +4856,17 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
47504856 return PI_INVALID_EVENT;
47514857 }
47524858
4859+ // Make sure to add all host-visible "proxy" event signals if needed.
4860+ // This ensures that all signalling commands are submitted below and
4861+ // thus proxy events can be waited without a deadlock.
4862+ //
4863+ for (uint32_t I = 0 ; I < NumEvents; I++) {
4864+ ze_event_handle_t ZeHostVisibleEvent;
4865+ if (auto Res =
4866+ EventList[I]->getOrCreateHostVisibleEvent (ZeHostVisibleEvent))
4867+ return Res;
4868+ }
4869+
47534870 // Submit dependent open command lists for execution, if any
47544871 for (uint32_t I = 0 ; I < NumEvents; I++) {
47554872 auto Queue = EventList[I]->Queue ;
@@ -4765,7 +4882,7 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
47654882 }
47664883
47674884 for (uint32_t I = 0 ; I < NumEvents; I++) {
4768- ze_event_handle_t ZeEvent = EventList[I]->ZeEvent ;
4885+ ze_event_handle_t ZeEvent = EventList[I]->getHostVisibleEvent () ;
47694886 zePrint (" ZeEvent = %#lx\n " , pi_cast<std::uintptr_t >(ZeEvent));
47704887 ZE_CALL (zeHostSynchronize, (ZeEvent));
47714888
@@ -4831,11 +4948,20 @@ static pi_result EventRelease(pi_event Event, pi_queue LockedQueue) {
48314948 if (Event->OwnZeEvent ) {
48324949 ZE_CALL (zeEventDestroy, (Event->ZeEvent ));
48334950 }
4951+ if (Event->ZeHostVisibleEvent ) {
4952+ ZE_CALL (zeEventDestroy, (Event->ZeHostVisibleEvent ));
4953+ }
48344954
48354955 auto Context = Event->Context ;
4836- if (auto Res = Context->decrementUnreleasedEventsInPool (Event))
4956+ if (auto Res = Context->decrementUnreleasedEventsInPool (Event-> ZeEventPool ))
48374957 return Res;
48384958
4959+ if (Event->ZeHostVisibleEvent ) {
4960+ if (auto Res = Context->decrementUnreleasedEventsInPool (
4961+ Event->ZeHostVisibleEventPool ))
4962+ return Res;
4963+ }
4964+
48394965 // We intentionally incremented the reference counter when an event is
48404966 // created so that we can avoid pi_queue is released before the associated
48414967 // pi_event is released. Here we have to decrement it so pi_queue
0 commit comments