@@ -318,12 +318,13 @@ pi_result _pi_context::decrementUnreleasedEventsInPool(pi_event Event) {
318318}
319319
320320// Forward declarations
321- static pi_result
322- enqueueMemCopyHelper (pi_command_type CommandType, pi_queue Queue, void *Dst,
323- pi_bool BlockingWrite, size_t Size, const void *Src,
324- pi_uint32 NumEventsInWaitList,
325- const pi_event *EventWaitList, pi_event *Event,
326- bool PreferCopyEngine = false );
321+ static pi_result enqueueMemCopyHelper (pi_command_type CommandType,
322+ pi_queue Queue, void *Dst,
323+ pi_bool BlockingWrite, size_t Size,
324+ const void *Src,
325+ pi_uint32 NumEventsInWaitList,
326+ const pi_event *EventWaitList,
327+ pi_event *Event, bool PreferCopyEngine);
327328
328329static pi_result enqueueMemCopyRectHelper (
329330 pi_command_type CommandType, pi_queue Queue, const void *SrcBuffer,
@@ -577,19 +578,30 @@ pi_result _pi_context::initialize() {
577578 createUSMAllocators (SingleRootDevice);
578579 }
579580
580- // Create the immediate command list to be used for initializations
581+ // Create the immediate command list to be used for initializations.
581582 // Created as synchronous so level-zero performs implicit synchronization and
582583 // there is no need to query for completion in the plugin
583584 //
584- // TODO: get rid of using Devices[0] for the context with multiple
585- // root-devices. We should somehow make the data initialized on all devices.
585+ // TODO: we use Device[0] here as the single immediate command-list
586+ // for buffer creation and migration. Initialization is in
587+ // in sync and is always performed to Devices[0] as well but
588+ // D2D migartion, if no P2P, is broken since it should use
589+ // immediate command-list for the specfic devices, and this single one.
590+ //
586591 pi_device Device = SingleRootDevice ? SingleRootDevice : Devices[0 ];
587592
588- // NOTE: we always submit to the "0" index compute engine with immediate
589- // command list since this is one for context .
593+ // Prefer to use copy engine for initialization copies,
594+ // if available and allowed (main copy engine with index 0) .
590595 ZeStruct<ze_command_queue_desc_t > ZeCommandQueueDesc;
596+ const auto &Range = getRangeOfAllowedCopyEngines ((zer_device_handle_t )Device);
591597 ZeCommandQueueDesc.ordinal =
592598 Device->QueueGroup [_pi_device::queue_group_info_t ::Compute].ZeOrdinal ;
599+ if (Range.first >= 0 &&
600+ Device->QueueGroup [_pi_device::queue_group_info_t ::MainCopy].ZeOrdinal !=
601+ -1 )
602+ ZeCommandQueueDesc.ordinal =
603+ Device->QueueGroup [_pi_device::queue_group_info_t ::MainCopy].ZeOrdinal ;
604+
593605 ZeCommandQueueDesc.index = 0 ;
594606 ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
595607 ZE_CALL (
@@ -5646,7 +5658,8 @@ pi_result piEnqueueMemBufferRead(pi_queue Queue, pi_mem Src,
56465658 PI_CALL (Src->getZeHandle (ZeHandleSrc, _pi_mem::read_only, Queue->Device ));
56475659 return enqueueMemCopyHelper (PI_COMMAND_TYPE_MEM_BUFFER_READ, Queue, Dst,
56485660 BlockingRead, Size, ZeHandleSrc + Offset,
5649- NumEventsInWaitList, EventWaitList, Event);
5661+ NumEventsInWaitList, EventWaitList, Event,
5662+ /* PreferCopyEngine */ true );
56505663}
56515664
56525665pi_result piEnqueueMemBufferReadRect (
@@ -5913,7 +5926,8 @@ pi_result piEnqueueMemBufferWrite(pi_queue Queue, pi_mem Buffer,
59135926 ZeHandleDst + Offset, // dst
59145927 BlockingWrite, Size,
59155928 Ptr, // src
5916- NumEventsInWaitList, EventWaitList, Event);
5929+ NumEventsInWaitList, EventWaitList, Event,
5930+ /* PreferCopyEngine */ true );
59175931}
59185932
59195933pi_result piEnqueueMemBufferWriteRect (
0 commit comments