@@ -1916,42 +1916,59 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
19161916 assert (RetMem);
19171917
19181918 void *Ptr;
1919+ ze_device_handle_t ZeDevice = Context->Devices [0 ]->ZeDevice ;
1920+
1921+ // We treat integrated devices (physical memory shared with the CPU)
1922+ // differently from discrete devices (those with distinct memories).
1923+ // For integrated devices, allocating the buffer in host shared memory
1924+ // enables automatic access from the device, and makes copying
1925+ // unnecessary in the map/unmap operations. This improves performance.
1926+ bool DeviceIsIntegrated = Context->Devices .size () == 1 &&
1927+ Context->Devices [0 ]->ZeDeviceProperties .flags &
1928+ ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
1929+
1930+ if (DeviceIsIntegrated) {
1931+ ze_host_mem_alloc_desc_t ZeDesc = {};
1932+ ZeDesc.flags = 0 ;
19191933
1920- ze_device_mem_alloc_desc_t ZeDeviceMemDesc = {};
1921- ZeDeviceMemDesc.flags = 0 ;
1922- ZeDeviceMemDesc.ordinal = 0 ;
1934+ ZE_CALL (zeMemAllocHost (Context->ZeContext , &ZeDesc, Size, 1 , &Ptr));
19231935
1924- if (Context->Devices .size () == 1 ) {
1925- ZE_CALL (zeMemAllocDevice (Context->ZeContext , &ZeDeviceMemDesc, Size,
1926- 1 , // TODO: alignment
1927- Context->Devices [0 ]->ZeDevice , &Ptr));
1928- } else {
1929- ze_host_mem_alloc_desc_t ZeHostMemDesc = {};
1930- ZeHostMemDesc.flags = 0 ;
1931- ZE_CALL (zeMemAllocShared (Context->ZeContext , &ZeDeviceMemDesc,
1932- &ZeHostMemDesc, Size,
1933- 1 , // TODO: alignment
1934- nullptr , // not bound to any device
1935- &Ptr));
1936- }
1937-
1938- if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
1939- (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0 ) {
1940- // Initialize the buffer synchronously with immediate offload
1941- ZE_CALL (zeCommandListAppendMemoryCopy (Context->ZeCommandListInit , Ptr,
1942- HostPtr, Size, nullptr , 0 , nullptr ));
1943- } else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
1944- // Nothing more to do.
19451936 } else {
1946- die (" piMemBufferCreate: not implemented" );
1937+ ze_device_mem_alloc_desc_t ZeDesc = {};
1938+ ZeDesc.flags = 0 ;
1939+ ZeDesc.ordinal = 0 ;
1940+
1941+ ZE_CALL (
1942+ zeMemAllocDevice (Context->ZeContext , &ZeDesc, Size, 1 , ZeDevice, &Ptr));
1943+ }
1944+ if (HostPtr) {
1945+ if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
1946+ (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0 ) {
1947+ // Initialize the buffer with user data
1948+ if (DeviceIsIntegrated) {
1949+ // Do a host to host copy
1950+ memcpy (Ptr, HostPtr, Size);
1951+ } else {
1952+
1953+ // Initialize the buffer synchronously with immediate offload
1954+ ZE_CALL (zeCommandListAppendMemoryCopy (Context->ZeCommandListInit , Ptr,
1955+ HostPtr, Size, nullptr , 0 ,
1956+ nullptr ));
1957+ }
1958+ } else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
1959+ // Nothing more to do.
1960+ } else {
1961+ die (" piMemBufferCreate: not implemented" );
1962+ }
19471963 }
19481964
19491965 auto HostPtrOrNull =
19501966 (Flags & PI_MEM_FLAGS_HOST_PTR_USE) ? pi_cast<char *>(HostPtr) : nullptr ;
19511967 try {
19521968 *RetMem = new _pi_buffer (
19531969 Context, pi_cast<char *>(Ptr) /* Level Zero Memory Handle */ ,
1954- HostPtrOrNull);
1970+ HostPtrOrNull, nullptr , 0 , 0 ,
1971+ DeviceIsIntegrated /* Flag indicating allocation in host memory */ );
19551972 } catch (const std::bad_alloc &) {
19561973 return PI_OUT_OF_HOST_MEMORY;
19571974 } catch (...) {
@@ -4031,17 +4048,11 @@ piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap,
40314048 assert (Buffer);
40324049 assert (Queue);
40334050
4034- // Lock automatically releases when this goes out of scope.
4035- std::lock_guard<std::mutex> lock (Queue->PiQueueMutex );
4036-
4037- // Get a new command list to be used on this call
4051+ // For discrete devices we don't need a commandlist
40384052 ze_command_list_handle_t ZeCommandList = nullptr ;
40394053 ze_fence_handle_t ZeFence = nullptr ;
4040- if (auto Res = Queue->Device ->getAvailableCommandList (Queue, &ZeCommandList,
4041- &ZeFence))
4042- return Res;
4043-
40444054 ze_event_handle_t ZeEvent = nullptr ;
4055+
40454056 if (Event) {
40464057 auto Res = piEventCreate (Queue->Context , Event);
40474058 if (Res != PI_SUCCESS)
@@ -4054,38 +4065,64 @@ piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap,
40544065 ZeEvent = (*Event)->ZeEvent ;
40554066 }
40564067
4068+ // TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4069+ // left to doing new memory allocation and a copy (read) on discrete devices.
4070+ // On integrated devices we have allocated the buffer in host memory
4071+ // so no actions are needed here except for synchronizing on incoming events
4072+ // and doing a host-to-host copy if a host pointer had been supplied
4073+ // during buffer creation.
4074+ //
4075+ // TODO: for discrete, check if the input buffer is already allocated
4076+ // in shared memory and thus is accessible from the host as is.
4077+ // Can we get SYCL RT to predict/allocate in shared memory
4078+ // from the beginning?
4079+ //
4080+ // On integrated devices the buffer has been allocated in host memory.
4081+ if (Buffer->OnHost ) {
4082+ // Wait on incoming events before doing the copy
4083+ piEventsWait (NumEventsInWaitList, EventWaitList);
4084+ if (Buffer->MapHostPtr ) {
4085+ *RetMap = Buffer->MapHostPtr + Offset;
4086+ memcpy (*RetMap, pi_cast<char *>(Buffer->getZeHandle ()) + Offset, Size);
4087+ } else {
4088+ *RetMap = pi_cast<char *>(Buffer->getZeHandle ()) + Offset;
4089+ }
4090+
4091+ // Signal this event
4092+ ZE_CALL (zeEventHostSignal (ZeEvent));
4093+
4094+ return Buffer->addMapping (*RetMap, Offset, Size);
4095+ }
4096+
4097+ // Lock automatically releases when this goes out of scope.
4098+ std::lock_guard<std::mutex> lock (Queue->PiQueueMutex );
4099+
4100+ // For discrete devices we need a command list
4101+ if (auto Res = Queue->Device ->getAvailableCommandList (Queue, &ZeCommandList,
4102+ &ZeFence))
4103+ return Res;
4104+
4105+ // Set the commandlist in the event
4106+ if (Event) {
4107+ (*Event)->ZeCommandList = ZeCommandList;
4108+ }
4109+
40574110 ze_event_handle_t *ZeEventWaitList =
40584111 _pi_event::createZeEventList (NumEventsInWaitList, EventWaitList);
40594112
4060- ZE_CALL (zeCommandListAppendWaitOnEvents (ZeCommandList, NumEventsInWaitList,
4061- ZeEventWaitList));
4062-
4063- // TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4064- // left to doing new memory allocation and a copy (read).
4065- //
4066- // TODO: check if the input buffer is already allocated in shared
4067- // memory and thus is accessible from the host as is. Can we get SYCL RT
4068- // to predict/allocate in shared memory from the beginning?
40694113 if (Buffer->MapHostPtr ) {
4070- // NOTE: borrowing below semantics from OpenCL as SYCL RT relies on it.
4071- // It is also better for performance.
4072- //
4073- // "If the buffer object is created with CL_MEM_USE_HOST_PTR set in
4074- // mem_flags, the following will be true:
4075- // - The host_ptr specified in clCreateBuffer is guaranteed to contain the
4076- // latest bits in the region being mapped when the clEnqueueMapBuffer
4077- // command has completed.
4078- // - The pointer value returned by clEnqueueMapBuffer will be derived from
4079- // the host_ptr specified when the buffer object is created."
40804114 *RetMap = Buffer->MapHostPtr + Offset;
40814115 } else {
40824116 ze_host_mem_alloc_desc_t ZeDesc = {};
40834117 ZeDesc.flags = 0 ;
4084- ZE_CALL ( zeMemAllocHost (Queue-> Context -> ZeContext , &ZeDesc, Size,
4085- 1 , // TODO: alignment
4086- RetMap));
4118+
4119+ ZE_CALL (
4120+ zeMemAllocHost (Queue-> Context -> ZeContext , &ZeDesc, Size, 1 , RetMap));
40874121 }
40884122
4123+ ZE_CALL (zeCommandListAppendWaitOnEvents (ZeCommandList, NumEventsInWaitList,
4124+ ZeEventWaitList));
4125+
40894126 ZE_CALL (zeCommandListAppendMemoryCopy (
40904127 ZeCommandList, *RetMap, pi_cast<char *>(Buffer->getZeHandle ()) + Offset,
40914128 Size, ZeEvent, 0 , nullptr ));
@@ -4103,15 +4140,10 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr,
41034140 const pi_event *EventWaitList, pi_event *Event) {
41044141 assert (Queue);
41054142
4106- // Lock automatically releases when this goes out of scope.
4107- std::lock_guard<std::mutex> lock (Queue->PiQueueMutex );
4108-
4109- // Get a new command list to be used on this call
4143+ // Integrated devices don't need a command list.
4144+ // If discrete we will get a commandlist later.
41104145 ze_command_list_handle_t ZeCommandList = nullptr ;
41114146 ze_fence_handle_t ZeFence = nullptr ;
4112- if (auto Res = Queue->Device ->getAvailableCommandList (Queue, &ZeCommandList,
4113- &ZeFence))
4114- return Res;
41154147
41164148 // TODO: handle the case when user does not care to follow the event
41174149 // of unmap completion.
@@ -4130,6 +4162,46 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr,
41304162 ZeEvent = (*Event)->ZeEvent ;
41314163 }
41324164
4165+ _pi_mem::Mapping MapInfo = {};
4166+ if (pi_result Res = MemObj->removeMapping (MappedPtr, MapInfo))
4167+ return Res;
4168+
4169+ // NOTE: we still have to free the host memory allocated/returned by
4170+ // piEnqueueMemBufferMap, but can only do so after the above copy
4171+ // is completed. Instead of waiting for It here (blocking), we shall
4172+ // do so in piEventRelease called for the pi_event tracking the unmap.
4173+ // In the case of an integrated device, the map operation does not allocate
4174+ // any memory, so there is nothing to free. This is indicated by a nullptr.
4175+ if (Event)
4176+ (*Event)->CommandData =
4177+ (MemObj->OnHost ? nullptr : (MemObj->MapHostPtr ? nullptr : MappedPtr));
4178+
4179+ // On integrated devices the buffer is allocated in host memory.
4180+ if (MemObj->OnHost ) {
4181+ // Wait on incoming events before doing the copy
4182+ piEventsWait (NumEventsInWaitList, EventWaitList);
4183+ if (MemObj->MapHostPtr )
4184+ memcpy (pi_cast<char *>(MemObj->getZeHandle ()) + MapInfo.Offset , MappedPtr,
4185+ MapInfo.Size );
4186+
4187+ // Signal this event
4188+ ZE_CALL (zeEventHostSignal (ZeEvent));
4189+
4190+ return PI_SUCCESS;
4191+ }
4192+
4193+ // Lock automatically releases when this goes out of scope.
4194+ std::lock_guard<std::mutex> lock (Queue->PiQueueMutex );
4195+
4196+ if (auto Res = Queue->Device ->getAvailableCommandList (Queue, &ZeCommandList,
4197+ &ZeFence))
4198+ return Res;
4199+
4200+ // Set the commandlist in the event
4201+ if (Event) {
4202+ (*Event)->ZeCommandList = ZeCommandList;
4203+ }
4204+
41334205 ze_event_handle_t *ZeEventWaitList =
41344206 _pi_event::createZeEventList (NumEventsInWaitList, EventWaitList);
41354207
@@ -4141,21 +4213,11 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr,
41414213 //
41424214 // NOTE: Keep this in sync with the implementation of
41434215 // piEnqueueMemBufferMap/piEnqueueMemImageMap.
4144- _pi_mem::Mapping MapInfo = {};
4145- if (pi_result Res = MemObj->removeMapping (MappedPtr, MapInfo))
4146- return Res;
41474216
41484217 ZE_CALL (zeCommandListAppendMemoryCopy (
41494218 ZeCommandList, pi_cast<char *>(MemObj->getZeHandle ()) + MapInfo.Offset ,
41504219 MappedPtr, MapInfo.Size , ZeEvent, 0 , nullptr ));
41514220
4152- // NOTE: we still have to free the host memory allocated/returned by
4153- // piEnqueueMemBufferMap, but can only do so after the above copy
4154- // is completed. Instead of waiting for It here (blocking), we shall
4155- // do so in piEventRelease called for the pi_event tracking the unmap.
4156- if (Event)
4157- (*Event)->CommandData = MemObj->MapHostPtr ? nullptr : MappedPtr;
4158-
41594221 // Execute command list asynchronously, as the event will be used
41604222 // to track down its completion.
41614223 if (auto Res = Queue->executeCommandList (ZeCommandList, ZeFence))
0 commit comments