diff --git a/sycl/include/sycl/detail/pi.def b/sycl/include/sycl/detail/pi.def
index 2e71783720def..68a6fbc10a707 100644
--- a/sycl/include/sycl/detail/pi.def
+++ b/sycl/include/sycl/detail/pi.def
@@ -135,6 +135,8 @@ _PI_API(piextUSMGetMemAllocInfo)
 _PI_API(piextEnqueueReadHostPipe)
 _PI_API(piextEnqueueWriteHostPipe)
 
+_PI_API(piextGetMemoryConnection)
+
 _PI_API(piextKernelSetArgMemObj)
 _PI_API(piextKernelSetArgSampler)
 
diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h
index 01ece111742d8..4d4c36df1fa02 100644
--- a/sycl/include/sycl/detail/pi.h
+++ b/sycl/include/sycl/detail/pi.h
@@ -1000,6 +1000,16 @@ typedef enum { PI_MEM_CONTEXT = 0x1106, PI_MEM_SIZE = 0x1102 } _pi_mem_info;
 
 using pi_mem_info = _pi_mem_info;
 
+// Represent different memory connections between devices, see
+// piextGetMemoryConnection.
+typedef enum {
+  PI_MEMORY_CONNECTION_NONE,       // copies must go through host
+  PI_MEMORY_CONNECTION_MIGRATABLE, // copies must be explicit
+  PI_MEMORY_CONNECTION_UNIFIED     // copies are not needed
+} _pi_memory_connection;
+
+using pi_memory_connection = _pi_memory_connection;
+
 //
 // Following section contains SYCL RT Plugin Interface (PI) functions.
 // They are 3 distinct categories:
@@ -1112,6 +1122,21 @@ __SYCL_EXPORT pi_result piextGetDeviceFunctionPointer(
     pi_device device, pi_program program, const char *function_name,
     pi_uint64 *function_pointer_ret);
 
+/// Returns the type of memory connection between the two devices in the two
+/// respective contexts.
+///
+/// \param dev1 First device.
+/// \param ctx1 First context, must contain dev1.
+/// \param dev2 Second device.
+/// \param ctx2 Second context, must contain dev2.
+/// \return res Type of memory connection supported between the pairs (dev1,
+/// ctx1) and (dev2, ctx2).
+__SYCL_EXPORT pi_result piextGetMemoryConnection(pi_device dev1,
+                                                 pi_context ctx1,
+                                                 pi_device dev2,
+                                                 pi_context ctx2,
+                                                 _pi_memory_connection *res);
+
 //
 // Context
 //
@@ -1226,11 +1251,13 @@ __SYCL_EXPORT pi_result piextQueueCreateWithNativeHandle(
 //
 // Memory
 //
-__SYCL_EXPORT pi_result piMemBufferCreate(
-    pi_context context, pi_mem_flags flags, size_t size, void *host_ptr,
-    pi_mem *ret_mem, const pi_mem_properties *properties = nullptr);
+__SYCL_EXPORT pi_result
+piMemBufferCreate(pi_context context, pi_device device, pi_mem_flags flags,
+                  size_t size, void *host_ptr, pi_mem *ret_mem,
+                  const pi_mem_properties *properties = nullptr);
 
-__SYCL_EXPORT pi_result piMemImageCreate(pi_context context, pi_mem_flags flags,
+__SYCL_EXPORT pi_result piMemImageCreate(pi_context context, pi_device device,
+                                         pi_mem_flags flags,
                                          const pi_image_format *image_format,
                                          const pi_image_desc *image_desc,
                                          void *host_ptr, pi_mem *ret_mem);
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 1c7a6be525beb..2d885ea9ed33e 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -188,14 +188,6 @@ pi_result check_error(CUresult result, const char *function, int line,
 /// contexts to be restored by SYCL.
 class ScopedContext {
 public:
-  ScopedContext(pi_context ctxt) {
-    if (!ctxt) {
-      throw PI_ERROR_INVALID_CONTEXT;
-    }
-
-    set_context(ctxt->get());
-  }
-
   ScopedContext(CUcontext ctxt) { set_context(ctxt); }
 
   ~ScopedContext() {}
@@ -296,7 +288,7 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
   threadsPerBlock[2] = ((global_work_size[2] - 1) / gridDim[2]) + 1;
 
   PI_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
-      &minGrid, &maxBlockSize, kernel->get(), NULL, local_size,
+      &minGrid, &maxBlockSize, kernel->get(device), NULL, local_size,
       maxThreadsPerBlock[0]));
 
   gridDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]);
@@ -319,7 +311,7 @@ pi_result enqueueEventsWait(pi_queue command_queue, CUstream stream,
     return PI_SUCCESS;
   }
   try {
-    ScopedContext active(command_queue->get_context());
+    ScopedContext active(command_queue->get_native_context());
 
     auto result = forLatestEvents(
         event_wait_list, num_events_in_wait_list,
@@ -520,12 +512,14 @@ CUstream _pi_queue::get_next_transfer_stream() {
   return res;
 }
 
-_pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue,
-                     CUstream stream, pi_uint32 stream_token)
+_pi_event::_pi_event(pi_command_type type, pi_context context,
+                     CUcontext native_context, pi_queue queue, CUstream stream,
+                     pi_uint32 stream_token)
     : commandType_{type}, refCount_{1}, has_ownership_{true},
       hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false},
       streamToken_{stream_token}, evEnd_{nullptr}, evStart_{nullptr},
-      evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context} {
+      evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context},
+      native_context_{native_context} {
 
   bool profilingEnabled = queue_->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE;
 
@@ -687,8 +681,10 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) {
 }
 
 _pi_program::_pi_program(pi_context ctxt)
-    : module_{nullptr}, binary_{}, binarySizeInBytes_{0}, refCount_{1},
-      context_{ctxt}, kernelReqdWorkGroupSizeMD_{} {
+    : modules_{ctxt->get_devices().size(), nullptr},
+      build_results_{ctxt->get_devices().size(), CUDA_ERROR_UNKNOWN},
+      binary_{0}, binarySizeInBytes_{0}, refCount_{1}, context_{ctxt},
+      kernelReqdWorkGroupSizeMD_{} {
   cuda_piContextRetain(context_);
 }
 
@@ -773,17 +769,41 @@ pi_result _pi_program::build_program(const char *build_options) {
   options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
   optionVals[3] = (void *)(long)MAX_LOG_SIZE;
 
-  auto result = PI_CHECK_ERROR(
-      cuModuleLoadDataEx(&module_, static_cast<const void *>(binary_),
-                         numberOfOptions, options, optionVals));
-
-  const auto success = (result == PI_SUCCESS);
+  auto devs = get_context()->get_devices();
 
-  buildStatus_ =
-      success ? PI_PROGRAM_BUILD_STATUS_SUCCESS : PI_PROGRAM_BUILD_STATUS_ERROR;
+  // we count the build as successful if it succeeds on at least one device
+  bool success = false;
+  CUresult res1 = CUDA_ERROR_NO_DEVICE;
+  for (size_t i = 0; i < devs.size(); i++) {
+    ScopedContext ctx(devs[i]->get_context());
+    res1 = cuModuleLoadDataEx(&modules_[i], static_cast<const void *>(binary_),
+                              numberOfOptions, options, optionVals);
+    build_results_[i] = res1;
+    success |= CUDA_SUCCESS == res1;
+  }
 
   // If no exception, result is correct
-  return success ? PI_SUCCESS : PI_ERROR_BUILD_PROGRAM_FAILURE;
+  if (success) {
+    buildStatus_ = PI_PROGRAM_BUILD_STATUS_SUCCESS;
+    return PI_SUCCESS;
+  } else {
+    PI_CHECK_ERROR(res1);
+    buildStatus_ = PI_PROGRAM_BUILD_STATUS_ERROR;
+    return PI_ERROR_BUILD_PROGRAM_FAILURE;
+  }
+}
+
+CUfunction _pi_kernel::get(pi_device device) const noexcept {
+  size_t i = context_->device_index(device);
+  PI_CHECK_ERROR(program_->build_results_[i]);
+  return functions_[i];
+}
+
+CUfunction
+_pi_kernel::get_with_offset_parameter(pi_device device) const noexcept {
+  size_t i = context_->device_index(device);
+  PI_CHECK_ERROR(program_->build_results_[i]);
+  return functionsWithOffsetParam_[i];
 }
 
 /// Finds kernel names by searching for entry points in the PTX source, as the
@@ -810,16 +830,13 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
 /// Triggers the CUDA Driver initialization (cuInit) the first time, so this
 /// must be the first PI API called.
 ///
-/// However because multiple devices in a context is not currently supported,
-/// place each device in a separate platform.
-///
 pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms,
                               pi_uint32 *num_platforms) {
 
   try {
     static std::once_flag initFlag;
     static pi_uint32 numPlatforms = 1;
-    static std::vector<_pi_platform> platformIds;
+    static _pi_platform platformId;
 
     if (num_entries == 0 && platforms != nullptr) {
       return PI_ERROR_INVALID_VALUE;
@@ -844,13 +861,11 @@ pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms,
             return;
           }
           try {
-            // make one platform per device
-            numPlatforms = numDevices;
-            platformIds.resize(numDevices);
-
+            platformId.devices_.reserve(numDevices);
             for (int i = 0; i < numDevices; ++i) {
               CUdevice device;
               err = PI_CHECK_ERROR(cuDeviceGet(&device, i));
+
               CUcontext context;
               err = PI_CHECK_ERROR(cuDevicePrimaryCtxRetain(&context, device));
 
@@ -861,11 +876,11 @@ pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms,
               // Use default stream to record base event counter
               err = PI_CHECK_ERROR(cuEventRecord(evBase, 0));
 
-              platformIds[i].devices_.emplace_back(
-                  new _pi_device{device, context, evBase, &platformIds[i]});
+              platformId.devices_.emplace_back(
+                  new _pi_device{device, context, evBase, &platformId});
 
               {
-                const auto &dev = platformIds[i].devices_.back().get();
+                const auto &dev = platformId.devices_.back().get();
                 size_t maxWorkGroupSize = 0u;
                 size_t maxThreadsPerBlock[3] = {};
                 pi_result retError = cuda_piDeviceGetInfo(
@@ -886,17 +901,11 @@ pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms,
             }
           } catch (const std::bad_alloc &) {
             // Signal out-of-memory situation
-            for (int i = 0; i < numDevices; ++i) {
-              platformIds[i].devices_.clear();
-            }
-            platformIds.clear();
+            platformId.devices_.clear();
             err = PI_ERROR_OUT_OF_HOST_MEMORY;
           } catch (...) {
             // Clear and rethrow to allow retry
-            for (int i = 0; i < numDevices; ++i) {
-              platformIds[i].devices_.clear();
-            }
-            platformIds.clear();
+            platformId.devices_.clear();
             throw;
           }
         },
@@ -907,9 +916,7 @@ pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms,
     }
 
     if (platforms != nullptr) {
-      for (unsigned i = 0; i < std::min(num_entries, numPlatforms); ++i) {
-        platforms[i] = &platformIds[i];
-      }
+      *platforms = &platformId;
     }
 
     return err;
@@ -998,8 +1005,9 @@ pi_result cuda_piContextGetInfo(pi_context context, pi_context_info param_name,
   case PI_CONTEXT_INFO_NUM_DEVICES:
     return getInfo(param_value_size, param_value, param_value_size_ret, 1);
   case PI_CONTEXT_INFO_DEVICES:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   context->get_device());
+    return getInfoArray(context->get_devices().size(), param_value_size,
+                        param_value, param_value_size_ret,
+                        &context->get_devices()[0]);
   case PI_CONTEXT_INFO_REFERENCE_COUNT:
     return getInfo(param_value_size, param_value, param_value_size_ret,
                    context->get_reference_count());
@@ -1083,12 +1091,12 @@ pi_result cuda_piextGetDeviceFunctionPointer(pi_device device,
                                              pi_program program,
                                              const char *func_name,
                                              pi_uint64 *func_pointer_ret) {
-  // Check if device passed is the same the device bound to the context
-  assert(device == program->get_context()->get_device());
   assert(func_pointer_ret != nullptr);
+  CUmodule module =
+      program->get()[program->get_context()->device_index(device)];
 
   CUfunction func;
-  CUresult ret = cuModuleGetFunction(&func, program->get(), func_name);
+  CUresult ret = cuModuleGetFunction(&func, module, func_name);
   *func_pointer_ret = reinterpret_cast<pi_uint64>(func);
   pi_result retError = PI_SUCCESS;
 
@@ -2143,19 +2151,18 @@ pi_result cuda_piContextCreate(const pi_context_properties *properties,
                                                   const void *private_info,
                                                   size_t cb, void *user_data),
                                void *user_data, pi_context *retcontext) {
-
   assert(devices != nullptr);
   // TODO: How to implement context callback?
   assert(pfn_notify == nullptr);
   assert(user_data == nullptr);
-  assert(num_devices == 1);
   // Need input context
   assert(retcontext != nullptr);
   pi_result errcode_ret = PI_SUCCESS;
 
   std::unique_ptr<_pi_context> piContextPtr{nullptr};
   try {
-    piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{*devices});
+    piContextPtr = std::unique_ptr<_pi_context>(new _pi_context(
+        std::vector<pi_device>(devices, devices + num_devices)));
     *retcontext = piContextPtr.release();
   } catch (pi_result err) {
     errcode_ret = err;
@@ -2186,7 +2193,13 @@ pi_result cuda_piContextRelease(pi_context ctxt) {
 /// \return PI_SUCCESS
 pi_result cuda_piextContextGetNativeHandle(pi_context context,
                                            pi_native_handle *nativeHandle) {
-  *nativeHandle = reinterpret_cast<pi_native_handle>(context->get());
+  // Currently only support context interop with one device
+  if (context->get_devices().size() != 1) {
+    return PI_ERROR_INVALID_CONTEXT;
+  }
+
+  *nativeHandle = reinterpret_cast<pi_native_handle>(
+      context->get_devices()[0]->get_context());
   return PI_SUCCESS;
 }
 
@@ -2217,8 +2230,9 @@ pi_result cuda_piextContextCreateWithNativeHandle(pi_native_handle nativeHandle,
 /// Can trigger a manual copy depending on the mode.
 /// \TODO Implement USE_HOST_PTR using cuHostRegister
 ///
-pi_result cuda_piMemBufferCreate(pi_context context, pi_mem_flags flags,
-                                 size_t size, void *host_ptr, pi_mem *ret_mem,
+pi_result cuda_piMemBufferCreate(pi_context context, pi_device device,
+                                 pi_mem_flags flags, size_t size,
+                                 void *host_ptr, pi_mem *ret_mem,
                                  const pi_mem_properties *properties) {
   // Need input memory object
   assert(ret_mem != nullptr);
@@ -2235,7 +2249,7 @@ pi_result cuda_piMemBufferCreate(pi_context context, pi_mem_flags flags,
   pi_mem retMemObj = nullptr;
 
   try {
-    ScopedContext active(context);
+    ScopedContext active(context->get(device));
     CUdeviceptr ptr;
     _pi_mem::mem_::buffer_mem_::alloc_mode allocMode =
         _pi_mem::mem_::buffer_mem_::alloc_mode::classic;
@@ -2259,8 +2273,8 @@ pi_result cuda_piMemBufferCreate(pi_context context, pi_mem_flags flags,
     if (retErr == PI_SUCCESS) {
       pi_mem parentBuffer = nullptr;
 
-      auto piMemObj = std::unique_ptr<_pi_mem>(
-          new _pi_mem{context, parentBuffer, allocMode, ptr, host_ptr, size});
+      auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{
+          context, device, parentBuffer, allocMode, ptr, host_ptr, size});
       if (piMemObj != nullptr) {
         retMemObj = piMemObj.release();
         if (performInitialCopy) {
@@ -2312,7 +2326,7 @@ pi_result cuda_piMemRelease(pi_mem memObj) {
       return PI_SUCCESS;
     }
 
-    ScopedContext active(uniqueMemObj->get_context());
+    ScopedContext active(uniqueMemObj->get_native_context());
 
     if (memObj->mem_type_ == _pi_mem::mem_type::buffer) {
       switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) {
@@ -2385,7 +2399,6 @@ pi_result cuda_piMemBufferPartition(pi_mem parent_buffer, pi_mem_flags flags,
           parent_buffer->mem_.buffer_mem_.get_size()) &&
          "PI_ERROR_INVALID_BUFFER_SIZE");
   // Retained indirectly due to retaining parent buffer below.
-  pi_context context = parent_buffer->context_;
   _pi_mem::mem_::buffer_mem_::alloc_mode allocMode =
       _pi_mem::mem_::buffer_mem_::alloc_mode::classic;
 
@@ -2402,8 +2415,9 @@ pi_result cuda_piMemBufferPartition(pi_mem parent_buffer, pi_mem_flags flags,
 
   std::unique_ptr<_pi_mem> retMemObj{nullptr};
   try {
-    retMemObj = std::unique_ptr<_pi_mem>{new _pi_mem{
-        context, parent_buffer, allocMode, ptr, hostPtr, bufferRegion.size}};
+    retMemObj = std::unique_ptr<_pi_mem>{
+        new _pi_mem{parent_buffer->get_context(), parent_buffer->get_device(),
+                    parent_buffer, allocMode, ptr, hostPtr, bufferRegion.size}};
   } catch (pi_result err) {
     *memObj = nullptr;
     return err;
@@ -2463,7 +2477,9 @@ pi_result cuda_piQueueCreate(pi_context context, pi_device device,
   try {
     std::unique_ptr<_pi_queue> queueImpl{nullptr};
 
-    if (context->get_device() != device) {
+    CUcontext native_context = context->get(device);
+
+    if (native_context == nullptr) {
       *queue = nullptr;
       return PI_ERROR_INVALID_DEVICE;
     }
@@ -2584,7 +2600,7 @@ pi_result cuda_piQueueRelease(pi_queue command_queue) {
     if (!command_queue->backend_has_ownership())
       return PI_SUCCESS;
 
-    ScopedContext active(command_queue->get_context());
+    ScopedContext active(command_queue->get_native_context());
 
     command_queue->for_each_stream([](CUstream s) {
       PI_CHECK_ERROR(cuStreamSynchronize(s));
@@ -2606,8 +2622,8 @@ pi_result cuda_piQueueFinish(pi_queue command_queue) {
 
     assert(command_queue !=
            nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code
-    ScopedContext active(command_queue->get_context());
 
+    ScopedContext active(command_queue->get_native_context());
     command_queue->sync_streams</*ResetUsed=*/true>([&result](CUstream s) {
       result = PI_CHECK_ERROR(cuStreamSynchronize(s));
     });
@@ -2640,7 +2656,7 @@ pi_result cuda_piQueueFlush(pi_queue command_queue) {
 /// \return PI_SUCCESS
 pi_result cuda_piextQueueGetNativeHandle(pi_queue queue,
                                          pi_native_handle *nativeHandle) {
-  ScopedContext active(queue->get_context());
+  ScopedContext active(queue->get_native_context());
   *nativeHandle =
       reinterpret_cast<pi_native_handle>(queue->get_next_compute_stream());
   return PI_SUCCESS;
@@ -2661,7 +2677,6 @@ pi_result cuda_piextQueueCreateWithNativeHandle(pi_native_handle nativeHandle,
                                                 pi_device device,
                                                 bool ownNativeHandle,
                                                 pi_queue *queue) {
-  (void)device;
   (void)ownNativeHandle;
   assert(ownNativeHandle == false);
 
@@ -2686,7 +2701,7 @@ pi_result cuda_piextQueueCreateWithNativeHandle(pi_native_handle nativeHandle,
   *queue = new _pi_queue{std::move(computeCuStreams),
                          std::move(transferCuStreams),
                          context,
-                         context->get_device(),
+                         device,
                          properties,
                          flags,
                          /*backend_owns*/ false};
@@ -2709,7 +2724,7 @@ pi_result cuda_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer,
   std::unique_ptr<_pi_event> retImplEv{nullptr};
 
   try {
-    ScopedContext active(command_queue->get_context());
+    ScopedContext active(command_queue->get_native_context());
     CUstream cuStream = command_queue->get_next_transfer_stream();
 
     retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
@@ -2755,7 +2770,7 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer,
   std::unique_ptr<_pi_event> retImplEv{nullptr};
 
   try {
-    ScopedContext active(command_queue->get_context());
+    ScopedContext active(command_queue->get_native_context());
     CUstream cuStream = command_queue->get_next_transfer_stream();
 
     retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
@@ -2801,7 +2816,8 @@ pi_result cuda_piEventsWait(pi_uint32 num_events, const pi_event *event_list) {
       return PI_ERROR_INVALID_EVENT;
     }
 
-    auto context = event_list[0]->get_context();
+    // all events must be in the same context
+    auto context = event_list[0]->get_native_context();
     ScopedContext active(context);
 
     auto waitFunc = [context](pi_event event) -> pi_result {
@@ -2809,7 +2825,7 @@ pi_result cuda_piEventsWait(pi_uint32 num_events, const pi_event *event_list) {
         return PI_ERROR_INVALID_EVENT;
       }
 
-      if (event->get_context() != context) {
+      if (event->get_native_context() != context) {
         return PI_ERROR_INVALID_CONTEXT;
       }
 
@@ -2832,26 +2848,40 @@ pi_result cuda_piKernelCreate(pi_program program, const char *kernel_name,
   std::unique_ptr<_pi_kernel> retKernel{nullptr};
 
   try {
-    ScopedContext active(program->get_context());
+    const auto &modules = program->get();
+    std::vector<CUfunction> cuFuncs(modules.size());
+    std::vector<CUfunction> cuFuncsWithOffsetParam(modules.size());
+
+    for (size_t i = 0; i < modules.size(); i++) {
+      CUmodule module = modules[i];
+      ScopedContext active(
+          program->get_context()->get_devices()[i]->get_context());
+
+      if (program->build_results_[i] != CUDA_SUCCESS) {
+        cuFuncs[i] = nullptr;
+        cuFuncsWithOffsetParam[i] = nullptr;
+        continue;
+      }
 
-    CUfunction cuFunc;
-    retErr = PI_CHECK_ERROR(
-        cuModuleGetFunction(&cuFunc, program->get(), kernel_name));
+      retErr =
+          PI_CHECK_ERROR(cuModuleGetFunction(&cuFuncs[i], module, kernel_name));
 
-    std::string kernel_name_woffset = std::string(kernel_name) + "_with_offset";
-    CUfunction cuFuncWithOffsetParam;
-    CUresult offsetRes = cuModuleGetFunction(
-        &cuFuncWithOffsetParam, program->get(), kernel_name_woffset.c_str());
+      std::string kernel_name_woffset =
+          std::string(kernel_name) + "_with_offset";
+      CUresult offsetRes = cuModuleGetFunction(
+          &cuFuncsWithOffsetParam[i], module, kernel_name_woffset.c_str());
 
-    // If there is no kernel with global offset parameter we mark it as missing
-    if (offsetRes == CUDA_ERROR_NOT_FOUND) {
-      cuFuncWithOffsetParam = nullptr;
-    } else {
-      retErr = PI_CHECK_ERROR(offsetRes);
+      // If there is no kernel with global offset parameter we mark it as
+      // missing
+      if (offsetRes == CUDA_ERROR_NOT_FOUND) {
+        cuFuncsWithOffsetParam[i] = nullptr;
+      } else {
+        retErr = PI_CHECK_ERROR(offsetRes);
+      }
     }
 
     retKernel = std::unique_ptr<_pi_kernel>(
-        new _pi_kernel{cuFunc, cuFuncWithOffsetParam, kernel_name, program,
+        new _pi_kernel{cuFuncs, cuFuncsWithOffsetParam, kernel_name, program,
                        program->get_context()});
   } catch (pi_result err) {
     retErr = err;
@@ -2945,7 +2975,7 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
       sycl::detail::pi::assertion(
           cuFuncGetAttribute(&max_threads,
                              CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                             kernel->get()) == CUDA_SUCCESS);
+                             kernel->get(device)) == CUDA_SUCCESS);
       return getInfo(param_value_size, param_value, param_value_size_ret,
                      size_t(max_threads));
     }
@@ -2968,7 +2998,7 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
       int bytes = 0;
       sycl::detail::pi::assertion(
           cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
-                             kernel->get()) == CUDA_SUCCESS);
+                             kernel->get(device)) == CUDA_SUCCESS);
       return getInfo(param_value_size, param_value, param_value_size_ret,
                      pi_uint64(bytes));
     }
@@ -2986,7 +3016,7 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
       int bytes = 0;
       sycl::detail::pi::assertion(
           cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
-                             kernel->get()) == CUDA_SUCCESS);
+                             kernel->get(device)) == CUDA_SUCCESS);
       return getInfo(param_value_size, param_value, param_value_size_ret,
                      pi_uint64(bytes));
     }
@@ -2994,7 +3024,7 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
       int numRegs = 0;
       sycl::detail::pi::assertion(
           cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
-                             kernel->get()) == CUDA_SUCCESS);
+                             kernel->get(device)) == CUDA_SUCCESS);
       return getInfo(param_value_size, param_value, param_value_size_ret,
                      pi_uint32(numRegs));
     }
@@ -3036,9 +3066,17 @@ pi_result cuda_piEnqueueKernelLaunch(
 
   try {
     // Set the active context here as guessLocalWorkSize needs an active context
-    ScopedContext active(command_queue->get_context());
+    ScopedContext active(command_queue->get_native_context());
     {
-      size_t *reqdThreadsPerBlock = kernel->reqdThreadsPerBlock_;
+      int dev_idx = 0;
+      auto devices = command_queue->context_->get_devices();
+      for (size_t i = 0; i < devices.size(); i++) {
+        if (devices[i] == command_queue->device_) {
+          dev_idx = i;
+          break;
+        }
+      }
+      size_t *reqdThreadsPerBlock = &kernel->reqdThreadsPerBlock_[dev_idx][0];
       maxWorkGroupSize = command_queue->device_->get_max_work_group_size();
       command_queue->device_->get_max_work_item_sizes(
           sizeof(maxThreadsPerBlock), maxThreadsPerBlock);
@@ -3068,7 +3106,7 @@ pi_result cuda_piEnqueueKernelLaunch(
             return err;
         }
       } else {
-        guessLocalWorkSize(command_queue->device_, threadsPerBlock,
+        guessLocalWorkSize(command_queue->get_device(), threadsPerBlock,
                            global_work_size, maxThreadsPerBlock, kernel,
                            local_size);
       }
@@ -3092,20 +3130,20 @@ pi_result cuda_piEnqueueKernelLaunch(
     _pi_stream_guard guard;
     CUstream cuStream = command_queue->get_next_compute_stream(
         num_events_in_wait_list, event_wait_list, guard, &stream_token);
-    CUfunction cuFunc = kernel->get();
+    CUfunction cuFunc = kernel->get(command_queue->device_);
 
     retError = enqueueEventsWait(command_queue, cuStream,
                                  num_events_in_wait_list, event_wait_list);
 
     // Set the implicit global offset parameter if kernel has offset variant
-    if (kernel->get_with_offset_parameter()) {
+    if (kernel->get_with_offset_parameter(command_queue->device_)) {
       std::uint32_t cuda_implicit_offset[3] = {0, 0, 0};
       if (global_work_offset) {
         for (size_t i = 0; i < work_dim; i++) {
           cuda_implicit_offset[i] =
               static_cast<std::uint32_t>(global_work_offset[i]);
           if (global_work_offset[i] != 0) {
-            cuFunc = kernel->get_with_offset_parameter();
+            cuFunc = kernel->get_with_offset_parameter(command_queue->device_);
           }
         }
       }
@@ -3148,6 +3186,7 @@ pi_result cuda_piEnqueueKernelLaunch(
         cuFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2],
         threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2], local_size,
         cuStream, const_cast<void **>(argIndices.data()), nullptr));
+
     if (local_size != 0)
       kernel->clear_local_size();
 
@@ -3177,7 +3216,8 @@ pi_result cuda_piextKernelCreateWithNativeHandle(pi_native_handle, pi_context,
 }
 
 /// \TODO Not implemented
-pi_result cuda_piMemImageCreate(pi_context context, pi_mem_flags flags,
+pi_result cuda_piMemImageCreate(pi_context context, pi_device device,
+                                pi_mem_flags flags,
                                 const pi_image_format *image_format,
                                 const pi_image_desc *image_desc, void *host_ptr,
                                 pi_mem *ret_mem) {
@@ -3262,7 +3302,7 @@ pi_result cuda_piMemImageCreate(pi_context context, pi_mem_flags flags,
   size_t image_size_bytes = pixel_size_bytes * image_desc->image_width *
                             image_desc->image_height * image_desc->image_depth;
 
-  ScopedContext active(context);
+  ScopedContext active(context->get(device));
   CUarray image_array;
   retErr = PI_CHECK_ERROR(cuArray3DCreate(&image_array, &array_desc));
 
@@ -3313,8 +3353,9 @@ pi_result cuda_piMemImageCreate(pi_context context, pi_mem_flags flags,
     CUsurfObject surface;
     retErr = PI_CHECK_ERROR(cuSurfObjectCreate(&surface, &image_res_desc));
 
-    auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{
-        context, image_array, surface, image_desc->image_type, host_ptr});
+    auto piMemObj = std::unique_ptr<_pi_mem>(
+        new _pi_mem{context, device, image_array, surface,
+                    image_desc->image_type, host_ptr});
 
     if (piMemObj == nullptr) {
       return PI_ERROR_OUT_OF_HOST_MEMORY;
@@ -3373,10 +3414,7 @@ pi_result cuda_piProgramBuild(pi_program program, pi_uint32 num_devices,
   pi_result retError = PI_SUCCESS;
 
   try {
-    ScopedContext active(program->get_context());
-
     program->build_program(options);
-
   } catch (pi_result err) {
     retError = err;
   }
@@ -3393,8 +3431,6 @@ pi_result cuda_piProgramCreate(pi_context, const void *, size_t, pi_program *) {
 /// Note: No calls to CUDA driver API in this function, only store binaries
 /// for later.
 ///
-/// Note: Only supports one device
-///
 pi_result cuda_piProgramCreateWithBinary(
     pi_context context, pi_uint32 num_devices, const pi_device *device_list,
     const size_t *lengths, const unsigned char **binaries,
@@ -3407,10 +3443,18 @@ pi_result cuda_piProgramCreateWithBinary(
   assert(binaries != nullptr);
   assert(program != nullptr);
   assert(device_list != nullptr);
-  assert(num_devices == 1 && "CUDA contexts are for a single device");
-  assert((context->get_device()->get() == device_list[0]->get()) &&
-         "Mismatch between devices context and passed context when creating "
-         "program from binary");
+  for (size_t i = 0; i < num_devices; i++) {
+    bool found_device = false;
+    for (pi_device context_device : context->get_devices()) {
+      if (device_list[i] == context_device) {
+        found_device = true;
+        break;
+      }
+    }
+    assert(found_device &&
+           "Mismatch between device's context and passed context when creating "
+           "program from binary");
+  }
 
   pi_result retError = PI_SUCCESS;
 
@@ -3447,8 +3491,9 @@ pi_result cuda_piProgramGetInfo(pi_program program, pi_program_info param_name,
   case PI_PROGRAM_INFO_NUM_DEVICES:
     return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
   case PI_PROGRAM_INFO_DEVICES:
-    return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
-                        &program->context_->deviceId_);
+    return getInfoArray(program->context_->get_devices().size(),
+                        param_value_size, param_value, param_value_size_ret,
+                        &program->context_->get_devices()[0]);
   case PI_PROGRAM_INFO_SOURCE:
     return getInfo(param_value_size, param_value, param_value_size_ret,
                    program->binary_);
@@ -3482,14 +3527,15 @@ pi_result cuda_piProgramLink(pi_context context, pi_uint32 num_devices,
                              void *user_data, pi_program *ret_program) {
 
   assert(ret_program != nullptr);
-  assert(num_devices == 1 || num_devices == 0);
   assert(device_list != nullptr || num_devices == 0);
   assert(pfn_notify == nullptr);
   assert(user_data == nullptr);
   pi_result retError = PI_SUCCESS;
 
   try {
-    ScopedContext active(context);
+    // We need a context for the linking operations but the result can later be
+    // used in any other context, so just use the first one.
+    ScopedContext active(context->get_devices()[0]->get_context());
 
     CUlinkState state;
     std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
@@ -3547,7 +3593,6 @@ pi_result cuda_piProgramCompile(
   (void)input_headers;
 
   assert(program != nullptr);
-  assert(num_devices == 1 || num_devices == 0);
   assert(device_list != nullptr || num_devices == 0);
   assert(pfn_notify == nullptr);
   assert(user_data == nullptr);
@@ -3555,10 +3600,7 @@ pi_result cuda_piProgramCompile(
   pi_result retError = PI_SUCCESS;
 
   try {
-    ScopedContext active(program->get_context());
-
     program->build_program(options);
-
   } catch (pi_result err) {
     retError = err;
   }
@@ -3618,9 +3660,14 @@ pi_result cuda_piProgramRelease(pi_program program) {
     pi_result result = PI_ERROR_INVALID_PROGRAM;
 
     try {
-      ScopedContext active(program->get_context());
-      auto cuModule = program->get();
-      result = PI_CHECK_ERROR(cuModuleUnload(cuModule));
+      const auto &modules = program->get();
+      for (size_t i = 0; i < modules.size(); i++) {
+        if (program->build_results_[i] == CUDA_SUCCESS) {
+          ScopedContext active(
+              program->get_context()->get_devices()[i]->get_context());
+          result = PI_CHECK_ERROR(cuModuleUnload(modules[i]));
+        }
+      }
     } catch (...) {
       result = PI_ERROR_OUT_OF_RESOURCES;
     }
@@ -3639,7 +3686,7 @@ pi_result cuda_piProgramRelease(pi_program program) {
 /// \return TBD
 pi_result cuda_piextProgramGetNativeHandle(pi_program program,
                                            pi_native_handle *nativeHandle) {
-  *nativeHandle = reinterpret_cast<pi_native_handle>(program->get());
+  *nativeHandle = reinterpret_cast<pi_native_handle>(program->get()[0]);
   return PI_SUCCESS;
 }
 
@@ -3720,7 +3767,7 @@ pi_result cuda_piKernelGetSubGroupInfo(
       sycl::detail::pi::assertion(
           cuFuncGetAttribute(&max_threads,
                              CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                             kernel->get()) == CUDA_SUCCESS);
+                             kernel->get(device)) == CUDA_SUCCESS);
       int warpSize = 0;
       cuda_piKernelGetSubGroupInfo(kernel, device, PI_KERNEL_MAX_SUB_GROUP_SIZE,
                                    0, nullptr, sizeof(uint32_t), &warpSize,
@@ -3902,7 +3949,8 @@ pi_result cuda_piEventRelease(pi_event event) {
     std::unique_ptr<_pi_event> event_ptr{event};
     pi_result result = PI_ERROR_INVALID_EVENT;
     try {
-      ScopedContext active(event->get_context());
+      ScopedContext active(
+          event->get_context()->get_devices()[0]->get_context());
       result = event->release();
     } catch (...) {
       result = PI_ERROR_OUT_OF_RESOURCES;
@@ -3951,7 +3999,7 @@ pi_result cuda_piEnqueueEventsWaitWithBarrier(pi_queue command_queue,
   pi_result result;
 
   try {
-    ScopedContext active(command_queue->get_context());
+    ScopedContext active(command_queue->get_native_context());
     pi_uint32 stream_token;
     _pi_stream_guard guard;
     CUstream cuStream = command_queue->get_next_compute_stream(
@@ -4272,7 +4320,7 @@ pi_result cuda_piEnqueueMemBufferReadRect(
   std::unique_ptr<_pi_event> retImplEv{nullptr};
 
   try {
-    ScopedContext active(command_queue->get_context());
+    ScopedContext active(command_queue->get_native_context());
     CUstream cuStream = command_queue->get_next_transfer_stream();
 
     retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
@@ -4323,7 +4371,7 @@ pi_result cuda_piEnqueueMemBufferWriteRect(
   std::unique_ptr<_pi_event> retImplEv{nullptr};
 
   try {
-    ScopedContext active(command_queue->get_context());
+    ScopedContext active(command_queue->get_native_context());
     CUstream cuStream = command_queue->get_next_transfer_stream();
     retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
                                event_wait_list);
@@ -4370,7 +4418,7 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer,
   std::unique_ptr<_pi_event> retImplEv{nullptr};
 
   try {
-    ScopedContext active(command_queue->get_context());
+    ScopedContext active(command_queue->get_native_context());
     pi_result result;
 
     auto stream = command_queue->get_next_transfer_stream();
@@ -4419,7 +4467,7 @@ pi_result cuda_piEnqueueMemBufferCopyRect(
   std::unique_ptr<_pi_event> retImplEv{nullptr};
 
   try {
-    ScopedContext active(command_queue->get_context());
+    ScopedContext active(command_queue->get_native_context());
     CUstream cuStream = command_queue->get_next_transfer_stream();
     retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
                                event_wait_list);
@@ -4472,7 +4520,7 @@ pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer,
   std::unique_ptr<_pi_event> retImplEv{nullptr};
 
   try {
-    ScopedContext active(command_queue->get_context());
+    ScopedContext active(command_queue->get_native_context());
 
     auto stream = command_queue->get_next_transfer_stream();
     pi_result result;
@@ -4648,7 +4696,7 @@ pi_result cuda_piEnqueueMemImageRead(
   pi_result retErr = PI_SUCCESS;
 
   try {
-    ScopedContext active(command_queue->get_context());
+    ScopedContext active(command_queue->get_native_context());
     CUstream cuStream = command_queue->get_next_transfer_stream();
     retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
                                event_wait_list);
@@ -4718,7 +4766,7 @@ cuda_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image,
   pi_result retErr = PI_SUCCESS;
 
   try {
-    ScopedContext active(command_queue->get_context());
+    ScopedContext active(command_queue->get_native_context());
     CUstream cuStream = command_queue->get_next_transfer_stream();
     retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
                                event_wait_list);
@@ -4780,7 +4828,7 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image,
   pi_result retErr = PI_SUCCESS;
 
   try {
-    ScopedContext active(command_queue->get_context());
+    ScopedContext active(command_queue->get_native_context());
     CUstream cuStream = command_queue->get_next_transfer_stream();
     retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
                                event_wait_list);
@@ -4884,7 +4932,7 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer,
         command_queue, buffer, blocking_map, offset, size, hostPtr,
         num_events_in_wait_list, event_wait_list, event);
   } else {
-    ScopedContext active(command_queue->get_context());
+    ScopedContext active(command_queue->get_native_context());
 
     if (is_pinned) {
       ret_err = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
@@ -4939,7 +4987,7 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj,
         memobj->mem_.buffer_mem_.get_size(), mapped_ptr,
         num_events_in_wait_list, event_wait_list, event);
   } else {
-    ScopedContext active(command_queue->get_context());
+    ScopedContext active(command_queue->get_native_context());
 
     if (is_pinned) {
       ret_err = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
@@ -4973,7 +5021,10 @@ pi_result cuda_piextUSMHostAlloc(void **result_ptr, pi_context context,
   assert(properties == nullptr || *properties == 0);
   pi_result result = PI_SUCCESS;
   try {
-    ScopedContext active(context);
+    // cuMemAllocHost requires an active context but the allocation is then
+    // available on all the USM compatible contexts and devices so we can
+    // simply use the first one.
+    ScopedContext active(context->get_devices()[0]->get_context());
     result = PI_CHECK_ERROR(cuMemAllocHost(result_ptr, size));
   } catch (pi_result error) {
     result = error;
@@ -4997,7 +5048,7 @@ pi_result cuda_piextUSMDeviceAlloc(void **result_ptr, pi_context context,
   assert(properties == nullptr || *properties == 0);
   pi_result result = PI_SUCCESS;
   try {
-    ScopedContext active(context);
+    ScopedContext active(context->get(device));
     result = PI_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)result_ptr, size));
   } catch (pi_result error) {
     result = error;
@@ -5021,7 +5072,7 @@ pi_result cuda_piextUSMSharedAlloc(void **result_ptr, pi_context context,
   assert(properties == nullptr || *properties == 0);
   pi_result result = PI_SUCCESS;
   try {
-    ScopedContext active(context);
+    ScopedContext active(context->get(device));
     result = PI_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)result_ptr, size,
                                               CU_MEM_ATTACH_GLOBAL));
   } catch (pi_result error) {
@@ -5040,7 +5091,7 @@ pi_result cuda_piextUSMFree(pi_context context, void *ptr) {
   assert(context != nullptr);
   pi_result result = PI_SUCCESS;
   try {
-    ScopedContext active(context);
+    ScopedContext active(context->get_devices()[0]->get_context());
     bool is_managed;
     unsigned int type;
     void *attribute_values[2] = {&is_managed, &type};
@@ -5048,6 +5099,7 @@ pi_result cuda_piextUSMFree(pi_context context, void *ptr) {
                                          CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
     result = PI_CHECK_ERROR(cuPointerGetAttributes(
         2, attributes, attribute_values, (CUdeviceptr)ptr));
+
     assert(type == CU_MEMORYTYPE_DEVICE || type == CU_MEMORYTYPE_HOST);
     if (is_managed || type == CU_MEMORYTYPE_DEVICE) {
       // Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed
@@ -5074,7 +5126,7 @@ pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value,
   std::unique_ptr<_pi_event> event_ptr{nullptr};
 
   try {
-    ScopedContext active(queue->get_context());
+    ScopedContext active(queue->get_native_context());
     pi_uint32 stream_token;
     _pi_stream_guard guard;
     CUstream cuStream = queue->get_next_compute_stream(
@@ -5112,7 +5164,7 @@ pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking,
   std::unique_ptr<_pi_event> event_ptr{nullptr};
 
   try {
-    ScopedContext active(queue->get_context());
+    ScopedContext active(queue->get_native_context());
     CUstream cuStream = queue->get_next_transfer_stream();
     result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
                                events_waitlist);
@@ -5144,13 +5196,17 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr,
                                        pi_uint32 num_events_in_waitlist,
                                        const pi_event *events_waitlist,
                                        pi_event *event) {
-  pi_device device = queue->get_context()->get_device();
+  pi_device device = queue->get_device();
 
   // Certain cuda devices and Windows do not have support for some Unified
   // Memory features. cuMemPrefetchAsync requires concurrent memory access
   // for managed memory. Therfore, ignore prefetch hint if concurrent managed
   // memory access is not available.
-  if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
+  int isConcurrentManagedAccessAvailable = 0;
+  cuDeviceGetAttribute(&isConcurrentManagedAccessAvailable,
+                       CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS,
+                       device->get());
+  if (!isConcurrentManagedAccessAvailable) {
     setErrorMessage("Prefetch hint ignored as device does not support "
                     "concurrent managed access",
                     PI_SUCCESS);
@@ -5175,7 +5231,7 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr,
   std::unique_ptr<_pi_event> event_ptr{nullptr};
 
   try {
-    ScopedContext active(queue->get_context());
+    ScopedContext active(queue->get_native_context());
     CUstream cuStream = queue->get_next_transfer_stream();
     result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
                                events_waitlist);
@@ -5214,7 +5270,7 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr,
       advice == PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY ||
       advice == PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY ||
       advice == PI_MEM_ADVICE_RESET) {
-    pi_device device = queue->get_context()->get_device();
+    pi_device device = queue->get_device();
     if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
       setErrorMessage("Mem advise ignored as device does not support "
                       "concurrent managed access",
@@ -5241,7 +5297,7 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr,
   std::unique_ptr<_pi_event> event_ptr{nullptr};
 
   try {
-    ScopedContext active(queue->get_context());
+    ScopedContext active(queue->get_native_context());
 
     if (event) {
       event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
@@ -5259,7 +5315,7 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr,
       result = PI_CHECK_ERROR(cuMemAdvise(
           (CUdeviceptr)ptr, length,
           (CUmem_advise)(advice - PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY + 1),
-          queue->get_context()->get_device()->get()));
+          queue->device_->get()));
       break;
     case PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION_HOST:
     case PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION_HOST:
@@ -5275,13 +5331,13 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr,
     case PI_MEM_ADVICE_RESET:
       PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length,
                                  CU_MEM_ADVISE_UNSET_READ_MOSTLY,
-                                 queue->get_context()->get_device()->get()));
+                                 queue->get_device()->get()));
       PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length,
                                  CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION,
-                                 queue->get_context()->get_device()->get()));
+                                 queue->get_device()->get()));
       PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length,
                                  CU_MEM_ADVISE_UNSET_ACCESSED_BY,
-                                 queue->get_context()->get_device()->get()));
+                                 queue->get_device()->get()));
       break;
     default:
       sycl::detail::pi::die("Unknown advice");
@@ -5343,7 +5399,7 @@ pi_result cuda_piextUSMEnqueueMemcpy2D(pi_queue queue, pi_bool blocking,
   pi_result result = PI_SUCCESS;
 
   try {
-    ScopedContext active(queue->get_context());
+    ScopedContext active(queue->get_native_context());
     CUstream cuStream = queue->get_next_transfer_stream();
     result = enqueueEventsWait(queue, cuStream, num_events_in_wait_list,
                                event_wait_list);
@@ -5407,7 +5463,7 @@ pi_result cuda_piextUSMGetMemAllocInfo(pi_context context, const void *ptr,
   pi_result result = PI_SUCCESS;
 
   try {
-    ScopedContext active(context);
+    ScopedContext active(context->get_devices()[0]->get_context());
     switch (param_name) {
     case PI_MEM_ALLOC_TYPE: {
       unsigned int value;
@@ -5495,6 +5551,19 @@ pi_result cuda_piextUSMGetMemAllocInfo(pi_context context, const void *ptr,
   return result;
 }
 
+__SYCL_EXPORT pi_result cuda_piextGetMemoryConnection(
+    pi_device device1, pi_context context1, pi_device device2,
+    pi_context context2, _pi_memory_connection *res) {
+  (void)device1;
+  (void)context1;
+  (void)device2;
+  (void)context2;
+
+  // We can always migrate memory between two devices using the CUDA APIs.
+  *res = PI_MEMORY_CONNECTION_MIGRATABLE;
+  return PI_SUCCESS;
+}
+
 pi_result cuda_piextEnqueueDeviceGlobalVariableWrite(
     pi_queue queue, pi_program program, const char *name,
     pi_bool blocking_write, size_t count, size_t offset, const void *src,
@@ -5517,9 +5586,11 @@ pi_result cuda_piextEnqueueDeviceGlobalVariableWrite(
   try {
     CUdeviceptr device_global = 0;
     size_t device_global_size = 0;
-    result = PI_CHECK_ERROR(
-        cuModuleGetGlobal(&device_global, &device_global_size, program->get(),
-                          device_global_name.c_str()));
+    result = PI_CHECK_ERROR(cuModuleGetGlobal(
+        &device_global, &device_global_size,
+        program
+            ->get()[program->get_context()->device_index(queue->get_device())],
+        device_global_name.c_str()));
 
     if (offset + count > device_global_size)
       return PI_ERROR_INVALID_VALUE;
@@ -5554,9 +5625,11 @@ pi_result cuda_piextEnqueueDeviceGlobalVariableRead(
   try {
     CUdeviceptr device_global = 0;
     size_t device_global_size = 0;
-    result = PI_CHECK_ERROR(
-        cuModuleGetGlobal(&device_global, &device_global_size, program->get(),
-                          device_global_name.c_str()));
+    result = PI_CHECK_ERROR(cuModuleGetGlobal(
+        &device_global, &device_global_size,
+        program
+            ->get()[program->get_context()->device_index(queue->get_device())],
+        device_global_name.c_str()));
 
     if (offset + count > device_global_size)
       return PI_ERROR_INVALID_VALUE;
@@ -5671,6 +5744,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   // Platform
   _PI_CL(piPlatformsGet, cuda_piPlatformsGet)
   _PI_CL(piPlatformGetInfo, cuda_piPlatformGetInfo)
+
   // Device
   _PI_CL(piDevicesGet, cuda_piDevicesGet)
   _PI_CL(piDeviceGetInfo, cuda_piDeviceGetInfo)
@@ -5794,6 +5868,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piextEnqueueDeviceGlobalVariableRead,
          cuda_piextEnqueueDeviceGlobalVariableRead)
 
+  _PI_CL(piextGetMemoryConnection, cuda_piextGetMemoryConnection)
+
   // Host Pipe
   _PI_CL(piextEnqueueReadHostPipe, cuda_piextEnqueueReadHostPipe)
   _PI_CL(piextEnqueueWriteHostPipe, cuda_piextEnqueueWriteHostPipe)
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
index f6baeab0a4445..21c84e5cba843 100644
--- a/sycl/plugins/cuda/pi_cuda.hpp
+++ b/sycl/plugins/cuda/pi_cuda.hpp
@@ -179,15 +179,21 @@ struct _pi_context {
   using native_type = CUcontext;
 
   native_type cuContext_;
-  _pi_device *deviceId_;
+  std::vector<pi_device> deviceIds_;
   std::atomic_uint32_t refCount_;
 
-  _pi_context(_pi_device *devId)
-      : cuContext_{devId->get_context()}, deviceId_{devId}, refCount_{1} {
-    cuda_piDeviceRetain(deviceId_);
+  _pi_context(std::vector<pi_device> &&devIds)
+      : deviceIds_{std::move(devIds)}, refCount_{1} {
+    for (pi_device dev : deviceIds_) {
+      cuda_piDeviceRetain(dev);
+    }
   };
 
-  ~_pi_context() { cuda_piDeviceRelease(deviceId_); }
+  ~_pi_context() {
+    for (pi_device dev : deviceIds_) {
+      cuda_piDeviceRelease(dev);
+    }
+  }
 
   void invoke_extended_deleters() {
     std::lock_guard<std::mutex> guard(mutex_);
@@ -202,9 +208,28 @@ struct _pi_context {
     extended_deleters_.emplace_back(deleter_data{function, user_data});
   }
 
-  pi_device get_device() const noexcept { return deviceId_; }
+  const std::vector<pi_device> &get_devices() const noexcept {
+    return deviceIds_;
+  }
 
-  native_type get() const noexcept { return cuContext_; }
+  size_t device_index(pi_device device) {
+    for (size_t i = 0; i < deviceIds_.size(); i++) {
+      if (deviceIds_[i] == device) {
+        return i;
+      }
+    }
+    assert(false && "No index for device");
+    return 0;
+  }
+
+  CUcontext get(pi_device device) {
+    for (size_t i = 0; i < deviceIds_.size(); i++) {
+      if (deviceIds_[i] == device) {
+        return device->get_context();
+      }
+    }
+    return nullptr;
+  }
 
   pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
 
@@ -229,6 +254,9 @@ struct _pi_mem {
   // Context where the memory object is accessibles
   pi_context context_;
 
+  // Device where the memory is located
+  pi_device device_;
+
   /// Reference counting of the handler
   std::atomic_uint32_t refCount_;
   enum class mem_type { buffer, surface } mem_type_;
@@ -331,9 +359,11 @@ struct _pi_mem {
   } mem_;
 
   /// Constructs the PI MEM handler for a non-typed allocation ("buffer")
-  _pi_mem(pi_context ctxt, pi_mem parent, mem_::buffer_mem_::alloc_mode mode,
-          CUdeviceptr ptr, void *host_ptr, size_t size)
-      : context_{ctxt}, refCount_{1}, mem_type_{mem_type::buffer} {
+  _pi_mem(pi_context ctxt, pi_device dev, pi_mem parent,
+          mem_::buffer_mem_::alloc_mode mode, CUdeviceptr ptr, void *host_ptr,
+          size_t size)
+      : context_{ctxt}, device_{dev}, refCount_{1},
+        mem_type_{mem_type::buffer} {
     mem_.buffer_mem_.ptr_ = ptr;
     mem_.buffer_mem_.parent_ = parent;
     mem_.buffer_mem_.hostPtr_ = host_ptr;
@@ -350,9 +380,10 @@ struct _pi_mem {
   };
 
   /// Constructs the PI allocation for an Image object (surface in CUDA)
-  _pi_mem(pi_context ctxt, CUarray array, CUsurfObject surf,
+  _pi_mem(pi_context ctxt, pi_device dev, CUarray array, CUsurfObject surf,
           pi_mem_type image_type, void *host_ptr)
-      : context_{ctxt}, refCount_{1}, mem_type_{mem_type::surface} {
+      : context_{ctxt}, device_{dev}, refCount_{1},
+        mem_type_{mem_type::surface} {
     // Ignore unused parameter
     (void)host_ptr;
 
@@ -382,6 +413,10 @@ struct _pi_mem {
   bool is_image() const noexcept { return mem_type_ == mem_type::surface; }
 
   pi_context get_context() const noexcept { return context_; }
+  pi_device get_device() const noexcept { return device_; }
+  CUcontext get_native_context() const noexcept {
+    return context_->get(device_);
+  }
 
   pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
 
@@ -544,6 +579,12 @@ struct _pi_queue {
     }
   }
 
+  CUcontext get_native_context() {
+    CUcontext res = context_->get(device_);
+    assert(res != nullptr);
+    return res;
+  }
+
   template <bool ResetUsed = false, typename T> void sync_streams(T &&f) {
     auto sync_compute = [&f, &streams = compute_streams_,
                          &delay = delay_compute_](unsigned int start,
@@ -671,6 +712,8 @@ struct _pi_event {
 
   pi_context get_context() const noexcept { return context_; };
 
+  CUcontext get_native_context() const noexcept { return native_context_; };
+
   pi_uint32 increment_reference_count() { return ++refCount_; }
 
   pi_uint32 decrement_reference_count() { return --refCount_; }
@@ -695,7 +738,8 @@ struct _pi_event {
   static pi_event
   make_native(pi_command_type type, pi_queue queue, CUstream stream,
               pi_uint32 stream_token = std::numeric_limits<pi_uint32>::max()) {
-    return new _pi_event(type, queue->get_context(), queue, stream,
+    return new _pi_event(type, queue->get_context(),
+                         queue->get_native_context(), queue, stream,
                          stream_token);
   }
 
@@ -710,7 +754,7 @@ struct _pi_event {
 private:
   // This constructor is private to force programmers to use the make_native /
   // make_user static members in order to create a pi_event for CUDA.
-  _pi_event(pi_command_type type, pi_context context, pi_queue queue,
+  _pi_event(pi_command_type type, pi_context context, CUcontext, pi_queue queue,
             CUstream stream, pi_uint32 stream_token);
 
   // This constructor is private to force programmers to use the
@@ -753,13 +797,16 @@ struct _pi_event {
   pi_context context_; // pi_context associated with the event. If this is a
                        // native event, this will be the same context associated
                        // with the queue_ member.
+  CUcontext native_context_; // CUcontext associated with the event. If this is
+                             // not a native event, this will be nullptr.
 };
 
 /// Implementation of PI Program on CUDA Module object
 ///
 struct _pi_program {
   using native_type = CUmodule;
-  native_type module_;
+  std::vector<native_type> modules_;
+  std::vector<CUresult> build_results_;
   const char *binary_;
   size_t binarySizeInBytes_;
   std::atomic_uint32_t refCount_;
@@ -788,7 +835,7 @@ struct _pi_program {
 
   pi_context get_context() const { return context_; };
 
-  native_type get() const noexcept { return module_; };
+  std::vector<native_type> get() const noexcept { return modules_; };
 
   pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
 
@@ -816,15 +863,16 @@ struct _pi_program {
 struct _pi_kernel {
   using native_type = CUfunction;
 
-  native_type function_;
-  native_type functionWithOffsetParam_;
+  std::vector<native_type> functions_;
+  std::vector<native_type> functionsWithOffsetParam_;
   std::string name_;
   pi_context context_;
   pi_program program_;
   std::atomic_uint32_t refCount_;
 
   static constexpr pi_uint32 REQD_THREADS_PER_BLOCK_DIMENSIONS = 3u;
-  size_t reqdThreadsPerBlock_[REQD_THREADS_PER_BLOCK_DIMENSIONS];
+  std::vector<std::array<size_t, REQD_THREADS_PER_BLOCK_DIMENSIONS>>
+      reqdThreadsPerBlock_;
 
   /// Structure that holds the arguments to the kernel.
   /// Note earch argument size is known, since it comes
@@ -911,18 +959,23 @@ struct _pi_kernel {
     }
   } args_;
 
-  _pi_kernel(CUfunction func, CUfunction funcWithOffsetParam, const char *name,
+  _pi_kernel(std::vector<CUfunction> funcs,
+             std::vector<CUfunction> funcsWithOffsetParam, const char *name,
              pi_program program, pi_context ctxt)
-      : function_{func}, functionWithOffsetParam_{funcWithOffsetParam},
-        name_{name}, context_{ctxt}, program_{program}, refCount_{1} {
+      : functions_{funcs}, functionsWithOffsetParam_{funcsWithOffsetParam},
+        name_{name}, context_{ctxt}, program_{program}, refCount_{1},
+        reqdThreadsPerBlock_{ctxt->get_devices().size()} {
     cuda_piProgramRetain(program_);
     cuda_piContextRetain(context_);
-    /// Note: this code assumes that there is only one device per context
-    pi_result retError = cuda_piKernelGetGroupInfo(
-        this, ctxt->get_device(), PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
-        sizeof(reqdThreadsPerBlock_), reqdThreadsPerBlock_, nullptr);
-    (void)retError;
-    assert(retError == PI_SUCCESS);
+    int device_num = 0;
+    for (pi_device device : ctxt->get_devices()) {
+      pi_result retError = cuda_piKernelGetGroupInfo(
+          this, device, PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
+          sizeof(reqdThreadsPerBlock_[device_num]),
+          &reqdThreadsPerBlock_[device_num++][0], nullptr);
+      (void)retError;
+      assert(retError == PI_SUCCESS);
+    }
   }
 
   ~_pi_kernel() {
@@ -938,15 +991,13 @@ struct _pi_kernel {
 
   pi_uint32 get_reference_count() const noexcept { return refCount_; }
 
-  native_type get() const noexcept { return function_; };
-
-  native_type get_with_offset_parameter() const noexcept {
-    return functionWithOffsetParam_;
-  };
+  std::vector<native_type> get() const noexcept { return functions_; }
+  native_type get(pi_device device) const noexcept;
 
-  bool has_with_offset_parameter() const noexcept {
-    return functionWithOffsetParam_ != nullptr;
+  std::vector<native_type> get_with_offset_parameter() const noexcept {
+    return functionsWithOffsetParam_;
   }
+  native_type get_with_offset_parameter(pi_device device) const noexcept;
 
   pi_context get_context() const noexcept { return context_; };
 
diff --git a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp
index 8eeb2432bcf08..4747d721bf600 100644
--- a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp
+++ b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp
@@ -481,6 +481,23 @@ pi_result piextPlatformCreateWithNativeHandle(pi_native_handle, pi_platform *) {
   DIE_NO_IMPLEMENTATION;
 }
 
+__SYCL_EXPORT pi_result piextGetMemoryConnection(pi_device device1,
+                                                 pi_context context1,
+                                                 pi_device device2,
+                                                 pi_context context2,
+                                                 _pi_memory_connection *res) {
+  ARG_UNUSED(device1);
+  ARG_UNUSED(device2);
+
+  if (context1 == context2) {
+    *res = PI_MEMORY_CONNECTION_UNIFIED;
+  } else {
+    *res = PI_MEMORY_CONNECTION_NONE;
+  }
+
+  return PI_SUCCESS;
+}
+
 pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType,
                        pi_uint32 NumEntries, pi_device *Devices,
                        pi_uint32 *NumDevices) {
@@ -1020,9 +1037,11 @@ pi_result piextQueueCreateWithNativeHandle(pi_native_handle, pi_context,
   DIE_NO_IMPLEMENTATION;
 }
 
-pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
-                            void *HostPtr, pi_mem *RetMem,
+pi_result piMemBufferCreate(pi_context Context, pi_device Device,
+                            pi_mem_flags Flags, size_t Size, void *HostPtr,
+                            pi_mem *RetMem,
                             const pi_mem_properties *properties) {
+  ARG_UNUSED(Device);
   ARG_UNUSED(properties);
 
   if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) {
@@ -1172,10 +1191,12 @@ ConvertPiImageFormatToCmFormat(const pi_image_format *PiFormat) {
   return cm_support::CM_SURFACE_FORMAT_UNKNOWN;
 }
 
-pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
+pi_result piMemImageCreate(pi_context Context, pi_device Device,
+                           pi_mem_flags Flags,
                            const pi_image_format *ImageFormat,
                            const pi_image_desc *ImageDesc, void *HostPtr,
                            pi_mem *RetImage) {
+  ARG_UNUSED(Device);
   if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) {
     PiTrace("Invalid memory attribute for piMemImageCreate");
     return PI_ERROR_INVALID_OPERATION;
diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp
index f216bc4565edf..69ca058ef4688 100644
--- a/sycl/plugins/hip/pi_hip.cpp
+++ b/sycl/plugins/hip/pi_hip.cpp
@@ -2173,9 +2173,11 @@ pi_result hip_piextContextCreateWithNativeHandle(pi_native_handle nativeHandle,
 /// Can trigger a manual copy depending on the mode.
 /// \TODO Implement USE_HOST_PTR using cuHostRegister
 ///
-pi_result hip_piMemBufferCreate(pi_context context, pi_mem_flags flags,
-                                size_t size, void *host_ptr, pi_mem *ret_mem,
+pi_result hip_piMemBufferCreate(pi_context context, pi_device device,
+                                pi_mem_flags flags, size_t size, void *host_ptr,
+                                pi_mem *ret_mem,
                                 const pi_mem_properties *properties) {
+  (void)device;
   // Need input memory object
   assert(ret_mem != nullptr);
   assert((properties == nullptr || *properties == 0) &&
@@ -3073,10 +3075,12 @@ hip_piEnqueueNativeKernel(pi_queue queue, void (*user_func)(void *), void *args,
 
 /// \TODO Not implemented
 
-pi_result hip_piMemImageCreate(pi_context context, pi_mem_flags flags,
+pi_result hip_piMemImageCreate(pi_context context, pi_device device,
+                               pi_mem_flags flags,
                                const pi_image_format *image_format,
                                const pi_image_desc *image_desc, void *host_ptr,
                                pi_mem *ret_mem) {
+  (void)device;
 
   // Need input memory object
   assert(ret_mem != nullptr);
@@ -5348,6 +5352,17 @@ pi_result hip_piextUSMGetMemAllocInfo(pi_context context, const void *ptr,
   return result;
 }
 
+__SYCL_EXPORT pi_result hip_piextGetMemoryConnection(
+    pi_device device1, pi_context context1, pi_device device2,
+    pi_context context2, _pi_memory_connection *res) {
+  (void)device1;
+  (void)context1;
+  (void)device2;
+  (void)context2;
+  *res = PI_MEMORY_CONNECTION_MIGRATABLE;
+  return PI_SUCCESS;
+}
+
 pi_result hip_piextEnqueueDeviceGlobalVariableWrite(
     pi_queue queue, pi_program program, const char *name,
     pi_bool blocking_write, size_t count, size_t offset, const void *src,
@@ -5613,6 +5628,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piextEnqueueDeviceGlobalVariableRead,
          hip_piextEnqueueDeviceGlobalVariableRead)
 
+  _PI_CL(piextGetMemoryConnection, hip_piextGetMemoryConnection)
+
   // Host Pipe
   _PI_CL(piextEnqueueReadHostPipe, hip_piextEnqueueReadHostPipe)
   _PI_CL(piextEnqueueWriteHostPipe, hip_piextEnqueueWriteHostPipe)
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index c25c8624ea00d..a4d102933d0fb 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -2125,6 +2125,21 @@ pi_result piextPlatformCreateWithNativeHandle(pi_native_handle NativeHandle,
   return PI_ERROR_INVALID_VALUE;
 }
 
+__SYCL_EXPORT pi_result piextGetMemoryConnection(pi_device device1,
+                                                 pi_context context1,
+                                                 pi_device device2,
+                                                 pi_context context2,
+                                                 _pi_memory_connection *res) {
+  (void)device1;
+  (void)device2;
+  if (context1 == context2) {
+    *res = PI_MEMORY_CONNECTION_UNIFIED;
+  } else {
+    *res = PI_MEMORY_CONNECTION_NONE;
+  }
+  return PI_SUCCESS;
+}
+
 pi_result piPluginGetLastError(char **message) {
   return pi2ur::piPluginGetLastError(message);
 }
@@ -2934,9 +2949,11 @@ static pi_result ZeHostMemAllocHelper(void **ResultPtr, pi_context Context,
   return PI_SUCCESS;
 }
 
-pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
-                            void *HostPtr, pi_mem *RetMem,
+pi_result piMemBufferCreate(pi_context Context, pi_device Device,
+                            pi_mem_flags Flags, size_t Size, void *HostPtr,
+                            pi_mem *RetMem,
                             const pi_mem_properties *properties) {
+  (void)Device;
 
   // TODO: implement support for more access modes
   if (!((Flags & PI_MEM_FLAGS_ACCESS_RW) ||
@@ -3117,10 +3134,12 @@ pi_result piMemRelease(pi_mem Mem) {
   return PI_SUCCESS;
 }
 
-pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
+pi_result piMemImageCreate(pi_context Context, pi_device Dev,
+                           pi_mem_flags Flags,
                            const pi_image_format *ImageFormat,
                            const pi_image_desc *ImageDesc, void *HostPtr,
                            pi_mem *RetImage) {
+  (void)Dev;
 
   // TODO: implement read-only, write-only
   if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) {
diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp
index 1061209447b6d..db91a8b56d5c3 100644
--- a/sycl/plugins/opencl/pi_opencl.cpp
+++ b/sycl/plugins/opencl/pi_opencl.cpp
@@ -1140,9 +1140,11 @@ pi_result piContextGetInfo(pi_context context, pi_context_info paramName,
   }
 }
 
-pi_result piMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size,
-                            void *host_ptr, pi_mem *ret_mem,
+pi_result piMemBufferCreate(pi_context context, pi_device device,
+                            pi_mem_flags flags, size_t size, void *host_ptr,
+                            pi_mem *ret_mem,
                             const pi_mem_properties *properties) {
+  (void)device;
   pi_result ret_err = PI_ERROR_INVALID_OPERATION;
   if (properties) {
     // TODO: need to check if all properties are supported by OpenCL RT and
@@ -1166,10 +1168,12 @@ pi_result piMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size,
   return ret_err;
 }
 
-pi_result piMemImageCreate(pi_context context, pi_mem_flags flags,
+pi_result piMemImageCreate(pi_context context, pi_device device,
+                           pi_mem_flags flags,
                            const pi_image_format *image_format,
                            const pi_image_desc *image_desc, void *host_ptr,
                            pi_mem *ret_mem) {
+  (void)device;
   pi_result ret_err = PI_ERROR_INVALID_OPERATION;
   *ret_mem = cast<pi_mem>(
       clCreateImage(cast<cl_context>(context), cast<cl_mem_flags>(flags),
@@ -2111,6 +2115,21 @@ pi_result piextKernelGetNativeHandle(pi_kernel kernel,
   return piextGetNativeHandle(kernel, nativeHandle);
 }
 
+__SYCL_EXPORT pi_result piextGetMemoryConnection(pi_device device1,
+                                                 pi_context context1,
+                                                 pi_device device2,
+                                                 pi_context context2,
+                                                 _pi_memory_connection *res) {
+  (void)device1;
+  (void)device2;
+  if (context1 == context2) {
+    *res = PI_MEMORY_CONNECTION_UNIFIED;
+  } else {
+    *res = PI_MEMORY_CONNECTION_NONE;
+  }
+  return PI_SUCCESS;
+}
+
 // This API is called by Sycl RT to notify the end of the plugin lifetime.
 // Windows: dynamically loaded plugins might have been unloaded already
 // when this is called. Sycl RT holds onto the PI plugin so it can be
@@ -2304,6 +2323,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piextEnqueueReadHostPipe, piextEnqueueReadHostPipe)
   _PI_CL(piextEnqueueWriteHostPipe, piextEnqueueWriteHostPipe)
 
+  _PI_CL(piextGetMemoryConnection, piextGetMemoryConnection)
   _PI_CL(piextKernelSetArgMemObj, piextKernelSetArgMemObj)
   _PI_CL(piextKernelSetArgSampler, piextKernelSetArgSampler)
   _PI_CL(piPluginGetLastError, piPluginGetLastError)
diff --git a/sycl/source/detail/buffer_impl.cpp b/sycl/source/detail/buffer_impl.cpp
index 550d803f5086e..340b126128ea1 100644
--- a/sycl/source/detail/buffer_impl.cpp
+++ b/sycl/source/detail/buffer_impl.cpp
@@ -19,8 +19,20 @@ namespace detail {
 #ifdef XPTI_ENABLE_INSTRUMENTATION
 uint8_t GBufferStreamID;
 #endif
+
+// FIXME: Remove this overload when the class is removed from __SYCL_EXPORT
 void *buffer_impl::allocateMem(ContextImplPtr Context, bool InitFromUserData,
-                               void *HostPtr, RT::PiEvent &OutEventToWait) {
+                               void *HostPtr, RT::PiEvent &InteropEvent) {
+  (void)Context;
+  (void)InitFromUserData;
+  (void)HostPtr;
+  (void)InteropEvent;
+  assert(false && "Deprecated: use the overload with the device parameter");
+}
+
+void *buffer_impl::allocateMem(ContextImplPtr Context, DeviceImplPtr Device,
+                               bool InitFromUserData, void *HostPtr,
+                               RT::PiEvent &OutEventToWait) {
   bool HostPtrReadOnly = false;
   BaseT::determineHostPtr(Context, InitFromUserData, HostPtr, HostPtrReadOnly);
 
@@ -28,7 +40,7 @@ void *buffer_impl::allocateMem(ContextImplPtr Context, bool InitFromUserData,
          "Internal error. Allocating memory on the host "
          "while having use_host_ptr property");
   return MemoryManager::allocateMemBuffer(
-      std::move(Context), this, HostPtr, HostPtrReadOnly,
+      std::move(Context), std::move(Device), this, HostPtr, HostPtrReadOnly,
       BaseT::getSizeInBytes(), BaseT::MInteropEvent, BaseT::MInteropContext,
       MProps, OutEventToWait);
 }
diff --git a/sycl/source/detail/buffer_impl.hpp b/sycl/source/detail/buffer_impl.hpp
index 1d34ce3e5c95c..01e58505185b3 100644
--- a/sycl/source/detail/buffer_impl.hpp
+++ b/sycl/source/detail/buffer_impl.hpp
@@ -146,7 +146,10 @@ class __SYCL_EXPORT buffer_impl final : public SYCLMemObjT {
               std::move(Allocator)) {}
 
   void *allocateMem(ContextImplPtr Context, bool InitFromUserData,
-                    void *HostPtr, RT::PiEvent &OutEventToWait) override;
+                    void *HostPtr, RT::PiEvent &InteropEvent) override;
+  void *allocateMem(ContextImplPtr Context, DeviceImplPtr Device,
+                    bool InitFromUserData, void *HostPtr,
+                    RT::PiEvent &OutEventToWait) override;
   void constructorNotification(const detail::code_location &CodeLoc,
                                void *UserObj, const void *HostObj,
                                const void *Type, uint32_t Dim,
diff --git a/sycl/source/detail/context_impl.hpp b/sycl/source/detail/context_impl.hpp
index ba6401a7da54c..0bc35a1b61a12 100644
--- a/sycl/source/detail/context_impl.hpp
+++ b/sycl/source/detail/context_impl.hpp
@@ -230,6 +230,11 @@ class context_impl {
   mutable KernelProgramCache MKernelProgramCache;
   mutable PropertySupport MSupportBufferLocationByDevices;
 
+  friend pi_memory_connection
+  getMemoryConnection(const std::shared_ptr<device_impl> &Dev1,
+                      const std::shared_ptr<context_impl> &Ctx1,
+                      const std::shared_ptr<device_impl> &Dev2,
+                      const std::shared_ptr<context_impl> &Ctx2);
   std::set<const void *> MAssociatedDeviceGlobals;
   std::mutex MAssociatedDeviceGlobalsMutex;
 
diff --git a/sycl/source/detail/device_image_impl.hpp b/sycl/source/detail/device_image_impl.hpp
index ea23b6828986c..dbf24e52de46a 100644
--- a/sycl/source/detail/device_image_impl.hpp
+++ b/sycl/source/detail/device_image_impl.hpp
@@ -61,7 +61,7 @@ class device_image_impl {
                     RT::PiProgram Program)
       : MBinImage(BinImage), MContext(std::move(Context)),
         MDevices(std::move(Devices)), MState(State), MProgram(Program),
-        MKernelIDs(std::move(KernelIDs)) {
+        MKernelIDs(std::move(KernelIDs)), MSpecConstsBuffers(MDevices.size()) {
     updateSpecConstSymMap();
   }
 
@@ -73,7 +73,7 @@ class device_image_impl {
       : MBinImage(BinImage), MContext(std::move(Context)),
         MDevices(std::move(Devices)), MState(State), MProgram(Program),
         MKernelIDs(std::move(KernelIDs)), MSpecConstsBlob(SpecConstsBlob),
-        MSpecConstSymMap(SpecConstMap) {}
+        MSpecConstsBuffers(MDevices.size()), MSpecConstSymMap(SpecConstMap) {}
 
   bool has_kernel(const kernel_id &KernelIDCand) const noexcept {
     return std::binary_search(MKernelIDs->begin(), MKernelIDs->end(),
@@ -206,22 +206,30 @@ class device_image_impl {
     return MSpecConstsBlob;
   }
 
-  RT::PiMem &get_spec_const_buffer_ref() noexcept {
+  RT::PiMem &get_spec_const_buffer_ref(device dev) noexcept {
     std::lock_guard<std::mutex> Lock{MSpecConstAccessMtx};
-    if (nullptr == MSpecConstsBuffer && !MSpecConstsBlob.empty()) {
+    RT::PiMem *mem = nullptr;
+    for (size_t i = 0; i < MDevices.size(); i++) {
+      if (MDevices[i] == dev) {
+        mem = &MSpecConstsBuffers[i];
+        break;
+      }
+    }
+    assert(mem);
+    if (nullptr == *mem && !MSpecConstsBlob.empty()) {
       const detail::plugin &Plugin = getSyclObjImpl(MContext)->getPlugin();
       // Uses PI_MEM_FLAGS_HOST_PTR_COPY instead of PI_MEM_FLAGS_HOST_PTR_USE
       // since post-enqueue cleanup might trigger destruction of
       // device_image_impl and, as a result, destruction of MSpecConstsBlob
       // while MSpecConstsBuffer is still in use.
       // TODO consider changing the lifetime of device_image_impl instead
-      memBufferCreateHelper(Plugin,
-                            detail::getSyclObjImpl(MContext)->getHandleRef(),
-                            PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_COPY,
-                            MSpecConstsBlob.size(), MSpecConstsBlob.data(),
-                            &MSpecConstsBuffer, nullptr);
+      memBufferCreateHelper(
+          Plugin, detail::getSyclObjImpl(MContext)->getHandleRef(),
+          detail::getSyclObjImpl(dev)->getHandleRef(),
+          PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_COPY,
+          MSpecConstsBlob.size(), MSpecConstsBlob.data(), mem, nullptr);
     }
-    return MSpecConstsBuffer;
+    return *mem;
   }
 
   const SpecConstMapT &get_spec_const_data_ref() const noexcept {
@@ -252,10 +260,12 @@ class device_image_impl {
       const detail::plugin &Plugin = getSyclObjImpl(MContext)->getPlugin();
       Plugin.call<PiApiKind::piProgramRelease>(MProgram);
     }
-    if (MSpecConstsBuffer) {
-      std::lock_guard<std::mutex> Lock{MSpecConstAccessMtx};
-      const detail::plugin &Plugin = getSyclObjImpl(MContext)->getPlugin();
-      memReleaseHelper(Plugin, MSpecConstsBuffer);
+    for (auto mem : MSpecConstsBuffers) {
+      if (mem) {
+        std::lock_guard<std::mutex> Lock{MSpecConstAccessMtx};
+        const detail::plugin &Plugin = getSyclObjImpl(MContext)->getPlugin();
+        memReleaseHelper(Plugin, mem);
+      }
     }
   }
 
@@ -339,10 +349,10 @@ class device_image_impl {
   // Binary blob which can have values of all specialization constants in the
   // image
   std::vector<unsigned char> MSpecConstsBlob;
-  // Buffer containing binary blob which can have values of all specialization
-  // constants in the image, it is using for storing non-native specialization
-  // constants
-  RT::PiMem MSpecConstsBuffer = nullptr;
+  // A vector of buffers containing binary blobs which can have values of all
+  // specialization constants in the image; they are used for storing non-native
+  // specialization constants
+  std::vector<RT::PiMem> MSpecConstsBuffers;
   // Contains map of spec const names to their descriptions + offsets in
   // the MSpecConstsBlob
   std::map<std::string, std::vector<SpecConstDescT>> MSpecConstSymMap;
diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp
index 41cb1c9fdb62c..9dd3c28ef81e1 100644
--- a/sycl/source/detail/device_impl.hpp
+++ b/sycl/source/detail/device_impl.hpp
@@ -30,6 +30,7 @@ namespace detail {
 
 // Forward declaration
 class platform_impl;
+class context_impl;
 using PlatformImplPtr = std::shared_ptr<platform_impl>;
 
 // TODO: Make code thread-safe
@@ -253,6 +254,14 @@ class device_impl {
   bool MIsAssertFailSupported = false;
   mutable std::string MDeviceName;
   mutable std::once_flag MDeviceNameFlag;
+
+  friend bool sameDev(const std::shared_ptr<device_impl> &LHS,
+                      const std::shared_ptr<device_impl> &RHS);
+  friend pi_memory_connection
+  getMemoryConnection(const std::shared_ptr<device_impl> &Dev1,
+                      const std::shared_ptr<context_impl> &Ctx1,
+                      const std::shared_ptr<device_impl> &Dev2,
+                      const std::shared_ptr<context_impl> &Ctx2);
   std::pair<uint64_t, uint64_t> MDeviceHostBaseTime;
 }; // class device_impl
 
diff --git a/sycl/source/detail/image_impl.cpp b/sycl/source/detail/image_impl.cpp
index 7ff987686496d..f639945c44a73 100644
--- a/sycl/source/detail/image_impl.cpp
+++ b/sycl/source/detail/image_impl.cpp
@@ -297,8 +297,19 @@ image_impl::image_impl(cl_mem MemObject, const context &SyclContext,
   }
 }
 
+// FIXME: Remove this overload when the class is removed from __SYCL_EXPORT
 void *image_impl::allocateMem(ContextImplPtr Context, bool InitFromUserData,
-                              void *HostPtr, RT::PiEvent &OutEventToWait) {
+                              void *HostPtr, RT::PiEvent &InteropEvent) {
+  (void)Context;
+  (void)InitFromUserData;
+  (void)HostPtr;
+  (void)InteropEvent;
+  assert(false && "Deprecated: use the overload with the device parameter");
+}
+
+void *image_impl::allocateMem(ContextImplPtr Context, DeviceImplPtr Device,
+                              bool InitFromUserData, void *HostPtr,
+                              RT::PiEvent &OutEventToWait) {
   bool HostPtrReadOnly = false;
   BaseT::determineHostPtr(Context, InitFromUserData, HostPtr, HostPtrReadOnly);
 
@@ -311,7 +322,7 @@ void *image_impl::allocateMem(ContextImplPtr Context, bool InitFromUserData,
          "The check an image format failed.");
 
   return MemoryManager::allocateMemImage(
-      std::move(Context), this, HostPtr, HostPtrReadOnly,
+      std::move(Context), std::move(Device), this, HostPtr, HostPtrReadOnly,
       BaseT::getSizeInBytes(), Desc, Format, BaseT::MInteropEvent,
       BaseT::MInteropContext, MProps, OutEventToWait);
 }
diff --git a/sycl/source/detail/image_impl.hpp b/sycl/source/detail/image_impl.hpp
index 9474fae5d895e..dda2803a00ecc 100644
--- a/sycl/source/detail/image_impl.hpp
+++ b/sycl/source/detail/image_impl.hpp
@@ -179,7 +179,10 @@ class __SYCL_EXPORT image_impl final : public SYCLMemObjT {
   size_t size() const noexcept { return MRange.size(); }
 
   void *allocateMem(ContextImplPtr Context, bool InitFromUserData,
-                    void *HostPtr, RT::PiEvent &OutEventToWait) override;
+                    void *HostPtr, RT::PiEvent &InteropEvent) override;
+  void *allocateMem(ContextImplPtr Context, DeviceImplPtr Device,
+                    bool InitFromUserData, void *HostPtr,
+                    RT::PiEvent &OutEventToWait) override;
 
   MemObjType getType() const override { return MemObjType::Image; }
 
diff --git a/sycl/source/detail/mem_alloc_helper.hpp b/sycl/source/detail/mem_alloc_helper.hpp
index c2c6dd0a3d5a9..988eed6e2e223 100644
--- a/sycl/source/detail/mem_alloc_helper.hpp
+++ b/sycl/source/detail/mem_alloc_helper.hpp
@@ -13,7 +13,7 @@
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
-void memBufferCreateHelper(const plugin &Plugin, pi_context Ctx,
+void memBufferCreateHelper(const plugin &Plugin, pi_context Ctx, pi_device Dev,
                            pi_mem_flags Flags, size_t Size, void *HostPtr,
                            pi_mem *RetMem,
                            const pi_mem_properties *Props = nullptr);
diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp
index 9985bee6ba426..124142e5c0f0d 100644
--- a/sycl/source/detail/memory_manager.cpp
+++ b/sycl/source/detail/memory_manager.cpp
@@ -126,7 +126,7 @@ static void waitForEvents(const std::vector<EventImplPtr> &Events) {
   }
 }
 
-void memBufferCreateHelper(const plugin &Plugin, pi_context Ctx,
+void memBufferCreateHelper(const plugin &Plugin, pi_context Ctx, pi_device Dev,
                            pi_mem_flags Flags, size_t Size, void *HostPtr,
                            pi_mem *RetMem, const pi_mem_properties *Props) {
 #ifdef XPTI_ENABLE_INSTRUMENTATION
@@ -149,8 +149,8 @@ void memBufferCreateHelper(const plugin &Plugin, pi_context Ctx,
                            CorrID);
     }};
 #endif
-    Plugin.call<PiApiKind::piMemBufferCreate>(Ctx, Flags, Size, HostPtr, RetMem,
-                                              Props);
+    Plugin.call<PiApiKind::piMemBufferCreate>(Ctx, Dev, Flags, Size, HostPtr,
+                                              RetMem, Props);
   }
 }
 
@@ -262,7 +262,8 @@ void MemoryManager::releaseMemObj(ContextImplPtr TargetContext,
   memReleaseHelper(Plugin, pi::cast<RT::PiMem>(MemAllocation));
 }
 
-void *MemoryManager::allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj,
+void *MemoryManager::allocate(ContextImplPtr TargetContext,
+                              DeviceImplPtr TargetDevice, SYCLMemObjI *MemObj,
                               bool InitFromUserData, void *HostPtr,
                               std::vector<EventImplPtr> DepEvents,
                               RT::PiEvent &OutEvent) {
@@ -271,8 +272,8 @@ void *MemoryManager::allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj,
   waitForEvents(DepEvents);
   OutEvent = nullptr;
 
-  return MemObj->allocateMem(TargetContext, InitFromUserData, HostPtr,
-                             OutEvent);
+  return MemObj->allocateMem(TargetContext, TargetDevice, InitFromUserData,
+                             HostPtr, OutEvent);
 }
 
 void *MemoryManager::allocateHostMemory(SYCLMemObjI *MemObj, void *UserPtr,
@@ -318,6 +319,7 @@ static RT::PiMemFlags getMemObjCreationFlags(void *UserPtr,
 }
 
 void *MemoryManager::allocateImageObject(ContextImplPtr TargetContext,
+                                         DeviceImplPtr TargetDevice,
                                          void *UserPtr, bool HostPtrReadOnly,
                                          const RT::PiMemImageDesc &Desc,
                                          const RT::PiMemImageFormat &Format,
@@ -327,14 +329,15 @@ void *MemoryManager::allocateImageObject(ContextImplPtr TargetContext,
 
   RT::PiMem NewMem;
   const detail::plugin &Plugin = TargetContext->getPlugin();
-  Plugin.call<PiApiKind::piMemImageCreate>(TargetContext->getHandleRef(),
-                                           CreationFlags, &Format, &Desc,
-                                           UserPtr, &NewMem);
+  Plugin.call<PiApiKind::piMemImageCreate>(
+      TargetContext->getHandleRef(), TargetDevice->getHandleRef(),
+      CreationFlags, &Format, &Desc, UserPtr, &NewMem);
   return NewMem;
 }
 
 void *
-MemoryManager::allocateBufferObject(ContextImplPtr TargetContext, void *UserPtr,
+MemoryManager::allocateBufferObject(ContextImplPtr TargetContext,
+                                    DeviceImplPtr TargetDevice, void *UserPtr,
                                     bool HostPtrReadOnly, const size_t Size,
                                     const sycl::property_list &PropsList) {
   RT::PiMemFlags CreationFlags =
@@ -354,21 +357,21 @@ MemoryManager::allocateBufferObject(ContextImplPtr TargetContext, void *UserPtr,
       pi_mem_properties props[3] = {PI_MEM_PROPERTIES_ALLOC_BUFFER_LOCATION,
                                     location, 0};
       memBufferCreateHelper(Plugin, TargetContext->getHandleRef(),
-                            CreationFlags, Size, UserPtr, &NewMem, props);
+                            TargetDevice->getHandleRef(), CreationFlags, Size,
+                            UserPtr, &NewMem, props);
       return NewMem;
     }
-  memBufferCreateHelper(Plugin, TargetContext->getHandleRef(), CreationFlags,
-                        Size, UserPtr, &NewMem, nullptr);
+  memBufferCreateHelper(Plugin, TargetContext->getHandleRef(),
+                        TargetDevice->getHandleRef(), CreationFlags, Size,
+                        UserPtr, &NewMem, nullptr);
   return NewMem;
 }
 
-void *MemoryManager::allocateMemBuffer(ContextImplPtr TargetContext,
-                                       SYCLMemObjI *MemObj, void *UserPtr,
-                                       bool HostPtrReadOnly, size_t Size,
-                                       const EventImplPtr &InteropEvent,
-                                       const ContextImplPtr &InteropContext,
-                                       const sycl::property_list &PropsList,
-                                       RT::PiEvent &OutEventToWait) {
+void *MemoryManager::allocateMemBuffer(
+    ContextImplPtr TargetContext, DeviceImplPtr TargetDevice,
+    SYCLMemObjI *MemObj, void *UserPtr, bool HostPtrReadOnly, size_t Size,
+    const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext,
+    const sycl::property_list &PropsList, RT::PiEvent &OutEventToWait) {
   void *MemPtr;
   if (TargetContext->is_host())
     MemPtr =
@@ -378,26 +381,26 @@ void *MemoryManager::allocateMemBuffer(ContextImplPtr TargetContext,
         allocateInteropMemObject(TargetContext, UserPtr, InteropEvent,
                                  InteropContext, PropsList, OutEventToWait);
   else
-    MemPtr = allocateBufferObject(TargetContext, UserPtr, HostPtrReadOnly, Size,
-                                  PropsList);
+    MemPtr = allocateBufferObject(TargetContext, TargetDevice, UserPtr,
+                                  HostPtrReadOnly, Size, PropsList);
   XPTIRegistry::bufferAssociateNotification(MemObj, MemPtr);
   return MemPtr;
 }
 
 void *MemoryManager::allocateMemImage(
-    ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *UserPtr,
-    bool HostPtrReadOnly, size_t Size, const RT::PiMemImageDesc &Desc,
-    const RT::PiMemImageFormat &Format, const EventImplPtr &InteropEvent,
-    const ContextImplPtr &InteropContext, const sycl::property_list &PropsList,
-    RT::PiEvent &OutEventToWait) {
+    ContextImplPtr TargetContext, DeviceImplPtr TargetDevice,
+    SYCLMemObjI *MemObj, void *UserPtr, bool HostPtrReadOnly, size_t Size,
+    const RT::PiMemImageDesc &Desc, const RT::PiMemImageFormat &Format,
+    const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext,
+    const sycl::property_list &PropsList, RT::PiEvent &OutEventToWait) {
   if (TargetContext->is_host())
     return allocateHostMemory(MemObj, UserPtr, HostPtrReadOnly, Size,
                               PropsList);
   if (UserPtr && InteropContext)
     return allocateInteropMemObject(TargetContext, UserPtr, InteropEvent,
                                     InteropContext, PropsList, OutEventToWait);
-  return allocateImageObject(TargetContext, UserPtr, HostPtrReadOnly, Desc,
-                             Format, PropsList);
+  return allocateImageObject(TargetContext, TargetDevice, UserPtr,
+                             HostPtrReadOnly, Desc, Format, PropsList);
 }
 
 void *MemoryManager::allocateMemSubBuffer(ContextImplPtr TargetContext,
@@ -435,6 +438,43 @@ void *MemoryManager::allocateMemSubBuffer(ContextImplPtr TargetContext,
   return NewMem;
 }
 
+// Deprecated allocation functions
+void *MemoryManager::allocate(ContextImplPtr, SYCLMemObjI *, bool, void *,
+                              std::vector<EventImplPtr>, RT::PiEvent &) {
+  assert(false && "Deprecated: use the overload with the device parameter");
+}
+
+void *MemoryManager::allocateMemBuffer(ContextImplPtr, SYCLMemObjI *, void *,
+                                       bool, size_t, const EventImplPtr &,
+                                       const ContextImplPtr &,
+                                       const sycl::property_list &,
+                                       RT::PiEvent &) {
+  assert(false && "Deprecated: use the overload with the device parameter");
+}
+
+void *MemoryManager::allocateMemImage(ContextImplPtr, SYCLMemObjI *, void *,
+                                      bool, size_t, const RT::PiMemImageDesc &,
+                                      const RT::PiMemImageFormat &,
+                                      const EventImplPtr &,
+                                      const ContextImplPtr &,
+                                      const sycl::property_list &,
+                                      RT::PiEvent &) {
+  assert(false && "Deprecated: use the overload with the device parameter");
+}
+
+void *MemoryManager::allocateImageObject(ContextImplPtr, void *, bool,
+                                         const RT::PiMemImageDesc &,
+                                         const RT::PiMemImageFormat &,
+                                         const sycl::property_list &) {
+  assert(false && "Deprecated: use the overload with the device parameter");
+}
+
+void *MemoryManager::allocateBufferObject(ContextImplPtr, void *, bool,
+                                          const size_t,
+                                          const sycl::property_list &) {
+  assert(false && "Deprecated: use the overload with the device parameter");
+}
+
 struct TermPositions {
   int XTerm;
   int YTerm;
diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp
index c9d1738e942ad..124bb698a994b 100644
--- a/sycl/source/detail/memory_manager.hpp
+++ b/sycl/source/detail/memory_manager.hpp
@@ -25,6 +25,7 @@ namespace detail {
 class queue_impl;
 class event_impl;
 class context_impl;
+class device_impl;
 
 using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
 using EventImplPtr = std::shared_ptr<detail::event_impl>;
@@ -43,7 +44,8 @@ class __SYCL_EXPORT MemoryManager {
 
   // The following method allocates memory allocation of memory object.
   // Depending on the context it allocates memory on host or on device.
-  static void *allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj,
+  static void *allocate(ContextImplPtr TargetContext,
+                        DeviceImplPtr TargetDevice, SYCLMemObjI *MemObj,
                         bool InitFromUserData, void *HostPtr,
                         std::vector<EventImplPtr> DepEvents,
                         RT::PiEvent &OutEvent);
@@ -59,22 +61,20 @@ class __SYCL_EXPORT MemoryManager {
   // Allocates buffer in specified context taking into account situations such
   // as host ptr or cl_mem provided by user. TargetContext should be device
   // one(not host).
-  static void *allocateMemBuffer(ContextImplPtr TargetContext,
-                                 SYCLMemObjI *MemObj, void *UserPtr,
-                                 bool HostPtrReadOnly, size_t Size,
-                                 const EventImplPtr &InteropEvent,
-                                 const ContextImplPtr &InteropContext,
-                                 const sycl::property_list &PropsList,
-                                 RT::PiEvent &OutEventToWait);
+  static void *allocateMemBuffer(
+      ContextImplPtr TargetContext, DeviceImplPtr TargetDevice,
+      SYCLMemObjI *MemObj, void *UserPtr, bool HostPtrReadOnly, size_t Size,
+      const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext,
+      const sycl::property_list &PropsList, RT::PiEvent &OutEventToWait);
 
   // Allocates images in specified context taking into account situations such
   // as host ptr or cl_mem provided by user. TargetContext should be device
   // one(not host).
   static void *allocateMemImage(
-      ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *UserPtr,
-      bool HostPtrReadOnly, size_t Size, const RT::PiMemImageDesc &Desc,
-      const RT::PiMemImageFormat &Format, const EventImplPtr &InteropEvent,
-      const ContextImplPtr &InteropContext,
+      ContextImplPtr TargetContext, DeviceImplPtr TargetDevice,
+      SYCLMemObjI *MemObj, void *UserPtr, bool HostPtrReadOnly, size_t Size,
+      const RT::PiMemImageDesc &Desc, const RT::PiMemImageFormat &Format,
+      const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext,
       const sycl::property_list &PropsList, RT::PiEvent &OutEventToWait);
 
   // Releases memory object(buffer or image). TargetContext should be device
@@ -93,12 +93,42 @@ class __SYCL_EXPORT MemoryManager {
                                         const sycl::property_list &PropsList,
                                         RT::PiEvent &OutEventToWait);
 
-  static void *allocateImageObject(ContextImplPtr TargetContext, void *UserPtr,
+  static void *allocateImageObject(ContextImplPtr TargetContext,
+                                   DeviceImplPtr TargetDevice, void *UserPtr,
                                    bool HostPtrReadOnly,
                                    const RT::PiMemImageDesc &Desc,
                                    const RT::PiMemImageFormat &Format,
                                    const sycl::property_list &PropsList);
 
+  static void *allocateBufferObject(ContextImplPtr TargetContext,
+                                    DeviceImplPtr TargetDevice, void *UserPtr,
+                                    bool HostPtrReadOnly, const size_t Size,
+                                    const sycl::property_list &PropsList);
+
+  // FIXME: Deprecated allocation methods, maintaining for ABI compatibility,
+  // to remove when the class is removed from __SYCL_EXPORT
+  static void *allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj,
+                        bool InitFromUserData, void *HostPtr,
+                        std::vector<EventImplPtr> DepEvents,
+                        RT::PiEvent &OutEvent);
+  static void *allocateMemBuffer(ContextImplPtr TargetContext,
+                                 SYCLMemObjI *MemObj, void *UserPtr,
+                                 bool HostPtrReadOnly, size_t Size,
+                                 const EventImplPtr &InteropEvent,
+                                 const ContextImplPtr &InteropContext,
+                                 const sycl::property_list &PropsList,
+                                 RT::PiEvent &OutEventToWait);
+  static void *allocateMemImage(
+      ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *UserPtr,
+      bool HostPtrReadOnly, size_t Size, const RT::PiMemImageDesc &Desc,
+      const RT::PiMemImageFormat &Format, const EventImplPtr &InteropEvent,
+      const ContextImplPtr &InteropContext,
+      const sycl::property_list &PropsList, RT::PiEvent &OutEventToWait);
+  static void *allocateImageObject(ContextImplPtr TargetContext, void *UserPtr,
+                                   bool HostPtrReadOnly,
+                                   const RT::PiMemImageDesc &Desc,
+                                   const RT::PiMemImageFormat &Format,
+                                   const sycl::property_list &PropsList);
   static void *allocateBufferObject(ContextImplPtr TargetContext, void *UserPtr,
                                     bool HostPtrReadOnly, const size_t Size,
                                     const sycl::property_list &PropsList);
diff --git a/sycl/source/detail/program_impl.cpp b/sycl/source/detail/program_impl.cpp
index 59fb15178114e..f73e1f8a420ea 100644
--- a/sycl/source/detail/program_impl.cpp
+++ b/sycl/source/detail/program_impl.cpp
@@ -35,14 +35,7 @@ program_impl::program_impl(ContextImplPtr Context,
 program_impl::program_impl(ContextImplPtr Context,
                            std::vector<device> DeviceList,
                            const property_list &PropList)
-    : MContext(Context), MDevices(DeviceList), MPropList(PropList) {
-  if (Context->getDevices().size() > 1) {
-    throw feature_not_supported(
-        "multiple devices within a context are not supported with "
-        "sycl::program and sycl::kernel",
-        PI_ERROR_INVALID_OPERATION);
-  }
-}
+    : MContext(Context), MDevices(DeviceList), MPropList(PropList) {}
 
 program_impl::program_impl(
     std::vector<std::shared_ptr<program_impl>> ProgramList,
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 4ab7c6bdce954..bc17f3cf5f53d 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -1035,8 +1035,8 @@ pi_int32 AllocaCommand::enqueueImp() {
   // TODO: Check if it is correct to use std::move on stack variable and
   // delete it RawEvents below.
   MMemAllocation = MemoryManager::allocate(
-      MQueue->getContextImplPtr(), getSYCLMemObj(), MInitFromUserData, HostPtr,
-      std::move(EventImpls), Event);
+      MQueue->getContextImplPtr(), MQueue->getDeviceImplPtr(), getSYCLMemObj(),
+      MInitFromUserData, HostPtr, std::move(EventImpls), Event);
 
   return PI_SUCCESS;
 }
@@ -2146,7 +2146,8 @@ static pi_result SetKernelParamsAndLaunch(
             PI_ERROR_INVALID_OPERATION);
       }
       assert(DeviceImageImpl != nullptr);
-      RT::PiMem SpecConstsBuffer = DeviceImageImpl->get_spec_const_buffer_ref();
+      RT::PiMem SpecConstsBuffer =
+          DeviceImageImpl->get_spec_const_buffer_ref(Queue->get_device());
       // Avoid taking an address of nullptr
       RT::PiMem *SpecConstsBufferArg =
           SpecConstsBuffer ? &SpecConstsBuffer : nullptr;
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 5dee68c6e69d2..e17115e8b7865 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -52,6 +52,39 @@ static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS) {
   // allocation on the host
   return LHS == RHS || (LHS->is_host() && RHS->is_host());
 }
+bool sameDev(const DeviceImplPtr &LHS, const DeviceImplPtr &RHS) {
+  RT::PiDevice LHSroot =
+      LHS->MRootDevice == nullptr ? LHS->MDevice : LHS->MRootDevice;
+  RT::PiDevice RHSroot =
+      RHS->MRootDevice == nullptr ? RHS->MDevice : RHS->MRootDevice;
+  return LHSroot == RHSroot;
+}
+
+pi_memory_connection getMemoryConnection(const DeviceImplPtr &Dev1,
+                                         const ContextImplPtr &Ctx1,
+                                         const DeviceImplPtr &Dev2,
+                                         const ContextImplPtr &Ctx2) {
+  if ((sameCtx(Ctx1, Ctx2) && sameDev(Dev1, Dev2)) ||
+      (Dev1->MIsHostDevice && Dev2->MIsHostDevice)) {
+    return PI_MEMORY_CONNECTION_UNIFIED;
+  }
+
+  if (Dev1->MIsHostDevice ^ Dev2->MIsHostDevice) {
+    return PI_MEMORY_CONNECTION_NONE;
+  }
+
+  auto plugin1 = Dev1->getPlugin();
+  auto plugin2 = Dev2->getPlugin();
+
+  if (plugin1.getBackend() != plugin2.getBackend()) {
+    return PI_MEMORY_CONNECTION_NONE;
+  }
+
+  _pi_memory_connection conn;
+  plugin1.call<PiApiKind::piextGetMemoryConnection>(
+      Dev1->MDevice, Ctx1->MContext, Dev2->MDevice, Ctx2->MContext, &conn);
+  return conn;
+}
 
 /// Checks if current requirement is requirement for sub buffer.
 static bool IsSuitableSubReq(const Requirement *Req) {
@@ -239,11 +272,12 @@ MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord(
         Dev, InteropCtxPtr, /*AsyncHandler=*/{}, /*PropertyList=*/{}}};
 
     MemObject->MRecord.reset(
-        new MemObjRecord{InteropCtxPtr, LeafLimit, AllocateDependency});
+        new MemObjRecord{InteropCtxPtr, Dev, LeafLimit, AllocateDependency});
     getOrCreateAllocaForReq(MemObject->MRecord.get(), Req, InteropQueuePtr,
                             ToEnqueue);
   } else
     MemObject->MRecord.reset(new MemObjRecord{Queue->getContextImplPtr(),
+                                              Queue->getDeviceImplPtr(),
                                               LeafLimit, AllocateDependency});
 
   MMemObjs.push_back(MemObject);
@@ -282,8 +316,8 @@ void Scheduler::GraphBuilder::addNodeToLeaves(
 UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd(
     MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue,
     std::vector<Command *> &ToEnqueue) {
-  AllocaCommandBase *AllocaCmd =
-      findAllocaForReq(Record, Req, Queue->getContextImplPtr());
+  AllocaCommandBase *AllocaCmd = findAllocaForReq(
+      Record, Req, Queue->getContextImplPtr(), Queue->getDeviceImplPtr());
   assert(AllocaCmd && "There must be alloca for requirement!");
   UpdateHostRequirementCommand *UpdateCommand =
       new UpdateHostRequirementCommand(Queue, *Req, AllocaCmd, &Req->MData);
@@ -291,8 +325,8 @@ UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd(
   // dependencies become invalid if requirement is stored by pointer.
   const Requirement *StoredReq = UpdateCommand->getRequirement();
 
-  std::set<Command *> Deps =
-      findDepsForReq(Record, Req, Queue->getContextImplPtr());
+  std::set<Command *> Deps = findDepsForReq(
+      Record, Req, Queue->getContextImplPtr(), Queue->getDeviceImplPtr());
   std::vector<Command *> ToCleanUp;
   for (Command *Dep : Deps) {
     Command *ConnCmd =
@@ -345,8 +379,8 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(
   if (!AllocaCmdDst)
     throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
 
-  std::set<Command *> Deps =
-      findDepsForReq(Record, Req, Queue->getContextImplPtr());
+  std::set<Command *> Deps = findDepsForReq(
+      Record, Req, Queue->getContextImplPtr(), Queue->getDeviceImplPtr());
   Deps.insert(AllocaCmdDst);
   // Get parent allocation of sub buffer to perform full copy of whole buffer
   if (IsSuitableSubReq(Req)) {
@@ -356,14 +390,16 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(
   }
 
   AllocaCommandBase *AllocaCmdSrc =
-      findAllocaForReq(Record, Req, Record->MCurContext);
+      findAllocaForReq(Record, Req, Record->MCurContext, Record->MCurDevice);
   if (!AllocaCmdSrc && IsSuitableSubReq(Req)) {
     // Since no alloca command for the sub buffer requirement was found in the
     // current context, need to find a parent alloca command for it (it must be
     // there)
     auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) {
-      bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(),
-                         Record->MCurContext) &&
+      bool Res = getMemoryConnection(AllocaCmd->getQueue()->getDeviceImplPtr(),
+                                     AllocaCmd->getQueue()->getContextImplPtr(),
+                                     Record->MCurDevice, Record->MCurContext) ==
+                     PI_MEMORY_CONNECTION_UNIFIED &&
                  // Looking for a parent buffer alloca command
                  AllocaCmd->getType() == Command::CommandType::ALLOCA;
       return Res;
@@ -399,6 +435,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(
     if ((Req->MAccessMode == access::mode::discard_write) ||
         (Req->MAccessMode == access::mode::discard_read_write)) {
       Record->MCurContext = Queue->getContextImplPtr();
+      Record->MCurDevice = Queue->getDeviceImplPtr();
       return nullptr;
     } else {
       // Full copy of buffer is needed to avoid loss of data that may be caused
@@ -421,6 +458,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(
   for (Command *Cmd : ToCleanUp)
     cleanupCommand(Cmd);
   Record->MCurContext = Queue->getContextImplPtr();
+  Record->MCurDevice = Queue->getDeviceImplPtr();
   return NewCmd;
 }
 
@@ -434,7 +472,8 @@ Command *Scheduler::GraphBuilder::remapMemoryObject(
   AllocaCommandBase *LinkedAllocaCmd = HostAllocaCmd->MLinkedAllocaCmd;
   assert(LinkedAllocaCmd && "Linked alloca command expected");
 
-  std::set<Command *> Deps = findDepsForReq(Record, Req, Record->MCurContext);
+  std::set<Command *> Deps =
+      findDepsForReq(Record, Req, Record->MCurContext, Record->MCurDevice);
 
   UnMapMemObject *UnMapCmd = new UnMapMemObject(
       LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
@@ -485,9 +524,10 @@ Scheduler::GraphBuilder::addCopyBack(Requirement *Req,
     return nullptr;
 
   std::set<Command *> Deps =
-      findDepsForReq(Record, Req, HostQueue->getContextImplPtr());
+      findDepsForReq(Record, Req, HostQueue->getContextImplPtr(),
+                     HostQueue->getDeviceImplPtr());
   AllocaCommandBase *SrcAllocaCmd =
-      findAllocaForReq(Record, Req, Record->MCurContext);
+      findAllocaForReq(Record, Req, Record->MCurContext, Record->MCurDevice);
 
   auto MemCpyCmdUniquePtr = std::make_unique<MemCpyCommandHost>(
       *SrcAllocaCmd->getRequirement(), SrcAllocaCmd, *Req, &Req->MData,
@@ -572,11 +612,10 @@ Command *Scheduler::GraphBuilder::addCGUpdateHost(
 ///
 /// 1. New and examined commands only read -> can bypass
 /// 2. New and examined commands has non-overlapping requirements -> can bypass
-/// 3. New and examined commands have different contexts -> cannot bypass
-std::set<Command *>
-Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record,
-                                        const Requirement *Req,
-                                        const ContextImplPtr &Context) {
+/// 3. New and examined commands have different devices -> cannot bypass
+std::set<Command *> Scheduler::GraphBuilder::findDepsForReq(
+    MemObjRecord *Record, const Requirement *Req, const ContextImplPtr &Context,
+    const DeviceImplPtr &Device) {
   std::set<Command *> RetDeps;
   std::vector<Command *> Visited;
   const bool ReadOnlyReq = Req->MAccessMode == access::mode::read;
@@ -607,10 +646,13 @@ Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record,
       // If not overlap
       CanBypassDep |= !doOverlap(Dep.MDepRequirement, Req);
 
-      // Going through copying memory between contexts is not supported.
+      // Going through copying memory between devices is not supported.
       if (Dep.MDepCommand)
-        CanBypassDep &=
-            sameCtx(Context, Dep.MDepCommand->getQueue()->getContextImplPtr());
+        CanBypassDep &= getMemoryConnection(
+                            Device, Context,
+                            Dep.MDepCommand->getQueue()->getDeviceImplPtr(),
+                            Dep.MDepCommand->getQueue()->getContextImplPtr()) ==
+                        PI_MEMORY_CONNECTION_UNIFIED;
 
       if (!CanBypassDep) {
         RetDeps.insert(DepCmd);
@@ -646,10 +688,13 @@ DepDesc Scheduler::GraphBuilder::findDepForRecord(Command *Cmd,
 // requirement.
 AllocaCommandBase *Scheduler::GraphBuilder::findAllocaForReq(
     MemObjRecord *Record, const Requirement *Req, const ContextImplPtr &Context,
-    bool AllowConst) {
-  auto IsSuitableAlloca = [&Context, Req,
+    const DeviceImplPtr &Device, bool AllowConst) {
+  auto IsSuitableAlloca = [&Device, &Context, Req,
                            AllowConst](AllocaCommandBase *AllocaCmd) {
-    bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), Context);
+    auto &Queue = AllocaCmd->getQueue();
+    bool Res = getMemoryConnection(Queue->getDeviceImplPtr(),
+                                   Queue->getContextImplPtr(), Device,
+                                   Context) == PI_MEMORY_CONNECTION_UNIFIED;
     if (IsSuitableSubReq(Req)) {
       const Requirement *TmpReq = AllocaCmd->getRequirement();
       Res &= AllocaCmd->getType() == Command::CommandType::ALLOCA_SUB_BUF;
@@ -687,8 +732,9 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
     MemObjRecord *Record, const Requirement *Req, const QueueImplPtr &Queue,
     std::vector<Command *> &ToEnqueue) {
 
-  AllocaCommandBase *AllocaCmd = findAllocaForReq(
-      Record, Req, Queue->getContextImplPtr(), /*AllowConst=*/false);
+  AllocaCommandBase *AllocaCmd =
+      findAllocaForReq(Record, Req, Queue->getContextImplPtr(),
+                       Queue->getDeviceImplPtr(), /*AllowConst=*/false);
 
   if (!AllocaCmd) {
     std::vector<Command *> ToCleanUp;
@@ -747,6 +793,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
             Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue);
             ++(HostAllocaCmd->MLeafCounter);
             Record->MCurContext = DefaultHostQueue->getContextImplPtr();
+            Record->MCurDevice = DefaultHostQueue->getDeviceImplPtr();
           }
         }
       } else {
@@ -774,8 +821,9 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
                 Queue->is_host() ? checkHostUnifiedMemory(Record->MCurContext)
                                  : HostUnifiedMemory;
             if (PinnedHostMemory || HostUnifiedMemoryOnNonHostDevice) {
-              AllocaCommandBase *LinkedAllocaCmdCand = findAllocaForReq(
-                  Record, Req, Record->MCurContext, /*AllowConst=*/false);
+              AllocaCommandBase *LinkedAllocaCmdCand =
+                  findAllocaForReq(Record, Req, Record->MCurContext,
+                                   Record->MCurDevice, /*AllowConst=*/false);
 
               // Cannot setup link if candidate is linked already
               if (LinkedAllocaCmdCand &&
@@ -816,9 +864,11 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
         } else {
           LinkedAllocaCmd->MIsActive = false;
           Record->MCurContext = Queue->getContextImplPtr();
+          Record->MCurDevice = Queue->getDeviceImplPtr();
 
           std::set<Command *> Deps =
-              findDepsForReq(Record, Req, Queue->getContextImplPtr());
+              findDepsForReq(Record, Req, Queue->getContextImplPtr(),
+                             Queue->getDeviceImplPtr());
           for (Command *Dep : Deps) {
             Command *ConnCmd = AllocaCmd->addDep(
                 DepDesc{Dep, Req, LinkedAllocaCmd}, ToCleanUp);
@@ -1025,7 +1075,7 @@ void Scheduler::GraphBuilder::createGraphForCommand(
     MemObjRecord *Record = nullptr;
     AllocaCommandBase *AllocaCmd = nullptr;
 
-    bool isSameCtx = false;
+    pi_memory_connection memoryConnection = PI_MEMORY_CONNECTION_NONE;
 
     {
       const QueueImplPtr &QueueForAlloca =
@@ -1037,13 +1087,15 @@ void Scheduler::GraphBuilder::createGraphForCommand(
       AllocaCmd =
           getOrCreateAllocaForReq(Record, Req, QueueForAlloca, ToEnqueue);
 
-      isSameCtx =
-          sameCtx(QueueForAlloca->getContextImplPtr(), Record->MCurContext);
+      memoryConnection =
+          getMemoryConnection(QueueForAlloca->getDeviceImplPtr(),
+                              QueueForAlloca->getContextImplPtr(),
+                              Record->MCurDevice, Record->MCurContext);
     }
 
     // If there is alloca command we need to check if the latest memory is in
     // required context.
-    if (isSameCtx) {
+    if (memoryConnection == PI_MEMORY_CONNECTION_UNIFIED) {
       // If the memory is already in the required host context, check if the
       // required access mode is valid, remap if not.
       if (Record->MCurContext->is_host() &&
@@ -1052,13 +1104,16 @@ void Scheduler::GraphBuilder::createGraphForCommand(
     } else {
       // Cannot directly copy memory from OpenCL device to OpenCL device -
       // create two copies: device->host and host->device.
-      bool NeedMemMoveToHost = false;
+      bool NeedMemMoveToHost = memoryConnection == PI_MEMORY_CONNECTION_NONE;
       auto MemMoveTargetQueue = Queue;
 
       if (isInteropTask) {
         const detail::CGHostTask &HT = static_cast<detail::CGHostTask &>(CG);
 
-        if (HT.MQueue->getContextImplPtr() != Record->MCurContext) {
+        if (getMemoryConnection(HT.MQueue->getDeviceImplPtr(),
+                                HT.MQueue->getContextImplPtr(),
+                                Record->MCurDevice, Record->MCurContext) !=
+            PI_MEMORY_CONNECTION_UNIFIED) {
           NeedMemMoveToHost = true;
           MemMoveTargetQueue = HT.MQueue;
         }
@@ -1071,8 +1126,8 @@ void Scheduler::GraphBuilder::createGraphForCommand(
                          ToEnqueue);
       insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
     }
-    std::set<Command *> Deps =
-        findDepsForReq(Record, Req, Queue->getContextImplPtr());
+    std::set<Command *> Deps = findDepsForReq(
+        Record, Req, Queue->getContextImplPtr(), Queue->getDeviceImplPtr());
 
     for (Command *Dep : Deps) {
       if (Dep != NewCmd) {
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index fe7c5f7fbbf8e..78a968f4ab05b 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -174,11 +174,14 @@ class MockScheduler;
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
+
+class device_impl;
 class queue_impl;
 class event_impl;
 class context_impl;
 class DispatchHostTask;
 
+using DeviceImplPtr = std::shared_ptr<detail::device_impl>;
 using ContextImplPtr = std::shared_ptr<detail::context_impl>;
 using EventImplPtr = std::shared_ptr<detail::event_impl>;
 using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
@@ -196,10 +199,11 @@ using FusionMap = std::unordered_map<QueueIdT, FusionList>;
 ///
 /// \ingroup sycl_graph
 struct MemObjRecord {
-  MemObjRecord(ContextImplPtr Ctx, std::size_t LeafLimit,
+  MemObjRecord(ContextImplPtr Ctx, DeviceImplPtr Dev, std::size_t LeafLimit,
                LeavesCollection::AllocateDependencyF AllocateDependency)
       : MReadLeaves{this, LeafLimit, AllocateDependency},
-        MWriteLeaves{this, LeafLimit, AllocateDependency}, MCurContext{Ctx} {}
+        MWriteLeaves{this, LeafLimit, AllocateDependency}, MCurContext{Ctx},
+        MCurDevice{Dev} {}
 
   // Contains all allocation commands for the memory object.
   std::vector<AllocaCommandBase *> MAllocaCommands;
@@ -213,6 +217,9 @@ struct MemObjRecord {
   // The context which has the latest state of the memory object.
   ContextImplPtr MCurContext;
 
+  // The device which has the latest state of the memory object.
+  DeviceImplPtr MCurDevice;
+
   // The mode this object can be accessed with from the host context.
   // Valid only if the current context is host.
   access::mode MHostAccess = access::mode::read_write;
@@ -652,7 +659,8 @@ class Scheduler {
     /// Finds dependencies for the requirement.
     std::set<Command *> findDepsForReq(MemObjRecord *Record,
                                        const Requirement *Req,
-                                       const ContextImplPtr &Context);
+                                       const ContextImplPtr &Context,
+                                       const DeviceImplPtr &Device);
 
     EmptyCommand *addEmptyCmd(Command *Cmd,
                               const std::vector<Requirement *> &Req,
@@ -675,6 +683,7 @@ class Scheduler {
     AllocaCommandBase *findAllocaForReq(MemObjRecord *Record,
                                         const Requirement *Req,
                                         const ContextImplPtr &Context,
+                                        const DeviceImplPtr &Device,
                                         bool AllowConst = true);
 
     friend class Command;
diff --git a/sycl/source/detail/sycl_mem_obj_i.hpp b/sycl/source/detail/sycl_mem_obj_i.hpp
index e3dfa11fe35a1..0fe320c99c55e 100644
--- a/sycl/source/detail/sycl_mem_obj_i.hpp
+++ b/sycl/source/detail/sycl_mem_obj_i.hpp
@@ -18,10 +18,12 @@ namespace detail {
 
 class event_impl;
 class context_impl;
+class device_impl;
 struct MemObjRecord;
 
 using EventImplPtr = std::shared_ptr<detail::event_impl>;
 using ContextImplPtr = std::shared_ptr<detail::context_impl>;
+using DeviceImplPtr = std::shared_ptr<detail::device_impl>;
 
 // The class serves as an interface in the scheduler for all SYCL memory
 // objects.
@@ -43,8 +45,9 @@ class SYCLMemObjI {
   // Non null HostPtr requires allocation to be made with USE_HOST_PTR property.
   // Method returns a pointer to host allocation if Context is host one and
   // cl_mem obect if not.
-  virtual void *allocateMem(ContextImplPtr Context, bool InitFromUserData,
-                            void *HostPtr, RT::PiEvent &InteropEvent) = 0;
+  virtual void *allocateMem(ContextImplPtr Context, DeviceImplPtr Device,
+                            bool InitFromUserData, void *HostPtr,
+                            RT::PiEvent &InteropEvent) = 0;
 
   // Should be used for memory object created without use_host_ptr property.
   virtual void *allocateHostMem() = 0;
diff --git a/sycl/source/detail/sycl_mem_obj_t.hpp b/sycl/source/detail/sycl_mem_obj_t.hpp
index dfd01b88c5a5a..abc9198ca5d12 100644
--- a/sycl/source/detail/sycl_mem_obj_t.hpp
+++ b/sycl/source/detail/sycl_mem_obj_t.hpp
@@ -238,12 +238,24 @@ class __SYCL_EXPORT SYCLMemObjT : public SYCLMemObjI {
   static size_t getBufSizeForContext(const ContextImplPtr &Context,
                                      pi_native_handle MemObject);
 
-  void *allocateMem(ContextImplPtr Context, bool InitFromUserData,
-                    void *HostPtr, RT::PiEvent &InteropEvent) override {
+  // FIXME: Remove this when the class is removed from __SYCL_EXPORT
+  virtual void *allocateMem(ContextImplPtr Context, bool InitFromUserData,
+                            void *HostPtr, RT::PiEvent &InteropEvent) {
     (void)Context;
     (void)InitFromUserData;
     (void)HostPtr;
     (void)InteropEvent;
+    assert(false && "Deprecated: use the overload with the device parameter");
+  }
+
+  void *allocateMem(ContextImplPtr Context, DeviceImplPtr Device,
+                    bool InitFromUserData, void *HostPtr,
+                    RT::PiEvent &InteropEvent) override {
+    (void)Context;
+    (void)Device;
+    (void)InitFromUserData;
+    (void)HostPtr;
+    (void)InteropEvent;
     throw runtime_error("Not implemented", PI_ERROR_INVALID_OPERATION);
   }
 
diff --git a/sycl/test-e2e/Basic/alloc_pinned_host_memory.cpp b/sycl/test-e2e/Basic/alloc_pinned_host_memory.cpp
index 0d8a0267504c0..16e64c3eb8aad 100644
--- a/sycl/test-e2e/Basic/alloc_pinned_host_memory.cpp
+++ b/sycl/test-e2e/Basic/alloc_pinned_host_memory.cpp
@@ -39,4 +39,5 @@ int main() {
 // CHECK:---> piMemBufferCreate
 // CHECK:---> piMemBufferCreate
 // CHECK-NEXT: {{.*}} : {{.*}}
+// CHECK-NEXT: {{.*}} : {{.*}}
 // CHECK-NEXT: {{.*}} : 17
diff --git a/sycl/test-e2e/Basic/buffer/native_buffer_creation_flags.cpp b/sycl/test-e2e/Basic/buffer/native_buffer_creation_flags.cpp
index bb29cef6bcc42..3b2655cf3df21 100644
--- a/sycl/test-e2e/Basic/buffer/native_buffer_creation_flags.cpp
+++ b/sycl/test-e2e/Basic/buffer/native_buffer_creation_flags.cpp
@@ -21,6 +21,7 @@ int main() {
     // buffer is created with the PI_MEM_FLAGS_HOST_PTR_USE flag.
     // CHECK: piMemBufferCreate
     // CHECK-NEXT: {{.*}} : {{.*}}
+    // CHECK-NEXT: {{.*}} : {{.*}}
     // CHECK-NEXT: {{.*}} : 9
     auto BufAcc = Buf.get_access<access::mode::read>(Cgh);
     Cgh.single_task<Foo>([=]() { int A = BufAcc[0]; });
diff --git a/sycl/test-e2e/Basic/use_pinned_host_memory.cpp b/sycl/test-e2e/Basic/use_pinned_host_memory.cpp
index 9efb9115b9c10..18378945ede65 100644
--- a/sycl/test-e2e/Basic/use_pinned_host_memory.cpp
+++ b/sycl/test-e2e/Basic/use_pinned_host_memory.cpp
@@ -43,4 +43,5 @@ int main() {
 
 // CHECK:---> piMemBufferCreate
 // CHECK-NEXT: {{.*}} : {{.*}}
+// CHECK-NEXT: {{.*}} : {{.*}}
 // CHECK-NEXT: {{.*}} : 17
diff --git a/sycl/test-e2e/Tracing/pi_tracing_test.cpp b/sycl/test-e2e/Tracing/pi_tracing_test.cpp
index c91e93cd21856..869a612921892 100644
--- a/sycl/test-e2e/Tracing/pi_tracing_test.cpp
+++ b/sycl/test-e2e/Tracing/pi_tracing_test.cpp
@@ -13,6 +13,7 @@
 // CHECK:  <nullptr>
 // CHECK: ---> piMemBufferCreate(
 // CHECK-NEXT:  <unknown> : {{0[xX]?[0-9a-fA-F]*}}
+// CHECK-NEXT:  <unknown> : {{0[xX]?[0-9a-fA-F]*}}
 // CHECK-NEXT:  <unknown> : 1
 // CHECK-NEXT:  <unknown> : 40
 // CHECK-NEXT:  <unknown> : 0
diff --git a/sycl/test/abi/pi_level_zero_symbol_check.dump b/sycl/test/abi/pi_level_zero_symbol_check.dump
index 4363024cda8a5..3c3ca21eee156 100644
--- a/sycl/test/abi/pi_level_zero_symbol_check.dump
+++ b/sycl/test/abi/pi_level_zero_symbol_check.dump
@@ -94,6 +94,7 @@ piextEnqueueWriteHostPipe
 piextEventCreateWithNativeHandle
 piextEventGetNativeHandle
 piextGetDeviceFunctionPointer
+piextGetMemoryConnection
 piextKernelCreateWithNativeHandle
 piextKernelGetNativeHandle
 piextKernelSetArgMemObj
diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump
index 9707c21163b95..79bfb8e299d0e 100644
--- a/sycl/test/abi/pi_opencl_symbol_check.dump
+++ b/sycl/test/abi/pi_opencl_symbol_check.dump
@@ -42,6 +42,7 @@ piextEnqueueReadHostPipe
 piextEnqueueWriteHostPipe
 piextEventCreateWithNativeHandle
 piextGetDeviceFunctionPointer
+piextGetMemoryConnection
 piextKernelCreateWithNativeHandle
 piextKernelGetNativeHandle
 piextKernelSetArgMemObj
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index d96b589b9a846..f1b5fc606bca8 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3719,6 +3719,7 @@ _ZN4sycl3_V16ONEAPI15filter_selectorC1ERKNSt7__cxx1112basic_stringIcSt11char_tra
 _ZN4sycl3_V16ONEAPI15filter_selectorC2ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE
 _ZN4sycl3_V16detail10build_implERKNS0_13kernel_bundleILNS0_12bundle_stateE0EEERKSt6vectorINS0_6deviceESaIS8_EERKNS0_13property_listE
 _ZN4sycl3_V16detail10image_impl10getDevicesESt10shared_ptrINS1_12context_implEE
+_ZN4sycl3_V16detail10image_impl11allocateMemESt10shared_ptrINS1_12context_implEES3_INS1_11device_implEEbPvRP9_pi_event
 _ZN4sycl3_V16detail10image_impl11allocateMemESt10shared_ptrINS1_12context_implEEbPvRP9_pi_event
 _ZN4sycl3_V16detail10image_impl14checkImageDescERK14_pi_image_descSt10shared_ptrINS1_12context_implEEPv
 _ZN4sycl3_V16detail10image_impl16checkImageFormatERK16_pi_image_formatSt10shared_ptrINS1_12context_implEE
@@ -3737,6 +3738,7 @@ _ZN4sycl3_V16detail11SYCLMemObjTC1EmRKNS0_7contextEbNS0_5eventESt10unique_ptrINS
 _ZN4sycl3_V16detail11SYCLMemObjTC1EmRKNS0_7contextEmNS0_5eventESt10unique_ptrINS1_19SYCLMemObjAllocatorESt14default_deleteIS8_EE
 _ZN4sycl3_V16detail11SYCLMemObjTC2EmRKNS0_7contextEbNS0_5eventESt10unique_ptrINS1_19SYCLMemObjAllocatorESt14default_deleteIS8_EE
 _ZN4sycl3_V16detail11SYCLMemObjTC2EmRKNS0_7contextEmNS0_5eventESt10unique_ptrINS1_19SYCLMemObjAllocatorESt14default_deleteIS8_EE
+_ZN4sycl3_V16detail11buffer_impl11allocateMemESt10shared_ptrINS1_12context_implEES3_INS1_11device_implEEbPvRP9_pi_event
 _ZN4sycl3_V16detail11buffer_impl11allocateMemESt10shared_ptrINS1_12context_implEEbPvRP9_pi_event
 _ZN4sycl3_V16detail11buffer_impl22destructorNotificationEPv
 _ZN4sycl3_V16detail11buffer_impl23constructorNotificationERKNS1_13code_locationEPvPKvS8_jjPm
@@ -3808,10 +3810,14 @@ _ZN4sycl3_V16detail13MemoryManager12prefetch_usmEPvSt10shared_ptrINS1_10queue_im
 _ZN4sycl3_V16detail13MemoryManager13memset_2d_usmEPvSt10shared_ptrINS1_10queue_implEEmmmcSt6vectorIP9_pi_eventSaIS9_EEPS9_
 _ZN4sycl3_V16detail13MemoryManager13releaseMemObjESt10shared_ptrINS1_12context_implEEPNS1_11SYCLMemObjIEPvS8_
 _ZN4sycl3_V16detail13MemoryManager16allocateMemImageESt10shared_ptrINS1_12context_implEEPNS1_11SYCLMemObjIEPvbmRK14_pi_image_descRK16_pi_image_formatRKS3_INS1_10event_implEERKS5_RKNS0_13property_listERP9_pi_event
+_ZN4sycl3_V16detail13MemoryManager16allocateMemImageESt10shared_ptrINS1_12context_implEES3_INS1_11device_implEEPNS1_11SYCLMemObjIEPvbmRK14_pi_image_descRK16_pi_image_formatRKS3_INS1_10event_implEERKS5_RKNS0_13property_listERP9_pi_event
 _ZN4sycl3_V16detail13MemoryManager17allocateMemBufferESt10shared_ptrINS1_12context_implEEPNS1_11SYCLMemObjIEPvbmRKS3_INS1_10event_implEERKS5_RKNS0_13property_listERP9_pi_event
+_ZN4sycl3_V16detail13MemoryManager17allocateMemBufferESt10shared_ptrINS1_12context_implEES3_INS1_11device_implEEPNS1_11SYCLMemObjIEPvbmRKS3_INS1_10event_implEERKS5_RKNS0_13property_listERP9_pi_event
 _ZN4sycl3_V16detail13MemoryManager18allocateHostMemoryEPNS1_11SYCLMemObjIEPvbmRKNS0_13property_listE
 _ZN4sycl3_V16detail13MemoryManager19allocateImageObjectESt10shared_ptrINS1_12context_implEEPvbRK14_pi_image_descRK16_pi_image_formatRKNS0_13property_listE
+_ZN4sycl3_V16detail13MemoryManager19allocateImageObjectESt10shared_ptrINS1_12context_implEES3_INS1_11device_implEEPvbRK14_pi_image_descRK16_pi_image_formatRKNS0_13property_listE
 _ZN4sycl3_V16detail13MemoryManager20allocateBufferObjectESt10shared_ptrINS1_12context_implEEPvbmRKNS0_13property_listE
+_ZN4sycl3_V16detail13MemoryManager20allocateBufferObjectESt10shared_ptrINS1_12context_implEES3_INS1_11device_implEEPvbmRKNS0_13property_listE
 _ZN4sycl3_V16detail13MemoryManager20allocateMemSubBufferESt10shared_ptrINS1_12context_implEEPvmmNS0_5rangeILi3EEESt6vectorIS3_INS1_10event_implEESaISB_EERP9_pi_event
 _ZN4sycl3_V16detail13MemoryManager21copy_to_device_globalEPKvbSt10shared_ptrINS1_10queue_implEEmmS4_lRKSt6vectorIP9_pi_eventSaISA_EEPSA_
 _ZN4sycl3_V16detail13MemoryManager23copy_from_device_globalEPKvbSt10shared_ptrINS1_10queue_implEEmmPvlRKSt6vectorIP9_pi_eventSaISB_EEPSB_
@@ -3822,6 +3828,7 @@ _ZN4sycl3_V16detail13MemoryManager4fillEPNS1_11SYCLMemObjIEPvSt10shared_ptrINS1_
 _ZN4sycl3_V16detail13MemoryManager5unmapEPNS1_11SYCLMemObjIEPvSt10shared_ptrINS1_10queue_implEES5_St6vectorIP9_pi_eventSaISB_EERSB_
 _ZN4sycl3_V16detail13MemoryManager7releaseESt10shared_ptrINS1_12context_implEEPNS1_11SYCLMemObjIEPvSt6vectorIS3_INS1_10event_implEESaISB_EERP9_pi_event
 _ZN4sycl3_V16detail13MemoryManager8allocateESt10shared_ptrINS1_12context_implEEPNS1_11SYCLMemObjIEbPvSt6vectorIS3_INS1_10event_implEESaISB_EERP9_pi_event
+_ZN4sycl3_V16detail13MemoryManager8allocateESt10shared_ptrINS1_12context_implEES3_INS1_11device_implEEPNS1_11SYCLMemObjIEbPvSt6vectorIS3_INS1_10event_implEESaISD_EERP9_pi_event
 _ZN4sycl3_V16detail13MemoryManager8copy_usmEPKvSt10shared_ptrINS1_10queue_implEEmPvSt6vectorIP9_pi_eventSaISB_EEPSB_
 _ZN4sycl3_V16detail13MemoryManager8fill_usmEPvSt10shared_ptrINS1_10queue_implEEmiSt6vectorIP9_pi_eventSaIS9_EEPS9_
 _ZN4sycl3_V16detail13host_pipe_map3addEPKvPKc
diff --git a/sycl/tools/sycl-sanitize/collector.cpp b/sycl/tools/sycl-sanitize/collector.cpp
index 86b5bb74b6939..9f4b11a718937 100644
--- a/sycl/tools/sycl-sanitize/collector.cpp
+++ b/sycl/tools/sycl-sanitize/collector.cpp
@@ -94,8 +94,8 @@ static void handleUSMFree(const pi_plugin &, std::optional<pi_result>,
 }
 
 static void handleMemBufferCreate(const pi_plugin &, std::optional<pi_result>,
-                                  pi_context, pi_mem_flags, size_t Size,
-                                  void *HostPtr, pi_mem *,
+                                  pi_context, pi_device, pi_mem_flags,
+                                  size_t Size, void *HostPtr, pi_mem *,
                                   const pi_mem_properties *) {
   for (const auto &Alloc : GS->ActivePointers) {
     const void *Begin = Alloc.first;
diff --git a/sycl/unittests/SYCL2020/GetNativeOpenCL.cpp b/sycl/unittests/SYCL2020/GetNativeOpenCL.cpp
index e89989c8d9b0d..3ea27f72457da 100644
--- a/sycl/unittests/SYCL2020/GetNativeOpenCL.cpp
+++ b/sycl/unittests/SYCL2020/GetNativeOpenCL.cpp
@@ -57,8 +57,8 @@ static pi_result redefinedMemRetain(pi_mem c) {
   return PI_SUCCESS;
 }
 
-pi_result redefinedMemBufferCreate(pi_context, pi_mem_flags, size_t size,
-                                   void *, pi_mem *,
+pi_result redefinedMemBufferCreate(pi_context, pi_device, pi_mem_flags,
+                                   size_t size, void *, pi_mem *,
                                    const pi_mem_properties *) {
   return PI_SUCCESS;
 }
diff --git a/sycl/unittests/assert/assert.cpp b/sycl/unittests/assert/assert.cpp
index 3a9482193fbba..90c522c2b30a8 100644
--- a/sycl/unittests/assert/assert.cpp
+++ b/sycl/unittests/assert/assert.cpp
@@ -205,15 +205,15 @@ redefinedEnqueueKernelLaunchAfter(pi_queue, pi_kernel, pi_uint32,
 
 static pi_result redefinedEventsWaitPositive(pi_uint32 num_events,
                                              const pi_event *event_list) {
-  // there should be two events: one is for memory map and the other is for
-  // copier kernel
-  assert(num_events == 2);
+  std::stringstream s("Waiting for events:");
+  for (pi_uint32 i = 0; i < num_events; ++i) {
+    s << " " << reinterpret_cast<int *>(event_list[i])[0];
+  }
+  s << "\n";
 
-  int EventIdx1 = reinterpret_cast<int *>(event_list[0])[0];
-  int EventIdx2 = reinterpret_cast<int *>(event_list[1])[0];
   // This output here is to reduce amount of time requried to debug/reproduce
   // a failing test upon feature break
-  printf("Waiting for events %i, %i\n", EventIdx1, EventIdx2);
+  printf("%s", s.str().c_str());
   return PI_SUCCESS;
 }
 
diff --git a/sycl/unittests/buffer/BufferLocation.cpp b/sycl/unittests/buffer/BufferLocation.cpp
index 77d674f602641..540342116e81d 100644
--- a/sycl/unittests/buffer/BufferLocation.cpp
+++ b/sycl/unittests/buffer/BufferLocation.cpp
@@ -20,8 +20,8 @@
 const uint64_t DEFAULT_VALUE = 7777;
 static uint64_t PassedLocation = DEFAULT_VALUE;
 
-pi_result redefinedMemBufferCreate(pi_context, pi_mem_flags, size_t size,
-                                   void *, pi_mem *,
+pi_result redefinedMemBufferCreate(pi_context, pi_device, pi_mem_flags,
+                                   size_t size, void *, pi_mem *,
                                    const pi_mem_properties *properties) {
   PassedLocation = DEFAULT_VALUE;
   if (!properties)
diff --git a/sycl/unittests/event/EventDestruction.cpp b/sycl/unittests/event/EventDestruction.cpp
index 20427b7623cf8..f1b3377510966 100644
--- a/sycl/unittests/event/EventDestruction.cpp
+++ b/sycl/unittests/event/EventDestruction.cpp
@@ -21,8 +21,8 @@ static pi_result redefinedEventRelease(pi_event event) {
   return PI_SUCCESS;
 }
 
-pi_result redefinedMemBufferCreate(pi_context, pi_mem_flags, size_t size,
-                                   void *, pi_mem *,
+pi_result redefinedMemBufferCreate(pi_context, pi_device, pi_mem_flags,
+                                   size_t size, void *, pi_mem *,
                                    const pi_mem_properties *) {
   return PI_SUCCESS;
 }
diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp
index 0faa30e9b7407..1121f28fee7d2 100644
--- a/sycl/unittests/helpers/PiMockPlugin.hpp
+++ b/sycl/unittests/helpers/PiMockPlugin.hpp
@@ -399,8 +399,8 @@ inline pi_result mock_piextQueueCreateWithNativeHandle(
 // Memory
 //
 inline pi_result
-mock_piMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size,
-                       void *host_ptr, pi_mem *ret_mem,
+mock_piMemBufferCreate(pi_context context, pi_device device, pi_mem_flags flags,
+                       size_t size, void *host_ptr, pi_mem *ret_mem,
                        const pi_mem_properties *properties = nullptr) {
   if (host_ptr && flags & PI_MEM_FLAGS_HOST_PTR_USE)
     *ret_mem = createDummyHandleWithData<pi_mem>(
@@ -410,7 +410,8 @@ mock_piMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size,
   return PI_SUCCESS;
 }
 
-inline pi_result mock_piMemImageCreate(pi_context context, pi_mem_flags flags,
+inline pi_result mock_piMemImageCreate(pi_context context, pi_device device,
+                                       pi_mem_flags flags,
                                        const pi_image_format *image_format,
                                        const pi_image_desc *image_desc,
                                        void *host_ptr, pi_mem *ret_mem) {
@@ -476,6 +477,13 @@ mock_piextMemCreateWithNativeHandle(pi_native_handle nativeHandle,
   return PI_SUCCESS;
 }
 
+inline pi_result mock_piextGetMemoryConnection(pi_device dev1, pi_context ctx1,
+                                               pi_device dev2, pi_context ctx2,
+                                               pi_memory_connection *res) {
+  *res = PI_MEMORY_CONNECTION_NONE;
+  return PI_SUCCESS;
+}
+
 //
 // Program
 //
diff --git a/sycl/unittests/pi/EnqueueMemTest.cpp b/sycl/unittests/pi/EnqueueMemTest.cpp
index 52f52e52ea15d..7ae5cd9dbdd62 100644
--- a/sycl/unittests/pi/EnqueueMemTest.cpp
+++ b/sycl/unittests/pi/EnqueueMemTest.cpp
@@ -51,7 +51,7 @@ class EnqueueMemTest : public testing::TestWithParam<detail::plugin> {
               PI_SUCCESS);
 
     ASSERT_EQ((plugin.call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                  _context, PI_MEM_FLAGS_ACCESS_RW,
+                  _context, _device, PI_MEM_FLAGS_ACCESS_RW,
                   _numElementsX * _numElementsY * sizeof(pi_int32), nullptr,
                   &_mem, nullptr)),
               PI_SUCCESS);
diff --git a/sycl/unittests/pi/cuda/test_base_objects.cpp b/sycl/unittests/pi/cuda/test_base_objects.cpp
index 9bcc9e9f24d56..b710ff5bbcced 100644
--- a/sycl/unittests/pi/cuda/test_base_objects.cpp
+++ b/sycl/unittests/pi/cuda/test_base_objects.cpp
@@ -71,13 +71,7 @@ TEST_F(CudaBaseObjectsTest, piContextCreate) {
       << "piContextCreate failed.\n";
 
   EXPECT_NE(ctxt, nullptr);
-  EXPECT_EQ(ctxt->get_device(), device);
-
-  // Retrieve the cuCtxt to check information is correct
-  CUcontext cudaContext = ctxt->get();
-  unsigned int version = 0;
-  cuCtxGetApiVersion(cudaContext, &version);
-  EXPECT_EQ(version, LATEST_KNOWN_CUDA_DRIVER_API_VERSION);
+  EXPECT_EQ(ctxt->get_devices()[0], device);
 
   ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextRelease>(ctxt)),
             PI_SUCCESS);
@@ -110,7 +104,7 @@ TEST_F(CudaBaseObjectsTest, piContextCreateChildThread) {
 
   // Retrieve the cuCtxt to check information is correct
   auto checkValue = [=]() {
-    CUcontext cudaContext = ctxt->get();
+    CUcontext cudaContext = device->get_context();
     unsigned int version = 0;
     auto cuErr = cuCtxGetApiVersion(cudaContext, &version);
     EXPECT_EQ(cuErr, CUDA_SUCCESS);
diff --git a/sycl/unittests/pi/cuda/test_commands.cpp b/sycl/unittests/pi/cuda/test_commands.cpp
index 5a57aa8471f13..93553052a14eb 100644
--- a/sycl/unittests/pi/cuda/test_commands.cpp
+++ b/sycl/unittests/pi/cuda/test_commands.cpp
@@ -85,10 +85,10 @@ TEST_F(CudaCommandsTest, PIEnqueueReadBufferBlocking) {
   int output[memSize] = {};
 
   pi_mem memObj;
-  ASSERT_EQ(
-      (plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-          context_, PI_MEM_FLAGS_ACCESS_RW, bytes, nullptr, &memObj, nullptr)),
-      PI_SUCCESS);
+  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
+                context_, device_, PI_MEM_FLAGS_ACCESS_RW, bytes, nullptr,
+                &memObj, nullptr)),
+            PI_SUCCESS);
 
   ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferWrite>(
                 queue_, memObj, true, 0, bytes, data, 0, nullptr, nullptr)),
@@ -115,10 +115,10 @@ TEST_F(CudaCommandsTest, PIEnqueueReadBufferNonBlocking) {
   int output[memSize] = {};
 
   pi_mem memObj;
-  ASSERT_EQ(
-      (plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-          context_, PI_MEM_FLAGS_ACCESS_RW, bytes, nullptr, &memObj, nullptr)),
-      PI_SUCCESS);
+  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
+                context_, device_, PI_MEM_FLAGS_ACCESS_RW, bytes, nullptr,
+                &memObj, nullptr)),
+            PI_SUCCESS);
 
   pi_event cpIn, cpOut;
   ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piEnqueueMemBufferWrite>(
diff --git a/sycl/unittests/pi/cuda/test_contexts.cpp b/sycl/unittests/pi/cuda/test_contexts.cpp
index d021081716b9a..7411507e25343 100644
--- a/sycl/unittests/pi/cuda/test_contexts.cpp
+++ b/sycl/unittests/pi/cuda/test_contexts.cpp
@@ -87,14 +87,10 @@ TEST_F(CudaContextsTest, ContextLifetime) {
   // check that the context is now the active CUDA context
   CUcontext cudaCtxt = nullptr;
   cuCtxGetCurrent(&cudaCtxt);
-  ASSERT_EQ(cudaCtxt, context->get());
+  ASSERT_EQ(cudaCtxt, queue->get_native_context());
 
   plugin->call<detail::PiApiKind::piQueueRelease>(queue);
   plugin->call<detail::PiApiKind::piContextRelease>(context);
-
-  // check that the context was cleaned up properly by the destructor
-  cuCtxGetCurrent(&cudaCtxt);
-  ASSERT_EQ(cudaCtxt, nullptr);
 }
 
 TEST_F(CudaContextsTest, ContextLifetimeExisting) {
@@ -126,124 +122,11 @@ TEST_F(CudaContextsTest, ContextLifetimeExisting) {
 
   // check that the context is now the active CUDA context
   cuCtxGetCurrent(&current);
-  ASSERT_EQ(current, context->get());
+  ASSERT_EQ(current, queue->get_native_context());
 
   plugin->call<detail::PiApiKind::piQueueRelease>(queue);
   plugin->call<detail::PiApiKind::piContextRelease>(context);
 
-  // check that the context was cleaned up, the old context will be restored
-  // automatically by cuCtxDestroy in piContextRelease, as it was pushed on the
-  // stack bu cuCtxCreate
-  cuCtxGetCurrent(&current);
-  ASSERT_EQ(current, original);
-
   // release original context
   cuCtxDestroy(original);
 }
-
-// In some cases (for host_task), the SYCL runtime may call PI API functions
-// from threads of the thread pool, this can cause issues because with the CUDA
-// plugin these functions will set an active CUDA context on these threads, but
-// never clean it up, as it will only get cleaned up in the main thread.
-//
-// So the following test aims to reproduce the scenario where there is a
-// dangling deleted context in a separate thread and seeing if the PI calls are
-// still able to work correctly in that thread.
-TEST_F(CudaContextsTest, ContextThread) {
-  // start with no active context
-  pi::clearCudaContext();
-
-  // create two PI contexts
-  pi_context context1;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                nullptr, 1, &device_, nullptr, nullptr, &context1)),
-            PI_SUCCESS);
-  ASSERT_NE(context1, nullptr);
-
-  pi_context context2;
-  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
-                nullptr, 1, &device_, nullptr, nullptr, &context2)),
-            PI_SUCCESS);
-  ASSERT_NE(context2, nullptr);
-
-  // setup synchronization variables between the main thread and the testing
-  // thread
-  std::mutex m;
-  std::condition_variable cv;
-  bool released = false;
-  bool thread_done = false;
-
-  // create a testing thread that will create a queue with the first context,
-  // release the queue, then wait for the main thread to release the first
-  // context, and then create and release another queue with the second context
-  // this time
-  auto test_thread = std::thread([&] {
-    CUcontext current = nullptr;
-
-    // create a queue with the first context
-    pi_queue queue;
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
-                  context1, device_, 0, &queue)),
-              PI_SUCCESS);
-    ASSERT_NE(queue, nullptr);
-
-    // ensure the queue has the correct context
-    ASSERT_EQ(context1, queue->get_context());
-
-    // check that the first context is now the active CUDA context
-    cuCtxGetCurrent(&current);
-    ASSERT_EQ(current, context1->get());
-
-    plugin->call<detail::PiApiKind::piQueueRelease>(queue);
-
-    // mark the first set of processing as done and notify the main thread
-    std::unique_lock<std::mutex> lock(m);
-    thread_done = true;
-    lock.unlock();
-    cv.notify_one();
-
-    // wait for the main thread to release the first context
-    lock.lock();
-    cv.wait(lock, [&] { return released; });
-
-    // check that the first context is still active, this is because deleting a
-    // context only cleans up the current thread
-    cuCtxGetCurrent(&current);
-    ASSERT_EQ(current, context1->get());
-
-    // create a queue with the second context
-    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
-                  context2, device_, 0, &queue)),
-              PI_SUCCESS);
-    ASSERT_NE(queue, nullptr);
-
-    // ensure the queue has the correct context
-    ASSERT_EQ(context2, queue->get_context());
-
-    // check that the second context is now the active CUDA context
-    cuCtxGetCurrent(&current);
-    ASSERT_EQ(current, context2->get());
-
-    plugin->call<detail::PiApiKind::piQueueRelease>(queue);
-  });
-
-  // wait for the thread to be done with the first queue to release the first context
-  std::unique_lock<std::mutex> lock(m);
-  cv.wait(lock, [&] { return thread_done; });
-  plugin->call<detail::PiApiKind::piContextRelease>(context1);
-
-  // notify the other thread that the context was released
-  released = true;
-  lock.unlock();
-  cv.notify_one();
-
-  // wait for the thread to finish
-  test_thread.join();
-
-  plugin->call<detail::PiApiKind::piContextRelease>(context2);
-
-  // check that there is no context set on the main thread
-  CUcontext current = nullptr;
-  cuCtxGetCurrent(&current);
-  ASSERT_EQ(current, nullptr);
-}
diff --git a/sycl/unittests/pi/cuda/test_kernels.cpp b/sycl/unittests/pi/cuda/test_kernels.cpp
index d485bb218a1dc..e9c1a6b788655 100644
--- a/sycl/unittests/pi/cuda/test_kernels.cpp
+++ b/sycl/unittests/pi/cuda/test_kernels.cpp
@@ -306,8 +306,8 @@ TEST_F(CudaKernelsTest, PIKernelSetMemObj) {
   size_t memSize = 1024u;
   pi_mem memObj;
   ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj,
-                nullptr)),
+                context_, device_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr,
+                &memObj, nullptr)),
             PI_SUCCESS);
 
   ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piKernelSetArg>(
@@ -342,8 +342,8 @@ TEST_F(CudaKernelsTest, PIkerneldispatch) {
   size_t memSize = 1024u;
   pi_mem memObj;
   ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj,
-                nullptr)),
+                context_, device_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr,
+                &memObj, nullptr)),
             PI_SUCCESS);
 
   ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piextKernelSetArgMemObj>(
@@ -386,14 +386,14 @@ TEST_F(CudaKernelsTest, PIkerneldispatchTwo) {
   size_t memSize = 1024u;
   pi_mem memObj;
   ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj,
-                nullptr)),
+                context_, device_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr,
+                &memObj, nullptr)),
             PI_SUCCESS);
 
   pi_mem memObj2;
   ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj2,
-                nullptr)),
+                context_, device_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr,
+                &memObj2, nullptr)),
             PI_SUCCESS);
 
   ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piextKernelSetArgMemObj>(
diff --git a/sycl/unittests/pi/cuda/test_mem_obj.cpp b/sycl/unittests/pi/cuda/test_mem_obj.cpp
index 46fc4a007526d..ba76c4a8bd400 100644
--- a/sycl/unittests/pi/cuda/test_mem_obj.cpp
+++ b/sycl/unittests/pi/cuda/test_mem_obj.cpp
@@ -75,8 +75,8 @@ TEST_F(CudaTestMemObj, piMemBufferCreateSimple) {
   const size_t memSize = 1024u;
   pi_mem memObj;
   ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj,
-                nullptr)),
+                context_, device_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr,
+                &memObj, nullptr)),
             PI_SUCCESS);
 
   ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemRelease>(memObj)),
@@ -87,8 +87,9 @@ TEST_F(CudaTestMemObj, piMemBufferAllocHost) {
   const size_t memSize = 1024u;
   pi_mem memObj;
   ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_ALLOC,
-                memSize, nullptr, &memObj, nullptr)),
+                context_, device_,
+                PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_ALLOC, memSize,
+                nullptr, &memObj, nullptr)),
             PI_SUCCESS);
 
   ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemRelease>(memObj)),
@@ -116,8 +117,8 @@ TEST_F(CudaTestMemObj, piMemBufferCreateNoActiveContext) {
   // to allocate the memory object
   pi_mem memObj;
   ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj,
-                nullptr)),
+                context_, device_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr,
+                &memObj, nullptr)),
             PI_SUCCESS);
   ASSERT_NE(memObj, nullptr);
 
@@ -138,8 +139,9 @@ TEST_F(CudaTestMemObj, piMemBufferPinnedMappedRead) {
 
   pi_mem memObj;
   ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_ALLOC,
-                memSize, nullptr, &memObj, nullptr)),
+                context_, device_,
+                PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_ALLOC, memSize,
+                nullptr, &memObj, nullptr)),
             PI_SUCCESS);
 
   ASSERT_EQ(
@@ -177,8 +179,9 @@ TEST_F(CudaTestMemObj, piMemBufferPinnedMappedWrite) {
 
   pi_mem memObj;
   ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piMemBufferCreate>(
-                context_, PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_ALLOC,
-                memSize, nullptr, &memObj, nullptr)),
+                context_, device_,
+                PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_ALLOC, memSize,
+                nullptr, &memObj, nullptr)),
             PI_SUCCESS);
 
   int *host_ptr = nullptr;
diff --git a/sycl/unittests/scheduler/InOrderQueueDeps.cpp b/sycl/unittests/scheduler/InOrderQueueDeps.cpp
index be5e8d874da17..52d807d9a6773 100644
--- a/sycl/unittests/scheduler/InOrderQueueDeps.cpp
+++ b/sycl/unittests/scheduler/InOrderQueueDeps.cpp
@@ -17,8 +17,8 @@
 using namespace sycl;
 
 static pi_result
-redefinedMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size,
-                         void *host_ptr, pi_mem *ret_mem,
+redefinedMemBufferCreate(pi_context context, pi_device, pi_mem_flags flags,
+                         size_t size, void *host_ptr, pi_mem *ret_mem,
                          const pi_mem_properties *properties = nullptr) {
   return PI_SUCCESS;
 }
diff --git a/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp b/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp
index b9a4726acb7fd..0a7a90c78767c 100644
--- a/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp
+++ b/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp
@@ -16,6 +16,7 @@ using namespace sycl;
 class MemObjMock : public sycl::detail::SYCLMemObjI {
 public:
   using ContextImplPtr = std::shared_ptr<sycl::detail::context_impl>;
+  using DeviceImplPtr = std::shared_ptr<sycl::detail::device_impl>;
 
   MemObjMock(const std::shared_ptr<sycl::detail::MemObjRecord> &Record)
       : SYCLMemObjI() {
@@ -26,7 +27,8 @@ class MemObjMock : public sycl::detail::SYCLMemObjI {
 
   MemObjType getType() const override { return MemObjType::Buffer; }
 
-  void *allocateMem(ContextImplPtr, bool, void *, sycl::detail::pi::PiEvent &) {
+  void *allocateMem(ContextImplPtr, DeviceImplPtr, bool, void *,
+                    sycl::detail::pi::PiEvent &) {
     return nullptr;
   }
 
@@ -70,7 +72,8 @@ TEST_F(SchedulerTest, LinkedAllocaDependencies) {
                       std::vector<sycl::detail::Command *> &) {};
 
   std::shared_ptr<sycl::detail::MemObjRecord> Record{
-      new sycl::detail::MemObjRecord(DefaultHostQueue->getContextImplPtr(), 10,
+      new sycl::detail::MemObjRecord(DefaultHostQueue->getContextImplPtr(),
+                                     DefaultHostQueue->getDeviceImplPtr(), 10,
                                      AllocaDep)};
 
   MemObjMock MemObj(Record);
diff --git a/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp b/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp
index f555c44cfc5a3..d3b1405c0ab2f 100644
--- a/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp
+++ b/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp
@@ -47,8 +47,8 @@ static pi_result redefinedDeviceGetInfoAfter(pi_device Device,
 }
 
 static pi_result
-redefinedMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size,
-                         void *host_ptr, pi_mem *ret_mem,
+redefinedMemBufferCreate(pi_context context, pi_device, pi_mem_flags flags,
+                         size_t size, void *host_ptr, pi_mem *ret_mem,
                          const pi_mem_properties *properties = nullptr) {
   EXPECT_EQ(flags, PI_MEM_FLAGS_ACCESS_RW);
   return PI_SUCCESS;
@@ -203,7 +203,8 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) {
   {
     pi_mem MockInteropBuffer = nullptr;
     pi_result PIRes = mock_piMemBufferCreate(
-        /*pi_context=*/0x0, /*pi_mem_flags=*/PI_MEM_FLAGS_ACCESS_RW, /*size=*/1,
+        /*pi_context=*/0x0, /*pi_device=*/0x0,
+        /*pi_mem_flags=*/PI_MEM_FLAGS_ACCESS_RW, /*size=*/1,
         /*host_ptr=*/nullptr, &MockInteropBuffer);
     EXPECT_TRUE(PI_SUCCESS == PIRes);
 
diff --git a/sycl/unittests/scheduler/QueueFlushing.cpp b/sycl/unittests/scheduler/QueueFlushing.cpp
index 269b93b3ee01a..bc5f507616a2e 100644
--- a/sycl/unittests/scheduler/QueueFlushing.cpp
+++ b/sycl/unittests/scheduler/QueueFlushing.cpp
@@ -101,7 +101,7 @@ TEST_F(SchedulerTest, QueueFlushing) {
   detail::Requirement MockReq = getMockRequirement(Buf);
 
   pi_mem PIBuf = nullptr;
-  pi_result Ret = mock_piMemBufferCreate(/*pi_context=*/0x0,
+  pi_result Ret = mock_piMemBufferCreate(/*pi_context=*/0x0, /*pi_device=*/0x0,
                                          PI_MEM_FLAGS_ACCESS_RW, /*size=*/1,
                                          /*host_ptr=*/nullptr, &PIBuf);
   EXPECT_TRUE(Ret == PI_SUCCESS);
diff --git a/sycl/unittests/stream/stream.cpp b/sycl/unittests/stream/stream.cpp
index 36afde6e06750..009300da9958a 100644
--- a/sycl/unittests/stream/stream.cpp
+++ b/sycl/unittests/stream/stream.cpp
@@ -20,8 +20,8 @@
 size_t GBufferCreateCounter = 0;
 
 static pi_result
-redefinedMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size,
-                         void *host_ptr, pi_mem *ret_mem,
+redefinedMemBufferCreate(pi_context context, pi_device, pi_mem_flags flags,
+                         size_t size, void *host_ptr, pi_mem *ret_mem,
                          const pi_mem_properties *properties = nullptr) {
   ++GBufferCreateCounter;
   *ret_mem = nullptr;