diff --git a/sycl/include/sycl/detail/pi.def b/sycl/include/sycl/detail/pi.def index 2e71783720def..68a6fbc10a707 100644 --- a/sycl/include/sycl/detail/pi.def +++ b/sycl/include/sycl/detail/pi.def @@ -135,6 +135,8 @@ _PI_API(piextUSMGetMemAllocInfo) _PI_API(piextEnqueueReadHostPipe) _PI_API(piextEnqueueWriteHostPipe) +_PI_API(piextGetMemoryConnection) + _PI_API(piextKernelSetArgMemObj) _PI_API(piextKernelSetArgSampler) diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 01ece111742d8..4d4c36df1fa02 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -1000,6 +1000,16 @@ typedef enum { PI_MEM_CONTEXT = 0x1106, PI_MEM_SIZE = 0x1102 } _pi_mem_info; using pi_mem_info = _pi_mem_info; +// Represent different memory connections between devices, see +// piextGetMemoryConnection. +typedef enum { + PI_MEMORY_CONNECTION_NONE, // copies must go through host + PI_MEMORY_CONNECTION_MIGRATABLE, // copies must be explicit + PI_MEMORY_CONNECTION_UNIFIED // copies are not needed +} _pi_memory_connection; + +using pi_memory_connection = _pi_memory_connection; + // // Following section contains SYCL RT Plugin Interface (PI) functions. // They are 3 distinct categories: @@ -1112,6 +1122,21 @@ __SYCL_EXPORT pi_result piextGetDeviceFunctionPointer( pi_device device, pi_program program, const char *function_name, pi_uint64 *function_pointer_ret); +/// Returns the type of memory connection between the two devices in the two +/// respective contexts. +/// +/// \param dev1 First device. +/// \param ctx1 First context, must contain dev1. +/// \param dev2 Second device. +/// \param ctx2 Second context, must contain dev2. +/// \return res Type of memory connection supported between the pairs (dev1, +/// ctx1) and (dev2, ctx2). +__SYCL_EXPORT pi_result piextGetMemoryConnection(pi_device dev1, + pi_context ctx1, + pi_device dev2, + pi_context ctx2, + _pi_memory_connection *res); + // // Context // @@ -1226,11 +1251,13 @@ __SYCL_EXPORT pi_result piextQueueCreateWithNativeHandle( // // Memory // -__SYCL_EXPORT pi_result piMemBufferCreate( - pi_context context, pi_mem_flags flags, size_t size, void *host_ptr, - pi_mem *ret_mem, const pi_mem_properties *properties = nullptr); +__SYCL_EXPORT pi_result +piMemBufferCreate(pi_context context, pi_device device, pi_mem_flags flags, + size_t size, void *host_ptr, pi_mem *ret_mem, + const pi_mem_properties *properties = nullptr); -__SYCL_EXPORT pi_result piMemImageCreate(pi_context context, pi_mem_flags flags, +__SYCL_EXPORT pi_result piMemImageCreate(pi_context context, pi_device device, + pi_mem_flags flags, const pi_image_format *image_format, const pi_image_desc *image_desc, void *host_ptr, pi_mem *ret_mem); diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 1c7a6be525beb..2d885ea9ed33e 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -188,14 +188,6 @@ pi_result check_error(CUresult result, const char *function, int line, /// contexts to be restored by SYCL. class ScopedContext { public: - ScopedContext(pi_context ctxt) { - if (!ctxt) { - throw PI_ERROR_INVALID_CONTEXT; - } - - set_context(ctxt->get()); - } - ScopedContext(CUcontext ctxt) { set_context(ctxt); } ~ScopedContext() {} @@ -296,7 +288,7 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock, threadsPerBlock[2] = ((global_work_size[2] - 1) / gridDim[2]) + 1; PI_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize( - &minGrid, &maxBlockSize, kernel->get(), NULL, local_size, + &minGrid, &maxBlockSize, kernel->get(device), NULL, local_size, maxThreadsPerBlock[0])); gridDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]); @@ -319,7 +311,7 @@ pi_result enqueueEventsWait(pi_queue command_queue, CUstream stream, return PI_SUCCESS; } try { - ScopedContext active(command_queue->get_context()); + ScopedContext active(command_queue->get_native_context()); auto result = forLatestEvents( event_wait_list, num_events_in_wait_list, @@ -520,12 +512,14 @@ CUstream _pi_queue::get_next_transfer_stream() { return res; } -_pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue, - CUstream stream, pi_uint32 stream_token) +_pi_event::_pi_event(pi_command_type type, pi_context context, + CUcontext native_context, pi_queue queue, CUstream stream, + pi_uint32 stream_token) : commandType_{type}, refCount_{1}, has_ownership_{true}, hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false}, streamToken_{stream_token}, evEnd_{nullptr}, evStart_{nullptr}, - evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context} { + evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context}, + native_context_{native_context} { bool profilingEnabled = queue_->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE; @@ -687,8 +681,10 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) { } _pi_program::_pi_program(pi_context ctxt) - : module_{nullptr}, binary_{}, binarySizeInBytes_{0}, refCount_{1}, - context_{ctxt}, kernelReqdWorkGroupSizeMD_{} { + : modules_{ctxt->get_devices().size(), nullptr}, + build_results_{ctxt->get_devices().size(), CUDA_ERROR_UNKNOWN}, + binary_{0}, binarySizeInBytes_{0}, refCount_{1}, context_{ctxt}, + kernelReqdWorkGroupSizeMD_{} { cuda_piContextRetain(context_); } @@ -773,17 +769,41 @@ pi_result _pi_program::build_program(const char *build_options) { options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; optionVals[3] = (void *)(long)MAX_LOG_SIZE; - auto result = PI_CHECK_ERROR( - cuModuleLoadDataEx(&module_, static_cast(binary_), - numberOfOptions, options, optionVals)); - - const auto success = (result == PI_SUCCESS); + auto devs = get_context()->get_devices(); - buildStatus_ = - success ? PI_PROGRAM_BUILD_STATUS_SUCCESS : PI_PROGRAM_BUILD_STATUS_ERROR; + // we count the build as successful if it succeeds on at least one device + bool success = false; + CUresult res1 = CUDA_ERROR_NO_DEVICE; + for (size_t i = 0; i < devs.size(); i++) { + ScopedContext ctx(devs[i]->get_context()); + res1 = cuModuleLoadDataEx(&modules_[i], static_cast(binary_), + numberOfOptions, options, optionVals); + build_results_[i] = res1; + success |= CUDA_SUCCESS == res1; + } // If no exception, result is correct - return success ? PI_SUCCESS : PI_ERROR_BUILD_PROGRAM_FAILURE; + if (success) { + buildStatus_ = PI_PROGRAM_BUILD_STATUS_SUCCESS; + return PI_SUCCESS; + } else { + PI_CHECK_ERROR(res1); + buildStatus_ = PI_PROGRAM_BUILD_STATUS_ERROR; + return PI_ERROR_BUILD_PROGRAM_FAILURE; + } +} + +CUfunction _pi_kernel::get(pi_device device) const noexcept { + size_t i = context_->device_index(device); + PI_CHECK_ERROR(program_->build_results_[i]); + return functions_[i]; +} + +CUfunction +_pi_kernel::get_with_offset_parameter(pi_device device) const noexcept { + size_t i = context_->device_index(device); + PI_CHECK_ERROR(program_->build_results_[i]); + return functionsWithOffsetParam_[i]; } /// Finds kernel names by searching for entry points in the PTX source, as the @@ -810,16 +830,13 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, /// Triggers the CUDA Driver initialization (cuInit) the first time, so this /// must be the first PI API called. /// -/// However because multiple devices in a context is not currently supported, -/// place each device in a separate platform. -/// pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms, pi_uint32 *num_platforms) { try { static std::once_flag initFlag; static pi_uint32 numPlatforms = 1; - static std::vector<_pi_platform> platformIds; + static _pi_platform platformId; if (num_entries == 0 && platforms != nullptr) { return PI_ERROR_INVALID_VALUE; @@ -844,13 +861,11 @@ pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms, return; } try { - // make one platform per device - numPlatforms = numDevices; - platformIds.resize(numDevices); - + platformId.devices_.reserve(numDevices); for (int i = 0; i < numDevices; ++i) { CUdevice device; err = PI_CHECK_ERROR(cuDeviceGet(&device, i)); + CUcontext context; err = PI_CHECK_ERROR(cuDevicePrimaryCtxRetain(&context, device)); @@ -861,11 +876,11 @@ pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms, // Use default stream to record base event counter err = PI_CHECK_ERROR(cuEventRecord(evBase, 0)); - platformIds[i].devices_.emplace_back( - new _pi_device{device, context, evBase, &platformIds[i]}); + platformId.devices_.emplace_back( + new _pi_device{device, context, evBase, &platformId}); { - const auto &dev = platformIds[i].devices_.back().get(); + const auto &dev = platformId.devices_.back().get(); size_t maxWorkGroupSize = 0u; size_t maxThreadsPerBlock[3] = {}; pi_result retError = cuda_piDeviceGetInfo( @@ -886,17 +901,11 @@ pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms, } } catch (const std::bad_alloc &) { // Signal out-of-memory situation - for (int i = 0; i < numDevices; ++i) { - platformIds[i].devices_.clear(); - } - platformIds.clear(); + platformId.devices_.clear(); err = PI_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { // Clear and rethrow to allow retry - for (int i = 0; i < numDevices; ++i) { - platformIds[i].devices_.clear(); - } - platformIds.clear(); + platformId.devices_.clear(); throw; } }, @@ -907,9 +916,7 @@ pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms, } if (platforms != nullptr) { - for (unsigned i = 0; i < std::min(num_entries, numPlatforms); ++i) { - platforms[i] = &platformIds[i]; - } + *platforms = &platformId; } return err; @@ -998,8 +1005,9 @@ pi_result cuda_piContextGetInfo(pi_context context, pi_context_info param_name, case PI_CONTEXT_INFO_NUM_DEVICES: return getInfo(param_value_size, param_value, param_value_size_ret, 1); case PI_CONTEXT_INFO_DEVICES: - return getInfo(param_value_size, param_value, param_value_size_ret, - context->get_device()); + return getInfoArray(context->get_devices().size(), param_value_size, + param_value, param_value_size_ret, + &context->get_devices()[0]); case PI_CONTEXT_INFO_REFERENCE_COUNT: return getInfo(param_value_size, param_value, param_value_size_ret, context->get_reference_count()); @@ -1083,12 +1091,12 @@ pi_result cuda_piextGetDeviceFunctionPointer(pi_device device, pi_program program, const char *func_name, pi_uint64 *func_pointer_ret) { - // Check if device passed is the same the device bound to the context - assert(device == program->get_context()->get_device()); assert(func_pointer_ret != nullptr); + CUmodule module = + program->get()[program->get_context()->device_index(device)]; CUfunction func; - CUresult ret = cuModuleGetFunction(&func, program->get(), func_name); + CUresult ret = cuModuleGetFunction(&func, module, func_name); *func_pointer_ret = reinterpret_cast(func); pi_result retError = PI_SUCCESS; @@ -2143,19 +2151,18 @@ pi_result cuda_piContextCreate(const pi_context_properties *properties, const void *private_info, size_t cb, void *user_data), void *user_data, pi_context *retcontext) { - assert(devices != nullptr); // TODO: How to implement context callback? assert(pfn_notify == nullptr); assert(user_data == nullptr); - assert(num_devices == 1); // Need input context assert(retcontext != nullptr); pi_result errcode_ret = PI_SUCCESS; std::unique_ptr<_pi_context> piContextPtr{nullptr}; try { - piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{*devices}); + piContextPtr = std::unique_ptr<_pi_context>(new _pi_context( + std::vector(devices, devices + num_devices))); *retcontext = piContextPtr.release(); } catch (pi_result err) { errcode_ret = err; @@ -2186,7 +2193,13 @@ pi_result cuda_piContextRelease(pi_context ctxt) { /// \return PI_SUCCESS pi_result cuda_piextContextGetNativeHandle(pi_context context, pi_native_handle *nativeHandle) { - *nativeHandle = reinterpret_cast(context->get()); + // Currently only support context interop with one device + if (context->get_devices().size() != 1) { + return PI_ERROR_INVALID_CONTEXT; + } + + *nativeHandle = reinterpret_cast( + context->get_devices()[0]->get_context()); return PI_SUCCESS; } @@ -2217,8 +2230,9 @@ pi_result cuda_piextContextCreateWithNativeHandle(pi_native_handle nativeHandle, /// Can trigger a manual copy depending on the mode. /// \TODO Implement USE_HOST_PTR using cuHostRegister /// -pi_result cuda_piMemBufferCreate(pi_context context, pi_mem_flags flags, - size_t size, void *host_ptr, pi_mem *ret_mem, +pi_result cuda_piMemBufferCreate(pi_context context, pi_device device, + pi_mem_flags flags, size_t size, + void *host_ptr, pi_mem *ret_mem, const pi_mem_properties *properties) { // Need input memory object assert(ret_mem != nullptr); @@ -2235,7 +2249,7 @@ pi_result cuda_piMemBufferCreate(pi_context context, pi_mem_flags flags, pi_mem retMemObj = nullptr; try { - ScopedContext active(context); + ScopedContext active(context->get(device)); CUdeviceptr ptr; _pi_mem::mem_::buffer_mem_::alloc_mode allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::classic; @@ -2259,8 +2273,8 @@ pi_result cuda_piMemBufferCreate(pi_context context, pi_mem_flags flags, if (retErr == PI_SUCCESS) { pi_mem parentBuffer = nullptr; - auto piMemObj = std::unique_ptr<_pi_mem>( - new _pi_mem{context, parentBuffer, allocMode, ptr, host_ptr, size}); + auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{ + context, device, parentBuffer, allocMode, ptr, host_ptr, size}); if (piMemObj != nullptr) { retMemObj = piMemObj.release(); if (performInitialCopy) { @@ -2312,7 +2326,7 @@ pi_result cuda_piMemRelease(pi_mem memObj) { return PI_SUCCESS; } - ScopedContext active(uniqueMemObj->get_context()); + ScopedContext active(uniqueMemObj->get_native_context()); if (memObj->mem_type_ == _pi_mem::mem_type::buffer) { switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) { @@ -2385,7 +2399,6 @@ pi_result cuda_piMemBufferPartition(pi_mem parent_buffer, pi_mem_flags flags, parent_buffer->mem_.buffer_mem_.get_size()) && "PI_ERROR_INVALID_BUFFER_SIZE"); // Retained indirectly due to retaining parent buffer below. - pi_context context = parent_buffer->context_; _pi_mem::mem_::buffer_mem_::alloc_mode allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::classic; @@ -2402,8 +2415,9 @@ pi_result cuda_piMemBufferPartition(pi_mem parent_buffer, pi_mem_flags flags, std::unique_ptr<_pi_mem> retMemObj{nullptr}; try { - retMemObj = std::unique_ptr<_pi_mem>{new _pi_mem{ - context, parent_buffer, allocMode, ptr, hostPtr, bufferRegion.size}}; + retMemObj = std::unique_ptr<_pi_mem>{ + new _pi_mem{parent_buffer->get_context(), parent_buffer->get_device(), + parent_buffer, allocMode, ptr, hostPtr, bufferRegion.size}}; } catch (pi_result err) { *memObj = nullptr; return err; @@ -2463,7 +2477,9 @@ pi_result cuda_piQueueCreate(pi_context context, pi_device device, try { std::unique_ptr<_pi_queue> queueImpl{nullptr}; - if (context->get_device() != device) { + CUcontext native_context = context->get(device); + + if (native_context == nullptr) { *queue = nullptr; return PI_ERROR_INVALID_DEVICE; } @@ -2584,7 +2600,7 @@ pi_result cuda_piQueueRelease(pi_queue command_queue) { if (!command_queue->backend_has_ownership()) return PI_SUCCESS; - ScopedContext active(command_queue->get_context()); + ScopedContext active(command_queue->get_native_context()); command_queue->for_each_stream([](CUstream s) { PI_CHECK_ERROR(cuStreamSynchronize(s)); @@ -2606,8 +2622,8 @@ pi_result cuda_piQueueFinish(pi_queue command_queue) { assert(command_queue != nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code - ScopedContext active(command_queue->get_context()); + ScopedContext active(command_queue->get_native_context()); command_queue->sync_streams([&result](CUstream s) { result = PI_CHECK_ERROR(cuStreamSynchronize(s)); }); @@ -2640,7 +2656,7 @@ pi_result cuda_piQueueFlush(pi_queue command_queue) { /// \return PI_SUCCESS pi_result cuda_piextQueueGetNativeHandle(pi_queue queue, pi_native_handle *nativeHandle) { - ScopedContext active(queue->get_context()); + ScopedContext active(queue->get_native_context()); *nativeHandle = reinterpret_cast(queue->get_next_compute_stream()); return PI_SUCCESS; @@ -2661,7 +2677,6 @@ pi_result cuda_piextQueueCreateWithNativeHandle(pi_native_handle nativeHandle, pi_device device, bool ownNativeHandle, pi_queue *queue) { - (void)device; (void)ownNativeHandle; assert(ownNativeHandle == false); @@ -2686,7 +2701,7 @@ pi_result cuda_piextQueueCreateWithNativeHandle(pi_native_handle nativeHandle, *queue = new _pi_queue{std::move(computeCuStreams), std::move(transferCuStreams), context, - context->get_device(), + device, properties, flags, /*backend_owns*/ false}; @@ -2709,7 +2724,7 @@ pi_result cuda_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, std::unique_ptr<_pi_event> retImplEv{nullptr}; try { - ScopedContext active(command_queue->get_context()); + ScopedContext active(command_queue->get_native_context()); CUstream cuStream = command_queue->get_next_transfer_stream(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, @@ -2755,7 +2770,7 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, std::unique_ptr<_pi_event> retImplEv{nullptr}; try { - ScopedContext active(command_queue->get_context()); + ScopedContext active(command_queue->get_native_context()); CUstream cuStream = command_queue->get_next_transfer_stream(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, @@ -2801,7 +2816,8 @@ pi_result cuda_piEventsWait(pi_uint32 num_events, const pi_event *event_list) { return PI_ERROR_INVALID_EVENT; } - auto context = event_list[0]->get_context(); + // all events must be in the same context + auto context = event_list[0]->get_native_context(); ScopedContext active(context); auto waitFunc = [context](pi_event event) -> pi_result { @@ -2809,7 +2825,7 @@ pi_result cuda_piEventsWait(pi_uint32 num_events, const pi_event *event_list) { return PI_ERROR_INVALID_EVENT; } - if (event->get_context() != context) { + if (event->get_native_context() != context) { return PI_ERROR_INVALID_CONTEXT; } @@ -2832,26 +2848,40 @@ pi_result cuda_piKernelCreate(pi_program program, const char *kernel_name, std::unique_ptr<_pi_kernel> retKernel{nullptr}; try { - ScopedContext active(program->get_context()); + const auto &modules = program->get(); + std::vector cuFuncs(modules.size()); + std::vector cuFuncsWithOffsetParam(modules.size()); + + for (size_t i = 0; i < modules.size(); i++) { + CUmodule module = modules[i]; + ScopedContext active( + program->get_context()->get_devices()[i]->get_context()); + + if (program->build_results_[i] != CUDA_SUCCESS) { + cuFuncs[i] = nullptr; + cuFuncsWithOffsetParam[i] = nullptr; + continue; + } - CUfunction cuFunc; - retErr = PI_CHECK_ERROR( - cuModuleGetFunction(&cuFunc, program->get(), kernel_name)); + retErr = + PI_CHECK_ERROR(cuModuleGetFunction(&cuFuncs[i], module, kernel_name)); - std::string kernel_name_woffset = std::string(kernel_name) + "_with_offset"; - CUfunction cuFuncWithOffsetParam; - CUresult offsetRes = cuModuleGetFunction( - &cuFuncWithOffsetParam, program->get(), kernel_name_woffset.c_str()); + std::string kernel_name_woffset = + std::string(kernel_name) + "_with_offset"; + CUresult offsetRes = cuModuleGetFunction( + &cuFuncsWithOffsetParam[i], module, kernel_name_woffset.c_str()); - // If there is no kernel with global offset parameter we mark it as missing - if (offsetRes == CUDA_ERROR_NOT_FOUND) { - cuFuncWithOffsetParam = nullptr; - } else { - retErr = PI_CHECK_ERROR(offsetRes); + // If there is no kernel with global offset parameter we mark it as + // missing + if (offsetRes == CUDA_ERROR_NOT_FOUND) { + cuFuncsWithOffsetParam[i] = nullptr; + } else { + retErr = PI_CHECK_ERROR(offsetRes); + } } retKernel = std::unique_ptr<_pi_kernel>( - new _pi_kernel{cuFunc, cuFuncWithOffsetParam, kernel_name, program, + new _pi_kernel{cuFuncs, cuFuncsWithOffsetParam, kernel_name, program, program->get_context()}); } catch (pi_result err) { retErr = err; @@ -2945,7 +2975,7 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, sycl::detail::pi::assertion( cuFuncGetAttribute(&max_threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - kernel->get()) == CUDA_SUCCESS); + kernel->get(device)) == CUDA_SUCCESS); return getInfo(param_value_size, param_value, param_value_size_ret, size_t(max_threads)); } @@ -2968,7 +2998,7 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, int bytes = 0; sycl::detail::pi::assertion( cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, - kernel->get()) == CUDA_SUCCESS); + kernel->get(device)) == CUDA_SUCCESS); return getInfo(param_value_size, param_value, param_value_size_ret, pi_uint64(bytes)); } @@ -2986,7 +3016,7 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, int bytes = 0; sycl::detail::pi::assertion( cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, - kernel->get()) == CUDA_SUCCESS); + kernel->get(device)) == CUDA_SUCCESS); return getInfo(param_value_size, param_value, param_value_size_ret, pi_uint64(bytes)); } @@ -2994,7 +3024,7 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, int numRegs = 0; sycl::detail::pi::assertion( cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, - kernel->get()) == CUDA_SUCCESS); + kernel->get(device)) == CUDA_SUCCESS); return getInfo(param_value_size, param_value, param_value_size_ret, pi_uint32(numRegs)); } @@ -3036,9 +3066,17 @@ pi_result cuda_piEnqueueKernelLaunch( try { // Set the active context here as guessLocalWorkSize needs an active context - ScopedContext active(command_queue->get_context()); + ScopedContext active(command_queue->get_native_context()); { - size_t *reqdThreadsPerBlock = kernel->reqdThreadsPerBlock_; + int dev_idx = 0; + auto devices = command_queue->context_->get_devices(); + for (size_t i = 0; i < devices.size(); i++) { + if (devices[i] == command_queue->device_) { + dev_idx = i; + break; + } + } + size_t *reqdThreadsPerBlock = &kernel->reqdThreadsPerBlock_[dev_idx][0]; maxWorkGroupSize = command_queue->device_->get_max_work_group_size(); command_queue->device_->get_max_work_item_sizes( sizeof(maxThreadsPerBlock), maxThreadsPerBlock); @@ -3068,7 +3106,7 @@ pi_result cuda_piEnqueueKernelLaunch( return err; } } else { - guessLocalWorkSize(command_queue->device_, threadsPerBlock, + guessLocalWorkSize(command_queue->get_device(), threadsPerBlock, global_work_size, maxThreadsPerBlock, kernel, local_size); } @@ -3092,20 +3130,20 @@ pi_result cuda_piEnqueueKernelLaunch( _pi_stream_guard guard; CUstream cuStream = command_queue->get_next_compute_stream( num_events_in_wait_list, event_wait_list, guard, &stream_token); - CUfunction cuFunc = kernel->get(); + CUfunction cuFunc = kernel->get(command_queue->device_); retError = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); // Set the implicit global offset parameter if kernel has offset variant - if (kernel->get_with_offset_parameter()) { + if (kernel->get_with_offset_parameter(command_queue->device_)) { std::uint32_t cuda_implicit_offset[3] = {0, 0, 0}; if (global_work_offset) { for (size_t i = 0; i < work_dim; i++) { cuda_implicit_offset[i] = static_cast(global_work_offset[i]); if (global_work_offset[i] != 0) { - cuFunc = kernel->get_with_offset_parameter(); + cuFunc = kernel->get_with_offset_parameter(command_queue->device_); } } } @@ -3148,6 +3186,7 @@ pi_result cuda_piEnqueueKernelLaunch( cuFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2], threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2], local_size, cuStream, const_cast(argIndices.data()), nullptr)); + if (local_size != 0) kernel->clear_local_size(); @@ -3177,7 +3216,8 @@ pi_result cuda_piextKernelCreateWithNativeHandle(pi_native_handle, pi_context, } /// \TODO Not implemented -pi_result cuda_piMemImageCreate(pi_context context, pi_mem_flags flags, +pi_result cuda_piMemImageCreate(pi_context context, pi_device device, + pi_mem_flags flags, const pi_image_format *image_format, const pi_image_desc *image_desc, void *host_ptr, pi_mem *ret_mem) { @@ -3262,7 +3302,7 @@ pi_result cuda_piMemImageCreate(pi_context context, pi_mem_flags flags, size_t image_size_bytes = pixel_size_bytes * image_desc->image_width * image_desc->image_height * image_desc->image_depth; - ScopedContext active(context); + ScopedContext active(context->get(device)); CUarray image_array; retErr = PI_CHECK_ERROR(cuArray3DCreate(&image_array, &array_desc)); @@ -3313,8 +3353,9 @@ pi_result cuda_piMemImageCreate(pi_context context, pi_mem_flags flags, CUsurfObject surface; retErr = PI_CHECK_ERROR(cuSurfObjectCreate(&surface, &image_res_desc)); - auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{ - context, image_array, surface, image_desc->image_type, host_ptr}); + auto piMemObj = std::unique_ptr<_pi_mem>( + new _pi_mem{context, device, image_array, surface, + image_desc->image_type, host_ptr}); if (piMemObj == nullptr) { return PI_ERROR_OUT_OF_HOST_MEMORY; @@ -3373,10 +3414,7 @@ pi_result cuda_piProgramBuild(pi_program program, pi_uint32 num_devices, pi_result retError = PI_SUCCESS; try { - ScopedContext active(program->get_context()); - program->build_program(options); - } catch (pi_result err) { retError = err; } @@ -3393,8 +3431,6 @@ pi_result cuda_piProgramCreate(pi_context, const void *, size_t, pi_program *) { /// Note: No calls to CUDA driver API in this function, only store binaries /// for later. /// -/// Note: Only supports one device -/// pi_result cuda_piProgramCreateWithBinary( pi_context context, pi_uint32 num_devices, const pi_device *device_list, const size_t *lengths, const unsigned char **binaries, @@ -3407,10 +3443,18 @@ pi_result cuda_piProgramCreateWithBinary( assert(binaries != nullptr); assert(program != nullptr); assert(device_list != nullptr); - assert(num_devices == 1 && "CUDA contexts are for a single device"); - assert((context->get_device()->get() == device_list[0]->get()) && - "Mismatch between devices context and passed context when creating " - "program from binary"); + for (size_t i = 0; i < num_devices; i++) { + bool found_device = false; + for (pi_device context_device : context->get_devices()) { + if (device_list[i] == context_device) { + found_device = true; + break; + } + } + assert(found_device && + "Mismatch between device's context and passed context when creating " + "program from binary"); + } pi_result retError = PI_SUCCESS; @@ -3447,8 +3491,9 @@ pi_result cuda_piProgramGetInfo(pi_program program, pi_program_info param_name, case PI_PROGRAM_INFO_NUM_DEVICES: return getInfo(param_value_size, param_value, param_value_size_ret, 1u); case PI_PROGRAM_INFO_DEVICES: - return getInfoArray(1, param_value_size, param_value, param_value_size_ret, - &program->context_->deviceId_); + return getInfoArray(program->context_->get_devices().size(), + param_value_size, param_value, param_value_size_ret, + &program->context_->get_devices()[0]); case PI_PROGRAM_INFO_SOURCE: return getInfo(param_value_size, param_value, param_value_size_ret, program->binary_); @@ -3482,14 +3527,15 @@ pi_result cuda_piProgramLink(pi_context context, pi_uint32 num_devices, void *user_data, pi_program *ret_program) { assert(ret_program != nullptr); - assert(num_devices == 1 || num_devices == 0); assert(device_list != nullptr || num_devices == 0); assert(pfn_notify == nullptr); assert(user_data == nullptr); pi_result retError = PI_SUCCESS; try { - ScopedContext active(context); + // We need a context for the linking operations but the result can later be + // used in any other context, so just use the first one. + ScopedContext active(context->get_devices()[0]->get_context()); CUlinkState state; std::unique_ptr<_pi_program> retProgram{new _pi_program{context}}; @@ -3547,7 +3593,6 @@ pi_result cuda_piProgramCompile( (void)input_headers; assert(program != nullptr); - assert(num_devices == 1 || num_devices == 0); assert(device_list != nullptr || num_devices == 0); assert(pfn_notify == nullptr); assert(user_data == nullptr); @@ -3555,10 +3600,7 @@ pi_result cuda_piProgramCompile( pi_result retError = PI_SUCCESS; try { - ScopedContext active(program->get_context()); - program->build_program(options); - } catch (pi_result err) { retError = err; } @@ -3618,9 +3660,14 @@ pi_result cuda_piProgramRelease(pi_program program) { pi_result result = PI_ERROR_INVALID_PROGRAM; try { - ScopedContext active(program->get_context()); - auto cuModule = program->get(); - result = PI_CHECK_ERROR(cuModuleUnload(cuModule)); + const auto &modules = program->get(); + for (size_t i = 0; i < modules.size(); i++) { + if (program->build_results_[i] == CUDA_SUCCESS) { + ScopedContext active( + program->get_context()->get_devices()[i]->get_context()); + result = PI_CHECK_ERROR(cuModuleUnload(modules[i])); + } + } } catch (...) { result = PI_ERROR_OUT_OF_RESOURCES; } @@ -3639,7 +3686,7 @@ pi_result cuda_piProgramRelease(pi_program program) { /// \return TBD pi_result cuda_piextProgramGetNativeHandle(pi_program program, pi_native_handle *nativeHandle) { - *nativeHandle = reinterpret_cast(program->get()); + *nativeHandle = reinterpret_cast(program->get()[0]); return PI_SUCCESS; } @@ -3720,7 +3767,7 @@ pi_result cuda_piKernelGetSubGroupInfo( sycl::detail::pi::assertion( cuFuncGetAttribute(&max_threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - kernel->get()) == CUDA_SUCCESS); + kernel->get(device)) == CUDA_SUCCESS); int warpSize = 0; cuda_piKernelGetSubGroupInfo(kernel, device, PI_KERNEL_MAX_SUB_GROUP_SIZE, 0, nullptr, sizeof(uint32_t), &warpSize, @@ -3902,7 +3949,8 @@ pi_result cuda_piEventRelease(pi_event event) { std::unique_ptr<_pi_event> event_ptr{event}; pi_result result = PI_ERROR_INVALID_EVENT; try { - ScopedContext active(event->get_context()); + ScopedContext active( + event->get_context()->get_devices()[0]->get_context()); result = event->release(); } catch (...) { result = PI_ERROR_OUT_OF_RESOURCES; @@ -3951,7 +3999,7 @@ pi_result cuda_piEnqueueEventsWaitWithBarrier(pi_queue command_queue, pi_result result; try { - ScopedContext active(command_queue->get_context()); + ScopedContext active(command_queue->get_native_context()); pi_uint32 stream_token; _pi_stream_guard guard; CUstream cuStream = command_queue->get_next_compute_stream( @@ -4272,7 +4320,7 @@ pi_result cuda_piEnqueueMemBufferReadRect( std::unique_ptr<_pi_event> retImplEv{nullptr}; try { - ScopedContext active(command_queue->get_context()); + ScopedContext active(command_queue->get_native_context()); CUstream cuStream = command_queue->get_next_transfer_stream(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, @@ -4323,7 +4371,7 @@ pi_result cuda_piEnqueueMemBufferWriteRect( std::unique_ptr<_pi_event> retImplEv{nullptr}; try { - ScopedContext active(command_queue->get_context()); + ScopedContext active(command_queue->get_native_context()); CUstream cuStream = command_queue->get_next_transfer_stream(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); @@ -4370,7 +4418,7 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, std::unique_ptr<_pi_event> retImplEv{nullptr}; try { - ScopedContext active(command_queue->get_context()); + ScopedContext active(command_queue->get_native_context()); pi_result result; auto stream = command_queue->get_next_transfer_stream(); @@ -4419,7 +4467,7 @@ pi_result cuda_piEnqueueMemBufferCopyRect( std::unique_ptr<_pi_event> retImplEv{nullptr}; try { - ScopedContext active(command_queue->get_context()); + ScopedContext active(command_queue->get_native_context()); CUstream cuStream = command_queue->get_next_transfer_stream(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); @@ -4472,7 +4520,7 @@ pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, std::unique_ptr<_pi_event> retImplEv{nullptr}; try { - ScopedContext active(command_queue->get_context()); + ScopedContext active(command_queue->get_native_context()); auto stream = command_queue->get_next_transfer_stream(); pi_result result; @@ -4648,7 +4696,7 @@ pi_result cuda_piEnqueueMemImageRead( pi_result retErr = PI_SUCCESS; try { - ScopedContext active(command_queue->get_context()); + ScopedContext active(command_queue->get_native_context()); CUstream cuStream = command_queue->get_next_transfer_stream(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); @@ -4718,7 +4766,7 @@ cuda_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, pi_result retErr = PI_SUCCESS; try { - ScopedContext active(command_queue->get_context()); + ScopedContext active(command_queue->get_native_context()); CUstream cuStream = command_queue->get_next_transfer_stream(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); @@ -4780,7 +4828,7 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, pi_result retErr = PI_SUCCESS; try { - ScopedContext active(command_queue->get_context()); + ScopedContext active(command_queue->get_native_context()); CUstream cuStream = command_queue->get_next_transfer_stream(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); @@ -4884,7 +4932,7 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer, command_queue, buffer, blocking_map, offset, size, hostPtr, num_events_in_wait_list, event_wait_list, event); } else { - ScopedContext active(command_queue->get_context()); + ScopedContext active(command_queue->get_native_context()); if (is_pinned) { ret_err = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list, @@ -4939,7 +4987,7 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, memobj->mem_.buffer_mem_.get_size(), mapped_ptr, num_events_in_wait_list, event_wait_list, event); } else { - ScopedContext active(command_queue->get_context()); + ScopedContext active(command_queue->get_native_context()); if (is_pinned) { ret_err = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list, @@ -4973,7 +5021,10 @@ pi_result cuda_piextUSMHostAlloc(void **result_ptr, pi_context context, assert(properties == nullptr || *properties == 0); pi_result result = PI_SUCCESS; try { - ScopedContext active(context); + // cuMemAllocHost requires an active context but the allocation is then + // available on all the USM compatible contexts and devices so we can + // simply use the first one. + ScopedContext active(context->get_devices()[0]->get_context()); result = PI_CHECK_ERROR(cuMemAllocHost(result_ptr, size)); } catch (pi_result error) { result = error; @@ -4997,7 +5048,7 @@ pi_result cuda_piextUSMDeviceAlloc(void **result_ptr, pi_context context, assert(properties == nullptr || *properties == 0); pi_result result = PI_SUCCESS; try { - ScopedContext active(context); + ScopedContext active(context->get(device)); result = PI_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)result_ptr, size)); } catch (pi_result error) { result = error; @@ -5021,7 +5072,7 @@ pi_result cuda_piextUSMSharedAlloc(void **result_ptr, pi_context context, assert(properties == nullptr || *properties == 0); pi_result result = PI_SUCCESS; try { - ScopedContext active(context); + ScopedContext active(context->get(device)); result = PI_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)result_ptr, size, CU_MEM_ATTACH_GLOBAL)); } catch (pi_result error) { @@ -5040,7 +5091,7 @@ pi_result cuda_piextUSMFree(pi_context context, void *ptr) { assert(context != nullptr); pi_result result = PI_SUCCESS; try { - ScopedContext active(context); + ScopedContext active(context->get_devices()[0]->get_context()); bool is_managed; unsigned int type; void *attribute_values[2] = {&is_managed, &type}; @@ -5048,6 +5099,7 @@ pi_result cuda_piextUSMFree(pi_context context, void *ptr) { CU_POINTER_ATTRIBUTE_MEMORY_TYPE}; result = PI_CHECK_ERROR(cuPointerGetAttributes( 2, attributes, attribute_values, (CUdeviceptr)ptr)); + assert(type == CU_MEMORYTYPE_DEVICE || type == CU_MEMORYTYPE_HOST); if (is_managed || type == CU_MEMORYTYPE_DEVICE) { // Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed @@ -5074,7 +5126,7 @@ pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, std::unique_ptr<_pi_event> event_ptr{nullptr}; try { - ScopedContext active(queue->get_context()); + ScopedContext active(queue->get_native_context()); pi_uint32 stream_token; _pi_stream_guard guard; CUstream cuStream = queue->get_next_compute_stream( @@ -5112,7 +5164,7 @@ pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, std::unique_ptr<_pi_event> event_ptr{nullptr}; try { - ScopedContext active(queue->get_context()); + ScopedContext active(queue->get_native_context()); CUstream cuStream = queue->get_next_transfer_stream(); result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, events_waitlist); @@ -5144,13 +5196,17 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event) { - pi_device device = queue->get_context()->get_device(); + pi_device device = queue->get_device(); // Certain cuda devices and Windows do not have support for some Unified // Memory features. cuMemPrefetchAsync requires concurrent memory access // for managed memory. Therfore, ignore prefetch hint if concurrent managed // memory access is not available. - if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { + int isConcurrentManagedAccessAvailable = 0; + cuDeviceGetAttribute(&isConcurrentManagedAccessAvailable, + CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, + device->get()); + if (!isConcurrentManagedAccessAvailable) { setErrorMessage("Prefetch hint ignored as device does not support " "concurrent managed access", PI_SUCCESS); @@ -5175,7 +5231,7 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr, std::unique_ptr<_pi_event> event_ptr{nullptr}; try { - ScopedContext active(queue->get_context()); + ScopedContext active(queue->get_native_context()); CUstream cuStream = queue->get_next_transfer_stream(); result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, events_waitlist); @@ -5214,7 +5270,7 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, advice == PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY || advice == PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY || advice == PI_MEM_ADVICE_RESET) { - pi_device device = queue->get_context()->get_device(); + pi_device device = queue->get_device(); if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { setErrorMessage("Mem advise ignored as device does not support " "concurrent managed access", @@ -5241,7 +5297,7 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, std::unique_ptr<_pi_event> event_ptr{nullptr}; try { - ScopedContext active(queue->get_context()); + ScopedContext active(queue->get_native_context()); if (event) { event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native( @@ -5259,7 +5315,7 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, result = PI_CHECK_ERROR(cuMemAdvise( (CUdeviceptr)ptr, length, (CUmem_advise)(advice - PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY + 1), - queue->get_context()->get_device()->get())); + queue->device_->get())); break; case PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION_HOST: case PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION_HOST: @@ -5275,13 +5331,13 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, case PI_MEM_ADVICE_RESET: PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length, CU_MEM_ADVISE_UNSET_READ_MOSTLY, - queue->get_context()->get_device()->get())); + queue->get_device()->get())); PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length, CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION, - queue->get_context()->get_device()->get())); + queue->get_device()->get())); PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length, CU_MEM_ADVISE_UNSET_ACCESSED_BY, - queue->get_context()->get_device()->get())); + queue->get_device()->get())); break; default: sycl::detail::pi::die("Unknown advice"); @@ -5343,7 +5399,7 @@ pi_result cuda_piextUSMEnqueueMemcpy2D(pi_queue queue, pi_bool blocking, pi_result result = PI_SUCCESS; try { - ScopedContext active(queue->get_context()); + ScopedContext active(queue->get_native_context()); CUstream cuStream = queue->get_next_transfer_stream(); result = enqueueEventsWait(queue, cuStream, num_events_in_wait_list, event_wait_list); @@ -5407,7 +5463,7 @@ pi_result cuda_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, pi_result result = PI_SUCCESS; try { - ScopedContext active(context); + ScopedContext active(context->get_devices()[0]->get_context()); switch (param_name) { case PI_MEM_ALLOC_TYPE: { unsigned int value; @@ -5495,6 +5551,19 @@ pi_result cuda_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, return result; } +__SYCL_EXPORT pi_result cuda_piextGetMemoryConnection( + pi_device device1, pi_context context1, pi_device device2, + pi_context context2, _pi_memory_connection *res) { + (void)device1; + (void)context1; + (void)device2; + (void)context2; + + // We can always migrate memory between two devices using the CUDA APIs. + *res = PI_MEMORY_CONNECTION_MIGRATABLE; + return PI_SUCCESS; +} + pi_result cuda_piextEnqueueDeviceGlobalVariableWrite( pi_queue queue, pi_program program, const char *name, pi_bool blocking_write, size_t count, size_t offset, const void *src, @@ -5517,9 +5586,11 @@ pi_result cuda_piextEnqueueDeviceGlobalVariableWrite( try { CUdeviceptr device_global = 0; size_t device_global_size = 0; - result = PI_CHECK_ERROR( - cuModuleGetGlobal(&device_global, &device_global_size, program->get(), - device_global_name.c_str())); + result = PI_CHECK_ERROR(cuModuleGetGlobal( + &device_global, &device_global_size, + program + ->get()[program->get_context()->device_index(queue->get_device())], + device_global_name.c_str())); if (offset + count > device_global_size) return PI_ERROR_INVALID_VALUE; @@ -5554,9 +5625,11 @@ pi_result cuda_piextEnqueueDeviceGlobalVariableRead( try { CUdeviceptr device_global = 0; size_t device_global_size = 0; - result = PI_CHECK_ERROR( - cuModuleGetGlobal(&device_global, &device_global_size, program->get(), - device_global_name.c_str())); + result = PI_CHECK_ERROR(cuModuleGetGlobal( + &device_global, &device_global_size, + program + ->get()[program->get_context()->device_index(queue->get_device())], + device_global_name.c_str())); if (offset + count > device_global_size) return PI_ERROR_INVALID_VALUE; @@ -5671,6 +5744,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) { // Platform _PI_CL(piPlatformsGet, cuda_piPlatformsGet) _PI_CL(piPlatformGetInfo, cuda_piPlatformGetInfo) + // Device _PI_CL(piDevicesGet, cuda_piDevicesGet) _PI_CL(piDeviceGetInfo, cuda_piDeviceGetInfo) @@ -5794,6 +5868,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextEnqueueDeviceGlobalVariableRead, cuda_piextEnqueueDeviceGlobalVariableRead) + _PI_CL(piextGetMemoryConnection, cuda_piextGetMemoryConnection) + // Host Pipe _PI_CL(piextEnqueueReadHostPipe, cuda_piextEnqueueReadHostPipe) _PI_CL(piextEnqueueWriteHostPipe, cuda_piextEnqueueWriteHostPipe) diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index f6baeab0a4445..21c84e5cba843 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -179,15 +179,21 @@ struct _pi_context { using native_type = CUcontext; native_type cuContext_; - _pi_device *deviceId_; + std::vector deviceIds_; std::atomic_uint32_t refCount_; - _pi_context(_pi_device *devId) - : cuContext_{devId->get_context()}, deviceId_{devId}, refCount_{1} { - cuda_piDeviceRetain(deviceId_); + _pi_context(std::vector &&devIds) + : deviceIds_{std::move(devIds)}, refCount_{1} { + for (pi_device dev : deviceIds_) { + cuda_piDeviceRetain(dev); + } }; - ~_pi_context() { cuda_piDeviceRelease(deviceId_); } + ~_pi_context() { + for (pi_device dev : deviceIds_) { + cuda_piDeviceRelease(dev); + } + } void invoke_extended_deleters() { std::lock_guard guard(mutex_); @@ -202,9 +208,28 @@ struct _pi_context { extended_deleters_.emplace_back(deleter_data{function, user_data}); } - pi_device get_device() const noexcept { return deviceId_; } + const std::vector &get_devices() const noexcept { + return deviceIds_; + } - native_type get() const noexcept { return cuContext_; } + size_t device_index(pi_device device) { + for (size_t i = 0; i < deviceIds_.size(); i++) { + if (deviceIds_[i] == device) { + return i; + } + } + assert(false && "No index for device"); + return 0; + } + + CUcontext get(pi_device device) { + for (size_t i = 0; i < deviceIds_.size(); i++) { + if (deviceIds_[i] == device) { + return device->get_context(); + } + } + return nullptr; + } pi_uint32 increment_reference_count() noexcept { return ++refCount_; } @@ -229,6 +254,9 @@ struct _pi_mem { // Context where the memory object is accessibles pi_context context_; + // Device where the memory is located + pi_device device_; + /// Reference counting of the handler std::atomic_uint32_t refCount_; enum class mem_type { buffer, surface } mem_type_; @@ -331,9 +359,11 @@ struct _pi_mem { } mem_; /// Constructs the PI MEM handler for a non-typed allocation ("buffer") - _pi_mem(pi_context ctxt, pi_mem parent, mem_::buffer_mem_::alloc_mode mode, - CUdeviceptr ptr, void *host_ptr, size_t size) - : context_{ctxt}, refCount_{1}, mem_type_{mem_type::buffer} { + _pi_mem(pi_context ctxt, pi_device dev, pi_mem parent, + mem_::buffer_mem_::alloc_mode mode, CUdeviceptr ptr, void *host_ptr, + size_t size) + : context_{ctxt}, device_{dev}, refCount_{1}, + mem_type_{mem_type::buffer} { mem_.buffer_mem_.ptr_ = ptr; mem_.buffer_mem_.parent_ = parent; mem_.buffer_mem_.hostPtr_ = host_ptr; @@ -350,9 +380,10 @@ struct _pi_mem { }; /// Constructs the PI allocation for an Image object (surface in CUDA) - _pi_mem(pi_context ctxt, CUarray array, CUsurfObject surf, + _pi_mem(pi_context ctxt, pi_device dev, CUarray array, CUsurfObject surf, pi_mem_type image_type, void *host_ptr) - : context_{ctxt}, refCount_{1}, mem_type_{mem_type::surface} { + : context_{ctxt}, device_{dev}, refCount_{1}, + mem_type_{mem_type::surface} { // Ignore unused parameter (void)host_ptr; @@ -382,6 +413,10 @@ struct _pi_mem { bool is_image() const noexcept { return mem_type_ == mem_type::surface; } pi_context get_context() const noexcept { return context_; } + pi_device get_device() const noexcept { return device_; } + CUcontext get_native_context() const noexcept { + return context_->get(device_); + } pi_uint32 increment_reference_count() noexcept { return ++refCount_; } @@ -544,6 +579,12 @@ struct _pi_queue { } } + CUcontext get_native_context() { + CUcontext res = context_->get(device_); + assert(res != nullptr); + return res; + } + template void sync_streams(T &&f) { auto sync_compute = [&f, &streams = compute_streams_, &delay = delay_compute_](unsigned int start, @@ -671,6 +712,8 @@ struct _pi_event { pi_context get_context() const noexcept { return context_; }; + CUcontext get_native_context() const noexcept { return native_context_; }; + pi_uint32 increment_reference_count() { return ++refCount_; } pi_uint32 decrement_reference_count() { return --refCount_; } @@ -695,7 +738,8 @@ struct _pi_event { static pi_event make_native(pi_command_type type, pi_queue queue, CUstream stream, pi_uint32 stream_token = std::numeric_limits::max()) { - return new _pi_event(type, queue->get_context(), queue, stream, + return new _pi_event(type, queue->get_context(), + queue->get_native_context(), queue, stream, stream_token); } @@ -710,7 +754,7 @@ struct _pi_event { private: // This constructor is private to force programmers to use the make_native / // make_user static members in order to create a pi_event for CUDA. - _pi_event(pi_command_type type, pi_context context, pi_queue queue, + _pi_event(pi_command_type type, pi_context context, CUcontext, pi_queue queue, CUstream stream, pi_uint32 stream_token); // This constructor is private to force programmers to use the @@ -753,13 +797,16 @@ struct _pi_event { pi_context context_; // pi_context associated with the event. If this is a // native event, this will be the same context associated // with the queue_ member. + CUcontext native_context_; // CUcontext associated with the event. If this is + // not a native event, this will be nullptr. }; /// Implementation of PI Program on CUDA Module object /// struct _pi_program { using native_type = CUmodule; - native_type module_; + std::vector modules_; + std::vector build_results_; const char *binary_; size_t binarySizeInBytes_; std::atomic_uint32_t refCount_; @@ -788,7 +835,7 @@ struct _pi_program { pi_context get_context() const { return context_; }; - native_type get() const noexcept { return module_; }; + std::vector get() const noexcept { return modules_; }; pi_uint32 increment_reference_count() noexcept { return ++refCount_; } @@ -816,15 +863,16 @@ struct _pi_program { struct _pi_kernel { using native_type = CUfunction; - native_type function_; - native_type functionWithOffsetParam_; + std::vector functions_; + std::vector functionsWithOffsetParam_; std::string name_; pi_context context_; pi_program program_; std::atomic_uint32_t refCount_; static constexpr pi_uint32 REQD_THREADS_PER_BLOCK_DIMENSIONS = 3u; - size_t reqdThreadsPerBlock_[REQD_THREADS_PER_BLOCK_DIMENSIONS]; + std::vector> + reqdThreadsPerBlock_; /// Structure that holds the arguments to the kernel. /// Note earch argument size is known, since it comes @@ -911,18 +959,23 @@ struct _pi_kernel { } } args_; - _pi_kernel(CUfunction func, CUfunction funcWithOffsetParam, const char *name, + _pi_kernel(std::vector funcs, + std::vector funcsWithOffsetParam, const char *name, pi_program program, pi_context ctxt) - : function_{func}, functionWithOffsetParam_{funcWithOffsetParam}, - name_{name}, context_{ctxt}, program_{program}, refCount_{1} { + : functions_{funcs}, functionsWithOffsetParam_{funcsWithOffsetParam}, + name_{name}, context_{ctxt}, program_{program}, refCount_{1}, + reqdThreadsPerBlock_{ctxt->get_devices().size()} { cuda_piProgramRetain(program_); cuda_piContextRetain(context_); - /// Note: this code assumes that there is only one device per context - pi_result retError = cuda_piKernelGetGroupInfo( - this, ctxt->get_device(), PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, - sizeof(reqdThreadsPerBlock_), reqdThreadsPerBlock_, nullptr); - (void)retError; - assert(retError == PI_SUCCESS); + int device_num = 0; + for (pi_device device : ctxt->get_devices()) { + pi_result retError = cuda_piKernelGetGroupInfo( + this, device, PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, + sizeof(reqdThreadsPerBlock_[device_num]), + &reqdThreadsPerBlock_[device_num++][0], nullptr); + (void)retError; + assert(retError == PI_SUCCESS); + } } ~_pi_kernel() { @@ -938,15 +991,13 @@ struct _pi_kernel { pi_uint32 get_reference_count() const noexcept { return refCount_; } - native_type get() const noexcept { return function_; }; - - native_type get_with_offset_parameter() const noexcept { - return functionWithOffsetParam_; - }; + std::vector get() const noexcept { return functions_; } + native_type get(pi_device device) const noexcept; - bool has_with_offset_parameter() const noexcept { - return functionWithOffsetParam_ != nullptr; + std::vector get_with_offset_parameter() const noexcept { + return functionsWithOffsetParam_; } + native_type get_with_offset_parameter(pi_device device) const noexcept; pi_context get_context() const noexcept { return context_; }; diff --git a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp index 8eeb2432bcf08..4747d721bf600 100644 --- a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp +++ b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp @@ -481,6 +481,23 @@ pi_result piextPlatformCreateWithNativeHandle(pi_native_handle, pi_platform *) { DIE_NO_IMPLEMENTATION; } +__SYCL_EXPORT pi_result piextGetMemoryConnection(pi_device device1, + pi_context context1, + pi_device device2, + pi_context context2, + _pi_memory_connection *res) { + ARG_UNUSED(device1); + ARG_UNUSED(device2); + + if (context1 == context2) { + *res = PI_MEMORY_CONNECTION_UNIFIED; + } else { + *res = PI_MEMORY_CONNECTION_NONE; + } + + return PI_SUCCESS; +} + pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType, pi_uint32 NumEntries, pi_device *Devices, pi_uint32 *NumDevices) { @@ -1020,9 +1037,11 @@ pi_result piextQueueCreateWithNativeHandle(pi_native_handle, pi_context, DIE_NO_IMPLEMENTATION; } -pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, - void *HostPtr, pi_mem *RetMem, +pi_result piMemBufferCreate(pi_context Context, pi_device Device, + pi_mem_flags Flags, size_t Size, void *HostPtr, + pi_mem *RetMem, const pi_mem_properties *properties) { + ARG_UNUSED(Device); ARG_UNUSED(properties); if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) { @@ -1172,10 +1191,12 @@ ConvertPiImageFormatToCmFormat(const pi_image_format *PiFormat) { return cm_support::CM_SURFACE_FORMAT_UNKNOWN; } -pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, +pi_result piMemImageCreate(pi_context Context, pi_device Device, + pi_mem_flags Flags, const pi_image_format *ImageFormat, const pi_image_desc *ImageDesc, void *HostPtr, pi_mem *RetImage) { + ARG_UNUSED(Device); if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) { PiTrace("Invalid memory attribute for piMemImageCreate"); return PI_ERROR_INVALID_OPERATION; diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index f216bc4565edf..69ca058ef4688 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -2173,9 +2173,11 @@ pi_result hip_piextContextCreateWithNativeHandle(pi_native_handle nativeHandle, /// Can trigger a manual copy depending on the mode. /// \TODO Implement USE_HOST_PTR using cuHostRegister /// -pi_result hip_piMemBufferCreate(pi_context context, pi_mem_flags flags, - size_t size, void *host_ptr, pi_mem *ret_mem, +pi_result hip_piMemBufferCreate(pi_context context, pi_device device, + pi_mem_flags flags, size_t size, void *host_ptr, + pi_mem *ret_mem, const pi_mem_properties *properties) { + (void)device; // Need input memory object assert(ret_mem != nullptr); assert((properties == nullptr || *properties == 0) && @@ -3073,10 +3075,12 @@ hip_piEnqueueNativeKernel(pi_queue queue, void (*user_func)(void *), void *args, /// \TODO Not implemented -pi_result hip_piMemImageCreate(pi_context context, pi_mem_flags flags, +pi_result hip_piMemImageCreate(pi_context context, pi_device device, + pi_mem_flags flags, const pi_image_format *image_format, const pi_image_desc *image_desc, void *host_ptr, pi_mem *ret_mem) { + (void)device; // Need input memory object assert(ret_mem != nullptr); @@ -5348,6 +5352,17 @@ pi_result hip_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, return result; } +__SYCL_EXPORT pi_result hip_piextGetMemoryConnection( + pi_device device1, pi_context context1, pi_device device2, + pi_context context2, _pi_memory_connection *res) { + (void)device1; + (void)context1; + (void)device2; + (void)context2; + *res = PI_MEMORY_CONNECTION_MIGRATABLE; + return PI_SUCCESS; +} + pi_result hip_piextEnqueueDeviceGlobalVariableWrite( pi_queue queue, pi_program program, const char *name, pi_bool blocking_write, size_t count, size_t offset, const void *src, @@ -5613,6 +5628,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextEnqueueDeviceGlobalVariableRead, hip_piextEnqueueDeviceGlobalVariableRead) + _PI_CL(piextGetMemoryConnection, hip_piextGetMemoryConnection) + // Host Pipe _PI_CL(piextEnqueueReadHostPipe, hip_piextEnqueueReadHostPipe) _PI_CL(piextEnqueueWriteHostPipe, hip_piextEnqueueWriteHostPipe) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index c25c8624ea00d..a4d102933d0fb 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -2125,6 +2125,21 @@ pi_result piextPlatformCreateWithNativeHandle(pi_native_handle NativeHandle, return PI_ERROR_INVALID_VALUE; } +__SYCL_EXPORT pi_result piextGetMemoryConnection(pi_device device1, + pi_context context1, + pi_device device2, + pi_context context2, + _pi_memory_connection *res) { + (void)device1; + (void)device2; + if (context1 == context2) { + *res = PI_MEMORY_CONNECTION_UNIFIED; + } else { + *res = PI_MEMORY_CONNECTION_NONE; + } + return PI_SUCCESS; +} + pi_result piPluginGetLastError(char **message) { return pi2ur::piPluginGetLastError(message); } @@ -2934,9 +2949,11 @@ static pi_result ZeHostMemAllocHelper(void **ResultPtr, pi_context Context, return PI_SUCCESS; } -pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, - void *HostPtr, pi_mem *RetMem, +pi_result piMemBufferCreate(pi_context Context, pi_device Device, + pi_mem_flags Flags, size_t Size, void *HostPtr, + pi_mem *RetMem, const pi_mem_properties *properties) { + (void)Device; // TODO: implement support for more access modes if (!((Flags & PI_MEM_FLAGS_ACCESS_RW) || @@ -3117,10 +3134,12 @@ pi_result piMemRelease(pi_mem Mem) { return PI_SUCCESS; } -pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, +pi_result piMemImageCreate(pi_context Context, pi_device Dev, + pi_mem_flags Flags, const pi_image_format *ImageFormat, const pi_image_desc *ImageDesc, void *HostPtr, pi_mem *RetImage) { + (void)Dev; // TODO: implement read-only, write-only if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) { diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 1061209447b6d..db91a8b56d5c3 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -1140,9 +1140,11 @@ pi_result piContextGetInfo(pi_context context, pi_context_info paramName, } } -pi_result piMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size, - void *host_ptr, pi_mem *ret_mem, +pi_result piMemBufferCreate(pi_context context, pi_device device, + pi_mem_flags flags, size_t size, void *host_ptr, + pi_mem *ret_mem, const pi_mem_properties *properties) { + (void)device; pi_result ret_err = PI_ERROR_INVALID_OPERATION; if (properties) { // TODO: need to check if all properties are supported by OpenCL RT and @@ -1166,10 +1168,12 @@ pi_result piMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size, return ret_err; } -pi_result piMemImageCreate(pi_context context, pi_mem_flags flags, +pi_result piMemImageCreate(pi_context context, pi_device device, + pi_mem_flags flags, const pi_image_format *image_format, const pi_image_desc *image_desc, void *host_ptr, pi_mem *ret_mem) { + (void)device; pi_result ret_err = PI_ERROR_INVALID_OPERATION; *ret_mem = cast( clCreateImage(cast(context), cast(flags), @@ -2111,6 +2115,21 @@ pi_result piextKernelGetNativeHandle(pi_kernel kernel, return piextGetNativeHandle(kernel, nativeHandle); } +__SYCL_EXPORT pi_result piextGetMemoryConnection(pi_device device1, + pi_context context1, + pi_device device2, + pi_context context2, + _pi_memory_connection *res) { + (void)device1; + (void)device2; + if (context1 == context2) { + *res = PI_MEMORY_CONNECTION_UNIFIED; + } else { + *res = PI_MEMORY_CONNECTION_NONE; + } + return PI_SUCCESS; +} + // This API is called by Sycl RT to notify the end of the plugin lifetime. // Windows: dynamically loaded plugins might have been unloaded already // when this is called. Sycl RT holds onto the PI plugin so it can be @@ -2304,6 +2323,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextEnqueueReadHostPipe, piextEnqueueReadHostPipe) _PI_CL(piextEnqueueWriteHostPipe, piextEnqueueWriteHostPipe) + _PI_CL(piextGetMemoryConnection, piextGetMemoryConnection) _PI_CL(piextKernelSetArgMemObj, piextKernelSetArgMemObj) _PI_CL(piextKernelSetArgSampler, piextKernelSetArgSampler) _PI_CL(piPluginGetLastError, piPluginGetLastError) diff --git a/sycl/source/detail/buffer_impl.cpp b/sycl/source/detail/buffer_impl.cpp index 550d803f5086e..340b126128ea1 100644 --- a/sycl/source/detail/buffer_impl.cpp +++ b/sycl/source/detail/buffer_impl.cpp @@ -19,8 +19,20 @@ namespace detail { #ifdef XPTI_ENABLE_INSTRUMENTATION uint8_t GBufferStreamID; #endif + +// FIXME: Remove this overload when the class is removed from __SYCL_EXPORT void *buffer_impl::allocateMem(ContextImplPtr Context, bool InitFromUserData, - void *HostPtr, RT::PiEvent &OutEventToWait) { + void *HostPtr, RT::PiEvent &InteropEvent) { + (void)Context; + (void)InitFromUserData; + (void)HostPtr; + (void)InteropEvent; + assert(false && "Deprecated: use the overload with the device parameter"); +} + +void *buffer_impl::allocateMem(ContextImplPtr Context, DeviceImplPtr Device, + bool InitFromUserData, void *HostPtr, + RT::PiEvent &OutEventToWait) { bool HostPtrReadOnly = false; BaseT::determineHostPtr(Context, InitFromUserData, HostPtr, HostPtrReadOnly); @@ -28,7 +40,7 @@ void *buffer_impl::allocateMem(ContextImplPtr Context, bool InitFromUserData, "Internal error. Allocating memory on the host " "while having use_host_ptr property"); return MemoryManager::allocateMemBuffer( - std::move(Context), this, HostPtr, HostPtrReadOnly, + std::move(Context), std::move(Device), this, HostPtr, HostPtrReadOnly, BaseT::getSizeInBytes(), BaseT::MInteropEvent, BaseT::MInteropContext, MProps, OutEventToWait); } diff --git a/sycl/source/detail/buffer_impl.hpp b/sycl/source/detail/buffer_impl.hpp index 1d34ce3e5c95c..01e58505185b3 100644 --- a/sycl/source/detail/buffer_impl.hpp +++ b/sycl/source/detail/buffer_impl.hpp @@ -146,7 +146,10 @@ class __SYCL_EXPORT buffer_impl final : public SYCLMemObjT { std::move(Allocator)) {} void *allocateMem(ContextImplPtr Context, bool InitFromUserData, - void *HostPtr, RT::PiEvent &OutEventToWait) override; + void *HostPtr, RT::PiEvent &InteropEvent) override; + void *allocateMem(ContextImplPtr Context, DeviceImplPtr Device, + bool InitFromUserData, void *HostPtr, + RT::PiEvent &OutEventToWait) override; void constructorNotification(const detail::code_location &CodeLoc, void *UserObj, const void *HostObj, const void *Type, uint32_t Dim, diff --git a/sycl/source/detail/context_impl.hpp b/sycl/source/detail/context_impl.hpp index ba6401a7da54c..0bc35a1b61a12 100644 --- a/sycl/source/detail/context_impl.hpp +++ b/sycl/source/detail/context_impl.hpp @@ -230,6 +230,11 @@ class context_impl { mutable KernelProgramCache MKernelProgramCache; mutable PropertySupport MSupportBufferLocationByDevices; + friend pi_memory_connection + getMemoryConnection(const std::shared_ptr &Dev1, + const std::shared_ptr &Ctx1, + const std::shared_ptr &Dev2, + const std::shared_ptr &Ctx2); std::set MAssociatedDeviceGlobals; std::mutex MAssociatedDeviceGlobalsMutex; diff --git a/sycl/source/detail/device_image_impl.hpp b/sycl/source/detail/device_image_impl.hpp index ea23b6828986c..dbf24e52de46a 100644 --- a/sycl/source/detail/device_image_impl.hpp +++ b/sycl/source/detail/device_image_impl.hpp @@ -61,7 +61,7 @@ class device_image_impl { RT::PiProgram Program) : MBinImage(BinImage), MContext(std::move(Context)), MDevices(std::move(Devices)), MState(State), MProgram(Program), - MKernelIDs(std::move(KernelIDs)) { + MKernelIDs(std::move(KernelIDs)), MSpecConstsBuffers(MDevices.size()) { updateSpecConstSymMap(); } @@ -73,7 +73,7 @@ class device_image_impl { : MBinImage(BinImage), MContext(std::move(Context)), MDevices(std::move(Devices)), MState(State), MProgram(Program), MKernelIDs(std::move(KernelIDs)), MSpecConstsBlob(SpecConstsBlob), - MSpecConstSymMap(SpecConstMap) {} + MSpecConstsBuffers(MDevices.size()), MSpecConstSymMap(SpecConstMap) {} bool has_kernel(const kernel_id &KernelIDCand) const noexcept { return std::binary_search(MKernelIDs->begin(), MKernelIDs->end(), @@ -206,22 +206,30 @@ class device_image_impl { return MSpecConstsBlob; } - RT::PiMem &get_spec_const_buffer_ref() noexcept { + RT::PiMem &get_spec_const_buffer_ref(device dev) noexcept { std::lock_guard Lock{MSpecConstAccessMtx}; - if (nullptr == MSpecConstsBuffer && !MSpecConstsBlob.empty()) { + RT::PiMem *mem = nullptr; + for (size_t i = 0; i < MDevices.size(); i++) { + if (MDevices[i] == dev) { + mem = &MSpecConstsBuffers[i]; + break; + } + } + assert(mem); + if (nullptr == *mem && !MSpecConstsBlob.empty()) { const detail::plugin &Plugin = getSyclObjImpl(MContext)->getPlugin(); // Uses PI_MEM_FLAGS_HOST_PTR_COPY instead of PI_MEM_FLAGS_HOST_PTR_USE // since post-enqueue cleanup might trigger destruction of // device_image_impl and, as a result, destruction of MSpecConstsBlob // while MSpecConstsBuffer is still in use. // TODO consider changing the lifetime of device_image_impl instead - memBufferCreateHelper(Plugin, - detail::getSyclObjImpl(MContext)->getHandleRef(), - PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_COPY, - MSpecConstsBlob.size(), MSpecConstsBlob.data(), - &MSpecConstsBuffer, nullptr); + memBufferCreateHelper( + Plugin, detail::getSyclObjImpl(MContext)->getHandleRef(), + detail::getSyclObjImpl(dev)->getHandleRef(), + PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_COPY, + MSpecConstsBlob.size(), MSpecConstsBlob.data(), mem, nullptr); } - return MSpecConstsBuffer; + return *mem; } const SpecConstMapT &get_spec_const_data_ref() const noexcept { @@ -252,10 +260,12 @@ class device_image_impl { const detail::plugin &Plugin = getSyclObjImpl(MContext)->getPlugin(); Plugin.call(MProgram); } - if (MSpecConstsBuffer) { - std::lock_guard Lock{MSpecConstAccessMtx}; - const detail::plugin &Plugin = getSyclObjImpl(MContext)->getPlugin(); - memReleaseHelper(Plugin, MSpecConstsBuffer); + for (auto mem : MSpecConstsBuffers) { + if (mem) { + std::lock_guard Lock{MSpecConstAccessMtx}; + const detail::plugin &Plugin = getSyclObjImpl(MContext)->getPlugin(); + memReleaseHelper(Plugin, mem); + } } } @@ -339,10 +349,10 @@ class device_image_impl { // Binary blob which can have values of all specialization constants in the // image std::vector MSpecConstsBlob; - // Buffer containing binary blob which can have values of all specialization - // constants in the image, it is using for storing non-native specialization - // constants - RT::PiMem MSpecConstsBuffer = nullptr; + // A vector of buffers containing binary blobs which can have values of all + // specialization constants in the image; they are used for storing non-native + // specialization constants + std::vector MSpecConstsBuffers; // Contains map of spec const names to their descriptions + offsets in // the MSpecConstsBlob std::map> MSpecConstSymMap; diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp index 41cb1c9fdb62c..9dd3c28ef81e1 100644 --- a/sycl/source/detail/device_impl.hpp +++ b/sycl/source/detail/device_impl.hpp @@ -30,6 +30,7 @@ namespace detail { // Forward declaration class platform_impl; +class context_impl; using PlatformImplPtr = std::shared_ptr; // TODO: Make code thread-safe @@ -253,6 +254,14 @@ class device_impl { bool MIsAssertFailSupported = false; mutable std::string MDeviceName; mutable std::once_flag MDeviceNameFlag; + + friend bool sameDev(const std::shared_ptr &LHS, + const std::shared_ptr &RHS); + friend pi_memory_connection + getMemoryConnection(const std::shared_ptr &Dev1, + const std::shared_ptr &Ctx1, + const std::shared_ptr &Dev2, + const std::shared_ptr &Ctx2); std::pair MDeviceHostBaseTime; }; // class device_impl diff --git a/sycl/source/detail/image_impl.cpp b/sycl/source/detail/image_impl.cpp index 7ff987686496d..f639945c44a73 100644 --- a/sycl/source/detail/image_impl.cpp +++ b/sycl/source/detail/image_impl.cpp @@ -297,8 +297,19 @@ image_impl::image_impl(cl_mem MemObject, const context &SyclContext, } } +// FIXME: Remove this overload when the class is removed from __SYCL_EXPORT void *image_impl::allocateMem(ContextImplPtr Context, bool InitFromUserData, - void *HostPtr, RT::PiEvent &OutEventToWait) { + void *HostPtr, RT::PiEvent &InteropEvent) { + (void)Context; + (void)InitFromUserData; + (void)HostPtr; + (void)InteropEvent; + assert(false && "Deprecated: use the overload with the device parameter"); +} + +void *image_impl::allocateMem(ContextImplPtr Context, DeviceImplPtr Device, + bool InitFromUserData, void *HostPtr, + RT::PiEvent &OutEventToWait) { bool HostPtrReadOnly = false; BaseT::determineHostPtr(Context, InitFromUserData, HostPtr, HostPtrReadOnly); @@ -311,7 +322,7 @@ void *image_impl::allocateMem(ContextImplPtr Context, bool InitFromUserData, "The check an image format failed."); return MemoryManager::allocateMemImage( - std::move(Context), this, HostPtr, HostPtrReadOnly, + std::move(Context), std::move(Device), this, HostPtr, HostPtrReadOnly, BaseT::getSizeInBytes(), Desc, Format, BaseT::MInteropEvent, BaseT::MInteropContext, MProps, OutEventToWait); } diff --git a/sycl/source/detail/image_impl.hpp b/sycl/source/detail/image_impl.hpp index 9474fae5d895e..dda2803a00ecc 100644 --- a/sycl/source/detail/image_impl.hpp +++ b/sycl/source/detail/image_impl.hpp @@ -179,7 +179,10 @@ class __SYCL_EXPORT image_impl final : public SYCLMemObjT { size_t size() const noexcept { return MRange.size(); } void *allocateMem(ContextImplPtr Context, bool InitFromUserData, - void *HostPtr, RT::PiEvent &OutEventToWait) override; + void *HostPtr, RT::PiEvent &InteropEvent) override; + void *allocateMem(ContextImplPtr Context, DeviceImplPtr Device, + bool InitFromUserData, void *HostPtr, + RT::PiEvent &OutEventToWait) override; MemObjType getType() const override { return MemObjType::Image; } diff --git a/sycl/source/detail/mem_alloc_helper.hpp b/sycl/source/detail/mem_alloc_helper.hpp index c2c6dd0a3d5a9..988eed6e2e223 100644 --- a/sycl/source/detail/mem_alloc_helper.hpp +++ b/sycl/source/detail/mem_alloc_helper.hpp @@ -13,7 +13,7 @@ namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { namespace detail { -void memBufferCreateHelper(const plugin &Plugin, pi_context Ctx, +void memBufferCreateHelper(const plugin &Plugin, pi_context Ctx, pi_device Dev, pi_mem_flags Flags, size_t Size, void *HostPtr, pi_mem *RetMem, const pi_mem_properties *Props = nullptr); diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 9985bee6ba426..124142e5c0f0d 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -126,7 +126,7 @@ static void waitForEvents(const std::vector &Events) { } } -void memBufferCreateHelper(const plugin &Plugin, pi_context Ctx, +void memBufferCreateHelper(const plugin &Plugin, pi_context Ctx, pi_device Dev, pi_mem_flags Flags, size_t Size, void *HostPtr, pi_mem *RetMem, const pi_mem_properties *Props) { #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -149,8 +149,8 @@ void memBufferCreateHelper(const plugin &Plugin, pi_context Ctx, CorrID); }}; #endif - Plugin.call(Ctx, Flags, Size, HostPtr, RetMem, - Props); + Plugin.call(Ctx, Dev, Flags, Size, HostPtr, + RetMem, Props); } } @@ -262,7 +262,8 @@ void MemoryManager::releaseMemObj(ContextImplPtr TargetContext, memReleaseHelper(Plugin, pi::cast(MemAllocation)); } -void *MemoryManager::allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, +void *MemoryManager::allocate(ContextImplPtr TargetContext, + DeviceImplPtr TargetDevice, SYCLMemObjI *MemObj, bool InitFromUserData, void *HostPtr, std::vector DepEvents, RT::PiEvent &OutEvent) { @@ -271,8 +272,8 @@ void *MemoryManager::allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, waitForEvents(DepEvents); OutEvent = nullptr; - return MemObj->allocateMem(TargetContext, InitFromUserData, HostPtr, - OutEvent); + return MemObj->allocateMem(TargetContext, TargetDevice, InitFromUserData, + HostPtr, OutEvent); } void *MemoryManager::allocateHostMemory(SYCLMemObjI *MemObj, void *UserPtr, @@ -318,6 +319,7 @@ static RT::PiMemFlags getMemObjCreationFlags(void *UserPtr, } void *MemoryManager::allocateImageObject(ContextImplPtr TargetContext, + DeviceImplPtr TargetDevice, void *UserPtr, bool HostPtrReadOnly, const RT::PiMemImageDesc &Desc, const RT::PiMemImageFormat &Format, @@ -327,14 +329,15 @@ void *MemoryManager::allocateImageObject(ContextImplPtr TargetContext, RT::PiMem NewMem; const detail::plugin &Plugin = TargetContext->getPlugin(); - Plugin.call(TargetContext->getHandleRef(), - CreationFlags, &Format, &Desc, - UserPtr, &NewMem); + Plugin.call( + TargetContext->getHandleRef(), TargetDevice->getHandleRef(), + CreationFlags, &Format, &Desc, UserPtr, &NewMem); return NewMem; } void * -MemoryManager::allocateBufferObject(ContextImplPtr TargetContext, void *UserPtr, +MemoryManager::allocateBufferObject(ContextImplPtr TargetContext, + DeviceImplPtr TargetDevice, void *UserPtr, bool HostPtrReadOnly, const size_t Size, const sycl::property_list &PropsList) { RT::PiMemFlags CreationFlags = @@ -354,21 +357,21 @@ MemoryManager::allocateBufferObject(ContextImplPtr TargetContext, void *UserPtr, pi_mem_properties props[3] = {PI_MEM_PROPERTIES_ALLOC_BUFFER_LOCATION, location, 0}; memBufferCreateHelper(Plugin, TargetContext->getHandleRef(), - CreationFlags, Size, UserPtr, &NewMem, props); + TargetDevice->getHandleRef(), CreationFlags, Size, + UserPtr, &NewMem, props); return NewMem; } - memBufferCreateHelper(Plugin, TargetContext->getHandleRef(), CreationFlags, - Size, UserPtr, &NewMem, nullptr); + memBufferCreateHelper(Plugin, TargetContext->getHandleRef(), + TargetDevice->getHandleRef(), CreationFlags, Size, + UserPtr, &NewMem, nullptr); return NewMem; } -void *MemoryManager::allocateMemBuffer(ContextImplPtr TargetContext, - SYCLMemObjI *MemObj, void *UserPtr, - bool HostPtrReadOnly, size_t Size, - const EventImplPtr &InteropEvent, - const ContextImplPtr &InteropContext, - const sycl::property_list &PropsList, - RT::PiEvent &OutEventToWait) { +void *MemoryManager::allocateMemBuffer( + ContextImplPtr TargetContext, DeviceImplPtr TargetDevice, + SYCLMemObjI *MemObj, void *UserPtr, bool HostPtrReadOnly, size_t Size, + const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext, + const sycl::property_list &PropsList, RT::PiEvent &OutEventToWait) { void *MemPtr; if (TargetContext->is_host()) MemPtr = @@ -378,26 +381,26 @@ void *MemoryManager::allocateMemBuffer(ContextImplPtr TargetContext, allocateInteropMemObject(TargetContext, UserPtr, InteropEvent, InteropContext, PropsList, OutEventToWait); else - MemPtr = allocateBufferObject(TargetContext, UserPtr, HostPtrReadOnly, Size, - PropsList); + MemPtr = allocateBufferObject(TargetContext, TargetDevice, UserPtr, + HostPtrReadOnly, Size, PropsList); XPTIRegistry::bufferAssociateNotification(MemObj, MemPtr); return MemPtr; } void *MemoryManager::allocateMemImage( - ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *UserPtr, - bool HostPtrReadOnly, size_t Size, const RT::PiMemImageDesc &Desc, - const RT::PiMemImageFormat &Format, const EventImplPtr &InteropEvent, - const ContextImplPtr &InteropContext, const sycl::property_list &PropsList, - RT::PiEvent &OutEventToWait) { + ContextImplPtr TargetContext, DeviceImplPtr TargetDevice, + SYCLMemObjI *MemObj, void *UserPtr, bool HostPtrReadOnly, size_t Size, + const RT::PiMemImageDesc &Desc, const RT::PiMemImageFormat &Format, + const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext, + const sycl::property_list &PropsList, RT::PiEvent &OutEventToWait) { if (TargetContext->is_host()) return allocateHostMemory(MemObj, UserPtr, HostPtrReadOnly, Size, PropsList); if (UserPtr && InteropContext) return allocateInteropMemObject(TargetContext, UserPtr, InteropEvent, InteropContext, PropsList, OutEventToWait); - return allocateImageObject(TargetContext, UserPtr, HostPtrReadOnly, Desc, - Format, PropsList); + return allocateImageObject(TargetContext, TargetDevice, UserPtr, + HostPtrReadOnly, Desc, Format, PropsList); } void *MemoryManager::allocateMemSubBuffer(ContextImplPtr TargetContext, @@ -435,6 +438,43 @@ void *MemoryManager::allocateMemSubBuffer(ContextImplPtr TargetContext, return NewMem; } +// Deprecated allocation functions +void *MemoryManager::allocate(ContextImplPtr, SYCLMemObjI *, bool, void *, + std::vector, RT::PiEvent &) { + assert(false && "Deprecated: use the overload with the device parameter"); +} + +void *MemoryManager::allocateMemBuffer(ContextImplPtr, SYCLMemObjI *, void *, + bool, size_t, const EventImplPtr &, + const ContextImplPtr &, + const sycl::property_list &, + RT::PiEvent &) { + assert(false && "Deprecated: use the overload with the device parameter"); +} + +void *MemoryManager::allocateMemImage(ContextImplPtr, SYCLMemObjI *, void *, + bool, size_t, const RT::PiMemImageDesc &, + const RT::PiMemImageFormat &, + const EventImplPtr &, + const ContextImplPtr &, + const sycl::property_list &, + RT::PiEvent &) { + assert(false && "Deprecated: use the overload with the device parameter"); +} + +void *MemoryManager::allocateImageObject(ContextImplPtr, void *, bool, + const RT::PiMemImageDesc &, + const RT::PiMemImageFormat &, + const sycl::property_list &) { + assert(false && "Deprecated: use the overload with the device parameter"); +} + +void *MemoryManager::allocateBufferObject(ContextImplPtr, void *, bool, + const size_t, + const sycl::property_list &) { + assert(false && "Deprecated: use the overload with the device parameter"); +} + struct TermPositions { int XTerm; int YTerm; diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp index c9d1738e942ad..124bb698a994b 100644 --- a/sycl/source/detail/memory_manager.hpp +++ b/sycl/source/detail/memory_manager.hpp @@ -25,6 +25,7 @@ namespace detail { class queue_impl; class event_impl; class context_impl; +class device_impl; using QueueImplPtr = std::shared_ptr; using EventImplPtr = std::shared_ptr; @@ -43,7 +44,8 @@ class __SYCL_EXPORT MemoryManager { // The following method allocates memory allocation of memory object. // Depending on the context it allocates memory on host or on device. - static void *allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, + static void *allocate(ContextImplPtr TargetContext, + DeviceImplPtr TargetDevice, SYCLMemObjI *MemObj, bool InitFromUserData, void *HostPtr, std::vector DepEvents, RT::PiEvent &OutEvent); @@ -59,22 +61,20 @@ class __SYCL_EXPORT MemoryManager { // Allocates buffer in specified context taking into account situations such // as host ptr or cl_mem provided by user. TargetContext should be device // one(not host). - static void *allocateMemBuffer(ContextImplPtr TargetContext, - SYCLMemObjI *MemObj, void *UserPtr, - bool HostPtrReadOnly, size_t Size, - const EventImplPtr &InteropEvent, - const ContextImplPtr &InteropContext, - const sycl::property_list &PropsList, - RT::PiEvent &OutEventToWait); + static void *allocateMemBuffer( + ContextImplPtr TargetContext, DeviceImplPtr TargetDevice, + SYCLMemObjI *MemObj, void *UserPtr, bool HostPtrReadOnly, size_t Size, + const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext, + const sycl::property_list &PropsList, RT::PiEvent &OutEventToWait); // Allocates images in specified context taking into account situations such // as host ptr or cl_mem provided by user. TargetContext should be device // one(not host). static void *allocateMemImage( - ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *UserPtr, - bool HostPtrReadOnly, size_t Size, const RT::PiMemImageDesc &Desc, - const RT::PiMemImageFormat &Format, const EventImplPtr &InteropEvent, - const ContextImplPtr &InteropContext, + ContextImplPtr TargetContext, DeviceImplPtr TargetDevice, + SYCLMemObjI *MemObj, void *UserPtr, bool HostPtrReadOnly, size_t Size, + const RT::PiMemImageDesc &Desc, const RT::PiMemImageFormat &Format, + const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext, const sycl::property_list &PropsList, RT::PiEvent &OutEventToWait); // Releases memory object(buffer or image). TargetContext should be device @@ -93,12 +93,42 @@ class __SYCL_EXPORT MemoryManager { const sycl::property_list &PropsList, RT::PiEvent &OutEventToWait); - static void *allocateImageObject(ContextImplPtr TargetContext, void *UserPtr, + static void *allocateImageObject(ContextImplPtr TargetContext, + DeviceImplPtr TargetDevice, void *UserPtr, bool HostPtrReadOnly, const RT::PiMemImageDesc &Desc, const RT::PiMemImageFormat &Format, const sycl::property_list &PropsList); + static void *allocateBufferObject(ContextImplPtr TargetContext, + DeviceImplPtr TargetDevice, void *UserPtr, + bool HostPtrReadOnly, const size_t Size, + const sycl::property_list &PropsList); + + // FIXME: Deprecated allocation methods, maintaining for ABI compatibility, + // to remove when the class is removed from __SYCL_EXPORT + static void *allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, + bool InitFromUserData, void *HostPtr, + std::vector DepEvents, + RT::PiEvent &OutEvent); + static void *allocateMemBuffer(ContextImplPtr TargetContext, + SYCLMemObjI *MemObj, void *UserPtr, + bool HostPtrReadOnly, size_t Size, + const EventImplPtr &InteropEvent, + const ContextImplPtr &InteropContext, + const sycl::property_list &PropsList, + RT::PiEvent &OutEventToWait); + static void *allocateMemImage( + ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *UserPtr, + bool HostPtrReadOnly, size_t Size, const RT::PiMemImageDesc &Desc, + const RT::PiMemImageFormat &Format, const EventImplPtr &InteropEvent, + const ContextImplPtr &InteropContext, + const sycl::property_list &PropsList, RT::PiEvent &OutEventToWait); + static void *allocateImageObject(ContextImplPtr TargetContext, void *UserPtr, + bool HostPtrReadOnly, + const RT::PiMemImageDesc &Desc, + const RT::PiMemImageFormat &Format, + const sycl::property_list &PropsList); static void *allocateBufferObject(ContextImplPtr TargetContext, void *UserPtr, bool HostPtrReadOnly, const size_t Size, const sycl::property_list &PropsList); diff --git a/sycl/source/detail/program_impl.cpp b/sycl/source/detail/program_impl.cpp index 59fb15178114e..f73e1f8a420ea 100644 --- a/sycl/source/detail/program_impl.cpp +++ b/sycl/source/detail/program_impl.cpp @@ -35,14 +35,7 @@ program_impl::program_impl(ContextImplPtr Context, program_impl::program_impl(ContextImplPtr Context, std::vector DeviceList, const property_list &PropList) - : MContext(Context), MDevices(DeviceList), MPropList(PropList) { - if (Context->getDevices().size() > 1) { - throw feature_not_supported( - "multiple devices within a context are not supported with " - "sycl::program and sycl::kernel", - PI_ERROR_INVALID_OPERATION); - } -} + : MContext(Context), MDevices(DeviceList), MPropList(PropList) {} program_impl::program_impl( std::vector> ProgramList, diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 4ab7c6bdce954..bc17f3cf5f53d 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1035,8 +1035,8 @@ pi_int32 AllocaCommand::enqueueImp() { // TODO: Check if it is correct to use std::move on stack variable and // delete it RawEvents below. MMemAllocation = MemoryManager::allocate( - MQueue->getContextImplPtr(), getSYCLMemObj(), MInitFromUserData, HostPtr, - std::move(EventImpls), Event); + MQueue->getContextImplPtr(), MQueue->getDeviceImplPtr(), getSYCLMemObj(), + MInitFromUserData, HostPtr, std::move(EventImpls), Event); return PI_SUCCESS; } @@ -2146,7 +2146,8 @@ static pi_result SetKernelParamsAndLaunch( PI_ERROR_INVALID_OPERATION); } assert(DeviceImageImpl != nullptr); - RT::PiMem SpecConstsBuffer = DeviceImageImpl->get_spec_const_buffer_ref(); + RT::PiMem SpecConstsBuffer = + DeviceImageImpl->get_spec_const_buffer_ref(Queue->get_device()); // Avoid taking an address of nullptr RT::PiMem *SpecConstsBufferArg = SpecConstsBuffer ? &SpecConstsBuffer : nullptr; diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 5dee68c6e69d2..e17115e8b7865 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -52,6 +52,39 @@ static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS) { // allocation on the host return LHS == RHS || (LHS->is_host() && RHS->is_host()); } +bool sameDev(const DeviceImplPtr &LHS, const DeviceImplPtr &RHS) { + RT::PiDevice LHSroot = + LHS->MRootDevice == nullptr ? LHS->MDevice : LHS->MRootDevice; + RT::PiDevice RHSroot = + RHS->MRootDevice == nullptr ? RHS->MDevice : RHS->MRootDevice; + return LHSroot == RHSroot; +} + +pi_memory_connection getMemoryConnection(const DeviceImplPtr &Dev1, + const ContextImplPtr &Ctx1, + const DeviceImplPtr &Dev2, + const ContextImplPtr &Ctx2) { + if ((sameCtx(Ctx1, Ctx2) && sameDev(Dev1, Dev2)) || + (Dev1->MIsHostDevice && Dev2->MIsHostDevice)) { + return PI_MEMORY_CONNECTION_UNIFIED; + } + + if (Dev1->MIsHostDevice ^ Dev2->MIsHostDevice) { + return PI_MEMORY_CONNECTION_NONE; + } + + auto plugin1 = Dev1->getPlugin(); + auto plugin2 = Dev2->getPlugin(); + + if (plugin1.getBackend() != plugin2.getBackend()) { + return PI_MEMORY_CONNECTION_NONE; + } + + _pi_memory_connection conn; + plugin1.call( + Dev1->MDevice, Ctx1->MContext, Dev2->MDevice, Ctx2->MContext, &conn); + return conn; +} /// Checks if current requirement is requirement for sub buffer. static bool IsSuitableSubReq(const Requirement *Req) { @@ -239,11 +272,12 @@ MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord( Dev, InteropCtxPtr, /*AsyncHandler=*/{}, /*PropertyList=*/{}}}; MemObject->MRecord.reset( - new MemObjRecord{InteropCtxPtr, LeafLimit, AllocateDependency}); + new MemObjRecord{InteropCtxPtr, Dev, LeafLimit, AllocateDependency}); getOrCreateAllocaForReq(MemObject->MRecord.get(), Req, InteropQueuePtr, ToEnqueue); } else MemObject->MRecord.reset(new MemObjRecord{Queue->getContextImplPtr(), + Queue->getDeviceImplPtr(), LeafLimit, AllocateDependency}); MMemObjs.push_back(MemObject); @@ -282,8 +316,8 @@ void Scheduler::GraphBuilder::addNodeToLeaves( UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd( MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue, std::vector &ToEnqueue) { - AllocaCommandBase *AllocaCmd = - findAllocaForReq(Record, Req, Queue->getContextImplPtr()); + AllocaCommandBase *AllocaCmd = findAllocaForReq( + Record, Req, Queue->getContextImplPtr(), Queue->getDeviceImplPtr()); assert(AllocaCmd && "There must be alloca for requirement!"); UpdateHostRequirementCommand *UpdateCommand = new UpdateHostRequirementCommand(Queue, *Req, AllocaCmd, &Req->MData); @@ -291,8 +325,8 @@ UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd( // dependencies become invalid if requirement is stored by pointer. const Requirement *StoredReq = UpdateCommand->getRequirement(); - std::set Deps = - findDepsForReq(Record, Req, Queue->getContextImplPtr()); + std::set Deps = findDepsForReq( + Record, Req, Queue->getContextImplPtr(), Queue->getDeviceImplPtr()); std::vector ToCleanUp; for (Command *Dep : Deps) { Command *ConnCmd = @@ -345,8 +379,8 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( if (!AllocaCmdDst) throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY); - std::set Deps = - findDepsForReq(Record, Req, Queue->getContextImplPtr()); + std::set Deps = findDepsForReq( + Record, Req, Queue->getContextImplPtr(), Queue->getDeviceImplPtr()); Deps.insert(AllocaCmdDst); // Get parent allocation of sub buffer to perform full copy of whole buffer if (IsSuitableSubReq(Req)) { @@ -356,14 +390,16 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( } AllocaCommandBase *AllocaCmdSrc = - findAllocaForReq(Record, Req, Record->MCurContext); + findAllocaForReq(Record, Req, Record->MCurContext, Record->MCurDevice); if (!AllocaCmdSrc && IsSuitableSubReq(Req)) { // Since no alloca command for the sub buffer requirement was found in the // current context, need to find a parent alloca command for it (it must be // there) auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) { - bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), - Record->MCurContext) && + bool Res = getMemoryConnection(AllocaCmd->getQueue()->getDeviceImplPtr(), + AllocaCmd->getQueue()->getContextImplPtr(), + Record->MCurDevice, Record->MCurContext) == + PI_MEMORY_CONNECTION_UNIFIED && // Looking for a parent buffer alloca command AllocaCmd->getType() == Command::CommandType::ALLOCA; return Res; @@ -399,6 +435,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( if ((Req->MAccessMode == access::mode::discard_write) || (Req->MAccessMode == access::mode::discard_read_write)) { Record->MCurContext = Queue->getContextImplPtr(); + Record->MCurDevice = Queue->getDeviceImplPtr(); return nullptr; } else { // Full copy of buffer is needed to avoid loss of data that may be caused @@ -421,6 +458,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( for (Command *Cmd : ToCleanUp) cleanupCommand(Cmd); Record->MCurContext = Queue->getContextImplPtr(); + Record->MCurDevice = Queue->getDeviceImplPtr(); return NewCmd; } @@ -434,7 +472,8 @@ Command *Scheduler::GraphBuilder::remapMemoryObject( AllocaCommandBase *LinkedAllocaCmd = HostAllocaCmd->MLinkedAllocaCmd; assert(LinkedAllocaCmd && "Linked alloca command expected"); - std::set Deps = findDepsForReq(Record, Req, Record->MCurContext); + std::set Deps = + findDepsForReq(Record, Req, Record->MCurContext, Record->MCurDevice); UnMapMemObject *UnMapCmd = new UnMapMemObject( LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(), @@ -485,9 +524,10 @@ Scheduler::GraphBuilder::addCopyBack(Requirement *Req, return nullptr; std::set Deps = - findDepsForReq(Record, Req, HostQueue->getContextImplPtr()); + findDepsForReq(Record, Req, HostQueue->getContextImplPtr(), + HostQueue->getDeviceImplPtr()); AllocaCommandBase *SrcAllocaCmd = - findAllocaForReq(Record, Req, Record->MCurContext); + findAllocaForReq(Record, Req, Record->MCurContext, Record->MCurDevice); auto MemCpyCmdUniquePtr = std::make_unique( *SrcAllocaCmd->getRequirement(), SrcAllocaCmd, *Req, &Req->MData, @@ -572,11 +612,10 @@ Command *Scheduler::GraphBuilder::addCGUpdateHost( /// /// 1. New and examined commands only read -> can bypass /// 2. New and examined commands has non-overlapping requirements -> can bypass -/// 3. New and examined commands have different contexts -> cannot bypass -std::set -Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, - const Requirement *Req, - const ContextImplPtr &Context) { +/// 3. New and examined commands have different devices -> cannot bypass +std::set Scheduler::GraphBuilder::findDepsForReq( + MemObjRecord *Record, const Requirement *Req, const ContextImplPtr &Context, + const DeviceImplPtr &Device) { std::set RetDeps; std::vector Visited; const bool ReadOnlyReq = Req->MAccessMode == access::mode::read; @@ -607,10 +646,13 @@ Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, // If not overlap CanBypassDep |= !doOverlap(Dep.MDepRequirement, Req); - // Going through copying memory between contexts is not supported. + // Going through copying memory between devices is not supported. if (Dep.MDepCommand) - CanBypassDep &= - sameCtx(Context, Dep.MDepCommand->getQueue()->getContextImplPtr()); + CanBypassDep &= getMemoryConnection( + Device, Context, + Dep.MDepCommand->getQueue()->getDeviceImplPtr(), + Dep.MDepCommand->getQueue()->getContextImplPtr()) == + PI_MEMORY_CONNECTION_UNIFIED; if (!CanBypassDep) { RetDeps.insert(DepCmd); @@ -646,10 +688,13 @@ DepDesc Scheduler::GraphBuilder::findDepForRecord(Command *Cmd, // requirement. AllocaCommandBase *Scheduler::GraphBuilder::findAllocaForReq( MemObjRecord *Record, const Requirement *Req, const ContextImplPtr &Context, - bool AllowConst) { - auto IsSuitableAlloca = [&Context, Req, + const DeviceImplPtr &Device, bool AllowConst) { + auto IsSuitableAlloca = [&Device, &Context, Req, AllowConst](AllocaCommandBase *AllocaCmd) { - bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), Context); + auto &Queue = AllocaCmd->getQueue(); + bool Res = getMemoryConnection(Queue->getDeviceImplPtr(), + Queue->getContextImplPtr(), Device, + Context) == PI_MEMORY_CONNECTION_UNIFIED; if (IsSuitableSubReq(Req)) { const Requirement *TmpReq = AllocaCmd->getRequirement(); Res &= AllocaCmd->getType() == Command::CommandType::ALLOCA_SUB_BUF; @@ -687,8 +732,9 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( MemObjRecord *Record, const Requirement *Req, const QueueImplPtr &Queue, std::vector &ToEnqueue) { - AllocaCommandBase *AllocaCmd = findAllocaForReq( - Record, Req, Queue->getContextImplPtr(), /*AllowConst=*/false); + AllocaCommandBase *AllocaCmd = + findAllocaForReq(Record, Req, Queue->getContextImplPtr(), + Queue->getDeviceImplPtr(), /*AllowConst=*/false); if (!AllocaCmd) { std::vector ToCleanUp; @@ -747,6 +793,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue); ++(HostAllocaCmd->MLeafCounter); Record->MCurContext = DefaultHostQueue->getContextImplPtr(); + Record->MCurDevice = DefaultHostQueue->getDeviceImplPtr(); } } } else { @@ -774,8 +821,9 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( Queue->is_host() ? checkHostUnifiedMemory(Record->MCurContext) : HostUnifiedMemory; if (PinnedHostMemory || HostUnifiedMemoryOnNonHostDevice) { - AllocaCommandBase *LinkedAllocaCmdCand = findAllocaForReq( - Record, Req, Record->MCurContext, /*AllowConst=*/false); + AllocaCommandBase *LinkedAllocaCmdCand = + findAllocaForReq(Record, Req, Record->MCurContext, + Record->MCurDevice, /*AllowConst=*/false); // Cannot setup link if candidate is linked already if (LinkedAllocaCmdCand && @@ -816,9 +864,11 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( } else { LinkedAllocaCmd->MIsActive = false; Record->MCurContext = Queue->getContextImplPtr(); + Record->MCurDevice = Queue->getDeviceImplPtr(); std::set Deps = - findDepsForReq(Record, Req, Queue->getContextImplPtr()); + findDepsForReq(Record, Req, Queue->getContextImplPtr(), + Queue->getDeviceImplPtr()); for (Command *Dep : Deps) { Command *ConnCmd = AllocaCmd->addDep( DepDesc{Dep, Req, LinkedAllocaCmd}, ToCleanUp); @@ -1025,7 +1075,7 @@ void Scheduler::GraphBuilder::createGraphForCommand( MemObjRecord *Record = nullptr; AllocaCommandBase *AllocaCmd = nullptr; - bool isSameCtx = false; + pi_memory_connection memoryConnection = PI_MEMORY_CONNECTION_NONE; { const QueueImplPtr &QueueForAlloca = @@ -1037,13 +1087,15 @@ void Scheduler::GraphBuilder::createGraphForCommand( AllocaCmd = getOrCreateAllocaForReq(Record, Req, QueueForAlloca, ToEnqueue); - isSameCtx = - sameCtx(QueueForAlloca->getContextImplPtr(), Record->MCurContext); + memoryConnection = + getMemoryConnection(QueueForAlloca->getDeviceImplPtr(), + QueueForAlloca->getContextImplPtr(), + Record->MCurDevice, Record->MCurContext); } // If there is alloca command we need to check if the latest memory is in // required context. - if (isSameCtx) { + if (memoryConnection == PI_MEMORY_CONNECTION_UNIFIED) { // If the memory is already in the required host context, check if the // required access mode is valid, remap if not. if (Record->MCurContext->is_host() && @@ -1052,13 +1104,16 @@ void Scheduler::GraphBuilder::createGraphForCommand( } else { // Cannot directly copy memory from OpenCL device to OpenCL device - // create two copies: device->host and host->device. - bool NeedMemMoveToHost = false; + bool NeedMemMoveToHost = memoryConnection == PI_MEMORY_CONNECTION_NONE; auto MemMoveTargetQueue = Queue; if (isInteropTask) { const detail::CGHostTask &HT = static_cast(CG); - if (HT.MQueue->getContextImplPtr() != Record->MCurContext) { + if (getMemoryConnection(HT.MQueue->getDeviceImplPtr(), + HT.MQueue->getContextImplPtr(), + Record->MCurDevice, Record->MCurContext) != + PI_MEMORY_CONNECTION_UNIFIED) { NeedMemMoveToHost = true; MemMoveTargetQueue = HT.MQueue; } @@ -1071,8 +1126,8 @@ void Scheduler::GraphBuilder::createGraphForCommand( ToEnqueue); insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue); } - std::set Deps = - findDepsForReq(Record, Req, Queue->getContextImplPtr()); + std::set Deps = findDepsForReq( + Record, Req, Queue->getContextImplPtr(), Queue->getDeviceImplPtr()); for (Command *Dep : Deps) { if (Dep != NewCmd) { diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index fe7c5f7fbbf8e..78a968f4ab05b 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -174,11 +174,14 @@ class MockScheduler; namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { namespace detail { + +class device_impl; class queue_impl; class event_impl; class context_impl; class DispatchHostTask; +using DeviceImplPtr = std::shared_ptr; using ContextImplPtr = std::shared_ptr; using EventImplPtr = std::shared_ptr; using QueueImplPtr = std::shared_ptr; @@ -196,10 +199,11 @@ using FusionMap = std::unordered_map; /// /// \ingroup sycl_graph struct MemObjRecord { - MemObjRecord(ContextImplPtr Ctx, std::size_t LeafLimit, + MemObjRecord(ContextImplPtr Ctx, DeviceImplPtr Dev, std::size_t LeafLimit, LeavesCollection::AllocateDependencyF AllocateDependency) : MReadLeaves{this, LeafLimit, AllocateDependency}, - MWriteLeaves{this, LeafLimit, AllocateDependency}, MCurContext{Ctx} {} + MWriteLeaves{this, LeafLimit, AllocateDependency}, MCurContext{Ctx}, + MCurDevice{Dev} {} // Contains all allocation commands for the memory object. std::vector MAllocaCommands; @@ -213,6 +217,9 @@ struct MemObjRecord { // The context which has the latest state of the memory object. ContextImplPtr MCurContext; + // The device which has the latest state of the memory object. + DeviceImplPtr MCurDevice; + // The mode this object can be accessed with from the host context. // Valid only if the current context is host. access::mode MHostAccess = access::mode::read_write; @@ -652,7 +659,8 @@ class Scheduler { /// Finds dependencies for the requirement. std::set findDepsForReq(MemObjRecord *Record, const Requirement *Req, - const ContextImplPtr &Context); + const ContextImplPtr &Context, + const DeviceImplPtr &Device); EmptyCommand *addEmptyCmd(Command *Cmd, const std::vector &Req, @@ -675,6 +683,7 @@ class Scheduler { AllocaCommandBase *findAllocaForReq(MemObjRecord *Record, const Requirement *Req, const ContextImplPtr &Context, + const DeviceImplPtr &Device, bool AllowConst = true); friend class Command; diff --git a/sycl/source/detail/sycl_mem_obj_i.hpp b/sycl/source/detail/sycl_mem_obj_i.hpp index e3dfa11fe35a1..0fe320c99c55e 100644 --- a/sycl/source/detail/sycl_mem_obj_i.hpp +++ b/sycl/source/detail/sycl_mem_obj_i.hpp @@ -18,10 +18,12 @@ namespace detail { class event_impl; class context_impl; +class device_impl; struct MemObjRecord; using EventImplPtr = std::shared_ptr; using ContextImplPtr = std::shared_ptr; +using DeviceImplPtr = std::shared_ptr; // The class serves as an interface in the scheduler for all SYCL memory // objects. @@ -43,8 +45,9 @@ class SYCLMemObjI { // Non null HostPtr requires allocation to be made with USE_HOST_PTR property. // Method returns a pointer to host allocation if Context is host one and // cl_mem obect if not. - virtual void *allocateMem(ContextImplPtr Context, bool InitFromUserData, - void *HostPtr, RT::PiEvent &InteropEvent) = 0; + virtual void *allocateMem(ContextImplPtr Context, DeviceImplPtr Device, + bool InitFromUserData, void *HostPtr, + RT::PiEvent &InteropEvent) = 0; // Should be used for memory object created without use_host_ptr property. virtual void *allocateHostMem() = 0; diff --git a/sycl/source/detail/sycl_mem_obj_t.hpp b/sycl/source/detail/sycl_mem_obj_t.hpp index dfd01b88c5a5a..abc9198ca5d12 100644 --- a/sycl/source/detail/sycl_mem_obj_t.hpp +++ b/sycl/source/detail/sycl_mem_obj_t.hpp @@ -238,12 +238,24 @@ class __SYCL_EXPORT SYCLMemObjT : public SYCLMemObjI { static size_t getBufSizeForContext(const ContextImplPtr &Context, pi_native_handle MemObject); - void *allocateMem(ContextImplPtr Context, bool InitFromUserData, - void *HostPtr, RT::PiEvent &InteropEvent) override { + // FIXME: Remove this when the class is removed from __SYCL_EXPORT + virtual void *allocateMem(ContextImplPtr Context, bool InitFromUserData, + void *HostPtr, RT::PiEvent &InteropEvent) { (void)Context; (void)InitFromUserData; (void)HostPtr; (void)InteropEvent; + assert(false && "Deprecated: use the overload with the device parameter"); + } + + void *allocateMem(ContextImplPtr Context, DeviceImplPtr Device, + bool InitFromUserData, void *HostPtr, + RT::PiEvent &InteropEvent) override { + (void)Context; + (void)Device; + (void)InitFromUserData; + (void)HostPtr; + (void)InteropEvent; throw runtime_error("Not implemented", PI_ERROR_INVALID_OPERATION); } diff --git a/sycl/test-e2e/Basic/alloc_pinned_host_memory.cpp b/sycl/test-e2e/Basic/alloc_pinned_host_memory.cpp index 0d8a0267504c0..16e64c3eb8aad 100644 --- a/sycl/test-e2e/Basic/alloc_pinned_host_memory.cpp +++ b/sycl/test-e2e/Basic/alloc_pinned_host_memory.cpp @@ -39,4 +39,5 @@ int main() { // CHECK:---> piMemBufferCreate // CHECK:---> piMemBufferCreate // CHECK-NEXT: {{.*}} : {{.*}} +// CHECK-NEXT: {{.*}} : {{.*}} // CHECK-NEXT: {{.*}} : 17 diff --git a/sycl/test-e2e/Basic/buffer/native_buffer_creation_flags.cpp b/sycl/test-e2e/Basic/buffer/native_buffer_creation_flags.cpp index bb29cef6bcc42..3b2655cf3df21 100644 --- a/sycl/test-e2e/Basic/buffer/native_buffer_creation_flags.cpp +++ b/sycl/test-e2e/Basic/buffer/native_buffer_creation_flags.cpp @@ -21,6 +21,7 @@ int main() { // buffer is created with the PI_MEM_FLAGS_HOST_PTR_USE flag. // CHECK: piMemBufferCreate // CHECK-NEXT: {{.*}} : {{.*}} + // CHECK-NEXT: {{.*}} : {{.*}} // CHECK-NEXT: {{.*}} : 9 auto BufAcc = Buf.get_access(Cgh); Cgh.single_task([=]() { int A = BufAcc[0]; }); diff --git a/sycl/test-e2e/Basic/use_pinned_host_memory.cpp b/sycl/test-e2e/Basic/use_pinned_host_memory.cpp index 9efb9115b9c10..18378945ede65 100644 --- a/sycl/test-e2e/Basic/use_pinned_host_memory.cpp +++ b/sycl/test-e2e/Basic/use_pinned_host_memory.cpp @@ -43,4 +43,5 @@ int main() { // CHECK:---> piMemBufferCreate // CHECK-NEXT: {{.*}} : {{.*}} +// CHECK-NEXT: {{.*}} : {{.*}} // CHECK-NEXT: {{.*}} : 17 diff --git a/sycl/test-e2e/Tracing/pi_tracing_test.cpp b/sycl/test-e2e/Tracing/pi_tracing_test.cpp index c91e93cd21856..869a612921892 100644 --- a/sycl/test-e2e/Tracing/pi_tracing_test.cpp +++ b/sycl/test-e2e/Tracing/pi_tracing_test.cpp @@ -13,6 +13,7 @@ // CHECK: // CHECK: ---> piMemBufferCreate( // CHECK-NEXT: : {{0[xX]?[0-9a-fA-F]*}} +// CHECK-NEXT: : {{0[xX]?[0-9a-fA-F]*}} // CHECK-NEXT: : 1 // CHECK-NEXT: : 40 // CHECK-NEXT: : 0 diff --git a/sycl/test/abi/pi_level_zero_symbol_check.dump b/sycl/test/abi/pi_level_zero_symbol_check.dump index 4363024cda8a5..3c3ca21eee156 100644 --- a/sycl/test/abi/pi_level_zero_symbol_check.dump +++ b/sycl/test/abi/pi_level_zero_symbol_check.dump @@ -94,6 +94,7 @@ piextEnqueueWriteHostPipe piextEventCreateWithNativeHandle piextEventGetNativeHandle piextGetDeviceFunctionPointer +piextGetMemoryConnection piextKernelCreateWithNativeHandle piextKernelGetNativeHandle piextKernelSetArgMemObj diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump index 9707c21163b95..79bfb8e299d0e 100644 --- a/sycl/test/abi/pi_opencl_symbol_check.dump +++ b/sycl/test/abi/pi_opencl_symbol_check.dump @@ -42,6 +42,7 @@ piextEnqueueReadHostPipe piextEnqueueWriteHostPipe piextEventCreateWithNativeHandle piextGetDeviceFunctionPointer +piextGetMemoryConnection piextKernelCreateWithNativeHandle piextKernelGetNativeHandle piextKernelSetArgMemObj diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index d96b589b9a846..f1b5fc606bca8 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3719,6 +3719,7 @@ _ZN4sycl3_V16ONEAPI15filter_selectorC1ERKNSt7__cxx1112basic_stringIcSt11char_tra _ZN4sycl3_V16ONEAPI15filter_selectorC2ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE _ZN4sycl3_V16detail10build_implERKNS0_13kernel_bundleILNS0_12bundle_stateE0EEERKSt6vectorINS0_6deviceESaIS8_EERKNS0_13property_listE _ZN4sycl3_V16detail10image_impl10getDevicesESt10shared_ptrINS1_12context_implEE +_ZN4sycl3_V16detail10image_impl11allocateMemESt10shared_ptrINS1_12context_implEES3_INS1_11device_implEEbPvRP9_pi_event _ZN4sycl3_V16detail10image_impl11allocateMemESt10shared_ptrINS1_12context_implEEbPvRP9_pi_event _ZN4sycl3_V16detail10image_impl14checkImageDescERK14_pi_image_descSt10shared_ptrINS1_12context_implEEPv _ZN4sycl3_V16detail10image_impl16checkImageFormatERK16_pi_image_formatSt10shared_ptrINS1_12context_implEE @@ -3737,6 +3738,7 @@ _ZN4sycl3_V16detail11SYCLMemObjTC1EmRKNS0_7contextEbNS0_5eventESt10unique_ptrINS _ZN4sycl3_V16detail11SYCLMemObjTC1EmRKNS0_7contextEmNS0_5eventESt10unique_ptrINS1_19SYCLMemObjAllocatorESt14default_deleteIS8_EE _ZN4sycl3_V16detail11SYCLMemObjTC2EmRKNS0_7contextEbNS0_5eventESt10unique_ptrINS1_19SYCLMemObjAllocatorESt14default_deleteIS8_EE _ZN4sycl3_V16detail11SYCLMemObjTC2EmRKNS0_7contextEmNS0_5eventESt10unique_ptrINS1_19SYCLMemObjAllocatorESt14default_deleteIS8_EE +_ZN4sycl3_V16detail11buffer_impl11allocateMemESt10shared_ptrINS1_12context_implEES3_INS1_11device_implEEbPvRP9_pi_event _ZN4sycl3_V16detail11buffer_impl11allocateMemESt10shared_ptrINS1_12context_implEEbPvRP9_pi_event _ZN4sycl3_V16detail11buffer_impl22destructorNotificationEPv _ZN4sycl3_V16detail11buffer_impl23constructorNotificationERKNS1_13code_locationEPvPKvS8_jjPm @@ -3808,10 +3810,14 @@ _ZN4sycl3_V16detail13MemoryManager12prefetch_usmEPvSt10shared_ptrINS1_10queue_im _ZN4sycl3_V16detail13MemoryManager13memset_2d_usmEPvSt10shared_ptrINS1_10queue_implEEmmmcSt6vectorIP9_pi_eventSaIS9_EEPS9_ _ZN4sycl3_V16detail13MemoryManager13releaseMemObjESt10shared_ptrINS1_12context_implEEPNS1_11SYCLMemObjIEPvS8_ _ZN4sycl3_V16detail13MemoryManager16allocateMemImageESt10shared_ptrINS1_12context_implEEPNS1_11SYCLMemObjIEPvbmRK14_pi_image_descRK16_pi_image_formatRKS3_INS1_10event_implEERKS5_RKNS0_13property_listERP9_pi_event +_ZN4sycl3_V16detail13MemoryManager16allocateMemImageESt10shared_ptrINS1_12context_implEES3_INS1_11device_implEEPNS1_11SYCLMemObjIEPvbmRK14_pi_image_descRK16_pi_image_formatRKS3_INS1_10event_implEERKS5_RKNS0_13property_listERP9_pi_event _ZN4sycl3_V16detail13MemoryManager17allocateMemBufferESt10shared_ptrINS1_12context_implEEPNS1_11SYCLMemObjIEPvbmRKS3_INS1_10event_implEERKS5_RKNS0_13property_listERP9_pi_event +_ZN4sycl3_V16detail13MemoryManager17allocateMemBufferESt10shared_ptrINS1_12context_implEES3_INS1_11device_implEEPNS1_11SYCLMemObjIEPvbmRKS3_INS1_10event_implEERKS5_RKNS0_13property_listERP9_pi_event _ZN4sycl3_V16detail13MemoryManager18allocateHostMemoryEPNS1_11SYCLMemObjIEPvbmRKNS0_13property_listE _ZN4sycl3_V16detail13MemoryManager19allocateImageObjectESt10shared_ptrINS1_12context_implEEPvbRK14_pi_image_descRK16_pi_image_formatRKNS0_13property_listE +_ZN4sycl3_V16detail13MemoryManager19allocateImageObjectESt10shared_ptrINS1_12context_implEES3_INS1_11device_implEEPvbRK14_pi_image_descRK16_pi_image_formatRKNS0_13property_listE _ZN4sycl3_V16detail13MemoryManager20allocateBufferObjectESt10shared_ptrINS1_12context_implEEPvbmRKNS0_13property_listE +_ZN4sycl3_V16detail13MemoryManager20allocateBufferObjectESt10shared_ptrINS1_12context_implEES3_INS1_11device_implEEPvbmRKNS0_13property_listE _ZN4sycl3_V16detail13MemoryManager20allocateMemSubBufferESt10shared_ptrINS1_12context_implEEPvmmNS0_5rangeILi3EEESt6vectorIS3_INS1_10event_implEESaISB_EERP9_pi_event _ZN4sycl3_V16detail13MemoryManager21copy_to_device_globalEPKvbSt10shared_ptrINS1_10queue_implEEmmS4_lRKSt6vectorIP9_pi_eventSaISA_EEPSA_ _ZN4sycl3_V16detail13MemoryManager23copy_from_device_globalEPKvbSt10shared_ptrINS1_10queue_implEEmmPvlRKSt6vectorIP9_pi_eventSaISB_EEPSB_ @@ -3822,6 +3828,7 @@ _ZN4sycl3_V16detail13MemoryManager4fillEPNS1_11SYCLMemObjIEPvSt10shared_ptrINS1_ _ZN4sycl3_V16detail13MemoryManager5unmapEPNS1_11SYCLMemObjIEPvSt10shared_ptrINS1_10queue_implEES5_St6vectorIP9_pi_eventSaISB_EERSB_ _ZN4sycl3_V16detail13MemoryManager7releaseESt10shared_ptrINS1_12context_implEEPNS1_11SYCLMemObjIEPvSt6vectorIS3_INS1_10event_implEESaISB_EERP9_pi_event _ZN4sycl3_V16detail13MemoryManager8allocateESt10shared_ptrINS1_12context_implEEPNS1_11SYCLMemObjIEbPvSt6vectorIS3_INS1_10event_implEESaISB_EERP9_pi_event +_ZN4sycl3_V16detail13MemoryManager8allocateESt10shared_ptrINS1_12context_implEES3_INS1_11device_implEEPNS1_11SYCLMemObjIEbPvSt6vectorIS3_INS1_10event_implEESaISD_EERP9_pi_event _ZN4sycl3_V16detail13MemoryManager8copy_usmEPKvSt10shared_ptrINS1_10queue_implEEmPvSt6vectorIP9_pi_eventSaISB_EEPSB_ _ZN4sycl3_V16detail13MemoryManager8fill_usmEPvSt10shared_ptrINS1_10queue_implEEmiSt6vectorIP9_pi_eventSaIS9_EEPS9_ _ZN4sycl3_V16detail13host_pipe_map3addEPKvPKc diff --git a/sycl/tools/sycl-sanitize/collector.cpp b/sycl/tools/sycl-sanitize/collector.cpp index 86b5bb74b6939..9f4b11a718937 100644 --- a/sycl/tools/sycl-sanitize/collector.cpp +++ b/sycl/tools/sycl-sanitize/collector.cpp @@ -94,8 +94,8 @@ static void handleUSMFree(const pi_plugin &, std::optional, } static void handleMemBufferCreate(const pi_plugin &, std::optional, - pi_context, pi_mem_flags, size_t Size, - void *HostPtr, pi_mem *, + pi_context, pi_device, pi_mem_flags, + size_t Size, void *HostPtr, pi_mem *, const pi_mem_properties *) { for (const auto &Alloc : GS->ActivePointers) { const void *Begin = Alloc.first; diff --git a/sycl/unittests/SYCL2020/GetNativeOpenCL.cpp b/sycl/unittests/SYCL2020/GetNativeOpenCL.cpp index e89989c8d9b0d..3ea27f72457da 100644 --- a/sycl/unittests/SYCL2020/GetNativeOpenCL.cpp +++ b/sycl/unittests/SYCL2020/GetNativeOpenCL.cpp @@ -57,8 +57,8 @@ static pi_result redefinedMemRetain(pi_mem c) { return PI_SUCCESS; } -pi_result redefinedMemBufferCreate(pi_context, pi_mem_flags, size_t size, - void *, pi_mem *, +pi_result redefinedMemBufferCreate(pi_context, pi_device, pi_mem_flags, + size_t size, void *, pi_mem *, const pi_mem_properties *) { return PI_SUCCESS; } diff --git a/sycl/unittests/assert/assert.cpp b/sycl/unittests/assert/assert.cpp index 3a9482193fbba..90c522c2b30a8 100644 --- a/sycl/unittests/assert/assert.cpp +++ b/sycl/unittests/assert/assert.cpp @@ -205,15 +205,15 @@ redefinedEnqueueKernelLaunchAfter(pi_queue, pi_kernel, pi_uint32, static pi_result redefinedEventsWaitPositive(pi_uint32 num_events, const pi_event *event_list) { - // there should be two events: one is for memory map and the other is for - // copier kernel - assert(num_events == 2); + std::stringstream s("Waiting for events:"); + for (pi_uint32 i = 0; i < num_events; ++i) { + s << " " << reinterpret_cast(event_list[i])[0]; + } + s << "\n"; - int EventIdx1 = reinterpret_cast(event_list[0])[0]; - int EventIdx2 = reinterpret_cast(event_list[1])[0]; // This output here is to reduce amount of time requried to debug/reproduce // a failing test upon feature break - printf("Waiting for events %i, %i\n", EventIdx1, EventIdx2); + printf("%s", s.str().c_str()); return PI_SUCCESS; } diff --git a/sycl/unittests/buffer/BufferLocation.cpp b/sycl/unittests/buffer/BufferLocation.cpp index 77d674f602641..540342116e81d 100644 --- a/sycl/unittests/buffer/BufferLocation.cpp +++ b/sycl/unittests/buffer/BufferLocation.cpp @@ -20,8 +20,8 @@ const uint64_t DEFAULT_VALUE = 7777; static uint64_t PassedLocation = DEFAULT_VALUE; -pi_result redefinedMemBufferCreate(pi_context, pi_mem_flags, size_t size, - void *, pi_mem *, +pi_result redefinedMemBufferCreate(pi_context, pi_device, pi_mem_flags, + size_t size, void *, pi_mem *, const pi_mem_properties *properties) { PassedLocation = DEFAULT_VALUE; if (!properties) diff --git a/sycl/unittests/event/EventDestruction.cpp b/sycl/unittests/event/EventDestruction.cpp index 20427b7623cf8..f1b3377510966 100644 --- a/sycl/unittests/event/EventDestruction.cpp +++ b/sycl/unittests/event/EventDestruction.cpp @@ -21,8 +21,8 @@ static pi_result redefinedEventRelease(pi_event event) { return PI_SUCCESS; } -pi_result redefinedMemBufferCreate(pi_context, pi_mem_flags, size_t size, - void *, pi_mem *, +pi_result redefinedMemBufferCreate(pi_context, pi_device, pi_mem_flags, + size_t size, void *, pi_mem *, const pi_mem_properties *) { return PI_SUCCESS; } diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp index 0faa30e9b7407..1121f28fee7d2 100644 --- a/sycl/unittests/helpers/PiMockPlugin.hpp +++ b/sycl/unittests/helpers/PiMockPlugin.hpp @@ -399,8 +399,8 @@ inline pi_result mock_piextQueueCreateWithNativeHandle( // Memory // inline pi_result -mock_piMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size, - void *host_ptr, pi_mem *ret_mem, +mock_piMemBufferCreate(pi_context context, pi_device device, pi_mem_flags flags, + size_t size, void *host_ptr, pi_mem *ret_mem, const pi_mem_properties *properties = nullptr) { if (host_ptr && flags & PI_MEM_FLAGS_HOST_PTR_USE) *ret_mem = createDummyHandleWithData( @@ -410,7 +410,8 @@ mock_piMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size, return PI_SUCCESS; } -inline pi_result mock_piMemImageCreate(pi_context context, pi_mem_flags flags, +inline pi_result mock_piMemImageCreate(pi_context context, pi_device device, + pi_mem_flags flags, const pi_image_format *image_format, const pi_image_desc *image_desc, void *host_ptr, pi_mem *ret_mem) { @@ -476,6 +477,13 @@ mock_piextMemCreateWithNativeHandle(pi_native_handle nativeHandle, return PI_SUCCESS; } +inline pi_result mock_piextGetMemoryConnection(pi_device dev1, pi_context ctx1, + pi_device dev2, pi_context ctx2, + pi_memory_connection *res) { + *res = PI_MEMORY_CONNECTION_NONE; + return PI_SUCCESS; +} + // // Program // diff --git a/sycl/unittests/pi/EnqueueMemTest.cpp b/sycl/unittests/pi/EnqueueMemTest.cpp index 52f52e52ea15d..7ae5cd9dbdd62 100644 --- a/sycl/unittests/pi/EnqueueMemTest.cpp +++ b/sycl/unittests/pi/EnqueueMemTest.cpp @@ -51,7 +51,7 @@ class EnqueueMemTest : public testing::TestWithParam { PI_SUCCESS); ASSERT_EQ((plugin.call_nocheck( - _context, PI_MEM_FLAGS_ACCESS_RW, + _context, _device, PI_MEM_FLAGS_ACCESS_RW, _numElementsX * _numElementsY * sizeof(pi_int32), nullptr, &_mem, nullptr)), PI_SUCCESS); diff --git a/sycl/unittests/pi/cuda/test_base_objects.cpp b/sycl/unittests/pi/cuda/test_base_objects.cpp index 9bcc9e9f24d56..b710ff5bbcced 100644 --- a/sycl/unittests/pi/cuda/test_base_objects.cpp +++ b/sycl/unittests/pi/cuda/test_base_objects.cpp @@ -71,13 +71,7 @@ TEST_F(CudaBaseObjectsTest, piContextCreate) { << "piContextCreate failed.\n"; EXPECT_NE(ctxt, nullptr); - EXPECT_EQ(ctxt->get_device(), device); - - // Retrieve the cuCtxt to check information is correct - CUcontext cudaContext = ctxt->get(); - unsigned int version = 0; - cuCtxGetApiVersion(cudaContext, &version); - EXPECT_EQ(version, LATEST_KNOWN_CUDA_DRIVER_API_VERSION); + EXPECT_EQ(ctxt->get_devices()[0], device); ASSERT_EQ((plugin->call_nocheck(ctxt)), PI_SUCCESS); @@ -110,7 +104,7 @@ TEST_F(CudaBaseObjectsTest, piContextCreateChildThread) { // Retrieve the cuCtxt to check information is correct auto checkValue = [=]() { - CUcontext cudaContext = ctxt->get(); + CUcontext cudaContext = device->get_context(); unsigned int version = 0; auto cuErr = cuCtxGetApiVersion(cudaContext, &version); EXPECT_EQ(cuErr, CUDA_SUCCESS); diff --git a/sycl/unittests/pi/cuda/test_commands.cpp b/sycl/unittests/pi/cuda/test_commands.cpp index 5a57aa8471f13..93553052a14eb 100644 --- a/sycl/unittests/pi/cuda/test_commands.cpp +++ b/sycl/unittests/pi/cuda/test_commands.cpp @@ -85,10 +85,10 @@ TEST_F(CudaCommandsTest, PIEnqueueReadBufferBlocking) { int output[memSize] = {}; pi_mem memObj; - ASSERT_EQ( - (plugin->call_nocheck( - context_, PI_MEM_FLAGS_ACCESS_RW, bytes, nullptr, &memObj, nullptr)), - PI_SUCCESS); + ASSERT_EQ((plugin->call_nocheck( + context_, device_, PI_MEM_FLAGS_ACCESS_RW, bytes, nullptr, + &memObj, nullptr)), + PI_SUCCESS); ASSERT_EQ((plugin->call_nocheck( queue_, memObj, true, 0, bytes, data, 0, nullptr, nullptr)), @@ -115,10 +115,10 @@ TEST_F(CudaCommandsTest, PIEnqueueReadBufferNonBlocking) { int output[memSize] = {}; pi_mem memObj; - ASSERT_EQ( - (plugin->call_nocheck( - context_, PI_MEM_FLAGS_ACCESS_RW, bytes, nullptr, &memObj, nullptr)), - PI_SUCCESS); + ASSERT_EQ((plugin->call_nocheck( + context_, device_, PI_MEM_FLAGS_ACCESS_RW, bytes, nullptr, + &memObj, nullptr)), + PI_SUCCESS); pi_event cpIn, cpOut; ASSERT_EQ((plugin->call_nocheck( diff --git a/sycl/unittests/pi/cuda/test_contexts.cpp b/sycl/unittests/pi/cuda/test_contexts.cpp index d021081716b9a..7411507e25343 100644 --- a/sycl/unittests/pi/cuda/test_contexts.cpp +++ b/sycl/unittests/pi/cuda/test_contexts.cpp @@ -87,14 +87,10 @@ TEST_F(CudaContextsTest, ContextLifetime) { // check that the context is now the active CUDA context CUcontext cudaCtxt = nullptr; cuCtxGetCurrent(&cudaCtxt); - ASSERT_EQ(cudaCtxt, context->get()); + ASSERT_EQ(cudaCtxt, queue->get_native_context()); plugin->call(queue); plugin->call(context); - - // check that the context was cleaned up properly by the destructor - cuCtxGetCurrent(&cudaCtxt); - ASSERT_EQ(cudaCtxt, nullptr); } TEST_F(CudaContextsTest, ContextLifetimeExisting) { @@ -126,124 +122,11 @@ TEST_F(CudaContextsTest, ContextLifetimeExisting) { // check that the context is now the active CUDA context cuCtxGetCurrent(¤t); - ASSERT_EQ(current, context->get()); + ASSERT_EQ(current, queue->get_native_context()); plugin->call(queue); plugin->call(context); - // check that the context was cleaned up, the old context will be restored - // automatically by cuCtxDestroy in piContextRelease, as it was pushed on the - // stack bu cuCtxCreate - cuCtxGetCurrent(¤t); - ASSERT_EQ(current, original); - // release original context cuCtxDestroy(original); } - -// In some cases (for host_task), the SYCL runtime may call PI API functions -// from threads of the thread pool, this can cause issues because with the CUDA -// plugin these functions will set an active CUDA context on these threads, but -// never clean it up, as it will only get cleaned up in the main thread. -// -// So the following test aims to reproduce the scenario where there is a -// dangling deleted context in a separate thread and seeing if the PI calls are -// still able to work correctly in that thread. -TEST_F(CudaContextsTest, ContextThread) { - // start with no active context - pi::clearCudaContext(); - - // create two PI contexts - pi_context context1; - ASSERT_EQ((plugin->call_nocheck( - nullptr, 1, &device_, nullptr, nullptr, &context1)), - PI_SUCCESS); - ASSERT_NE(context1, nullptr); - - pi_context context2; - ASSERT_EQ((plugin->call_nocheck( - nullptr, 1, &device_, nullptr, nullptr, &context2)), - PI_SUCCESS); - ASSERT_NE(context2, nullptr); - - // setup synchronization variables between the main thread and the testing - // thread - std::mutex m; - std::condition_variable cv; - bool released = false; - bool thread_done = false; - - // create a testing thread that will create a queue with the first context, - // release the queue, then wait for the main thread to release the first - // context, and then create and release another queue with the second context - // this time - auto test_thread = std::thread([&] { - CUcontext current = nullptr; - - // create a queue with the first context - pi_queue queue; - ASSERT_EQ((plugin->call_nocheck( - context1, device_, 0, &queue)), - PI_SUCCESS); - ASSERT_NE(queue, nullptr); - - // ensure the queue has the correct context - ASSERT_EQ(context1, queue->get_context()); - - // check that the first context is now the active CUDA context - cuCtxGetCurrent(¤t); - ASSERT_EQ(current, context1->get()); - - plugin->call(queue); - - // mark the first set of processing as done and notify the main thread - std::unique_lock lock(m); - thread_done = true; - lock.unlock(); - cv.notify_one(); - - // wait for the main thread to release the first context - lock.lock(); - cv.wait(lock, [&] { return released; }); - - // check that the first context is still active, this is because deleting a - // context only cleans up the current thread - cuCtxGetCurrent(¤t); - ASSERT_EQ(current, context1->get()); - - // create a queue with the second context - ASSERT_EQ((plugin->call_nocheck( - context2, device_, 0, &queue)), - PI_SUCCESS); - ASSERT_NE(queue, nullptr); - - // ensure the queue has the correct context - ASSERT_EQ(context2, queue->get_context()); - - // check that the second context is now the active CUDA context - cuCtxGetCurrent(¤t); - ASSERT_EQ(current, context2->get()); - - plugin->call(queue); - }); - - // wait for the thread to be done with the first queue to release the first context - std::unique_lock lock(m); - cv.wait(lock, [&] { return thread_done; }); - plugin->call(context1); - - // notify the other thread that the context was released - released = true; - lock.unlock(); - cv.notify_one(); - - // wait for the thread to finish - test_thread.join(); - - plugin->call(context2); - - // check that there is no context set on the main thread - CUcontext current = nullptr; - cuCtxGetCurrent(¤t); - ASSERT_EQ(current, nullptr); -} diff --git a/sycl/unittests/pi/cuda/test_kernels.cpp b/sycl/unittests/pi/cuda/test_kernels.cpp index d485bb218a1dc..e9c1a6b788655 100644 --- a/sycl/unittests/pi/cuda/test_kernels.cpp +++ b/sycl/unittests/pi/cuda/test_kernels.cpp @@ -306,8 +306,8 @@ TEST_F(CudaKernelsTest, PIKernelSetMemObj) { size_t memSize = 1024u; pi_mem memObj; ASSERT_EQ((plugin->call_nocheck( - context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj, - nullptr)), + context_, device_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, + &memObj, nullptr)), PI_SUCCESS); ASSERT_EQ((plugin->call_nocheck( @@ -342,8 +342,8 @@ TEST_F(CudaKernelsTest, PIkerneldispatch) { size_t memSize = 1024u; pi_mem memObj; ASSERT_EQ((plugin->call_nocheck( - context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj, - nullptr)), + context_, device_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, + &memObj, nullptr)), PI_SUCCESS); ASSERT_EQ((plugin->call_nocheck( @@ -386,14 +386,14 @@ TEST_F(CudaKernelsTest, PIkerneldispatchTwo) { size_t memSize = 1024u; pi_mem memObj; ASSERT_EQ((plugin->call_nocheck( - context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj, - nullptr)), + context_, device_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, + &memObj, nullptr)), PI_SUCCESS); pi_mem memObj2; ASSERT_EQ((plugin->call_nocheck( - context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj2, - nullptr)), + context_, device_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, + &memObj2, nullptr)), PI_SUCCESS); ASSERT_EQ((plugin->call_nocheck( diff --git a/sycl/unittests/pi/cuda/test_mem_obj.cpp b/sycl/unittests/pi/cuda/test_mem_obj.cpp index 46fc4a007526d..ba76c4a8bd400 100644 --- a/sycl/unittests/pi/cuda/test_mem_obj.cpp +++ b/sycl/unittests/pi/cuda/test_mem_obj.cpp @@ -75,8 +75,8 @@ TEST_F(CudaTestMemObj, piMemBufferCreateSimple) { const size_t memSize = 1024u; pi_mem memObj; ASSERT_EQ((plugin->call_nocheck( - context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj, - nullptr)), + context_, device_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, + &memObj, nullptr)), PI_SUCCESS); ASSERT_EQ((plugin->call_nocheck(memObj)), @@ -87,8 +87,9 @@ TEST_F(CudaTestMemObj, piMemBufferAllocHost) { const size_t memSize = 1024u; pi_mem memObj; ASSERT_EQ((plugin->call_nocheck( - context_, PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_ALLOC, - memSize, nullptr, &memObj, nullptr)), + context_, device_, + PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_ALLOC, memSize, + nullptr, &memObj, nullptr)), PI_SUCCESS); ASSERT_EQ((plugin->call_nocheck(memObj)), @@ -116,8 +117,8 @@ TEST_F(CudaTestMemObj, piMemBufferCreateNoActiveContext) { // to allocate the memory object pi_mem memObj; ASSERT_EQ((plugin->call_nocheck( - context_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, &memObj, - nullptr)), + context_, device_, PI_MEM_FLAGS_ACCESS_RW, memSize, nullptr, + &memObj, nullptr)), PI_SUCCESS); ASSERT_NE(memObj, nullptr); @@ -138,8 +139,9 @@ TEST_F(CudaTestMemObj, piMemBufferPinnedMappedRead) { pi_mem memObj; ASSERT_EQ((plugin->call_nocheck( - context_, PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_ALLOC, - memSize, nullptr, &memObj, nullptr)), + context_, device_, + PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_ALLOC, memSize, + nullptr, &memObj, nullptr)), PI_SUCCESS); ASSERT_EQ( @@ -177,8 +179,9 @@ TEST_F(CudaTestMemObj, piMemBufferPinnedMappedWrite) { pi_mem memObj; ASSERT_EQ((plugin->call_nocheck( - context_, PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_ALLOC, - memSize, nullptr, &memObj, nullptr)), + context_, device_, + PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_ALLOC, memSize, + nullptr, &memObj, nullptr)), PI_SUCCESS); int *host_ptr = nullptr; diff --git a/sycl/unittests/scheduler/InOrderQueueDeps.cpp b/sycl/unittests/scheduler/InOrderQueueDeps.cpp index be5e8d874da17..52d807d9a6773 100644 --- a/sycl/unittests/scheduler/InOrderQueueDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueDeps.cpp @@ -17,8 +17,8 @@ using namespace sycl; static pi_result -redefinedMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size, - void *host_ptr, pi_mem *ret_mem, +redefinedMemBufferCreate(pi_context context, pi_device, pi_mem_flags flags, + size_t size, void *host_ptr, pi_mem *ret_mem, const pi_mem_properties *properties = nullptr) { return PI_SUCCESS; } diff --git a/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp b/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp index b9a4726acb7fd..0a7a90c78767c 100644 --- a/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp +++ b/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp @@ -16,6 +16,7 @@ using namespace sycl; class MemObjMock : public sycl::detail::SYCLMemObjI { public: using ContextImplPtr = std::shared_ptr; + using DeviceImplPtr = std::shared_ptr; MemObjMock(const std::shared_ptr &Record) : SYCLMemObjI() { @@ -26,7 +27,8 @@ class MemObjMock : public sycl::detail::SYCLMemObjI { MemObjType getType() const override { return MemObjType::Buffer; } - void *allocateMem(ContextImplPtr, bool, void *, sycl::detail::pi::PiEvent &) { + void *allocateMem(ContextImplPtr, DeviceImplPtr, bool, void *, + sycl::detail::pi::PiEvent &) { return nullptr; } @@ -70,7 +72,8 @@ TEST_F(SchedulerTest, LinkedAllocaDependencies) { std::vector &) {}; std::shared_ptr Record{ - new sycl::detail::MemObjRecord(DefaultHostQueue->getContextImplPtr(), 10, + new sycl::detail::MemObjRecord(DefaultHostQueue->getContextImplPtr(), + DefaultHostQueue->getDeviceImplPtr(), 10, AllocaDep)}; MemObjMock MemObj(Record); diff --git a/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp b/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp index f555c44cfc5a3..d3b1405c0ab2f 100644 --- a/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp +++ b/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp @@ -47,8 +47,8 @@ static pi_result redefinedDeviceGetInfoAfter(pi_device Device, } static pi_result -redefinedMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size, - void *host_ptr, pi_mem *ret_mem, +redefinedMemBufferCreate(pi_context context, pi_device, pi_mem_flags flags, + size_t size, void *host_ptr, pi_mem *ret_mem, const pi_mem_properties *properties = nullptr) { EXPECT_EQ(flags, PI_MEM_FLAGS_ACCESS_RW); return PI_SUCCESS; @@ -203,7 +203,8 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) { { pi_mem MockInteropBuffer = nullptr; pi_result PIRes = mock_piMemBufferCreate( - /*pi_context=*/0x0, /*pi_mem_flags=*/PI_MEM_FLAGS_ACCESS_RW, /*size=*/1, + /*pi_context=*/0x0, /*pi_device=*/0x0, + /*pi_mem_flags=*/PI_MEM_FLAGS_ACCESS_RW, /*size=*/1, /*host_ptr=*/nullptr, &MockInteropBuffer); EXPECT_TRUE(PI_SUCCESS == PIRes); diff --git a/sycl/unittests/scheduler/QueueFlushing.cpp b/sycl/unittests/scheduler/QueueFlushing.cpp index 269b93b3ee01a..bc5f507616a2e 100644 --- a/sycl/unittests/scheduler/QueueFlushing.cpp +++ b/sycl/unittests/scheduler/QueueFlushing.cpp @@ -101,7 +101,7 @@ TEST_F(SchedulerTest, QueueFlushing) { detail::Requirement MockReq = getMockRequirement(Buf); pi_mem PIBuf = nullptr; - pi_result Ret = mock_piMemBufferCreate(/*pi_context=*/0x0, + pi_result Ret = mock_piMemBufferCreate(/*pi_context=*/0x0, /*pi_device=*/0x0, PI_MEM_FLAGS_ACCESS_RW, /*size=*/1, /*host_ptr=*/nullptr, &PIBuf); EXPECT_TRUE(Ret == PI_SUCCESS); diff --git a/sycl/unittests/stream/stream.cpp b/sycl/unittests/stream/stream.cpp index 36afde6e06750..009300da9958a 100644 --- a/sycl/unittests/stream/stream.cpp +++ b/sycl/unittests/stream/stream.cpp @@ -20,8 +20,8 @@ size_t GBufferCreateCounter = 0; static pi_result -redefinedMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size, - void *host_ptr, pi_mem *ret_mem, +redefinedMemBufferCreate(pi_context context, pi_device, pi_mem_flags flags, + size_t size, void *host_ptr, pi_mem *ret_mem, const pi_mem_properties *properties = nullptr) { ++GBufferCreateCounter; *ret_mem = nullptr;