@@ -1067,6 +1067,9 @@ pi_result cuda_piContextGetInfo(pi_context context, pi_context_info param_name,
10671067 capabilities);
10681068 }
10691069 case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT:
1070+ // 2D USM memcpy is supported.
1071+ return getInfo<pi_bool>(param_value_size, param_value, param_value_size_ret,
1072+ true );
10701073 case PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT:
10711074 case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT:
10721075 // 2D USM operations currently not supported.
@@ -1949,10 +1952,12 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
19491952 CUresult current_ctx_device_ret = cuCtxGetDevice (¤t_ctx_device);
19501953 if (current_ctx_device_ret != CUDA_ERROR_INVALID_CONTEXT)
19511954 PI_CHECK_ERROR (current_ctx_device_ret);
1952- bool need_primary_ctx = current_ctx_device_ret == CUDA_ERROR_INVALID_CONTEXT ||
1953- current_ctx_device != device->get ();
1955+ bool need_primary_ctx =
1956+ current_ctx_device_ret == CUDA_ERROR_INVALID_CONTEXT ||
1957+ current_ctx_device != device->get ();
19541958 if (need_primary_ctx) {
1955- // Use the primary context for the device if no context with the device is set.
1959+ // Use the primary context for the device if no context with the device is
1960+ // set.
19561961 CUcontext primary_context;
19571962 PI_CHECK_ERROR (cuDevicePrimaryCtxRetain (&primary_context, device->get ()));
19581963 PI_CHECK_ERROR (cuCtxSetCurrent (primary_context));
@@ -5383,14 +5388,91 @@ pi_result cuda_piextUSMEnqueueMemset2D(pi_queue, void *, size_t, int, size_t,
53835388 return {};
53845389}
53855390
5386- // TODO: Implement this. Remember to return true for
5387- // PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT when it is implemented.
5388- pi_result cuda_piextUSMEnqueueMemcpy2D (pi_queue, pi_bool, void *, size_t ,
5389- const void *, size_t , size_t , size_t ,
5390- pi_uint32, const pi_event *,
5391- pi_event *) {
5392- sycl::detail::pi::die (" piextUSMEnqueueMemcpy2D not implemented" );
5393- return {};
5391+ // / 2D Memcpy API
5392+ // /
5393+ // / \param queue is the queue to submit to
5394+ // / \param blocking is whether this operation should block the host
5395+ // / \param dst_ptr is the location the data will be copied
5396+ // / \param dst_pitch is the total width of the destination memory including
5397+ // / padding
5398+ // / \param src_ptr is the data to be copied
5399+ // / \param dst_pitch is the total width of the source memory including padding
5400+ // / \param width is width in bytes of each row to be copied
5401+ // / \param height is height the columns to be copied
5402+ // / \param num_events_in_waitlist is the number of events to wait on
5403+ // / \param events_waitlist is an array of events to wait on
5404+ // / \param event is the event that represents this operation
5405+ pi_result cuda_piextUSMEnqueueMemcpy2D (pi_queue queue, pi_bool blocking,
5406+ void *dst_ptr, size_t dst_pitch,
5407+ const void *src_ptr, size_t src_pitch,
5408+ size_t width, size_t height,
5409+ pi_uint32 num_events_in_wait_list,
5410+ const pi_event *event_wait_list,
5411+ pi_event *event) {
5412+
5413+ assert (queue != nullptr );
5414+
5415+ pi_result result = PI_SUCCESS;
5416+
5417+ try {
5418+ ScopedContext active (queue->get_context ());
5419+ CUstream cuStream = queue->get_next_transfer_stream ();
5420+ result = enqueueEventsWait (queue, cuStream, num_events_in_wait_list,
5421+ event_wait_list);
5422+ if (event) {
5423+ (*event) = _pi_event::make_native (PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT,
5424+ queue, cuStream);
5425+ (*event)->start ();
5426+ }
5427+
5428+ // Determine the direction of Copy using cuPointerGetAttributes
5429+ // for both the src_ptr and dst_ptr
5430+ // TODO: Doesn't yet support CU_MEMORYTYPE_UNIFIED
5431+ CUpointer_attribute attributes = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
5432+
5433+ CUmemorytype src_type = static_cast <CUmemorytype>(0 );
5434+ void *src_attribute_values[] = {(void *)(&src_type)};
5435+ result = PI_CHECK_ERROR (cuPointerGetAttributes (
5436+ 1 , &attributes, src_attribute_values, (CUdeviceptr)src_ptr));
5437+ assert (src_type == CU_MEMORYTYPE_DEVICE || src_type == CU_MEMORYTYPE_HOST);
5438+
5439+ CUmemorytype dst_type = static_cast <CUmemorytype>(0 );
5440+ void *dst_attribute_values[] = {(void *)(&dst_type)};
5441+ result = PI_CHECK_ERROR (cuPointerGetAttributes (
5442+ 1 , &attributes, dst_attribute_values, (CUdeviceptr)dst_ptr));
5443+ assert (dst_type == CU_MEMORYTYPE_DEVICE || dst_type == CU_MEMORYTYPE_HOST);
5444+
5445+ CUDA_MEMCPY2D cpyDesc = {0 };
5446+
5447+ cpyDesc.srcMemoryType = src_type;
5448+ cpyDesc.srcDevice = (src_type == CU_MEMORYTYPE_DEVICE)
5449+ ? reinterpret_cast <CUdeviceptr>(src_ptr)
5450+ : 0 ;
5451+ cpyDesc.srcHost = (src_type == CU_MEMORYTYPE_HOST) ? src_ptr : nullptr ;
5452+ cpyDesc.srcPitch = src_pitch;
5453+
5454+ cpyDesc.dstMemoryType = dst_type;
5455+ cpyDesc.dstDevice = (dst_type == CU_MEMORYTYPE_DEVICE)
5456+ ? reinterpret_cast <CUdeviceptr>(dst_ptr)
5457+ : 0 ;
5458+ cpyDesc.dstHost = (dst_type == CU_MEMORYTYPE_HOST) ? dst_ptr : nullptr ;
5459+ cpyDesc.dstPitch = dst_pitch;
5460+
5461+ cpyDesc.WidthInBytes = width;
5462+ cpyDesc.Height = height;
5463+
5464+ result = PI_CHECK_ERROR (cuMemcpy2DAsync (&cpyDesc, cuStream));
5465+
5466+ if (event) {
5467+ (*event)->record ();
5468+ }
5469+ if (blocking) {
5470+ result = PI_CHECK_ERROR (cuStreamSynchronize (cuStream));
5471+ }
5472+ } catch (pi_result err) {
5473+ result = err;
5474+ }
5475+ return result;
53945476}
53955477
53965478// / API to query information about USM allocated pointers
0 commit comments