From 36ad296b49067ef3e24797e01897fd6137e8e4ab Mon Sep 17 00:00:00 2001 From: Anton Volkov Date: Sun, 5 Mar 2023 04:32:03 -0600 Subject: [PATCH] Excess memcpy to shared memory in elementwise and bitwise functions --- .github/workflows/conda-package.yml | 1 + dpnp/backend/kernels/dpnp_krnl_bitwise.cpp | 97 ++++++++------- dpnp/backend/kernels/dpnp_krnl_elemwise.cpp | 131 ++++++++++++-------- 3 files changed, 130 insertions(+), 99 deletions(-) diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml index 64a14a85be54..52ac769b7fe8 100644 --- a/.github/workflows/conda-package.yml +++ b/.github/workflows/conda-package.yml @@ -12,6 +12,7 @@ env: CHANNELS: '-c dppy/label/dev -c intel -c main --override-channels' TEST_SCOPE: >- test_arraycreation.py + test_dot.py test_dparray.py test_fft.py test_linalg.py diff --git a/dpnp/backend/kernels/dpnp_krnl_bitwise.cpp b/dpnp/backend/kernels/dpnp_krnl_bitwise.cpp index b64670be4e09..c082bd636bf9 100644 --- a/dpnp/backend/kernels/dpnp_krnl_bitwise.cpp +++ b/dpnp/backend/kernels/dpnp_krnl_bitwise.cpp @@ -1,5 +1,5 @@ //***************************************************************************** -// Copyright (c) 2016-2020, Intel Corporation +// Copyright (c) 2016-2023, Intel Corporation // All rights reserved. // // Redistribution and use in source and binary forms, with or without @@ -148,53 +148,62 @@ static void func_map_init_bitwise_1arg_1type(func_map_t& fmap) \ sycl::queue q = *(reinterpret_cast(q_ref)); \ \ - DPNPC_ptr_adapter<_DataType> input1_ptr(q_ref, input1_in, input1_size); \ - DPNPC_ptr_adapter input1_shape_ptr(q_ref, input1_shape, input1_ndim, true); \ - DPNPC_ptr_adapter input1_strides_ptr(q_ref, input1_strides, input1_ndim, true); \ + _DataType* input1_data = static_cast<_DataType*>(const_cast(input1_in)); \ + _DataType* input2_data = static_cast<_DataType*>(const_cast(input2_in)); \ + _DataType* result = static_cast<_DataType*>(result_out); \ \ - DPNPC_ptr_adapter<_DataType> input2_ptr(q_ref, input2_in, input2_size); \ - DPNPC_ptr_adapter input2_shape_ptr(q_ref, input2_shape, input2_ndim, true); \ - DPNPC_ptr_adapter input2_strides_ptr(q_ref, input2_strides, input2_ndim, true); \ + shape_elem_type* input1_shape_offsets = new shape_elem_type[input1_ndim]; \ \ - DPNPC_ptr_adapter<_DataType> result_ptr(q_ref, result_out, result_size, false, true); \ - DPNPC_ptr_adapter result_strides_ptr(q_ref, result_strides, result_ndim); \ + get_shape_offsets_inkernel(input1_shape, input1_ndim, input1_shape_offsets); \ + bool use_strides = !array_equal(input1_strides, input1_ndim, input1_shape_offsets, input1_ndim); \ + delete[] input1_shape_offsets; \ \ - _DataType* input1_data = input1_ptr.get_ptr(); \ - shape_elem_type* input1_shape_data = input1_shape_ptr.get_ptr(); \ - shape_elem_type* input1_strides_data = input1_strides_ptr.get_ptr(); \ + shape_elem_type* input2_shape_offsets = new shape_elem_type[input2_ndim]; \ \ - _DataType* input2_data = input2_ptr.get_ptr(); \ - shape_elem_type* input2_shape_data = input2_shape_ptr.get_ptr(); \ - shape_elem_type* input2_strides_data = input2_strides_ptr.get_ptr(); \ - \ - _DataType* result = result_ptr.get_ptr(); \ - shape_elem_type* result_strides_data = result_strides_ptr.get_ptr(); \ - \ - const size_t input1_shape_size_in_bytes = input1_ndim * sizeof(shape_elem_type); \ - shape_elem_type* input1_shape_offsets = \ - reinterpret_cast(sycl::malloc_shared(input1_shape_size_in_bytes, q)); \ - get_shape_offsets_inkernel(input1_shape_data, input1_ndim, input1_shape_offsets); \ - bool use_strides = !array_equal(input1_strides_data, input1_ndim, input1_shape_offsets, input1_ndim); \ - sycl::free(input1_shape_offsets, q); \ - \ - const size_t input2_shape_size_in_bytes = input2_ndim * sizeof(shape_elem_type); \ - shape_elem_type* input2_shape_offsets = \ - reinterpret_cast(sycl::malloc_shared(input2_shape_size_in_bytes, q)); \ - get_shape_offsets_inkernel(input2_shape_data, input2_ndim, input2_shape_offsets); \ - use_strides = \ - use_strides || !array_equal(input2_strides_data, input2_ndim, input2_shape_offsets, input2_ndim); \ - sycl::free(input2_shape_offsets, q); \ + get_shape_offsets_inkernel(input2_shape, input2_ndim, input2_shape_offsets); \ + use_strides = use_strides || !array_equal(input2_strides, input2_ndim, input2_shape_offsets, input2_ndim); \ + delete[] input2_shape_offsets; \ \ sycl::event event; \ sycl::range<1> gws(result_size); \ \ if (use_strides) \ { \ + if ((result_ndim != input1_ndim) || (result_ndim != input2_ndim)) \ + { \ + throw std::runtime_error("Result ndim=" + std::to_string(result_ndim) + \ + " mismatches with either input1 ndim=" + std::to_string(input1_ndim) + \ + " or input2 ndim=" + std::to_string(input2_ndim)); \ + } \ + \ + /* memory transfer optimization, use USM-host for temporary speeds up tranfer to device */ \ + using usm_host_allocatorT = sycl::usm_allocator; \ + \ + size_t strides_size = 3 * result_ndim; \ + shape_elem_type* dev_strides_data = sycl::malloc_device(strides_size, q); \ + \ + /* create host temporary for packed strides managed by shared pointer */ \ + auto strides_host_packed = \ + std::vector(strides_size, usm_host_allocatorT(q)); \ + \ + /* packed vector is concatenation of result_strides, input1_strides and input2_strides */ \ + std::copy(result_strides, result_strides + result_ndim, strides_host_packed.begin()); \ + std::copy(input1_strides, input1_strides + result_ndim, strides_host_packed.begin() + result_ndim); \ + std::copy(input2_strides, input2_strides + result_ndim, strides_host_packed.begin() + 2 * result_ndim); \ + \ + auto copy_strides_ev = \ + q.copy(strides_host_packed.data(), dev_strides_data, strides_host_packed.size()); \ + \ auto kernel_parallel_for_func = [=](sycl::id<1> global_id) { \ - const size_t output_id = global_id[0]; /*for (size_t i = 0; i < result_size; ++i)*/ \ + const size_t output_id = global_id[0]; /* for (size_t i = 0; i < result_size; ++i) */ \ { \ + const shape_elem_type* result_strides_data = &dev_strides_data[0]; \ + const shape_elem_type* input1_strides_data = &dev_strides_data[1]; \ + const shape_elem_type* input2_strides_data = &dev_strides_data[2]; \ + \ size_t input1_id = 0; \ size_t input2_id = 0; \ + \ for (size_t i = 0; i < result_ndim; ++i) \ { \ const size_t output_xyz_id = \ @@ -209,14 +218,19 @@ static void func_map_init_bitwise_1arg_1type(func_map_t& fmap) } \ }; \ auto kernel_func = [&](sycl::handler& cgh) { \ + cgh.depends_on(copy_strides_ev); \ cgh.parallel_for>(gws, kernel_parallel_for_func); \ }; \ - event = q.submit(kernel_func); \ + \ + q.submit(kernel_func).wait(); \ + \ + sycl::free(dev_strides_data, q); \ + return event_ref; \ } \ else \ { \ auto kernel_parallel_for_func = [=](sycl::id<1> global_id) { \ - size_t i = global_id[0]; /*for (size_t i = 0; i < result_size; ++i)*/ \ + size_t i = global_id[0]; /* for (size_t i = 0; i < result_size; ++i) */ \ const _DataType input1_elem = (input1_size == 1) ? input1_data[0] : input1_data[i]; \ const _DataType input2_elem = (input2_size == 1) ? input2_data[0] : input2_data[i]; \ result[i] = __operation__; \ @@ -226,16 +240,8 @@ static void func_map_init_bitwise_1arg_1type(func_map_t& fmap) }; \ event = q.submit(kernel_func); \ } \ - input1_ptr.depends_on(event); \ - input1_shape_ptr.depends_on(event); \ - input1_strides_ptr.depends_on(event); \ - input2_ptr.depends_on(event); \ - input2_shape_ptr.depends_on(event); \ - input2_strides_ptr.depends_on(event); \ - result_ptr.depends_on(event); \ - result_strides_ptr.depends_on(event); \ - event_ref = reinterpret_cast(&event); \ \ + event_ref = reinterpret_cast(&event); \ return DPCTLEvent_Copy(event_ref); \ } \ \ @@ -278,6 +284,7 @@ static void func_map_init_bitwise_1arg_1type(func_map_t& fmap) where, \ dep_event_vec_ref); \ DPCTLEvent_WaitAndThrow(event_ref); \ + DPCTLEvent_Delete(event_ref); \ } \ \ template \ diff --git a/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp b/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp index c8b32fa98094..741a945fb099 100644 --- a/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp +++ b/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp @@ -70,35 +70,49 @@ \ sycl::queue q = *(reinterpret_cast(q_ref)); \ \ - DPNPC_ptr_adapter<_DataType_input> input1_ptr(q_ref, input1_in, input1_size); \ - DPNPC_ptr_adapter input1_shape_ptr(q_ref, input1_shape, input1_ndim, true); \ - DPNPC_ptr_adapter input1_strides_ptr(q_ref, input1_strides, input1_ndim, true); \ - \ - DPNPC_ptr_adapter<_DataType_output> result_ptr(q_ref, result_out, result_size, false, true); \ - DPNPC_ptr_adapter result_strides_ptr(q_ref, result_strides, result_ndim); \ - \ - _DataType_input* input1_data = input1_ptr.get_ptr(); \ - shape_elem_type* input1_shape_data = input1_shape_ptr.get_ptr(); \ - shape_elem_type* input1_strides_data = input1_strides_ptr.get_ptr(); \ + _DataType_input* input1_data = static_cast<_DataType_input*>(const_cast(input1_in)); \ + _DataType_output* result = static_cast<_DataType_output*>(result_out); \ \ - _DataType_output* result = result_ptr.get_ptr(); \ - shape_elem_type* result_strides_data = result_strides_ptr.get_ptr(); \ + shape_elem_type* input1_shape_offsets = new shape_elem_type[input1_ndim]; \ \ - const size_t input1_shape_size_in_bytes = input1_ndim * sizeof(shape_elem_type); \ - shape_elem_type* input1_shape_offsets = \ - reinterpret_cast(sycl::malloc_shared(input1_shape_size_in_bytes, q)); \ - get_shape_offsets_inkernel(input1_shape_data, input1_ndim, input1_shape_offsets); \ - bool use_strides = !array_equal(input1_strides_data, input1_ndim, input1_shape_offsets, input1_ndim); \ - sycl::free(input1_shape_offsets, q); \ + get_shape_offsets_inkernel(input1_shape, input1_ndim, input1_shape_offsets); \ + bool use_strides = !array_equal(input1_strides, input1_ndim, input1_shape_offsets, input1_ndim); \ + delete[] input1_shape_offsets; \ \ sycl::event event; \ sycl::range<1> gws(result_size); \ \ if (use_strides) \ { \ + if (result_ndim != input1_ndim) \ + { \ + throw std::runtime_error("Result ndim=" + std::to_string(result_ndim) + \ + " mismatches with input1 ndim=" + std::to_string(input1_ndim)); \ + } \ + \ + /* memory transfer optimization, use USM-host for temporary speeds up tranfer to device */ \ + using usm_host_allocatorT = sycl::usm_allocator; \ + \ + size_t strides_size = 2 * result_ndim; \ + shape_elem_type* dev_strides_data = sycl::malloc_device(strides_size, q); \ + \ + /* create host temporary for packed strides managed by shared pointer */ \ + auto strides_host_packed = \ + std::vector(strides_size, usm_host_allocatorT(q)); \ + \ + /* packed vector is concatenation of result_strides, input1_strides and input2_strides */ \ + std::copy(result_strides, result_strides + result_ndim, strides_host_packed.begin()); \ + std::copy(input1_strides, input1_strides + result_ndim, strides_host_packed.begin() + result_ndim); \ + \ + auto copy_strides_ev = \ + q.copy(strides_host_packed.data(), dev_strides_data, strides_host_packed.size()); \ + \ auto kernel_parallel_for_func = [=](sycl::id<1> global_id) { \ - size_t output_id = global_id[0]; /*for (size_t i = 0; i < result_size; ++i)*/ \ + size_t output_id = global_id[0]; /* for (size_t i = 0; i < result_size; ++i) */ \ { \ + const shape_elem_type* result_strides_data = &dev_strides_data[0]; \ + const shape_elem_type* input1_strides_data = &dev_strides_data[1]; \ + \ size_t input_id = 0; \ for (size_t i = 0; i < input1_ndim; ++i) \ { \ @@ -115,7 +129,11 @@ cgh.parallel_for>( \ gws, kernel_parallel_for_func); \ }; \ - event = q.submit(kernel_func); \ + \ + q.submit(kernel_func).wait(); \ + \ + sycl::free(dev_strides_data, q); \ + return event_ref; \ } \ else \ { \ @@ -143,14 +161,7 @@ } \ } \ \ - input1_ptr.depends_on(event); \ - input1_shape_ptr.depends_on(event); \ - input1_strides_ptr.depends_on(event); \ - result_ptr.depends_on(event); \ - result_strides_ptr.depends_on(event); \ - \ event_ref = reinterpret_cast(&event); \ - \ return DPCTLEvent_Copy(event_ref); \ } \ \ @@ -583,34 +594,49 @@ static void func_map_init_elemwise_1arg_2type(func_map_t& fmap) \ sycl::queue q = *(reinterpret_cast(q_ref)); \ \ - DPNPC_ptr_adapter<_DataType> input1_ptr(q_ref, input1_in, input1_size); \ - DPNPC_ptr_adapter input1_shape_ptr(q_ref, input1_shape, input1_ndim, true); \ - DPNPC_ptr_adapter input1_strides_ptr(q_ref, input1_strides, input1_ndim, true); \ - DPNPC_ptr_adapter<_DataType> result_ptr(q_ref, result_out, result_size, false, true); \ - DPNPC_ptr_adapter result_strides_ptr(q_ref, result_strides, result_ndim); \ - \ - _DataType* input1_data = input1_ptr.get_ptr(); \ - shape_elem_type* input1_shape_data = input1_shape_ptr.get_ptr(); \ - shape_elem_type* input1_strides_data = input1_strides_ptr.get_ptr(); \ + _DataType* input1_data = static_cast<_DataType*>(const_cast(input1_in)); \ + _DataType* result = static_cast<_DataType*>(result_out); \ \ - _DataType* result = result_ptr.get_ptr(); \ - shape_elem_type* result_strides_data = result_strides_ptr.get_ptr(); \ + shape_elem_type* input1_shape_offsets = new shape_elem_type[input1_ndim]; \ \ - const size_t input1_shape_size_in_bytes = input1_ndim * sizeof(shape_elem_type); \ - shape_elem_type* input1_shape_offsets = \ - reinterpret_cast(sycl::malloc_shared(input1_shape_size_in_bytes, q)); \ - get_shape_offsets_inkernel(input1_shape_data, input1_ndim, input1_shape_offsets); \ - bool use_strides = !array_equal(input1_strides_data, input1_ndim, input1_shape_offsets, input1_ndim); \ - sycl::free(input1_shape_offsets, q); \ + get_shape_offsets_inkernel(input1_shape, input1_ndim, input1_shape_offsets); \ + bool use_strides = !array_equal(input1_strides, input1_ndim, input1_shape_offsets, input1_ndim); \ + delete[] input1_shape_offsets; \ \ sycl::event event; \ sycl::range<1> gws(result_size); \ \ if (use_strides) \ { \ + if (result_ndim != input1_ndim) \ + { \ + throw std::runtime_error("Result ndim=" + std::to_string(result_ndim) + \ + " mismatches with input1 ndim=" + std::to_string(input1_ndim)); \ + } \ + \ + /* memory transfer optimization, use USM-host for temporary speeds up tranfer to device */ \ + using usm_host_allocatorT = sycl::usm_allocator; \ + \ + size_t strides_size = 2 * result_ndim; \ + shape_elem_type* dev_strides_data = sycl::malloc_device(strides_size, q); \ + \ + /* create host temporary for packed strides managed by shared pointer */ \ + auto strides_host_packed = \ + std::vector(strides_size, usm_host_allocatorT(q)); \ + \ + /* packed vector is concatenation of result_strides, input1_strides and input2_strides */ \ + std::copy(result_strides, result_strides + result_ndim, strides_host_packed.begin()); \ + std::copy(input1_strides, input1_strides + result_ndim, strides_host_packed.begin() + result_ndim); \ + \ + auto copy_strides_ev = \ + q.copy(strides_host_packed.data(), dev_strides_data, strides_host_packed.size()); \ + \ auto kernel_parallel_for_func = [=](sycl::id<1> global_id) { \ - size_t output_id = global_id[0]; /*for (size_t i = 0; i < result_size; ++i)*/ \ + size_t output_id = global_id[0]; /* for (size_t i = 0; i < result_size; ++i) */ \ { \ + const shape_elem_type* result_strides_data = &dev_strides_data[0]; \ + const shape_elem_type* input1_strides_data = &dev_strides_data[1]; \ + \ size_t input_id = 0; \ for (size_t i = 0; i < input1_ndim; ++i) \ { \ @@ -626,12 +652,16 @@ static void func_map_init_elemwise_1arg_2type(func_map_t& fmap) auto kernel_func = [&](sycl::handler& cgh) { \ cgh.parallel_for>(gws, kernel_parallel_for_func); \ }; \ - event = q.submit(kernel_func); \ + \ + q.submit(kernel_func).wait(); \ + \ + sycl::free(dev_strides_data, q); \ + return event_ref; \ } \ else \ { \ auto kernel_parallel_for_func = [=](sycl::id<1> global_id) { \ - size_t i = global_id[0]; /*for (size_t i = 0; i < result_size; ++i)*/ \ + size_t i = global_id[0]; /* for (size_t i = 0; i < result_size; ++i) */ \ { \ const _DataType input_elem = input1_data[i]; \ result[i] = __operation1__; \ @@ -651,14 +681,7 @@ static void func_map_init_elemwise_1arg_2type(func_map_t& fmap) } \ } \ \ - input1_ptr.depends_on(event); \ - input1_shape_ptr.depends_on(event); \ - input1_strides_ptr.depends_on(event); \ - result_ptr.depends_on(event); \ - result_strides_ptr.depends_on(event); \ - \ event_ref = reinterpret_cast(&event); \ - \ return DPCTLEvent_Copy(event_ref); \ } \ \