From beb456fff59f2286c4b0a76c82dcdd664d159ab5 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 6 Dec 2024 14:24:40 -0600 Subject: [PATCH 1/8] Improvement in boolean index extract 1. Use shared local memory to optimize access to neighboring elements of cumulative sums. 2. Introduce contig variant for masked_extract code 3. Removed unused orthog_nelems functor argument, and added local_accessor argument instead. The example ``` import dpctl.tensor as dpt x = dpt.ones(20241024, dtype='f4') m = dpt.ones(x.size, dtype='b1') %time x[m] ``` decreased from 41ms on Iris Xe WSL box to 37 ms. --- .../kernels/boolean_advanced_indexing.hpp | 248 +++++++++++++----- .../source/boolean_advanced_indexing.cpp | 119 ++++++--- 2 files changed, 269 insertions(+), 98 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp index 4e99b26f53..5fd6102bbb 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -48,51 +48,65 @@ template + typename indT, + typename LocalAccessorT> struct MaskedExtractStridedFunctor { MaskedExtractStridedFunctor(const char *src_data_p, const char *cumsum_data_p, char *dst_data_p, - size_t orthog_iter_size, size_t masked_iter_size, const OrthogIndexerT &orthog_src_dst_indexer_, const MaskedSrcIndexerT &masked_src_indexer_, - const MaskedDstIndexerT &masked_dst_indexer_) + const MaskedDstIndexerT &masked_dst_indexer_, + const LocalAccessorT &lacc_) : src_cp(src_data_p), cumsum_cp(cumsum_data_p), dst_cp(dst_data_p), - orthog_nelems(orthog_iter_size), masked_nelems(masked_iter_size), + masked_nelems(masked_iter_size), orthog_src_dst_indexer(orthog_src_dst_indexer_), masked_src_indexer(masked_src_indexer_), - masked_dst_indexer(masked_dst_indexer_) + masked_dst_indexer(masked_dst_indexer_), lacc(lacc_) { + static_assert( + std::is_same_v); } - void operator()(sycl::id<1> idx) const + void operator()(sycl::nd_item<2> ndit) const { const dataT *src_data = reinterpret_cast(src_cp); dataT *dst_data = reinterpret_cast(dst_cp); const indT *cumsum_data = reinterpret_cast(cumsum_cp); - size_t global_i = idx[0]; - size_t orthog_i = global_i / masked_nelems; - size_t masked_i = global_i - masked_nelems * orthog_i; + const size_t orthog_i = ndit.get_global_id(0); + const size_t group_i = ndit.get_group(1); + const std::uint32_t l_i = ndit.get_local_id(1); + const std::uint32_t lws = ndit.get_local_range(1); - indT current_running_count = cumsum_data[masked_i]; - bool mask_set = - (masked_i == 0) - ? (current_running_count == 1) - : (current_running_count == cumsum_data[masked_i - 1] + 1); + const size_t masked_block_start = group_i * lws; + const size_t masked_i = masked_block_start + l_i; - // dst[cumsum[i], j] - 1 = src[i, j] if cumsum[i] == ((i > 0) ? - // cumsum[i-1] - // + 1 : 1) - if (mask_set) { - auto orthog_offsets = - orthog_src_dst_indexer(static_cast(orthog_i)); + for (std::uint32_t i = l_i; i < lacc.size(); i += lws) { + const size_t offset = masked_block_start + i; + lacc[i] = (offset == 0) ? indT(0) + : (offset - 1 < masked_nelems) + ? cumsum_data[offset - 1] + : cumsum_data[masked_nelems - 1] + 1; + } - size_t total_src_offset = masked_src_indexer(masked_i) + - orthog_offsets.get_first_offset(); - size_t total_dst_offset = + sycl::group_barrier(ndit.get_group()); + + const indT current_running_count = lacc[l_i + 1]; + const bool mask_set = (masked_i == 0) + ? (current_running_count == 1) + : (current_running_count == lacc[l_i] + 1); + + // dst[cumsum[i] - 1, j] = src[i, j] + // if cumsum[i] == ((i > 0) ? cumsum[i-1] + 1 : 1) + if (mask_set && (masked_i < masked_nelems)) { + const auto &orthog_offsets = orthog_src_dst_indexer(orthog_i); + + const size_t total_src_offset = masked_src_indexer(masked_i) + + orthog_offsets.get_first_offset(); + const size_t total_dst_offset = masked_dst_indexer(current_running_count - 1) + orthog_offsets.get_second_offset(); @@ -104,8 +118,7 @@ struct MaskedExtractStridedFunctor const char *src_cp = nullptr; const char *cumsum_cp = nullptr; char *dst_cp = nullptr; - size_t orthog_nelems = 0; - size_t masked_nelems = 0; + const size_t masked_nelems = 0; // has nd, shape, src_strides, dst_strides for // dimensions that ARE NOT masked const OrthogIndexerT orthog_src_dst_indexer; @@ -114,6 +127,7 @@ struct MaskedExtractStridedFunctor const MaskedSrcIndexerT masked_src_indexer; // has 1, dst_strides for dimensions that ARE masked const MaskedDstIndexerT masked_dst_indexer; + LocalAccessorT lacc; }; template +class masked_extract_all_slices_contig_impl_krn; + +typedef sycl::event (*masked_extract_all_slices_contig_impl_fn_ptr_t)( + sycl::queue &, + ssize_t, + const char *, + const char *, + char *, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event masked_extract_all_slices_contig_impl( + sycl::queue &exec_q, + ssize_t iteration_size, + const char *src_p, + const char *cumsum_p, + char *dst_p, + ssize_t dst_size, // dst is 1D + ssize_t dst_stride, + const std::vector &depends = {}) +{ + constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{}; + + constexpr NoOpIndexer masked_src_indexer{}; + const Strided1DIndexer masked_dst_indexer(/* size */ dst_size, + /* step */ dst_stride); + + using KernelName = + class masked_extract_all_slices_contig_impl_krn; + + using LocalAccessorT = sycl::local_accessor; + using Impl = + struct MaskedExtractStridedFunctor; + + constexpr std::size_t nominal_lws = 256; + const std::size_t masked_extent = iteration_size; + const std::size_t lws = std::min(masked_extent, nominal_lws); + const std::size_t n_groups = (iteration_size + lws - 1) / lws; + + sycl::range<2> gRange{1, n_groups * lws}; + sycl::range<2> lRange{1, lws}; + + sycl::nd_range<2> ndRange(gRange, lRange); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const std::size_t lacc_size = std::min(lws, masked_extent) + 1; + LocalAccessorT lacc(lacc_size, cgh); + + cgh.parallel_for( + ndRange, + Impl(src_p, cumsum_p, dst_p, masked_extent, orthog_src_dst_indexer, + masked_src_indexer, masked_dst_indexer, lacc)); + }); + + return comp_ev; +} + +template @@ -223,11 +301,6 @@ sycl::event masked_extract_all_slices_strided_impl( ssize_t dst_stride, const std::vector &depends = {}) { - // using MaskedExtractStridedFunctor; - // using Strided1DIndexer; - // using StridedIndexer; - // using TwoZeroOffsets_Indexer; - constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{}; /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const @@ -236,18 +309,35 @@ sycl::event masked_extract_all_slices_strided_impl( const Strided1DIndexer masked_dst_indexer(/* size */ dst_size, /* step */ dst_stride); + using KernelName = class masked_extract_all_slices_strided_impl_krn< + StridedIndexer, Strided1DIndexer, dataT, indT>; + + using LocalAccessorT = sycl::local_accessor; + using Impl = + struct MaskedExtractStridedFunctor; + + constexpr std::size_t nominal_lws = 256; + const std::size_t masked_nelems = iteration_size; + const std::size_t lws = std::min(masked_nelems, nominal_lws); + const std::size_t n_groups = (masked_nelems + lws - 1) / lws; + + sycl::range<2> gRange{1, n_groups * lws}; + sycl::range<2> lRange{1, lws}; + + sycl::nd_range<2> ndRange(gRange, lRange); + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - cgh.parallel_for>( - sycl::range<1>(static_cast(iteration_size)), - MaskedExtractStridedFunctor( - src_p, cumsum_p, dst_p, 1, iteration_size, - orthog_src_dst_indexer, masked_src_indexer, - masked_dst_indexer)); + const std::size_t lacc_size = std::min(lws, masked_nelems) + 1; + LocalAccessorT lacc(lacc_size, cgh); + + cgh.parallel_for( + ndRange, + Impl(src_p, cumsum_p, dst_p, iteration_size, orthog_src_dst_indexer, + masked_src_indexer, masked_dst_indexer, lacc)); }); return comp_ev; @@ -299,11 +389,6 @@ sycl::event masked_extract_some_slices_strided_impl( ssize_t masked_dst_stride, const std::vector &depends = {}) { - // using MaskedExtractStridedFunctor; - // using Strided1DIndexer; - // using StridedIndexer; - // using TwoOffsets_StridedIndexer; - const TwoOffsets_StridedIndexer orthog_src_dst_indexer{ orthog_nd, ortho_src_offset, ortho_dst_offset, packed_ortho_src_dst_shape_strides}; @@ -313,24 +398,63 @@ sycl::event masked_extract_some_slices_strided_impl( const Strided1DIndexer masked_dst_indexer{/* size */ masked_dst_size, /* step */ masked_dst_stride}; + using KernelName = class masked_extract_some_slices_strided_impl_krn< + TwoOffsets_StridedIndexer, StridedIndexer, Strided1DIndexer, dataT, + indT>; + + using LocalAccessorT = sycl::local_accessor; + using Impl = + struct MaskedExtractStridedFunctor; + + const size_t nominal_lws = 256; + const std::size_t masked_extent = masked_nelems; + const size_t lws = std::min(masked_extent, nominal_lws); + const size_t n_groups = ((masked_extent + lws - 1) / lws); + const size_t orthog_extent = static_cast(orthog_nelems); + + sycl::range<2> gRange{orthog_extent, n_groups * lws}; + sycl::range<2> lRange{1, lws}; + + sycl::nd_range<2> ndRange(gRange, lRange); + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - cgh.parallel_for>( - sycl::range<1>(static_cast(orthog_nelems * masked_nelems)), - MaskedExtractStridedFunctor( - src_p, cumsum_p, dst_p, orthog_nelems, masked_nelems, - orthog_src_dst_indexer, masked_src_indexer, - masked_dst_indexer)); + const std::size_t lacc_size = + std::min(lws, masked_extent) + 1; + LocalAccessorT lacc(lacc_size, cgh); + + cgh.parallel_for( + ndRange, + Impl(src_p, cumsum_p, dst_p, masked_nelems, orthog_src_dst_indexer, + masked_src_indexer, masked_dst_indexer, lacc)); }); return comp_ev; } +template +struct MaskExtractAllSlicesContigFactoryForInt32 +{ + fnT get() + { + fnT fn = masked_extract_all_slices_contig_impl; + return fn; + } +}; + +template +struct MaskExtractAllSlicesContigFactoryForInt64 +{ + fnT get() + { + fnT fn = masked_extract_all_slices_contig_impl; + return fn; + } +}; + template struct MaskExtractAllSlicesStridedFactoryForInt32 { @@ -487,13 +611,17 @@ sycl::event masked_place_some_slices_strided_impl( const Strided1DCyclicIndexer masked_rhs_indexer{0, masked_rhs_size, masked_rhs_stride}; + using KernelName = class masked_place_some_slices_strided_impl_krn< + TwoOffsets_StridedIndexer, StridedIndexer, Strided1DCyclicIndexer, + dataT, indT>; + + sycl::range<1> gRange(static_cast(orthog_nelems * masked_nelems)); + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - cgh.parallel_for>( - sycl::range<1>(static_cast(orthog_nelems * masked_nelems)), + cgh.parallel_for( + gRange, MaskedPlaceStridedFunctor( dst_p, cumsum_p, rhs_p, orthog_nelems, masked_nelems, diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp index 905a65a9a6..e6d91082d3 100644 --- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp @@ -64,6 +64,14 @@ static masked_extract_all_slices_strided_impl_fn_ptr_t masked_extract_all_slices_strided_i64_impl_dispatch_vector [td_ns::num_types]; +using dpctl::tensor::kernels::indexing:: + masked_extract_all_slices_contig_impl_fn_ptr_t; + +static masked_extract_all_slices_contig_impl_fn_ptr_t + masked_extract_all_slices_contig_i32_impl_dispatch_vector[td_ns::num_types]; +static masked_extract_all_slices_contig_impl_fn_ptr_t + masked_extract_all_slices_contig_i64_impl_dispatch_vector[td_ns::num_types]; + using dpctl::tensor::kernels::indexing:: masked_extract_some_slices_strided_impl_fn_ptr_t; @@ -111,6 +119,24 @@ void populate_masked_extract_dispatch_vectors(void) dvb4; dvb4.populate_dispatch_vector( masked_extract_some_slices_strided_i64_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing:: + MaskExtractAllSlicesContigFactoryForInt32; + td_ns::DispatchVectorBuilder + dvb5; + dvb5.populate_dispatch_vector( + masked_extract_all_slices_contig_i32_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing:: + MaskExtractAllSlicesContigFactoryForInt64; + td_ns::DispatchVectorBuilder + dvb6; + dvb6.populate_dispatch_vector( + masked_extract_all_slices_contig_i64_impl_dispatch_vector); } std::pair @@ -223,50 +249,67 @@ py_extract(const dpctl::tensor::usm_ndarray &src, sycl::event extract_ev; std::vector host_task_events{}; if (axis_start == 0 && axis_end == src_nd) { - // empty orthogonal directions - auto fn = - (use_i32) - ? masked_extract_all_slices_strided_i32_impl_dispatch_vector - [src_typeid] - : masked_extract_all_slices_strided_i64_impl_dispatch_vector - [src_typeid]; - assert(dst_shape_vec.size() == 1); assert(dst_strides_vec.size() == 1); - using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, src_shape_vec, src_strides_vec); - py::ssize_t *packed_src_shape_strides = - std::get<0>(ptr_size_event_tuple1); - if (packed_src_shape_strides == nullptr) { - throw std::runtime_error("Unable to allocated device memory"); - } - sycl::event copy_src_shape_strides_ev = - std::get<2>(ptr_size_event_tuple1); - - std::vector all_deps; - all_deps.reserve(depends.size() + 1); - all_deps.insert(all_deps.end(), depends.begin(), depends.end()); - all_deps.push_back(copy_src_shape_strides_ev); - - assert(all_deps.size() == depends.size() + 1); + if (src.is_c_contiguous()) { + auto fn = + (use_i32) + ? masked_extract_all_slices_contig_i32_impl_dispatch_vector + [src_typeid] + : masked_extract_all_slices_contig_i64_impl_dispatch_vector + [src_typeid]; - extract_ev = fn(exec_q, cumsum_sz, src_data_p, cumsum_data_p, - dst_data_p, src_nd, packed_src_shape_strides, - dst_shape_vec[0], dst_strides_vec[0], all_deps); + extract_ev = + fn(exec_q, cumsum_sz, src_data_p, cumsum_data_p, dst_data_p, + dst_shape_vec[0], dst_strides_vec[0], depends); - sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(extract_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_src_shape_strides] { - sycl_free_noexcept(packed_src_shape_strides, ctx); + // + host_task_events.push_back(extract_ev); + } + else { + // empty orthogonal directions + auto fn = + (use_i32) + ? masked_extract_all_slices_strided_i32_impl_dispatch_vector + [src_typeid] + : masked_extract_all_slices_strided_i64_impl_dispatch_vector + [src_typeid]; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_size_event_tuple1 = + device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + py::ssize_t *packed_src_shape_strides = + std::get<0>(ptr_size_event_tuple1); + if (packed_src_shape_strides == nullptr) { + throw std::runtime_error("Unable to allocated device memory"); + } + sycl::event copy_src_shape_strides_ev = + std::get<2>(ptr_size_event_tuple1); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_src_shape_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + extract_ev = fn(exec_q, cumsum_sz, src_data_p, cumsum_data_p, + dst_data_p, src_nd, packed_src_shape_strides, + dst_shape_vec[0], dst_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(extract_ev); + const auto &ctx = exec_q.get_context(); + using dpctl::tensor::alloc_utils::sycl_free_noexcept; + cgh.host_task([ctx, packed_src_shape_strides] { + sycl_free_noexcept(packed_src_shape_strides, ctx); + }); }); - }); - host_task_events.push_back(cleanup_tmp_allocations_ev); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } } else { // non-empty othogonal directions From 5161b8e30b7ab5f99b8fbb0b3d58a5e85b748adb Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 6 Dec 2024 15:37:28 -0600 Subject: [PATCH 2/8] MaskedPlaced kernel optimized Use local_accessor to improve memory bandwidth of the work-group. --- .../kernels/boolean_advanced_indexing.hpp | 129 ++++++++++++------ 1 file changed, 87 insertions(+), 42 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp index 5fd6102bbb..6c96ea4a43 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -134,51 +134,65 @@ template + typename indT, + typename LocalAccessorT> struct MaskedPlaceStridedFunctor { MaskedPlaceStridedFunctor(char *dst_data_p, const char *cumsum_data_p, const char *rhs_data_p, - size_t orthog_iter_size, size_t masked_iter_size, const OrthogIndexerT &orthog_dst_rhs_indexer_, const MaskedDstIndexerT &masked_dst_indexer_, - const MaskedRhsIndexerT &masked_rhs_indexer_) + const MaskedRhsIndexerT &masked_rhs_indexer_, + const LocalAccessorT &lacc_) : dst_cp(dst_data_p), cumsum_cp(cumsum_data_p), rhs_cp(rhs_data_p), - orthog_nelems(orthog_iter_size), masked_nelems(masked_iter_size), + masked_nelems(masked_iter_size), orthog_dst_rhs_indexer(orthog_dst_rhs_indexer_), masked_dst_indexer(masked_dst_indexer_), - masked_rhs_indexer(masked_rhs_indexer_) + masked_rhs_indexer(masked_rhs_indexer_), lacc(lacc_) { + static_assert( + std::is_same_v); } - void operator()(sycl::id<1> idx) const + void operator()(sycl::nd_item<2> ndit) const { dataT *dst_data = reinterpret_cast(dst_cp); const indT *cumsum_data = reinterpret_cast(cumsum_cp); const dataT *rhs_data = reinterpret_cast(rhs_cp); - size_t global_i = idx[0]; - size_t orthog_i = global_i / masked_nelems; - size_t masked_i = global_i - masked_nelems * orthog_i; - - indT current_running_count = cumsum_data[masked_i]; - bool mask_set = - (masked_i == 0) - ? (current_running_count == 1) - : (current_running_count == cumsum_data[masked_i - 1] + 1); - - // src[i, j] = rhs[cumsum[i] - 1, j] if cumsum[i] == ((i > 0) ? - // cumsum[i-1] - // + 1 : 1) - if (mask_set) { - auto orthog_offsets = - orthog_dst_rhs_indexer(static_cast(orthog_i)); - - size_t total_dst_offset = masked_dst_indexer(masked_i) + - orthog_offsets.get_first_offset(); - size_t total_rhs_offset = + const std::size_t orthog_i = ndit.get_global_id(0); + const std::size_t group_i = ndit.get_group(1); + const std::uint32_t l_i = ndit.get_local_id(1); + const std::uint32_t lws = ndit.get_local_range(1); + + const size_t masked_block_start = group_i * lws; + const size_t masked_i = masked_block_start + l_i; + + for (std::uint32_t i = l_i; i < lacc.size(); i += lws) { + const size_t offset = masked_block_start + i; + lacc[i] = (offset == 0) ? indT(0) + : (offset - 1 < masked_nelems) + ? cumsum_data[offset - 1] + : cumsum_data[masked_nelems - 1] + 1; + } + + sycl::group_barrier(ndit.get_group()); + + const indT current_running_count = lacc[l_i + 1]; + const bool mask_set = (masked_i == 0) + ? (current_running_count == 1) + : (current_running_count == lacc[l_i] + 1); + + // src[i, j] = rhs[cumsum[i] - 1, j] + // if cumsum[i] == ((i > 0) ? cumsum[i-1] + 1 : 1) + if (mask_set && (masked_i < masked_nelems)) { + const auto &orthog_offsets = orthog_dst_rhs_indexer(orthog_i); + + const size_t total_dst_offset = masked_dst_indexer(masked_i) + + orthog_offsets.get_first_offset(); + const size_t total_rhs_offset = masked_rhs_indexer(current_running_count - 1) + orthog_offsets.get_second_offset(); @@ -190,8 +204,7 @@ struct MaskedPlaceStridedFunctor char *dst_cp = nullptr; const char *cumsum_cp = nullptr; const char *rhs_cp = nullptr; - size_t orthog_nelems = 0; - size_t masked_nelems = 0; + const size_t masked_nelems = 0; // has nd, shape, dst_strides, rhs_strides for // dimensions that ARE NOT masked const OrthogIndexerT orthog_dst_rhs_indexer; @@ -200,6 +213,7 @@ struct MaskedPlaceStridedFunctor const MaskedDstIndexerT masked_dst_indexer; // has 1, rhs_strides for dimensions that ARE masked const MaskedRhsIndexerT masked_rhs_indexer; + LocalAccessorT lacc; }; // ======= Masked extraction ================================ @@ -537,18 +551,35 @@ sycl::event masked_place_all_slices_strided_impl( const StridedIndexer masked_dst_indexer(nd, 0, packed_dst_shape_strides); const Strided1DCyclicIndexer masked_rhs_indexer(0, rhs_size, rhs_stride); + using KernelName = class masked_place_all_slices_strided_impl_krn< + TwoZeroOffsets_Indexer, StridedIndexer, Strided1DCyclicIndexer, dataT, + indT>; + + constexpr std::size_t nominal_lws = 256; + const std::size_t masked_extent = iteration_size; + const std::size_t lws = std::min(masked_extent, nominal_lws); + + const std::size_t n_groups = (masked_extent + lws - 1) / lws; + + sycl::range<2> gRange{1, n_groups * lws}; + sycl::range<2> lRange{1, lws}; + sycl::nd_range<2> ndRange{gRange, lRange}; + + using LocalAccessorT = sycl::local_accessor; + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - cgh.parallel_for>( - sycl::range<1>(static_cast(iteration_size)), + const std::size_t lacc_size = std::min(masked_extent, lws) + 1; + LocalAccessorT lacc(lacc_size, cgh); + + cgh.parallel_for( + ndRange, MaskedPlaceStridedFunctor( - dst_p, cumsum_p, rhs_p, 1, iteration_size, - orthog_dst_rhs_indexer, masked_dst_indexer, - masked_rhs_indexer)); + Strided1DCyclicIndexer, dataT, indT, + LocalAccessorT>( + dst_p, cumsum_p, rhs_p, iteration_size, orthog_dst_rhs_indexer, + masked_dst_indexer, masked_rhs_indexer, lacc)); }); return comp_ev; @@ -615,18 +646,32 @@ sycl::event masked_place_some_slices_strided_impl( TwoOffsets_StridedIndexer, StridedIndexer, Strided1DCyclicIndexer, dataT, indT>; - sycl::range<1> gRange(static_cast(orthog_nelems * masked_nelems)); + constexpr std::size_t nominal_lws = 256; + const std::size_t orthog_extent = orthog_nelems; + const std::size_t masked_extent = masked_nelems; + const std::size_t lws = std::min(masked_extent, nominal_lws); + + const std::size_t n_groups = (masked_extent + lws - 1) / lws; + + sycl::range<2> gRange{orthog_extent, n_groups * lws}; + sycl::range<2> lRange{1, lws}; + sycl::nd_range<2> ndRange{gRange, lRange}; + + using LocalAccessorT = sycl::local_accessor; sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); + const std::size_t lacc_size = std::min(masked_extent, lws) + 1; + LocalAccessorT lacc(lacc_size, cgh); + cgh.parallel_for( - gRange, + ndRange, MaskedPlaceStridedFunctor( - dst_p, cumsum_p, rhs_p, orthog_nelems, masked_nelems, - orthog_dst_rhs_indexer, masked_dst_indexer, - masked_rhs_indexer)); + Strided1DCyclicIndexer, dataT, indT, + LocalAccessorT>( + dst_p, cumsum_p, rhs_p, masked_nelems, orthog_dst_rhs_indexer, + masked_dst_indexer, masked_rhs_indexer, lacc)); }); return comp_ev; From 137c309b1a51b834b7bcef5ef96725f5f76c6b1e Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 6 Dec 2024 15:38:52 -0600 Subject: [PATCH 3/8] Optimization to kernel of tensor.nonzero Use shared local memory to improve global memory bandwidth. --- .../kernels/boolean_advanced_indexing.hpp | 61 +++++++++++++------ 1 file changed, 44 insertions(+), 17 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp index 6c96ea4a43..059b491cc8 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -744,31 +744,58 @@ sycl::event non_zero_indexes_impl(sycl::queue &exec_q, const indT1 *cumsum_data = reinterpret_cast(cumsum_cp); indT2 *indexes_data = reinterpret_cast(indexes_cp); + constexpr std::size_t nominal_lws = 256u; + const std::size_t masked_extent = iter_size; + const std::size_t lws = std::min(masked_extent, nominal_lws); + + const std::size_t n_groups = (masked_extent + lws - 1) / lws; + sycl::range<1> gRange{n_groups * lws}; + sycl::range<1> lRange{lws}; + + sycl::nd_range<1> ndRange{gRange, lRange}; + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - cgh.parallel_for>( - sycl::range<1>(iter_size), [=](sycl::id<1> idx) { - auto i = idx[0]; - auto cs_curr_val = cumsum_data[i] - 1; - auto cs_prev_val = (i > 0) ? cumsum_data[i - 1] : indT1(0); - bool cond = (cs_curr_val == cs_prev_val); + const std::size_t lacc_size = std::min(lws, masked_extent) + 1; + sycl::local_accessor lacc(lacc_size, cgh); + + using KernelName = class non_zero_indexes_krn; + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> ndit) { + const std::size_t group_i = ndit.get_group(0); + const std::uint32_t l_i = ndit.get_local_id(0); + const std::uint32_t lws = ndit.get_local_range(0); + + const std::size_t masked_block_start = group_i * lws; + + for (std::uint32_t i = l_i; i < lacc.size(); i += lws) { + const size_t offset = masked_block_start + i; + lacc[i] = (offset == 0) ? indT1(0) + : (offset - 1 < masked_extent) + ? cumsum_data[offset - 1] + : cumsum_data[masked_extent - 1] + 1; + } + + sycl::group_barrier(ndit.get_group()); + + const std::size_t i = masked_block_start + l_i; + const auto cs_val = lacc[l_i]; + const bool cond = (lacc[l_i + 1] == cs_val + 1); + + if (cond && (i < masked_extent)) { ssize_t i_ = static_cast(i); for (int dim = nd; --dim > 0;) { - auto sd = mask_shape[dim]; - ssize_t q = i_ / sd; - ssize_t r = (i_ - q * sd); - if (cond) { - indexes_data[cs_curr_val + dim * nz_elems] = - static_cast(r); - } + const auto sd = mask_shape[dim]; + const ssize_t q = i_ / sd; + const ssize_t r = (i_ - q * sd); + indexes_data[cs_val + dim * nz_elems] = + static_cast(r); i_ = q; } - if (cond) { - indexes_data[cs_curr_val] = static_cast(i_); - } - }); + indexes_data[cs_val] = static_cast(i_); + } + }); }); return comp_ev; From ec924c39478dd1ede127eb285ea2ced025b3d4ba Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sat, 7 Dec 2024 14:29:44 -0600 Subject: [PATCH 4/8] Functors for masked extract/place changed to store typed pointers Also implement get_lws to choose local-work-group-size from given choices I0 > I1 > I2 > ..., if n > I0, use I0, if n > I1 use I1, and so on. --- .../kernels/boolean_advanced_indexing.hpp | 155 +++++++++++------- 1 file changed, 97 insertions(+), 58 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp index 059b491cc8..f20d269bd0 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -52,15 +52,15 @@ template struct MaskedExtractStridedFunctor { - MaskedExtractStridedFunctor(const char *src_data_p, - const char *cumsum_data_p, - char *dst_data_p, + MaskedExtractStridedFunctor(const dataT *src_data_p, + const indT *cumsum_data_p, + dataT *dst_data_p, size_t masked_iter_size, const OrthogIndexerT &orthog_src_dst_indexer_, const MaskedSrcIndexerT &masked_src_indexer_, const MaskedDstIndexerT &masked_dst_indexer_, const LocalAccessorT &lacc_) - : src_cp(src_data_p), cumsum_cp(cumsum_data_p), dst_cp(dst_data_p), + : src(src_data_p), cumsum(cumsum_data_p), dst(dst_data_p), masked_nelems(masked_iter_size), orthog_src_dst_indexer(orthog_src_dst_indexer_), masked_src_indexer(masked_src_indexer_), @@ -72,24 +72,19 @@ struct MaskedExtractStridedFunctor void operator()(sycl::nd_item<2> ndit) const { - const dataT *src_data = reinterpret_cast(src_cp); - dataT *dst_data = reinterpret_cast(dst_cp); - const indT *cumsum_data = reinterpret_cast(cumsum_cp); - - const size_t orthog_i = ndit.get_global_id(0); - const size_t group_i = ndit.get_group(1); + const std::size_t orthog_i = ndit.get_global_id(0); const std::uint32_t l_i = ndit.get_local_id(1); const std::uint32_t lws = ndit.get_local_range(1); - const size_t masked_block_start = group_i * lws; - const size_t masked_i = masked_block_start + l_i; + const std::size_t masked_i = ndit.get_global_id(1); + const std::size_t masked_block_start = masked_i - l_i; + const std::size_t max_offset = masked_nelems + 1; for (std::uint32_t i = l_i; i < lacc.size(); i += lws) { const size_t offset = masked_block_start + i; - lacc[i] = (offset == 0) ? indT(0) - : (offset - 1 < masked_nelems) - ? cumsum_data[offset - 1] - : cumsum_data[masked_nelems - 1] + 1; + lacc[i] = (offset == 0) ? indT(0) + : (offset < max_offset) ? cumsum[offset - 1] + : cumsum[masked_nelems - 1] + 1; } sycl::group_barrier(ndit.get_group()); @@ -110,14 +105,14 @@ struct MaskedExtractStridedFunctor masked_dst_indexer(current_running_count - 1) + orthog_offsets.get_second_offset(); - dst_data[total_dst_offset] = src_data[total_src_offset]; + dst[total_dst_offset] = src[total_src_offset]; } } private: - const char *src_cp = nullptr; - const char *cumsum_cp = nullptr; - char *dst_cp = nullptr; + const dataT *src = nullptr; + const indT *cumsum = nullptr; + dataT *dst = nullptr; const size_t masked_nelems = 0; // has nd, shape, src_strides, dst_strides for // dimensions that ARE NOT masked @@ -138,15 +133,15 @@ template struct MaskedPlaceStridedFunctor { - MaskedPlaceStridedFunctor(char *dst_data_p, - const char *cumsum_data_p, - const char *rhs_data_p, + MaskedPlaceStridedFunctor(dataT *dst_data_p, + const indT *cumsum_data_p, + const dataT *rhs_data_p, size_t masked_iter_size, const OrthogIndexerT &orthog_dst_rhs_indexer_, const MaskedDstIndexerT &masked_dst_indexer_, const MaskedRhsIndexerT &masked_rhs_indexer_, const LocalAccessorT &lacc_) - : dst_cp(dst_data_p), cumsum_cp(cumsum_data_p), rhs_cp(rhs_data_p), + : dst(dst_data_p), cumsum(cumsum_data_p), rhs(rhs_data_p), masked_nelems(masked_iter_size), orthog_dst_rhs_indexer(orthog_dst_rhs_indexer_), masked_dst_indexer(masked_dst_indexer_), @@ -158,24 +153,19 @@ struct MaskedPlaceStridedFunctor void operator()(sycl::nd_item<2> ndit) const { - dataT *dst_data = reinterpret_cast(dst_cp); - const indT *cumsum_data = reinterpret_cast(cumsum_cp); - const dataT *rhs_data = reinterpret_cast(rhs_cp); - const std::size_t orthog_i = ndit.get_global_id(0); - const std::size_t group_i = ndit.get_group(1); const std::uint32_t l_i = ndit.get_local_id(1); const std::uint32_t lws = ndit.get_local_range(1); - const size_t masked_block_start = group_i * lws; - const size_t masked_i = masked_block_start + l_i; + const size_t masked_i = ndit.get_global_id(1); + const size_t masked_block_start = masked_i - l_i; + const std::size_t max_offset = masked_nelems + 1; for (std::uint32_t i = l_i; i < lacc.size(); i += lws) { const size_t offset = masked_block_start + i; - lacc[i] = (offset == 0) ? indT(0) - : (offset - 1 < masked_nelems) - ? cumsum_data[offset - 1] - : cumsum_data[masked_nelems - 1] + 1; + lacc[i] = (offset == 0) ? indT(0) + : (offset < max_offset) ? cumsum[offset - 1] + : cumsum[masked_nelems - 1] + 1; } sycl::group_barrier(ndit.get_group()); @@ -196,14 +186,14 @@ struct MaskedPlaceStridedFunctor masked_rhs_indexer(current_running_count - 1) + orthog_offsets.get_second_offset(); - dst_data[total_dst_offset] = rhs_data[total_rhs_offset]; + dst[total_dst_offset] = rhs[total_rhs_offset]; } } private: - char *dst_cp = nullptr; - const char *cumsum_cp = nullptr; - const char *rhs_cp = nullptr; + dataT *dst = nullptr; + const indT *cumsum = nullptr; + const dataT *rhs = nullptr; const size_t masked_nelems = 0; // has nd, shape, dst_strides, rhs_strides for // dimensions that ARE NOT masked @@ -218,6 +208,30 @@ struct MaskedPlaceStridedFunctor // ======= Masked extraction ================================ +namespace +{ + +template +std::size_t _get_lws_impl(std::size_t n) +{ + if constexpr (sizeof...(IR) == 0) { + return I; + } + else { + return (n < I) ? _get_lws_impl(n) : I; + } +} + +std::size_t get_lws(std::size_t n) +{ + constexpr std::size_t lws0 = 256u; + constexpr std::size_t lws1 = 128u; + constexpr std::size_t lws2 = 64u; + return _get_lws_impl(n); +} + +} // end of anonymous namespace + template class masked_extract_all_slices_contig_impl_krn; @@ -258,9 +272,10 @@ sycl::event masked_extract_all_slices_contig_impl( Strided1DIndexer, dataT, indT, LocalAccessorT>; - constexpr std::size_t nominal_lws = 256; const std::size_t masked_extent = iteration_size; - const std::size_t lws = std::min(masked_extent, nominal_lws); + + const std::size_t lws = get_lws(masked_extent); + const std::size_t n_groups = (iteration_size + lws - 1) / lws; sycl::range<2> gRange{1, n_groups * lws}; @@ -268,6 +283,10 @@ sycl::event masked_extract_all_slices_contig_impl( sycl::nd_range<2> ndRange(gRange, lRange); + const dataT *src_tp = reinterpret_cast(src_p); + const indT *cumsum_tp = reinterpret_cast(cumsum_p); + dataT *dst_tp = reinterpret_cast(dst_p); + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -275,9 +294,9 @@ sycl::event masked_extract_all_slices_contig_impl( LocalAccessorT lacc(lacc_size, cgh); cgh.parallel_for( - ndRange, - Impl(src_p, cumsum_p, dst_p, masked_extent, orthog_src_dst_indexer, - masked_src_indexer, masked_dst_indexer, lacc)); + ndRange, Impl(src_tp, cumsum_tp, dst_tp, masked_extent, + orthog_src_dst_indexer, masked_src_indexer, + masked_dst_indexer, lacc)); }); return comp_ev; @@ -332,9 +351,10 @@ sycl::event masked_extract_all_slices_strided_impl( StridedIndexer, Strided1DIndexer, dataT, indT, LocalAccessorT>; - constexpr std::size_t nominal_lws = 256; const std::size_t masked_nelems = iteration_size; - const std::size_t lws = std::min(masked_nelems, nominal_lws); + + const std::size_t lws = get_lws(masked_nelems); + const std::size_t n_groups = (masked_nelems + lws - 1) / lws; sycl::range<2> gRange{1, n_groups * lws}; @@ -342,6 +362,10 @@ sycl::event masked_extract_all_slices_strided_impl( sycl::nd_range<2> ndRange(gRange, lRange); + const dataT *src_tp = reinterpret_cast(src_p); + const indT *cumsum_tp = reinterpret_cast(cumsum_p); + dataT *dst_tp = reinterpret_cast(dst_p); + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -349,9 +373,9 @@ sycl::event masked_extract_all_slices_strided_impl( LocalAccessorT lacc(lacc_size, cgh); cgh.parallel_for( - ndRange, - Impl(src_p, cumsum_p, dst_p, iteration_size, orthog_src_dst_indexer, - masked_src_indexer, masked_dst_indexer, lacc)); + ndRange, Impl(src_tp, cumsum_tp, dst_tp, iteration_size, + orthog_src_dst_indexer, masked_src_indexer, + masked_dst_indexer, lacc)); }); return comp_ev; @@ -422,9 +446,10 @@ sycl::event masked_extract_some_slices_strided_impl( StridedIndexer, Strided1DIndexer, dataT, indT, LocalAccessorT>; - const size_t nominal_lws = 256; const std::size_t masked_extent = masked_nelems; - const size_t lws = std::min(masked_extent, nominal_lws); + + const std::size_t lws = get_lws(masked_extent); + const size_t n_groups = ((masked_extent + lws - 1) / lws); const size_t orthog_extent = static_cast(orthog_nelems); @@ -433,6 +458,10 @@ sycl::event masked_extract_some_slices_strided_impl( sycl::nd_range<2> ndRange(gRange, lRange); + const dataT *src_tp = reinterpret_cast(src_p); + const indT *cumsum_tp = reinterpret_cast(cumsum_p); + dataT *dst_tp = reinterpret_cast(dst_p); + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -441,9 +470,9 @@ sycl::event masked_extract_some_slices_strided_impl( LocalAccessorT lacc(lacc_size, cgh); cgh.parallel_for( - ndRange, - Impl(src_p, cumsum_p, dst_p, masked_nelems, orthog_src_dst_indexer, - masked_src_indexer, masked_dst_indexer, lacc)); + ndRange, Impl(src_tp, cumsum_tp, dst_tp, masked_nelems, + orthog_src_dst_indexer, masked_src_indexer, + masked_dst_indexer, lacc)); }); return comp_ev; @@ -567,6 +596,10 @@ sycl::event masked_place_all_slices_strided_impl( using LocalAccessorT = sycl::local_accessor; + dataT *dst_tp = reinterpret_cast(dst_p); + const dataT *rhs_tp = reinterpret_cast(rhs_p); + const indT *cumsum_tp = reinterpret_cast(cumsum_p); + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -578,8 +611,9 @@ sycl::event masked_place_all_slices_strided_impl( MaskedPlaceStridedFunctor( - dst_p, cumsum_p, rhs_p, iteration_size, orthog_dst_rhs_indexer, - masked_dst_indexer, masked_rhs_indexer, lacc)); + dst_tp, cumsum_tp, rhs_tp, iteration_size, + orthog_dst_rhs_indexer, masked_dst_indexer, masked_rhs_indexer, + lacc)); }); return comp_ev; @@ -659,6 +693,10 @@ sycl::event masked_place_some_slices_strided_impl( using LocalAccessorT = sycl::local_accessor; + dataT *dst_tp = reinterpret_cast(dst_p); + const dataT *rhs_tp = reinterpret_cast(rhs_p); + const indT *cumsum_tp = reinterpret_cast(cumsum_p); + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -670,8 +708,9 @@ sycl::event masked_place_some_slices_strided_impl( MaskedPlaceStridedFunctor( - dst_p, cumsum_p, rhs_p, masked_nelems, orthog_dst_rhs_indexer, - masked_dst_indexer, masked_rhs_indexer, lacc)); + dst_tp, cumsum_tp, rhs_tp, masked_nelems, + orthog_dst_rhs_indexer, masked_dst_indexer, masked_rhs_indexer, + lacc)); }); return comp_ev; From 3eb956e2252a9907991cd6e718363ba290371ca8 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 9 Dec 2024 10:25:50 -0600 Subject: [PATCH 5/8] Inclusive scan iter chunk update kernel (generic and 1d) improved The chunk update kernels processed consecutive elements in contiguous memory, hence sub-group memory access pattern was sub-optimal (no coalescing). This PR changes these kernels to process n_wi elements which are sub-group size apart, improving memory access patern. Running a micro-benchmark based on code from gh-1249 (for shape =(n, n,) where n = 4096) with this change: ``` (dev_dpctl) opavlyk@mtl-world:~/repos/dpctl$ ONEAPI_DEVICE_SELECTOR=cuda:gpu python index.py 0.010703916665753004 (dev_dpctl) opavlyk@mtl-world:~/repos/dpctl$ ONEAPI_DEVICE_SELECTOR=cuda:gpu python index.py 0.01079747307597211 (dev_dpctl) opavlyk@mtl-world:~/repos/dpctl$ ONEAPI_DEVICE_SELECTOR=cuda:gpu python index.py 0.010864820314088353 (dev_dpctl) opavlyk@mtl-world:~/repos/dpctl$ python index.py 0.023878061203975922 (dev_dpctl) opavlyk@mtl-world:~/repos/dpctl$ python index.py 0.023666468500677083 ``` while before: ``` (dev_dpctl) opavlyk@mtl-world:~/repos/dpctl$ ONEAPI_DEVICE_SELECTOR=cuda:gpu python index.py 0.011415911812542213 (dev_dpctl) opavlyk@mtl-world:~/repos/dpctl$ ONEAPI_DEVICE_SELECTOR=cuda:gpu python index.py 0.011722088705196424 (dev_dpctl) opavlyk@mtl-world:~/repos/dpctl$ ONEAPI_DEVICE_SELECTOR=level_zero:gpu python index.py 0.030126182353813893 (dev_dpctl) opavlyk@mtl-world:~/repos/dpctl$ ONEAPI_DEVICE_SELECTOR=level_zero:gpu python index.py 0.030459783371986338 ``` Running the same code using NumPy (same size): ``` (dev_dpctl) opavlyk@mtl-world:~/repos/dpctl$ python index_np.py 0.01416253090698134 (dev_dpctl) opavlyk@mtl-world:~/repos/dpctl$ python index_np.py 0.014979530811413296 ``` The reason Level-Zero device is slower has to do with slow allocation/deallocation bug. OpenCL device has better timing. With this change: ``` (dev_dpctl) opavlyk@mtl-world:~/repos/dpctl$ ONEAPI_DEVICE_SELECTOR=opencl:gpu python index.py 0.015038836885381627 (dev_dpctl) opavlyk@mtl-world:~/repos/dpctl$ ONEAPI_DEVICE_SELECTOR=opencl:gpu python index.py 0.01527448468496678 ``` before: ``` (dev_dpctl) opavlyk@mtl-world:~/repos/dpctl$ ONEAPI_DEVICE_SELECTOR=opencl:gpu python index.py 0.01758851639115838 (dev_dpctl) opavlyk@mtl-world:~/repos/dpctl$ ONEAPI_DEVICE_SELECTOR=opencl:gpu python index.py 0.017089676241286926 ``` --- .../include/kernels/accumulators.hpp | 88 ++++++++++++++----- 1 file changed, 66 insertions(+), 22 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index 64ed566151..f31547c87d 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -404,25 +404,47 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, const size_t src_size = stack_elem.get_size(); outputT *local_scans = stack_elem.get_local_scans_ptr(); + using UpdateKernelName = + class inclusive_scan_1d_iter_chunk_update_krn< + inputT, outputT, n_wi, IndexerT, TransformerT, + NoOpTransformerT, ScanOpT, include_initial>; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + // output[ chunk_size * (i + 1) + j] += temp[i] dependent_event = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(dependent_event); + cgh.use_kernel_bundle(kb); constexpr nwiT updates_per_wi = n_wi; - const size_t n_items = ceiling_quotient(src_size, n_wi); + const size_t n_items = + ceiling_quotient(src_size, sg_size * n_wi) * + sg_size; - using UpdateKernelName = - class inclusive_scan_1d_iter_chunk_update_krn< - inputT, outputT, n_wi, IndexerT, TransformerT, - NoOpTransformerT, ScanOpT, include_initial>; + sycl::range<1> gRange{n_items}; + sycl::range<1> lRange{sg_size}; + sycl::nd_range<1> ndRange{gRange, lRange}; cgh.parallel_for( - {n_items}, [chunk_size, src, src_size, local_scans, scan_op, - identity](auto wiid) { - const size_t gid = n_wi * wiid[0]; + ndRange, [chunk_size, src, src_size, local_scans, scan_op, + identity](sycl::nd_item<1> ndit) { + const std::uint32_t lws = ndit.get_local_range(0); + const size_t block_offset = + ndit.get_group(0) * n_wi * lws; #pragma unroll for (size_t i = 0; i < updates_per_wi; ++i) { - const size_t src_id = gid + i; + const size_t src_id = + block_offset + ndit.get_local_id(0) + i * lws; if (src_id < src_size) { const size_t scan_id = (src_id / chunk_size); src[src_id] = @@ -661,33 +683,55 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, outputT *local_scans = stack_elem.get_local_scans_ptr(); size_t local_stride = stack_elem.get_local_stride(); + using UpdateKernelName = class inclusive_scan_iter_chunk_update_krn< + inputT, outputT, n_wi, TransformerT, NoOpTransformerT, ScanOpT, + include_initial>; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + constexpr nwiT updates_per_wi = n_wi; const size_t update_nelems = - ceiling_quotient(src_size, updates_per_wi); + ceiling_quotient(src_size, sg_size * updates_per_wi) * + sg_size; dependent_event = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(dependent_event); + cgh.use_kernel_bundle(kb); - using UpdateKernelName = - class inclusive_scan_iter_chunk_update_krn< - inputT, outputT, n_wi, TransformerT, NoOpTransformerT, - ScanOpT, include_initial>; + sycl::range<1> gRange{iter_nelems * update_nelems}; + sycl::range<1> lRange{sg_size}; + + sycl::nd_range<1> ndRange{gRange, lRange}; cgh.parallel_for( - {iter_nelems * update_nelems}, + ndRange, [chunk_size, update_nelems, src_size, local_stride, src, - local_scans, scan_op, identity](auto wiid) { - const size_t gid = wiid[0]; + local_scans, scan_op, identity](sycl::nd_item<1> ndit) { + const size_t gr_id = ndit.get_group(0); - const size_t iter_gid = gid / update_nelems; - const size_t axis_gid = - gid - (iter_gid * update_nelems); + const size_t iter_gid = gr_id / update_nelems; + const size_t axis_gr_id = + gr_id - (iter_gid * update_nelems); - const size_t src_axis_id0 = axis_gid * updates_per_wi; + const std::uint32_t lws = ndit.get_local_range(0); + + const size_t src_axis_id0 = + axis_gr_id * updates_per_wi * lws; const size_t src_iter_id = iter_gid * src_size; #pragma unroll for (nwiT i = 0; i < updates_per_wi; ++i) { - const size_t src_axis_id = src_axis_id0 + i; + const size_t src_axis_id = + src_axis_id0 + ndit.get_local_id(0) + i * lws; const size_t src_id = src_axis_id + src_iter_id; if (src_axis_id < src_size) { From e0a59f58760d7c18c6131acaeca0e7271d5aeebb Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 10 Dec 2024 13:27:11 -0600 Subject: [PATCH 6/8] Chunk update kernels use nd_range<2> Changed left-over update kernel to use coalesceed memory access. --- .../include/kernels/accumulators.hpp | 119 +++++++++++------- 1 file changed, 72 insertions(+), 47 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index f31547c87d..90d77e0ca9 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -326,7 +326,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, std::vector &host_tasks, const std::vector &depends = {}) { - ScanOpT scan_op{}; + constexpr ScanOpT scan_op{}; constexpr outputT identity = su_ns::Identity::value; constexpr size_t _iter_nelems = 1; @@ -436,8 +436,12 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, sycl::nd_range<1> ndRange{gRange, lRange}; cgh.parallel_for( - ndRange, [chunk_size, src, src_size, local_scans, scan_op, - identity](sycl::nd_item<1> ndit) { + ndRange, [chunk_size, src, src_size, + local_scans](sycl::nd_item<1> ndit) { + constexpr ScanOpT scan_op{}; + constexpr outputT identity = + su_ns::Identity::value; + const std::uint32_t lws = ndit.get_local_range(0); const size_t block_offset = ndit.get_group(0) * n_wi * lws; @@ -447,11 +451,10 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, block_offset + ndit.get_local_id(0) + i * lws; if (src_id < src_size) { const size_t scan_id = (src_id / chunk_size); - src[src_id] = - (scan_id > 0) - ? scan_op(src[src_id], - local_scans[scan_id - 1]) - : scan_op(src[src_id], identity); + const outputT modifier = + (scan_id > 0) ? local_scans[scan_id - 1] + : identity; + src[src_id] = scan_op(src[src_id], modifier); } } }); @@ -561,7 +564,7 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, std::vector &host_tasks, const std::vector &depends = {}) { - ScanOpT scan_op = ScanOpT(); + constexpr ScanOpT scan_op{}; constexpr outputT identity = su_ns::Identity::value; using IterIndexerT = @@ -708,43 +711,44 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, cgh.depends_on(dependent_event); cgh.use_kernel_bundle(kb); - sycl::range<1> gRange{iter_nelems * update_nelems}; - sycl::range<1> lRange{sg_size}; + sycl::range<2> gRange{iter_nelems, update_nelems}; + sycl::range<2> lRange{1, sg_size}; - sycl::nd_range<1> ndRange{gRange, lRange}; + sycl::nd_range<2> ndRange{gRange, lRange}; cgh.parallel_for( - ndRange, - [chunk_size, update_nelems, src_size, local_stride, src, - local_scans, scan_op, identity](sycl::nd_item<1> ndit) { - const size_t gr_id = ndit.get_group(0); + ndRange, [chunk_size, src_size, local_stride, src, + local_scans](sycl::nd_item<2> ndit) { + constexpr ScanOpT scan_op{}; + constexpr outputT identity = + su_ns::Identity::value; - const size_t iter_gid = gr_id / update_nelems; - const size_t axis_gr_id = - gr_id - (iter_gid * update_nelems); + const size_t iter_gid = ndit.get_group(0); + const size_t axis_gr_id = ndit.get_group(1); const std::uint32_t lws = ndit.get_local_range(0); const size_t src_axis_id0 = axis_gr_id * updates_per_wi * lws; const size_t src_iter_id = iter_gid * src_size; + const size_t scan_id0 = iter_gid * local_stride; #pragma unroll for (nwiT i = 0; i < updates_per_wi; ++i) { const size_t src_axis_id = src_axis_id0 + ndit.get_local_id(0) + i * lws; - const size_t src_id = src_axis_id + src_iter_id; if (src_axis_id < src_size) { const size_t scan_axis_id = src_axis_id / chunk_size; - const size_t scan_id = - scan_axis_id + iter_gid * local_stride; + const size_t scan_id = scan_axis_id + scan_id0; - src[src_id] = + const outputT modifier = (scan_axis_id > 0) - ? scan_op(src[src_id], - local_scans[scan_id - 1]) - : scan_op(src[src_id], identity); + ? local_scans[scan_id - 1] + : identity; + + const size_t src_id = src_axis_id + src_iter_id; + src[src_id] = scan_op(src[src_id], modifier); } } }); @@ -759,35 +763,55 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, outputT *local_scans = stack_elem.get_local_scans_ptr(); const size_t local_stride = stack_elem.get_local_stride(); + using UpdateKernelName = + class inclusive_scan_final_chunk_update_krn< + inputT, outputT, n_wi, OutIterIndexerT, OutIndexerT, + TransformerT, NoOpTransformerT, ScanOpT, include_initial>; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + constexpr nwiT updates_per_wi = n_wi; const size_t update_nelems = - ceiling_quotient(src_size, updates_per_wi); + ceiling_quotient(src_size, sg_size * updates_per_wi) * + sg_size; + + sycl::range<2> gRange{iter_nelems, update_nelems}; + sycl::range<2> lRange{1, sg_size}; + + sycl::nd_range<2> ndRange{gRange, lRange}; dependent_event = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(dependent_event); - using UpdateKernelName = - class inclusive_scan_final_chunk_update_krn< - inputT, outputT, n_wi, OutIterIndexerT, OutIndexerT, - TransformerT, NoOpTransformerT, ScanOpT, - include_initial>; - cgh.parallel_for( - {iter_nelems * update_nelems}, - [chunk_size, update_nelems, src_size, local_stride, src, - local_scans, scan_op, identity, out_iter_indexer, - out_indexer](auto wiid) { - const size_t gid = wiid[0]; + ndRange, + [chunk_size, src_size, local_stride, src, local_scans, + out_iter_indexer, out_indexer](sycl::nd_item<2> ndit) { + constexpr ScanOpT scan_op{}; + constexpr outputT identity = + su_ns::Identity::value; - const size_t iter_gid = gid / update_nelems; - const size_t axis_gid = - gid - (iter_gid * update_nelems); + const std::uint32_t lws = ndit.get_local_range(1); - const size_t src_axis_id0 = axis_gid * updates_per_wi; + const size_t iter_gid = ndit.get_group(0); + + const size_t src_axis_id0 = + ndit.get_group(1) * updates_per_wi * lws + + ndit.get_local_id(1); const size_t src_iter_id = out_iter_indexer(iter_gid); #pragma unroll for (nwiT i = 0; i < updates_per_wi; ++i) { - const size_t src_axis_id = src_axis_id0 + i; + const size_t src_axis_id = src_axis_id0 + i * lws; const size_t src_id = out_indexer(src_axis_id) + src_iter_id; @@ -797,11 +821,12 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, const size_t scan_id = scan_axis_id + iter_gid * local_stride; - src[src_id] = + const outputT modifier = (scan_axis_id > 0) - ? scan_op(src[src_id], - local_scans[scan_id - 1]) - : scan_op(src[src_id], identity); + ? local_scans[scan_id - 1] + : identity; + + src[src_id] = scan_op(src[src_id], modifier); } } }); From b0c8c5e130c7ba07033ac1fe0beb71cb176ed1fe Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 10 Dec 2024 13:49:36 -0600 Subject: [PATCH 7/8] Add change-log entry for performance improvement in scanning and masked extract/place code --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a053b5a8e0..ae78312038 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,8 +16,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Improved performance of `tensor.sort` and `tensor.argsort` for short arrays in the range [16, 64] elements [gh-1866](https://github.com/IntelPython/dpctl/pull/1866) * Implement radix sort algorithm to be used in `dpt.sort` and `dpt.argsort` [gh-1867](https://github.com/IntelPython/dpctl/pull/1867) * Extended `dpctl.SyclTimer` with `device_timer` keyword, implementing different methods of collecting device times [gh-1872](https://github.com/IntelPython/dpctl/pull/1872) +* Improved performance of `tensor.cumulative_sum`, `tensor.cumulative_prod`, `tensor.cumulative_logsumexp` as well as performance of boolean indexing [gh-1923](https://github.com/IntelPython/dpctl/pull/1923) ### Fixed + * Fix for `tensor.result_type` when all inputs are Python built-in scalars [gh-1877](https://github.com/IntelPython/dpctl/pull/1877) * Improved error in constructors `tensor.full` and `tensor.full_like` when provided a non-numeric fill value [gh-1878](https://github.com/IntelPython/dpctl/pull/1878) * Added a check for pointer alignment when copying to C-contiguous memory [gh-1890](https://github.com/IntelPython/dpctl/pull/1890) From a8e7600618a2e23a9f0968fd10bc48d17eed5f5a Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 10 Dec 2024 14:14:17 -0600 Subject: [PATCH 8/8] size_t -> std::size_t --- .../include/kernels/accumulators.hpp | 267 +++++++++--------- 1 file changed, 139 insertions(+), 128 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index 90d77e0ca9..69283cf54f 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -93,12 +93,12 @@ namespace template class stack_t { T *src_; - size_t size_; + std::size_t size_; T *local_scans_; public: stack_t() : src_{}, size_{}, local_scans_{} {} - stack_t(T *src, size_t sz, T *local_scans) + stack_t(T *src, std::size_t sz, T *local_scans) : src_(src), size_(sz), local_scans_(local_scans) { } @@ -106,7 +106,7 @@ template class stack_t T *get_src_ptr() const { return src_; } - size_t get_size() const { return size_; } + std::size_t get_size() const { return size_; } T *get_local_scans_ptr() const { return local_scans_; } }; @@ -114,13 +114,16 @@ template class stack_t template class stack_strided_t { T *src_; - size_t size_; + std::size_t size_; T *local_scans_; - size_t local_stride_; + std::size_t local_stride_; public: stack_strided_t() : src_{}, size_{}, local_scans_{}, local_stride_{} {} - stack_strided_t(T *src, size_t sz, T *local_scans, size_t local_stride) + stack_strided_t(T *src, + std::size_t sz, + T *local_scans, + std::size_t local_stride) : src_(src), size_(sz), local_scans_(local_scans), local_stride_(local_stride) { @@ -129,11 +132,11 @@ template class stack_strided_t T *get_src_ptr() const { return src_; } - size_t get_size() const { return size_; } + std::size_t get_size() const { return size_; } T *get_local_scans_ptr() const { return local_scans_; } - size_t get_local_stride() const { return local_stride_; } + std::size_t get_local_stride() const { return local_stride_; } }; } // end of anonymous namespace @@ -175,23 +178,23 @@ template sycl::event inclusive_scan_base_step(sycl::queue &exec_q, - const size_t wg_size, - const size_t iter_nelems, - const size_t acc_nelems, + const std::size_t wg_size, + const std::size_t iter_nelems, + const std::size_t acc_nelems, const inputT *input, outputT *output, - const size_t s0, - const size_t s1, + const std::size_t s0, + const std::size_t s1, const IterIndexerT &iter_indexer, const InpIndexerT &inp_indexer, const OutIndexerT &out_indexer, TransformerT transformer, const ScanOpT &scan_op, outputT identity, - size_t &acc_groups, + std::size_t &acc_groups, const std::vector &depends = {}) { - acc_groups = ceiling_quotient(acc_nelems, n_wi * wg_size); + acc_groups = ceiling_quotient(acc_nelems, n_wi * wg_size); sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -212,14 +215,14 @@ inclusive_scan_base_step(sycl::queue &exec_q, cgh.parallel_for(ndRange, [=, slm_iscan_tmp = std::move(slm_iscan_tmp)]( sycl::nd_item<1> it) { - const size_t gid = it.get_global_id(0); - const size_t lid = it.get_local_id(0); + const std::size_t gid = it.get_global_id(0); + const std::size_t lid = it.get_local_id(0); - const size_t reduce_chunks = acc_groups * wg_size; - const size_t iter_gid = gid / reduce_chunks; - const size_t chunk_gid = gid - (iter_gid * reduce_chunks); + const std::size_t reduce_chunks = acc_groups * wg_size; + const std::size_t iter_gid = gid / reduce_chunks; + const std::size_t chunk_gid = gid - (iter_gid * reduce_chunks); - const size_t i = chunk_gid * n_wi; + const std::size_t i = chunk_gid * n_wi; const auto &iter_offsets = iter_indexer(iter_gid); const auto &inp_iter_offset = iter_offsets.get_first_offset(); const auto &out_iter_offset = iter_offsets.get_second_offset(); @@ -228,7 +231,7 @@ inclusive_scan_base_step(sycl::queue &exec_q, #pragma unroll for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { - const size_t i_m_wi = i + m_wi; + const std::size_t i_m_wi = i + m_wi; if constexpr (!include_initial) { local_iscan[m_wi] = (i_m_wi < acc_nelems) @@ -280,8 +283,8 @@ inclusive_scan_base_step(sycl::queue &exec_q, local_iscan[m_wi] = scan_op(local_iscan[m_wi], addand); } - const size_t start = std::min(i, acc_nelems); - const size_t end = std::min(i + n_wi, acc_nelems); + const std::size_t start = std::min(i, acc_nelems); + const std::size_t end = std::min(i + n_wi, acc_nelems); const nwiT m_max = static_cast(end - start); for (nwiT m_wi = 0; m_wi < m_max; ++m_wi) { output[out_iter_offset + out_indexer(i + m_wi)] = @@ -315,12 +318,12 @@ template sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, - const size_t wg_size, - const size_t n_elems, + const std::size_t wg_size, + const std::size_t n_elems, const inputT *input, outputT *output, - const size_t s0, - const size_t s1, + const std::size_t s0, + const std::size_t s1, const IndexerT &indexer, const TransformerT &transformer, std::vector &host_tasks, @@ -329,7 +332,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, constexpr ScanOpT scan_op{}; constexpr outputT identity = su_ns::Identity::value; - constexpr size_t _iter_nelems = 1; + constexpr std::size_t _iter_nelems = 1; using IterIndexerT = dpctl::tensor::offset_utils::TwoZeroOffsets_Indexer; constexpr IterIndexerT _no_op_iter_indexer{}; @@ -337,7 +340,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; constexpr NoOpIndexerT _no_op_indexer{}; - size_t n_groups; + std::size_t n_groups; sycl::event inc_scan_phase1_ev = inclusive_scan_base_step 1) { - const size_t chunk_size = wg_size * n_wi; + const std::size_t chunk_size = wg_size * n_wi; // how much of temporary allocation do we need - size_t n_groups_ = n_groups; - size_t temp_size = 0; + std::size_t n_groups_ = n_groups; + std::size_t temp_size = 0; while (n_groups_ > 1) { - const size_t this_size = (n_groups_ - 1); + const std::size_t this_size = (n_groups_ - 1); temp_size += this_size; n_groups_ = ceiling_quotient(this_size, chunk_size); } @@ -375,10 +378,10 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, using NoOpTransformerT = NoOpTransformer; constexpr NoOpTransformerT _no_op_transformer{}; - size_t size_to_update = n_elems; + std::size_t size_to_update = n_elems; while (n_groups_ > 1) { - const size_t src_size = n_groups_ - 1; + const std::size_t src_size = n_groups_ - 1; dependent_event = inclusive_scan_base_step(src_size, sg_size * n_wi) * + const std::size_t n_items = + ceiling_quotient(src_size, sg_size * n_wi) * sg_size; sycl::range<1> gRange{n_items}; @@ -443,14 +446,15 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, su_ns::Identity::value; const std::uint32_t lws = ndit.get_local_range(0); - const size_t block_offset = + const std::size_t block_offset = ndit.get_group(0) * n_wi * lws; #pragma unroll - for (size_t i = 0; i < updates_per_wi; ++i) { - const size_t src_id = + for (std::size_t i = 0; i < updates_per_wi; ++i) { + const std::size_t src_id = block_offset + ndit.get_local_id(0) + i * lws; if (src_id < src_size) { - const size_t scan_id = (src_id / chunk_size); + const std::size_t scan_id = + (src_id / chunk_size); const outputT modifier = (scan_id > 0) ? local_scans[scan_id - 1] : identity; @@ -475,7 +479,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, typedef sycl::event (*accumulate_1d_contig_impl_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, const char *, char *, std::vector &, @@ -488,7 +492,7 @@ template sycl::event accumulate_1d_contig_impl(sycl::queue &q, - size_t n_elems, + std::size_t n_elems, const char *src, char *dst, std::vector &host_tasks, @@ -501,14 +505,14 @@ accumulate_1d_contig_impl(sycl::queue &q, constexpr NoOpIndexerT flat_indexer{}; constexpr transformerT transformer{}; - constexpr size_t s0 = 0; - constexpr size_t s1 = 1; + constexpr std::size_t s0 = 0; + constexpr std::size_t s1 = 1; sycl::event comp_ev; const sycl::device &dev = q.get_device(); if (dev.has(sycl::aspect::cpu)) { constexpr nwiT n_wi_for_cpu = 8; - const size_t wg_size = 256; + const std::size_t wg_size = 256; comp_ev = inclusive_scan_iter_1d( @@ -517,7 +521,7 @@ accumulate_1d_contig_impl(sycl::queue &q, } else { constexpr nwiT n_wi_for_gpu = 4; - const size_t wg_size = 256; + const std::size_t wg_size = 256; comp_ev = inclusive_scan_iter_1d( @@ -549,13 +553,13 @@ template sycl::event inclusive_scan_iter(sycl::queue &exec_q, - const size_t wg_size, - const size_t iter_nelems, - const size_t acc_nelems, + const std::size_t wg_size, + const std::size_t iter_nelems, + const std::size_t acc_nelems, const inputT *input, outputT *output, - const size_t s0, - const size_t s1, + const std::size_t s0, + const std::size_t s1, const InpIterIndexerT &inp_iter_indexer, const OutIterIndexerT &out_iter_indexer, const InpIndexerT &inp_indexer, @@ -572,7 +576,7 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, InpIterIndexerT, OutIterIndexerT>; const IterIndexerT iter_indexer{inp_iter_indexer, out_iter_indexer}; - size_t acc_groups; + std::size_t acc_groups; sycl::event inc_scan_phase1_ev = inclusive_scan_base_step 1) { - const size_t chunk_size = wg_size * n_wi; + const std::size_t chunk_size = wg_size * n_wi; // how much of temporary allocation do we need - size_t acc_groups_ = acc_groups; - size_t temp_size = 0; + std::size_t acc_groups_ = acc_groups; + std::size_t temp_size = 0; while (acc_groups_ > 1) { - const size_t this_size = (acc_groups_ - 1); + const std::size_t this_size = (acc_groups_ - 1); temp_size += this_size; - acc_groups_ = ceiling_quotient(this_size, chunk_size); + acc_groups_ = ceiling_quotient(this_size, chunk_size); } // allocate @@ -613,10 +617,10 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, constexpr NoOpIndexerT _no_op_indexer{}; using NoOpTransformerT = NoOpTransformer; constexpr NoOpTransformerT _no_op_transformer{}; - size_t size_to_update = acc_nelems; + std::size_t size_to_update = acc_nelems; { - size_t src_size = acc_groups - 1; + std::size_t src_size = acc_groups - 1; using LocalScanIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; const LocalScanIndexerT scan_iter_indexer{/* size */ iter_nelems, @@ -644,7 +648,7 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, } while (acc_groups_ > 1) { - size_t src_size = acc_groups_ - 1; + std::size_t src_size = acc_groups_ - 1; using LocalScanIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; @@ -675,16 +679,16 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, size_to_update = src_size; } - for (size_t reverse_stack_id = 0; reverse_stack_id < stack.size() - 1; - ++reverse_stack_id) + for (std::size_t reverse_stack_id = 0; + reverse_stack_id < stack.size() - 1; ++reverse_stack_id) { - const size_t stack_id = stack.size() - 1 - reverse_stack_id; + const std::size_t stack_id = stack.size() - 1 - reverse_stack_id; const auto &stack_elem = stack[stack_id]; outputT *src = stack_elem.get_src_ptr(); - size_t src_size = stack_elem.get_size(); + std::size_t src_size = stack_elem.get_size(); outputT *local_scans = stack_elem.get_local_scans_ptr(); - size_t local_stride = stack_elem.get_local_stride(); + std::size_t local_stride = stack_elem.get_local_stride(); using UpdateKernelName = class inclusive_scan_iter_chunk_update_krn< inputT, outputT, n_wi, TransformerT, NoOpTransformerT, ScanOpT, @@ -703,8 +707,9 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, sycl::info::kernel_device_specific::max_sub_group_size>(dev); constexpr nwiT updates_per_wi = n_wi; - const size_t update_nelems = - ceiling_quotient(src_size, sg_size * updates_per_wi) * + const std::size_t update_nelems = + ceiling_quotient(src_size, + sg_size * updates_per_wi) * sg_size; dependent_event = exec_q.submit([&](sycl::handler &cgh) { @@ -723,31 +728,33 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, constexpr outputT identity = su_ns::Identity::value; - const size_t iter_gid = ndit.get_group(0); - const size_t axis_gr_id = ndit.get_group(1); + const std::size_t iter_gid = ndit.get_group(0); + const std::size_t axis_gr_id = ndit.get_group(1); const std::uint32_t lws = ndit.get_local_range(0); - const size_t src_axis_id0 = + const std::size_t src_axis_id0 = axis_gr_id * updates_per_wi * lws; - const size_t src_iter_id = iter_gid * src_size; - const size_t scan_id0 = iter_gid * local_stride; + const std::size_t src_iter_id = iter_gid * src_size; + const std::size_t scan_id0 = iter_gid * local_stride; #pragma unroll for (nwiT i = 0; i < updates_per_wi; ++i) { - const size_t src_axis_id = + const std::size_t src_axis_id = src_axis_id0 + ndit.get_local_id(0) + i * lws; if (src_axis_id < src_size) { - const size_t scan_axis_id = + const std::size_t scan_axis_id = src_axis_id / chunk_size; - const size_t scan_id = scan_axis_id + scan_id0; + const std::size_t scan_id = + scan_axis_id + scan_id0; const outputT modifier = (scan_axis_id > 0) ? local_scans[scan_id - 1] : identity; - const size_t src_id = src_axis_id + src_iter_id; + const std::size_t src_id = + src_axis_id + src_iter_id; src[src_id] = scan_op(src[src_id], modifier); } } @@ -759,9 +766,9 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, { const auto &stack_elem = stack[0]; outputT *src = stack_elem.get_src_ptr(); - const size_t src_size = stack_elem.get_size(); + const std::size_t src_size = stack_elem.get_size(); outputT *local_scans = stack_elem.get_local_scans_ptr(); - const size_t local_stride = stack_elem.get_local_stride(); + const std::size_t local_stride = stack_elem.get_local_stride(); using UpdateKernelName = class inclusive_scan_final_chunk_update_krn< @@ -781,8 +788,9 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, sycl::info::kernel_device_specific::max_sub_group_size>(dev); constexpr nwiT updates_per_wi = n_wi; - const size_t update_nelems = - ceiling_quotient(src_size, sg_size * updates_per_wi) * + const std::size_t update_nelems = + ceiling_quotient(src_size, + sg_size * updates_per_wi) * sg_size; sycl::range<2> gRange{iter_nelems, update_nelems}; @@ -803,22 +811,24 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, const std::uint32_t lws = ndit.get_local_range(1); - const size_t iter_gid = ndit.get_group(0); + const std::size_t iter_gid = ndit.get_group(0); - const size_t src_axis_id0 = + const std::size_t src_axis_id0 = ndit.get_group(1) * updates_per_wi * lws + ndit.get_local_id(1); - const size_t src_iter_id = out_iter_indexer(iter_gid); + const std::size_t src_iter_id = + out_iter_indexer(iter_gid); #pragma unroll for (nwiT i = 0; i < updates_per_wi; ++i) { - const size_t src_axis_id = src_axis_id0 + i * lws; - const size_t src_id = + const std::size_t src_axis_id = + src_axis_id0 + i * lws; + const std::size_t src_id = out_indexer(src_axis_id) + src_iter_id; if (src_axis_id < src_size) { - const size_t scan_axis_id = + const std::size_t scan_axis_id = src_axis_id / chunk_size; - const size_t scan_id = + const std::size_t scan_id = scan_axis_id + iter_gid * local_stride; const outputT modifier = @@ -847,8 +857,8 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, typedef sycl::event (*accumulate_strided_impl_fn_ptr_t)( sycl::queue &, - size_t, - size_t, + std::size_t, + std::size_t, const char *, int, const ssize_t *, @@ -867,8 +877,8 @@ template sycl::event accumulate_strided_impl(sycl::queue &q, - size_t iter_nelems, - size_t acc_nelems, + std::size_t iter_nelems, + std::size_t acc_nelems, const char *src, int iter_nd, const ssize_t *iter_shape_strides, @@ -897,14 +907,14 @@ accumulate_strided_impl(sycl::queue &q, constexpr transformerT transformer{}; - constexpr size_t s0 = 0; - constexpr size_t s1 = 1; + constexpr std::size_t s0 = 0; + constexpr std::size_t s1 = 1; const sycl::device &dev = q.get_device(); sycl::event comp_ev; if (dev.has(sycl::aspect::cpu)) { constexpr nwiT n_wi_for_cpu = 8; - const size_t wg_size = 256; + const std::size_t wg_size = 256; comp_ev = inclusive_scan_iter &, const std::vector &); template -size_t cumsum_val_contig_impl(sycl::queue &q, - size_t n_elems, - const char *mask, - char *cumsum, - std::vector &host_tasks, - const std::vector &depends = {}) +std::size_t cumsum_val_contig_impl(sycl::queue &q, + std::size_t n_elems, + const char *mask, + char *cumsum, + std::vector &host_tasks, + const std::vector &depends = {}) { const maskT *mask_data_ptr = reinterpret_cast(mask); cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); @@ -951,8 +961,8 @@ size_t cumsum_val_contig_impl(sycl::queue &q, constexpr NoOpIndexerT flat_indexer{}; constexpr transformerT transformer{}; - constexpr size_t s0 = 0; - constexpr size_t s1 = 1; + constexpr std::size_t s0 = 0; + constexpr std::size_t s1 = 1; constexpr bool include_initial = false; using AccumulateOpT = sycl::plus; @@ -960,7 +970,7 @@ size_t cumsum_val_contig_impl(sycl::queue &q, const sycl::device &dev = q.get_device(); if (dev.has(sycl::aspect::cpu)) { constexpr nwiT n_wi_for_cpu = 8; - const size_t wg_size = 256; + const std::size_t wg_size = 256; comp_ev = inclusive_scan_iter_1d( @@ -969,7 +979,7 @@ size_t cumsum_val_contig_impl(sycl::queue &q, } else { constexpr nwiT n_wi_for_gpu = 4; - const size_t wg_size = 256; + const std::size_t wg_size = 256; comp_ev = inclusive_scan_iter_1d( @@ -988,7 +998,7 @@ size_t cumsum_val_contig_impl(sycl::queue &q, cgh.copy(last_elem, last_elem_host_usm, 1); }); copy_e.wait(); - size_t return_val = static_cast(*last_elem_host_usm); + std::size_t return_val = static_cast(*last_elem_host_usm); using dpctl::tensor::alloc_utils::sycl_free_noexcept; sycl_free_noexcept(last_elem_host_usm, q); @@ -1034,9 +1044,9 @@ template struct Cumsum1DContigFactory } }; -typedef size_t (*cumsum_val_strided_impl_fn_ptr_t)( +typedef std::size_t (*cumsum_val_strided_impl_fn_ptr_t)( sycl::queue &, - size_t, + std::size_t, const char *, int, const ssize_t *, @@ -1045,14 +1055,15 @@ typedef size_t (*cumsum_val_strided_impl_fn_ptr_t)( const std::vector &); template -size_t cumsum_val_strided_impl(sycl::queue &q, - size_t n_elems, - const char *mask, - int nd, - const ssize_t *shape_strides, - char *cumsum, - std::vector &host_tasks, - const std::vector &depends = {}) +std::size_t +cumsum_val_strided_impl(sycl::queue &q, + std::size_t n_elems, + const char *mask, + int nd, + const ssize_t *shape_strides, + char *cumsum, + std::vector &host_tasks, + const std::vector &depends = {}) { const maskT *mask_data_ptr = reinterpret_cast(mask); cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); @@ -1061,8 +1072,8 @@ size_t cumsum_val_strided_impl(sycl::queue &q, const StridedIndexerT strided_indexer{nd, 0, shape_strides}; constexpr transformerT transformer{}; - constexpr size_t s0 = 0; - constexpr size_t s1 = 1; + constexpr std::size_t s0 = 0; + constexpr std::size_t s1 = 1; constexpr bool include_initial = false; using AccumulateOpT = sycl::plus; @@ -1070,7 +1081,7 @@ size_t cumsum_val_strided_impl(sycl::queue &q, sycl::event comp_ev; if (dev.has(sycl::aspect::cpu)) { constexpr nwiT n_wi_for_cpu = 8; - const size_t wg_size = 256; + const std::size_t wg_size = 256; comp_ev = inclusive_scan_iter_1d( @@ -1079,7 +1090,7 @@ size_t cumsum_val_strided_impl(sycl::queue &q, } else { constexpr nwiT n_wi_for_gpu = 4; - const size_t wg_size = 256; + const std::size_t wg_size = 256; comp_ev = inclusive_scan_iter_1d( @@ -1099,7 +1110,7 @@ size_t cumsum_val_strided_impl(sycl::queue &q, cgh.copy(last_elem, last_elem_host_usm, 1); }); copy_e.wait(); - size_t return_val = static_cast(*last_elem_host_usm); + std::size_t return_val = static_cast(*last_elem_host_usm); using dpctl::tensor::alloc_utils::sycl_free_noexcept; sycl_free_noexcept(last_elem_host_usm, q);