From c2d21d04a1ffb8e52b19661b636a2fa17446980a Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 6 Dec 2023 08:29:06 -0600 Subject: [PATCH 01/34] Initial check in of sort/argsort per array API spec Implemented data-parallel merge-sort algorithm. --- dpctl/tensor/CMakeLists.txt | 14 + dpctl/tensor/__init__.py | 3 + dpctl/tensor/_sorting.py | 107 ++ .../libtensor/include/kernels/sorting.hpp | 970 ++++++++++++++++++ .../libtensor/source/sorting/argsort.cpp | 238 +++++ .../libtensor/source/sorting/argsort.hpp | 18 + .../tensor/libtensor/source/sorting/sort.cpp | 212 ++++ .../tensor/libtensor/source/sorting/sort.hpp | 18 + .../source/sorting/sorting_common.hpp | 54 + .../libtensor/source/tensor_sorting.cpp | 37 + dpctl/tests/test_usm_ndarray_sorting.py | 128 +++ 11 files changed, 1799 insertions(+) create mode 100644 dpctl/tensor/_sorting.py create mode 100644 dpctl/tensor/libtensor/include/kernels/sorting.hpp create mode 100644 dpctl/tensor/libtensor/source/sorting/argsort.cpp create mode 100644 dpctl/tensor/libtensor/source/sorting/argsort.hpp create mode 100644 dpctl/tensor/libtensor/source/sorting/sort.cpp create mode 100644 dpctl/tensor/libtensor/source/sorting/sort.hpp create mode 100644 dpctl/tensor/libtensor/source/sorting/sorting_common.hpp create mode 100644 dpctl/tensor/libtensor/source/tensor_sorting.cpp create mode 100644 dpctl/tests/test_usm_ndarray_sorting.py diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index de8be6fae0..d2947aa772 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -118,6 +118,10 @@ set(_reduction_sources set(_boolean_reduction_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp ) +set(_sorting_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/sort.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/argsort.cpp +) set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp @@ -148,6 +152,10 @@ set(_tensor_reductions_impl_sources ${_boolean_reduction_sources} ${_reduction_sources} ) +set(_tensor_sorting_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_sorting.cpp + ${_sorting_sources} +) set(_py_trgts) @@ -166,6 +174,11 @@ pybind11_add_module(${python_module_name} MODULE ${_tensor_reductions_impl_sourc add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_reductions_impl_sources}) list(APPEND _py_trgts ${python_module_name}) +set(python_module_name _tensor_sorting_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_sorting_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_sorting_impl_sources}) +list(APPEND _py_trgts ${python_module_name}) + set(_clang_prefix "") if (WIN32) set(_clang_prefix "/clang:") @@ -179,6 +192,7 @@ set(_no_fast_math_sources list(APPEND _no_fast_math_sources ${_elementwise_sources} ${_reduction_sources} + ${_sorting_sources} ) foreach(_src_fn ${_no_fast_math_sources}) diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index cdb701e1cb..15be87588e 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -175,6 +175,7 @@ reduce_hypot, sum, ) +from ._sorting import argsort, sort from ._testing import allclose from ._type_utils import can_cast, finfo, iinfo, result_type @@ -343,4 +344,6 @@ "__array_namespace_info__", "reciprocal", "angle", + "sort", + "argsort", ] diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py new file mode 100644 index 0000000000..8ad496d0a4 --- /dev/null +++ b/dpctl/tensor/_sorting.py @@ -0,0 +1,107 @@ +from numpy.core.numeric import normalize_axis_index + +import dpctl +import dpctl.tensor as dpt +import dpctl.tensor._tensor_impl as ti + +from ._tensor_sorting_impl import ( + _argsort_ascending, + _argsort_descending, + _sort_ascending, + _sort_descending, +) + + +def sort(x, axis=-1, descending=False, stable=False): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}" + ) + nd = x.ndim + axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis") + a1 = axis + 1 + if a1 == nd: + perm = list(range(nd)) + arr = x + else: + perm = [i for i in range(nd) if i != axis] + [ + axis, + ] + arr = dpt.permute_dims(x, perm) + exec_q = x.sycl_queue + host_tasks_list = [] + impl_fn = _sort_descending if descending else _sort_ascending + if arr.flags.c_contiguous: + res = dpt.empty_like(arr, order="C") + ht_ev, _ = impl_fn( + src=arr, trailing_dims_to_sort=1, dst=res, sycl_queue=exec_q + ) + host_tasks_list.append(ht_ev) + else: + tmp = dpt.empty_like(arr, order="C") + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=exec_q + ) + host_tasks_list.append(ht_ev) + res = dpt.empty_like(arr, order="C") + ht_ev, _ = impl_fn( + src=tmp, + trailing_dims_to_sort=1, + dst=res, + sycl_queue=exec_q, + depends=[copy_ev], + ) + host_tasks_list.append(ht_ev) + if a1 != nd: + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(res, inv_perm) + dpctl.SyclEvent.wait_for(host_tasks_list) + return res + + +def argsort(x, axis=-1, descending=False, stable=False): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}" + ) + nd = x.ndim + axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis") + a1 = axis + 1 + if a1 == nd: + perm = list(range(nd)) + arr = x + else: + perm = [i for i in range(nd) if i != axis] + [ + axis, + ] + arr = dpt.permute_dims(x, perm) + exec_q = x.sycl_queue + host_tasks_list = [] + impl_fn = _argsort_descending if descending else _argsort_ascending + index_dt = ti.default_device_index_type(exec_q) + if arr.flags.c_contiguous: + res = dpt.empty_like(arr, dtype=index_dt, order="C") + ht_ev, _ = impl_fn( + src=arr, trailing_dims_to_sort=1, dst=res, sycl_queue=exec_q + ) + host_tasks_list.append(ht_ev) + else: + tmp = dpt.empty_like(arr, order="C") + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=exec_q + ) + host_tasks_list.append(ht_ev) + res = dpt.empty_like(arr, dtype=index_dt, order="C") + ht_ev, _ = impl_fn( + src=tmp, + trailing_dims_to_sort=1, + dst=res, + sycl_queue=exec_q, + depends=[copy_ev], + ) + host_tasks_list.append(ht_ev) + if a1 != nd: + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(res, inv_perm) + dpctl.SyclEvent.wait_for(host_tasks_list) + return res diff --git a/dpctl/tensor/libtensor/include/kernels/sorting.hpp b/dpctl/tensor/libtensor/include/kernels/sorting.hpp new file mode 100644 index 0000000000..cda9ce770d --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/sorting.hpp @@ -0,0 +1,970 @@ +#pragma once + +#include "pybind11/pybind11.h" + +#include +#include +#include +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ + +namespace sort_detail +{ + +template T quotient_ceil(T n, T m) +{ + return (n + m - 1) / m; +} + +template +std::size_t lower_bound_impl(const Acc acc, + std::size_t first, + std::size_t last, + const Value &value, + Compare comp) +{ + std::size_t n = last - first; + std::size_t cur = n; + std::size_t it; + while (n > 0) { + it = first; + cur = n / 2; + it += cur; + if (comp(acc[it], value)) { + n -= cur + 1, first = ++it; + } + else + n = cur; + } + return first; +} + +template +std::size_t upper_bound_impl(const Acc acc, + const std::size_t first, + const std::size_t last, + const Value &value, + Compare comp) +{ + const auto &op_comp = [comp](auto x, auto y) { return !comp(y, x); }; + return lower_bound_impl(acc, first, last, value, op_comp); +} + +/*! @brief Merge two contiguous sorted segments */ +template +void merge_impl(const std::size_t offset, + const InAcc in_acc, + OutAcc out_acc, + const std::size_t start_1, + const std::size_t end_1, + const std::size_t end_2, + const std::size_t start_out, + Compare comp, + const std::size_t chunk) +{ + const std::size_t start_2 = end_1; + // Borders of the sequences to merge within this call + const std::size_t local_start_1 = sycl::min(offset + start_1, end_1); + const std::size_t local_end_1 = sycl::min(local_start_1 + chunk, end_1); + const std::size_t local_start_2 = sycl::min(offset + start_2, end_2); + const std::size_t local_end_2 = sycl::min(local_start_2 + chunk, end_2); + + const std::size_t local_size_1 = local_end_1 - local_start_1; + const std::size_t local_size_2 = local_end_2 - local_start_2; + + const auto r_item_1 = in_acc[end_1 - 1]; + const auto l_item_2 = in_acc[start_2]; + + // Copy if the sequences are sorted with respect to each other or merge + // otherwise + if (!comp(l_item_2, r_item_1)) { + const std::size_t out_shift_1 = start_out + local_start_1 - start_1; + const std::size_t out_shift_2 = + start_out + end_1 - start_1 + local_start_2 - start_2; + + for (std::size_t i = 0; i < local_size_1; ++i) { + out_acc[out_shift_1 + i] = in_acc[local_start_1 + i]; + } + for (std::size_t i = 0; i < local_size_2; ++i) { + out_acc[out_shift_2 + i] = in_acc[local_start_2 + i]; + } + } + else if (comp(r_item_1, l_item_2)) { + const std::size_t out_shift_1 = + start_out + end_2 - start_2 + local_start_1 - start_1; + const std::size_t out_shift_2 = start_out + local_start_2 - start_2; + for (std::size_t i = 0; i < local_size_1; ++i) { + out_acc[out_shift_1 + i] = in_acc[local_start_1 + i]; + } + for (std::size_t i = 0; i < local_size_2; ++i) { + out_acc[out_shift_2 + i] = in_acc[local_start_2 + i]; + } + } + // Perform merging + else { + + // Process 1st sequence + if (local_start_1 < local_end_1) { + // Reduce the range for searching within the 2nd sequence and handle + // bound items find left border in 2nd sequence + const auto local_l_item_1 = in_acc[local_start_1]; + std::size_t l_search_bound_2 = + lower_bound_impl(in_acc, start_2, end_2, local_l_item_1, comp); + const std::size_t l_shift_1 = local_start_1 - start_1; + const std::size_t l_shift_2 = l_search_bound_2 - start_2; + + out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_1; + + std::size_t r_search_bound_2{}; + // find right border in 2nd sequence + if (local_size_1 > 1) { + const auto local_r_item_1 = in_acc[local_end_1 - 1]; + r_search_bound_2 = lower_bound_impl( + in_acc, l_search_bound_2, end_2, local_r_item_1, comp); + const auto r_shift_1 = local_end_1 - 1 - start_1; + const auto r_shift_2 = r_search_bound_2 - start_2; + + out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_1; + } + + // Handle intermediate items + if (r_search_bound_2 == l_search_bound_2) { + const std::size_t shift_2 = l_search_bound_2 - start_2; + for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1; + ++idx) { + const auto intermediate_item_1 = in_acc[idx]; + const std::size_t shift_1 = idx - start_1; + out_acc[start_out + shift_1 + shift_2] = + intermediate_item_1; + } + } + else { + for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1; + ++idx) { + const auto intermediate_item_1 = in_acc[idx]; + // we shouldn't seek in whole 2nd sequence. Just for the + // part where the 1st sequence should be + l_search_bound_2 = lower_bound_impl( + in_acc, l_search_bound_2, r_search_bound_2, + intermediate_item_1, comp); + const std::size_t shift_1 = idx - start_1; + const std::size_t shift_2 = l_search_bound_2 - start_2; + + out_acc[start_out + shift_1 + shift_2] = + intermediate_item_1; + } + } + } + // Process 2nd sequence + if (local_start_2 < local_end_2) { + // Reduce the range for searching within the 1st sequence and handle + // bound items find left border in 1st sequence + const auto local_l_item_2 = in_acc[local_start_2]; + std::size_t l_search_bound_1 = + upper_bound_impl(in_acc, start_1, end_1, local_l_item_2, comp); + const std::size_t l_shift_1 = l_search_bound_1 - start_1; + const std::size_t l_shift_2 = local_start_2 - start_2; + + out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_2; + + std::size_t r_search_bound_1{}; + // find right border in 1st sequence + if (local_size_2 > 1) { + const auto local_r_item_2 = in_acc[local_end_2 - 1]; + r_search_bound_1 = upper_bound_impl( + in_acc, l_search_bound_1, end_1, local_r_item_2, comp); + const std::size_t r_shift_1 = r_search_bound_1 - start_1; + const std::size_t r_shift_2 = local_end_2 - 1 - start_2; + + out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_2; + } + + // Handle intermediate items + if (l_search_bound_1 == r_search_bound_1) { + const std::size_t shift_1 = l_search_bound_1 - start_1; + for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx) + { + const auto intermediate_item_2 = in_acc[idx]; + const std::size_t shift_2 = idx - start_2; + out_acc[start_out + shift_1 + shift_2] = + intermediate_item_2; + } + } + else { + for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx) + { + const auto intermediate_item_2 = in_acc[idx]; + // we shouldn't seek in whole 1st sequence. Just for the + // part where the 2nd sequence should be + l_search_bound_1 = upper_bound_impl( + in_acc, l_search_bound_1, r_search_bound_1, + intermediate_item_2, comp); + const std::size_t shift_1 = l_search_bound_1 - start_1; + const std::size_t shift_2 = idx - start_2; + + out_acc[start_out + shift_1 + shift_2] = + intermediate_item_2; + } + } + } + } +} + +namespace +{ +template +void insertion_sort_impl(Iter first, + const size_t begin, + const size_t end, + Compare comp) +{ + for (size_t i = begin + 1; i < end; ++i) { + const auto val_i = first[i]; + size_t j = i - 1; + while ((j + 1 > begin) && (comp(val_i, first[j]))) { + first[j + 1] = first[j]; + --j; + } + if (j + 1 < i) { + first[j + 1] = val_i; + } + } +} + +template +void bubble_sort_impl(Iter first, + const size_t begin, + const size_t end, + Compare comp) +{ + if (begin < end) { + for (size_t i = begin; i < end; ++i) { + // Handle intermediate items + for (size_t idx = i + 1; idx < end; ++idx) { + if (comp(first[idx], first[i])) { + std::swap(first[i], first[idx]); + } + } + } + } +} + +template +void leaf_sort_impl(Iter first, + const size_t begin, + const size_t end, + Compare comp) +{ + return insertion_sort_impl(first, begin, end, comp); +} +} // namespace + +template struct GetValueType +{ + using value_type = typename std::iterator_traits::value_type; +}; + +template +struct GetValueType> +{ + using value_type = ElementType; +}; + +template +struct GetValueType< + sycl::accessor> +{ + using value_type = ElementType; +}; + +template +struct GetValueType> +{ + using value_type = ElementType; +}; + +template struct GetReadOnlyAccess +{ + Iter operator()(Iter it, sycl::handler &) + { + return it; + } +}; + +template +struct GetReadOnlyAccess> +{ + auto operator()(sycl::buffer buf, + sycl::handler &cgh) + { + sycl::accessor acc(buf, cgh, sycl::read_only); + return acc; + } +}; + +template struct GetWriteDiscardAccess +{ + Iter operator()(Iter it, sycl::handler &) + { + return it; + } +}; + +template +struct GetWriteDiscardAccess> +{ + auto operator()(sycl::buffer buf, + sycl::handler &cgh) + { + sycl::accessor acc(buf, cgh, sycl::write_only, sycl::no_init); + return acc; + } +}; + +template struct GetReadWriteAccess +{ + Iter operator()(Iter it, sycl::handler &) + { + return it; + } +}; + +template +struct GetReadWriteAccess> +{ + auto operator()(sycl::buffer buf, + sycl::handler &cgh) + { + sycl::accessor acc(buf, cgh, sycl::read_write); + return acc; + } +}; + +template +class sort_over_work_group_contig_krn; + +template +sycl::event +sort_over_work_group_contig_impl(sycl::queue &q, + size_t iter_nelems, + size_t sort_nelems, + const InpAcc input, + OutAcc output, + const Comp &comp, + size_t &nelems_wg_sorts, + const std::vector &depends = {}) +{ + using inpT = typename GetValueType::value_type; + using outT = typename GetValueType::value_type; + using KernelName = sort_over_work_group_contig_krn; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = q.get_context(); + auto const &dev = q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.template get_kernel(kernel_id); + + std::uint32_t max_sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + + const size_t lws = 4 * max_sg_size; + const std::uint32_t chunk_size = dev.has(sycl::aspect::cpu) ? 8 : 2; + nelems_wg_sorts = chunk_size * lws; + + // This assumption permits doing away with using a loop + assert(nelems_wg_sorts % lws == 0); + + const size_t n_segments = + quotient_ceil(sort_nelems, nelems_wg_sorts); + + sycl::event base_sort_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::range<1> global_range{iter_nelems * n_segments * lws}; + sycl::range<1> local_range{lws}; + + sycl::range<1> slm_range{nelems_wg_sorts}; + using T = typename GetValueType::value_type; + sycl::local_accessor work_space(slm_range, cgh); + sycl::local_accessor scratch_space(slm_range, cgh); + + auto input_acc = GetReadOnlyAccess{}(input, cgh); + auto output_acc = GetWriteDiscardAccess{}(output, cgh); + + sycl::nd_range<1> ndRange(global_range, local_range); + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> it) { + const size_t group_id = it.get_group_linear_id(); + const size_t iter_id = group_id / n_segments; + const size_t segment_id = group_id - iter_id * n_segments; + const size_t lid = it.get_local_linear_id(); + + const size_t segment_start_idx = segment_id * nelems_wg_sorts; + const size_t segment_end_idx = std::min( + segment_start_idx + nelems_wg_sorts, sort_nelems); + const size_t wg_chunk_size = segment_end_idx - segment_start_idx; + + // load input into SLM + for (size_t array_id = segment_start_idx + lid; + array_id < segment_end_idx; array_id += lws) + { + T v = (array_id < sort_nelems) + ? input_acc[iter_id * sort_nelems + array_id] + : T{}; + work_space[array_id - segment_start_idx] = v; + } + sycl::group_barrier(it.get_group()); + + const size_t chunk = quotient_ceil(nelems_wg_sorts, lws); + + const size_t chunk_start_idx = lid * chunk; + const size_t chunk_end_idx = + sycl::min(chunk_start_idx + chunk, wg_chunk_size); + + leaf_sort_impl(work_space, chunk_start_idx, chunk_end_idx, comp); + + sycl::group_barrier(it.get_group()); + + bool data_in_temp = false; + size_t sorted_size = 1; + while (true) { + const size_t nelems_sorted_so_far = sorted_size * chunk; + if (nelems_sorted_so_far < wg_chunk_size) { + const size_t q = (lid / sorted_size); + const size_t start_1 = + sycl::min(2 * nelems_sorted_so_far * q, wg_chunk_size); + const size_t end_1 = sycl::min( + start_1 + nelems_sorted_so_far, wg_chunk_size); + const size_t end_2 = + sycl::min(end_1 + nelems_sorted_so_far, wg_chunk_size); + const size_t offset = chunk * (lid - q * sorted_size); + + if (data_in_temp) { + merge_impl(offset, scratch_space, work_space, start_1, + end_1, end_2, start_1, comp, chunk); + } + else { + merge_impl(offset, work_space, scratch_space, start_1, + end_1, end_2, start_1, comp, chunk); + } + sycl::group_barrier(it.get_group()); + + data_in_temp = !data_in_temp; + sorted_size *= 2; + } + else + break; + } + + const auto &out_src = (data_in_temp) ? scratch_space : work_space; + for (size_t array_id = segment_start_idx + lid; + array_id < segment_end_idx; array_id += lws) + { + if (array_id < sort_nelems) { + output_acc[iter_id * sort_nelems + array_id] = + out_src[array_id - segment_start_idx]; + } + } + }); + }); + + return base_sort_ev; +} + +template +class sort_base_step_contig_krn; + +template +sycl::event +sort_base_step_contig_impl(sycl::queue &q, + size_t iter_nelems, + size_t sort_nelems, + const InpAcc input, + OutAcc output, + const Comp &comp, + size_t &conseq_nelems_sorted, + const std::vector &depends = {}) +{ + + using inpT = typename GetValueType::value_type; + using outT = typename GetValueType::value_type; + using KernelName = sort_base_step_contig_krn; + + conseq_nelems_sorted = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4); + + size_t n_segments = + quotient_ceil(sort_nelems, conseq_nelems_sorted); + + sycl::event base_sort = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::range<1> gRange{iter_nelems * n_segments}; + cgh.parallel_for(gRange, [=](sycl::id<1> id) { + const size_t iter_id = id[0] / n_segments; + const size_t segment_id = id[0] - iter_id * n_segments; + + const size_t iter_offset = iter_id * sort_nelems; + const size_t beg_id = + iter_offset + segment_id * conseq_nelems_sorted; + const size_t end_id = + iter_offset + + std::min((segment_id + 1) * conseq_nelems_sorted, + sort_nelems); + for (size_t i = beg_id; i < end_id; ++i) { + output[i] = input[i]; + } + + leaf_sort_impl(output, beg_id, end_id, comp); + }); + }); + + return base_sort; +} + +template +class exp_sort_over_work_group_contig_krn; + +template +sycl::event exp_sort_over_work_group_contig_impl( + sycl::queue &q, + size_t iter_nelems, + size_t sort_nelems, + const InpAcc input, + OutAcc output, + const Comp &comp, + size_t &conseq_nelems_sorted, + const std::vector &depends = {}) +{ + + using inpT = typename GetValueType::value_type; + using outT = typename GetValueType::value_type; + using KernelName = exp_sort_over_work_group_contig_krn; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = q.get_context(); + auto const &dev = q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.template get_kernel(kernel_id); + + std::uint32_t max_sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + + const size_t lws = 4 * max_sg_size; + const std::uint32_t chunk_size = dev.has(sycl::aspect::cpu) ? 8 : 2; + conseq_nelems_sorted = chunk_size * lws; + + // This assumption permits doing away with using a loop + assert(nelems_wg_sorts % lws == 0); + + sycl::event exp_default_sort_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const size_t n_chunks = + quotient_ceil(sort_nelems, conseq_nelems_sorted); + + sycl::range<1> global_range{iter_nelems * n_chunks * lws}; + sycl::range<1> local_range{lws}; + + sycl::range<1> slm_size_range{conseq_nelems_sorted}; + + using Sorter = sycl::ext::oneapi::experimental::default_sorter; + + // calculate required local memory size + // MUST pass local_range, not integer. + // Have different meanings + const size_t temp_memory_size = Sorter::template memory_required( + sycl::memory_scope::work_group, conseq_nelems_sorted); + + sycl::local_accessor workspace(slm_size_range, cgh); + sycl::local_accessor scratch({temp_memory_size}, cgh); + + cgh.parallel_for( + sycl::nd_range<1>(global_range, local_range), + [=](sycl::nd_item<1> it) { + auto sorter_op = Sorter(sycl::span{ + scratch + .template get_multi_ptr() + .get(), + temp_memory_size}); + + const size_t gr_id = it.get_group_linear_id(); + const size_t iter_id = gr_id / n_chunks; + const size_t sort_chunk_id = gr_id - iter_id * n_chunks; + + const std::uint32_t lid = it.get_local_linear_id(); + + const size_t iter_offset = iter_id * sort_nelems; + const size_t chunk_offset = + sort_chunk_id * conseq_nelems_sorted; + const size_t global_start_offset = iter_offset + chunk_offset; + const size_t workspace_size = + std::min(sort_nelems, + chunk_offset + conseq_nelems_sorted) - + chunk_offset; + for (std::uint32_t i = lid; i < workspace_size; i += lws) { + workspace[i] = input[global_start_offset + i]; + } + sycl::group_barrier(it.get_group()); + + sycl::ext::oneapi::experimental::joint_sort( + it.get_group(), + workspace + .template get_multi_ptr() + .get(), + workspace + .template get_multi_ptr< + sycl::access::decorated::no>() + .get() + + workspace_size, + sorter_op); + + for (std::uint32_t i = lid; i < workspace_size; i += lws) { + output[global_start_offset + i] = workspace[i]; + } + }); + }); + + return exp_default_sort_ev; +} + +class vacuous_krn; + +inline sycl::event tie_events(sycl::queue &q, + const std::vector depends) +{ + if (depends.empty()) + return sycl::event(); + if (depends.size() == 1) + return depends[0]; + + sycl::event e = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + using KernelName = vacuous_krn; + cgh.single_task([]() {}); + }); + + return e; +} + +template class merge_adjacent_blocks_to_temp_krn; + +template class merge_adjacent_blocks_from_temp_krn; + +template +sycl::event +merge_sorted_block_contig_impl(sycl::queue &q, + size_t iter_nelems, + size_t sort_nelems, + Acc output, + const Comp comp, + size_t sorted_block_size, + const std::vector &depends = {}) +{ + + if (sorted_block_size >= sort_nelems) + return tie_events(q, depends); + + // experimentally determined value + // size of segments worked upon by each work-item during merging + const sycl::device &dev = q.get_device(); + const size_t segment_size = (dev.has(sycl::aspect::cpu)) ? 32 : 4; + + const size_t chunk_size = + (sorted_block_size < segment_size) ? sorted_block_size : segment_size; + + assert(sorted_block_size % chunk_size == 0); + + using T = typename GetValueType::value_type; + + sycl::buffer temp_buf(sycl::range<1>{iter_nelems * sort_nelems}); + // T *allocated_mem = sycl::malloc_device(iter_nelems * sort_nelems, q); + + bool needs_copy = true; + bool used_depends = false; + + sycl::event dep_ev; + size_t chunks_merged = sorted_block_size / chunk_size; + + assert(!(chunks_merged & (chunks_merged - 1))); + + using ToTempKernelName = class merge_adjacent_blocks_to_temp_krn; + using FromTempKernelName = + class merge_adjacent_blocks_from_temp_krn; + + while (chunks_merged * chunk_size < sort_nelems) { + sycl::event local_dep = dep_ev; + + sycl::event merge_ev = q.submit([&](sycl::handler &cgh) { + if (used_depends) { + cgh.depends_on(local_dep); + } + else { + cgh.depends_on(depends); + used_depends = true; + } + + const size_t n_chunks = + quotient_ceil(sort_nelems, chunk_size); + + if (needs_copy) { + sycl::accessor temp_acc{temp_buf, cgh, sycl::write_only, + sycl::no_init}; + auto output_acc = GetReadOnlyAccess{}(output, cgh); + cgh.parallel_for( + {iter_nelems * n_chunks}, [=](sycl::id<1> wid) { + auto flat_idx = wid[0]; + auto iter_idx = flat_idx / n_chunks; + auto idx = flat_idx - n_chunks * iter_idx; + + const std::size_t idx_mult = + (idx / chunks_merged) * chunks_merged; + const std::size_t idx_rem = (idx - idx_mult); + const std::size_t start_1 = + sycl::min(2 * idx_mult * chunk_size, sort_nelems); + const std::size_t end_1 = sycl::min( + start_1 + chunks_merged * chunk_size, sort_nelems); + const std::size_t end_2 = sycl::min( + end_1 + chunks_merged * chunk_size, sort_nelems); + const std::size_t offset = chunk_size * idx_rem; + + const std::size_t iter_offset = iter_idx * sort_nelems; + + merge_impl(offset, output_acc, temp_acc, + iter_offset + start_1, iter_offset + end_1, + iter_offset + end_2, iter_offset + start_1, + comp, chunk_size); + }); + } + else { + sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only}; + auto output_acc = GetWriteDiscardAccess{}(output, cgh); + cgh.parallel_for( + {iter_nelems * n_chunks}, [=](sycl::id<1> wid) { + auto flat_idx = wid[0]; + auto iter_idx = flat_idx / n_chunks; + auto idx = flat_idx - n_chunks * iter_idx; + + const std::size_t idx_mult = + (idx / chunks_merged) * chunks_merged; + const std::size_t idx_rem = (idx - idx_mult); + const std::size_t start_1 = + sycl::min(2 * idx_mult * chunk_size, sort_nelems); + const std::size_t end_1 = sycl::min( + start_1 + chunks_merged * chunk_size, sort_nelems); + const std::size_t end_2 = sycl::min( + end_1 + chunks_merged * chunk_size, sort_nelems); + const std::size_t offset = chunk_size * idx_rem; + + const std::size_t iter_offset = iter_idx * sort_nelems; + + merge_impl(offset, temp_acc, output_acc, + iter_offset + start_1, iter_offset + end_1, + iter_offset + end_2, iter_offset + start_1, + comp, chunk_size); + }); + } + }); + + chunks_merged *= 2; + dep_ev = merge_ev; + + if (chunks_merged * chunk_size < sort_nelems) { + needs_copy = !needs_copy; + } + } + + if (needs_copy) { + sycl::event copy_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dep_ev); + + sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only}; + auto output_acc = GetWriteDiscardAccess{}(output, cgh); + + cgh.copy(temp_acc, output_acc); + }); + dep_ev = copy_ev; + } + + return dep_ev; +} + +} // end of namespace sort_detail + +typedef sycl::event (*sort_contig_fn_ptr_t)(sycl::queue &, + size_t, + size_t, + const char *, + char *, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + const std::vector &); + +template > +sycl::event stable_sort_axis1_contig_impl( + sycl::queue &exec_q, + size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a + // matrix when sorting over rows) + size_t sort_nelems, // size of each array to sort (length of rows, i.e. + // number of columns) + const char *arg_cp, + char *res_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + py::ssize_t sort_arg_offset, + py::ssize_t sort_res_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + sort_arg_offset; + argTy *res_tp = + reinterpret_cast(res_cp) + iter_res_offset + sort_res_offset; + + auto comp = Comp{}; + + static constexpr size_t determine_automatically = 0; + size_t sorted_block_size = + (sort_nelems >= 512) ? 512 : determine_automatically; + + // Sort segments of the array +#if 1 + sycl::event base_sort_ev = sort_detail::sort_over_work_group_contig_impl< + const argTy *, argTy *, Comp>( + exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp, + sorted_block_size, // modified in place with size of sorted block size + depends); +#else + sycl::event base_sort_ev = + sort_detail::sort_base_step_contig_impl( + exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp, + sorted_block_size, // modified in place with size of sorted block + // size + depends); +#endif + + // Merge segments in parallel until all elements are sorted + sycl::event merges_ev = + sort_detail::merge_sorted_block_contig_impl( + exec_q, iter_nelems, sort_nelems, res_tp, comp, sorted_block_size, + {base_sort_ev}); + + return merges_ev; +} + +template +class populate_indexed_data_krn; + +template class index_write_out_krn; + +template struct TupleComp +{ + bool operator()(const pairT &p1, const pairT &p2) const + { + const ValueComp value_comp{}; + return value_comp(std::get<0>(p1), std::get<0>(p2)); + } +}; + +template > +sycl::event stable_argsort_axis1_contig_impl( + sycl::queue &exec_q, + size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a + // matrix when sorting over rows) + size_t sort_nelems, // size of each array to sort (length of rows, i.e. + // number of columns) + const char *arg_cp, + char *res_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + py::ssize_t sort_arg_offset, + py::ssize_t sort_res_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + sort_arg_offset; + IndexTy *res_tp = + reinterpret_cast(res_cp) + iter_res_offset + sort_res_offset; + + using ValueIndexT = std::pair; + const TupleComp tuple_comp{}; + + static constexpr size_t determine_automatically = 0; + size_t sorted_block_size = + (sort_nelems >= 512) ? 512 : determine_automatically; + + sycl::buffer indexed_data( + sycl::range<1>(iter_nelems * sort_nelems)); + sycl::buffer temp_buf( + sycl::range<1>(iter_nelems * sort_nelems)); + + sycl::event populate_indexed_data_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + sycl::accessor acc(indexed_data, cgh, sycl::write_only, + sycl::no_init); + + auto const &range = indexed_data.get_range(); + + using KernelName = + populate_indexed_data_krn; + + cgh.parallel_for(range, [=](sycl::id<1> id) { + size_t i = id[0]; + size_t sort_id = i % sort_nelems; + acc[i] = + std::make_pair(arg_tp[i], static_cast(sort_id)); + }); + }); + + // Sort segments of the array + sycl::event base_sort_ev = sort_detail::sort_over_work_group_contig_impl( + exec_q, iter_nelems, sort_nelems, indexed_data, temp_buf, tuple_comp, + sorted_block_size, // modified in place with size of sorted block size + {populate_indexed_data_ev}); + + // Merge segments in parallel until all elements are sorted + sycl::event merges_ev = sort_detail::merge_sorted_block_contig_impl( + exec_q, iter_nelems, sort_nelems, temp_buf, tuple_comp, + sorted_block_size, {base_sort_ev}); + + sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(merges_ev); + + auto temp_acc = + sort_detail::GetReadOnlyAccess{}(temp_buf, cgh); + + using KernelName = index_write_out_krn; + + cgh.parallel_for(temp_buf.get_range(), [=](sycl::id<1> id) { + res_tp[id] = std::get<1>(temp_acc[id]); + }); + }); + + return write_out_ev; +} + +} // end of namespace kernels +} // end of namespace tensor +} // end of namespace dpctl diff --git a/dpctl/tensor/libtensor/source/sorting/argsort.cpp b/dpctl/tensor/libtensor/source/sorting/argsort.cpp new file mode 100644 index 0000000000..753d516e7b --- /dev/null +++ b/dpctl/tensor/libtensor/source/sorting/argsort.cpp @@ -0,0 +1,238 @@ + +#include "dpctl4pybind11.hpp" +#include +#include +#include + +#include "utils/math_utils.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/type_dispatch.hpp" + +#include "argsort.hpp" +#include "kernels/sorting.hpp" +#include "sorting_common.hpp" + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +template +std::pair +py_argsort(const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const sorting_contig_impl_fnT &stable_sort_contig_fns) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd) { + throw py::value_error("The input and output arrays must have " + "the same array ranks"); + } + int iteration_nd = src_nd - trailing_dims_to_sort; + if (trailing_dims_to_sort <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_sort must be positive, but no " + "greater than rank of the array being sorted"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + size_t iter_nelems(1); + + for (int i = 0; same_shapes && (i < iteration_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + iter_nelems *= static_cast(src_shape_i); + } + + size_t sort_nelems(1); + for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + sort_nelems *= static_cast(src_shape_i); + } + + if (!same_shapes) { + throw py::value_error( + "Destination shape does not match the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if ((iter_nelems == 0) || (sort_nelems == 0)) { + // Nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + // destination must be ample enough to accommodate all elements + { + auto dst_offsets = dst.get_minmax_offsets(); + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < sort_nelems * iter_nelems) { + throw py::value_error( + "Destination array can not accommodate all the " + "elements of source array."); + } + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if ((dst_typeid != static_cast(td_ns::typenum_t::INT64)) && + (dst_typeid != static_cast(td_ns::typenum_t::INT32))) + { + throw py::value_error( + "Output index array must have data type int32 or int64"); + } + + // handle special case when both reduction and iteration are 1D contiguous + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + + if (is_src_c_contig && is_dst_c_contig) { + using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl; + + static constexpr py::ssize_t zero_offset = py::ssize_t(0); + + auto fn = stable_sort_contig_fns[src_typeid][dst_typeid]; + + if (fn == nullptr) { + throw py::value_error("Not implemented for given index type"); + } + + sycl::event comp_ev = + fn(exec_q, iter_nelems, sort_nelems, src.get_data(), dst.get_data(), + zero_offset, zero_offset, zero_offset, zero_offset, depends); + + sycl::event keep_args_alive_ev = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev}); + + return std::make_pair(keep_args_alive_ev, comp_ev); + } + + return std::make_pair(sycl::event(), sycl::event()); +} + +using dpctl::tensor::kernels::sort_contig_fn_ptr_t; +static sort_contig_fn_ptr_t + ascending_argsort_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static sort_contig_fn_ptr_t + descending_argsort_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct AscendingArgSortContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v || + std::is_same_v) + { + using Comp = typename AscendingSorter::type; + + using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl; + return stable_argsort_axis1_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct DescendingArgSortContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v || + std::is_same_v) + { + using Comp = typename DescendingSorter::type; + + using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl; + return stable_argsort_axis1_contig_impl; + } + else { + return nullptr; + } + } +}; + +void init_argsort_dispatch_tables(void) +{ + using dpctl::tensor::kernels::sort_contig_fn_ptr_t; + + td_ns::DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(ascending_argsort_contig_dispatch_table); + + td_ns::DispatchTableBuilder< + sort_contig_fn_ptr_t, DescendingArgSortContigFactory, td_ns::num_types> + dtb2; + dtb2.populate_dispatch_table(descending_argsort_contig_dispatch_table); +} + +void init_argsort_functions(py::module_ m) +{ + dpctl::tensor::py_internal::init_argsort_dispatch_tables(); + + auto py_argsort_ascending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return dpctl::tensor::py_internal::py_argsort( + src, trailing_dims_to_sort, dst, exec_q, depends, + dpctl::tensor::py_internal:: + ascending_argsort_contig_dispatch_table); + }; + m.def("_argsort_ascending", py_argsort_ascending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto py_argsort_descending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return dpctl::tensor::py_internal::py_argsort( + src, trailing_dims_to_sort, dst, exec_q, depends, + dpctl::tensor::py_internal:: + descending_argsort_contig_dispatch_table); + }; + m.def("_argsort_descending", py_argsort_descending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + return; +} + +} // end of namespace py_internal +} // end of namespace tensor +} // end of namespace dpctl diff --git a/dpctl/tensor/libtensor/source/sorting/argsort.hpp b/dpctl/tensor/libtensor/source/sorting/argsort.hpp new file mode 100644 index 0000000000..70548a492c --- /dev/null +++ b/dpctl/tensor/libtensor/source/sorting/argsort.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_argsort_functions(py::module_); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/sorting/sort.cpp b/dpctl/tensor/libtensor/source/sorting/sort.cpp new file mode 100644 index 0000000000..e163fd1494 --- /dev/null +++ b/dpctl/tensor/libtensor/source/sorting/sort.cpp @@ -0,0 +1,212 @@ + +#include "dpctl4pybind11.hpp" +#include +#include +#include + +#include "utils/math_utils.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/sorting.hpp" +#include "sort.hpp" +#include "sorting_common.hpp" + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +template +std::pair +py_sort(const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const sorting_contig_impl_fnT &stable_sort_contig_fns) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd) { + throw py::value_error("The input and output arrays must have " + "the same array ranks"); + } + int iteration_nd = src_nd - trailing_dims_to_sort; + if (trailing_dims_to_sort <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_sort must be positive, but no " + "greater than rank of the array being sorted"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + size_t iter_nelems(1); + + for (int i = 0; same_shapes && (i < iteration_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + iter_nelems *= static_cast(src_shape_i); + } + + size_t sort_nelems(1); + for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + sort_nelems *= static_cast(src_shape_i); + } + + if (!same_shapes) { + throw py::value_error( + "Destination shape does not match the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if ((iter_nelems == 0) || (sort_nelems == 0)) { + // Nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + // destination must be ample enough to accommodate all elements + { + auto dst_offsets = dst.get_minmax_offsets(); + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < sort_nelems * iter_nelems) { + throw py::value_error( + "Destination array can not accommodate all the " + "elements of source array."); + } + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_typeid != dst_typeid) { + throw py::value_error("Both input arrays must have " + "the same value data type"); + } + + // handle special case when both reduction and iteration are 1D contiguous + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + + if (is_src_c_contig && is_dst_c_contig) { + using dpctl::tensor::kernels::stable_sort_axis1_contig_impl; + + static constexpr py::ssize_t zero_offset = py::ssize_t(0); + + auto fn = stable_sort_contig_fns[src_typeid]; + + sycl::event comp_ev = + fn(exec_q, iter_nelems, sort_nelems, src.get_data(), dst.get_data(), + zero_offset, zero_offset, zero_offset, zero_offset, depends); + + sycl::event keep_args_alive_ev = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev}); + + return std::make_pair(keep_args_alive_ev, comp_ev); + } + + return std::make_pair(sycl::event(), sycl::event()); +} + +using dpctl::tensor::kernels::sort_contig_fn_ptr_t; +static sort_contig_fn_ptr_t + ascending_sort_contig_dispatch_vector[td_ns::num_types]; +static sort_contig_fn_ptr_t + descending_sort_contig_dispatch_vector[td_ns::num_types]; + +template struct AscendingSortContigFactory +{ + fnT get() + { + using Comp = typename AscendingSorter::type; + + using dpctl::tensor::kernels::stable_sort_axis1_contig_impl; + return stable_sort_axis1_contig_impl; + } +}; + +template struct DescendingSortContigFactory +{ + fnT get() + { + using Comp = typename DescendingSorter::type; + using dpctl::tensor::kernels::stable_sort_axis1_contig_impl; + return stable_sort_axis1_contig_impl; + } +}; + +void init_sort_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::sort_contig_fn_ptr_t; + + td_ns::DispatchVectorBuilder + dtv1; + dtv1.populate_dispatch_vector(ascending_sort_contig_dispatch_vector); + + td_ns::DispatchVectorBuilder + dtv2; + dtv2.populate_dispatch_vector(descending_sort_contig_dispatch_vector); +} + +void init_sort_functions(py::module_ m) +{ + dpctl::tensor::py_internal::init_sort_dispatch_vectors(); + + auto py_sort_ascending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return dpctl::tensor::py_internal::py_sort( + src, trailing_dims_to_sort, dst, exec_q, depends, + dpctl::tensor::py_internal::ascending_sort_contig_dispatch_vector); + }; + m.def("_sort_ascending", py_sort_ascending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto py_sort_descending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return dpctl::tensor::py_internal::py_sort( + src, trailing_dims_to_sort, dst, exec_q, depends, + dpctl::tensor::py_internal::descending_sort_contig_dispatch_vector); + }; + m.def("_sort_descending", py_sort_descending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + return; +} + +} // end of namespace py_internal +} // end of namespace tensor +} // end of namespace dpctl diff --git a/dpctl/tensor/libtensor/source/sorting/sort.hpp b/dpctl/tensor/libtensor/source/sorting/sort.hpp new file mode 100644 index 0000000000..f47f8f4173 --- /dev/null +++ b/dpctl/tensor/libtensor/source/sorting/sort.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_sort_functions(py::module_); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp b/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp new file mode 100644 index 0000000000..d6e1e4e71a --- /dev/null +++ b/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp @@ -0,0 +1,54 @@ +#pragma once + +#include "utils/math_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +template struct ComplexLess +{ + bool operator()(const cT &v1, const cT &v2) const + { + using dpctl::tensor::math_utils::less_complex; + + return less_complex(v1, v2); + } +}; + +template struct ComplexGreater +{ + bool operator()(const cT &v1, const cT &v2) const + { + using dpctl::tensor::math_utils::greater_complex; + + return greater_complex(v1, v2); + } +}; + +template struct AscendingSorter +{ + using type = std::less; +}; + +template struct AscendingSorter> +{ + using type = ComplexLess>; +}; + +template struct DescendingSorter +{ + using type = std::greater; +}; + +template struct DescendingSorter> +{ + using type = ComplexGreater>; +}; + +} // end of namespace py_internal +} // end of namespace tensor +} // end of namespace dpctl diff --git a/dpctl/tensor/libtensor/source/tensor_sorting.cpp b/dpctl/tensor/libtensor/source/tensor_sorting.cpp new file mode 100644 index 0000000000..5d534d85cd --- /dev/null +++ b/dpctl/tensor/libtensor/source/tensor_sorting.cpp @@ -0,0 +1,37 @@ +//===-- tensor_sorting.cpp - -----*-C++-*-/===// +// Implementation of _tensor_reductions_impl module +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include + +#include "sorting/argsort.hpp" +#include "sorting/sort.hpp" + +namespace py = pybind11; + +PYBIND11_MODULE(_tensor_sorting_impl, m) +{ + dpctl::tensor::py_internal::init_sort_functions(m); + dpctl::tensor::py_internal::init_argsort_functions(m); +} diff --git a/dpctl/tests/test_usm_ndarray_sorting.py b/dpctl/tests/test_usm_ndarray_sorting.py new file mode 100644 index 0000000000..f0b57129d9 --- /dev/null +++ b/dpctl/tests/test_usm_ndarray_sorting.py @@ -0,0 +1,128 @@ +import pytest + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_sort_1d(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + inp = dpt.roll( + dpt.concat( + (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype)) + ), + 734, + ) + + s = dpt.sort(inp, descending=False) + assert dpt.all(s[:-1] <= s[1:]) + + s1 = dpt.sort(inp, descending=True) + assert dpt.all(s1[:-1] >= s1[1:]) + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_sort_2d(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + fl = dpt.roll( + dpt.concat( + (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype)) + ), + 734, + ) + inp = dpt.reshape(fl, (20, -1)) + + s = dpt.sort(inp, axis=1, descending=False) + assert dpt.all(s[:, :-1] <= s[:, 1:]) + + s1 = dpt.sort(inp, axis=1, descending=True) + assert dpt.all(s1[:, :-1] >= s1[:, 1:]) + + +def test_sort_strides(): + + fl = dpt.roll( + dpt.concat((dpt.ones(10000, dtype="i4"), dpt.zeros(10000, dtype="i4"))), + 734, + ) + inp = dpt.reshape(fl, (-1, 20)) + + s = dpt.sort(inp, axis=0, descending=False) + assert dpt.all(s[:-1, :] <= s[1:, :]) + + s1 = dpt.sort(inp, axis=0, descending=True) + assert dpt.all(s1[:-1, :] >= s1[1:, :]) + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_argsort_1d(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + inp = dpt.roll( + dpt.concat( + (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype)) + ), + 734, + ) + + s_idx = dpt.argsort(inp, descending=False) + assert dpt.all(inp[s_idx[:-1]] <= inp[s_idx[1:]]) + + s1_idx = dpt.argsort(inp, descending=True) + assert dpt.all(inp[s1_idx[:-1]] >= inp[s1_idx[1:]]) From 84437f1938a594dfe92add9264bb02ffa063d034 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 6 Dec 2023 10:52:44 -0600 Subject: [PATCH 02/34] Added docstrings --- dpctl/tensor/_sorting.py | 69 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py index 8ad496d0a4..ba1590ac32 100644 --- a/dpctl/tensor/_sorting.py +++ b/dpctl/tensor/_sorting.py @@ -1,3 +1,19 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from numpy.core.numeric import normalize_axis_index import dpctl @@ -12,7 +28,32 @@ ) -def sort(x, axis=-1, descending=False, stable=False): +def sort(x, /, *, axis=-1, descending=False, stable=False): + """sort(x, axis=-1, descending=False, stable=False) + + Returns a sorted copy of an input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which to sort. If set to `-1`, the function + must sort along the last axis. Default: `-1`. + descending (Optional[bool]): + sort order. If `True`, the array must be sorted in descending + order (by value). If `False`, the array must be sorted in + ascending order (by value). Default: `False`. + stable (Optional[bool]): + sort stability. If `True`, the returned array must maintain the + relative order of `x` values which compare as equal. If `False`, + the returned array may or may not maintain the relative order of + `x` values which compare as equal. Default: `True`. + + Returns: + usm_ndarray: + a sorted array. The returned array has the same data type and + the same shape as the input array `x`. + """ if not isinstance(x, dpt.usm_ndarray): raise TypeError( f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}" @@ -60,6 +101,32 @@ def sort(x, axis=-1, descending=False, stable=False): def argsort(x, axis=-1, descending=False, stable=False): + """argsort(x, axis=-1, descending=False, stable=False) + + Returns the indices that sort an array `x` along a specified axis. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which to sort. If set to `-1`, the function + must sort along the last axis. Default: `-1`. + descending (Optional[bool]): + sort order. If `True`, the array must be sorted in descending + order (by value). If `False`, the array must be sorted in + ascending order (by value). Default: `False`. + stable (Optional[bool]): + sort stability. If `True`, the returned array must maintain the + relative order of `x` values which compare as equal. If `False`, + the returned array may or may not maintain the relative order of + `x` values which compare as equal. Default: `True`. + + Returns: + usm_ndarray: + an array of indices. The returned array has the same shape as + the input array `x`. The return array has default array index + data type. + """ if not isinstance(x, dpt.usm_ndarray): raise TypeError( f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}" From 8076adfb38da7b855f34316c4c4bd89c9f37532e Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 6 Dec 2023 10:53:45 -0600 Subject: [PATCH 03/34] Added license header comments --- .../libtensor/source/sorting/argsort.cpp | 23 ++++++++++++++++++ .../libtensor/source/sorting/argsort.hpp | 24 +++++++++++++++++++ .../tensor/libtensor/source/sorting/sort.cpp | 23 ++++++++++++++++++ .../tensor/libtensor/source/sorting/sort.hpp | 24 +++++++++++++++++++ .../source/sorting/sorting_common.hpp | 24 +++++++++++++++++++ 5 files changed, 118 insertions(+) diff --git a/dpctl/tensor/libtensor/source/sorting/argsort.cpp b/dpctl/tensor/libtensor/source/sorting/argsort.cpp index 753d516e7b..f23084fe21 100644 --- a/dpctl/tensor/libtensor/source/sorting/argsort.cpp +++ b/dpctl/tensor/libtensor/source/sorting/argsort.cpp @@ -1,3 +1,26 @@ +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===--------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" #include diff --git a/dpctl/tensor/libtensor/source/sorting/argsort.hpp b/dpctl/tensor/libtensor/source/sorting/argsort.hpp index 70548a492c..7637f2df4c 100644 --- a/dpctl/tensor/libtensor/source/sorting/argsort.hpp +++ b/dpctl/tensor/libtensor/source/sorting/argsort.hpp @@ -1,3 +1,27 @@ +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===--------------------------------------------------------------------===// + #pragma once #include diff --git a/dpctl/tensor/libtensor/source/sorting/sort.cpp b/dpctl/tensor/libtensor/source/sorting/sort.cpp index e163fd1494..8a4a710354 100644 --- a/dpctl/tensor/libtensor/source/sorting/sort.cpp +++ b/dpctl/tensor/libtensor/source/sorting/sort.cpp @@ -1,3 +1,26 @@ +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===--------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" #include diff --git a/dpctl/tensor/libtensor/source/sorting/sort.hpp b/dpctl/tensor/libtensor/source/sorting/sort.hpp index f47f8f4173..2ad5bdb623 100644 --- a/dpctl/tensor/libtensor/source/sorting/sort.hpp +++ b/dpctl/tensor/libtensor/source/sorting/sort.hpp @@ -1,3 +1,27 @@ +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===--------------------------------------------------------------------===// + #pragma once #include diff --git a/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp b/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp index d6e1e4e71a..6e8f273ea5 100644 --- a/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp +++ b/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp @@ -1,3 +1,27 @@ +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===--------------------------------------------------------------------===// + #pragma once #include "utils/math_utils.hpp" From 3cb7b150586ffcd5a56ba691642fb9ccf71a9f95 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 6 Dec 2023 10:53:59 -0600 Subject: [PATCH 04/34] Corrected variable name used in assertion Removed dead code, added include of cassert, added lincese header comment. --- .../libtensor/include/kernels/sorting.hpp | 143 +++--------------- 1 file changed, 25 insertions(+), 118 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting.hpp b/dpctl/tensor/libtensor/include/kernels/sorting.hpp index cda9ce770d..8b0532918e 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting.hpp @@ -1,7 +1,32 @@ +//=== sorting.hpp - Implementation of sorting kernels ---*-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor sort/argsort operations. +//===----------------------------------------------------------------------===// + #pragma once #include "pybind11/pybind11.h" +#include #include #include #include @@ -537,115 +562,6 @@ sort_base_step_contig_impl(sycl::queue &q, return base_sort; } -template -class exp_sort_over_work_group_contig_krn; - -template -sycl::event exp_sort_over_work_group_contig_impl( - sycl::queue &q, - size_t iter_nelems, - size_t sort_nelems, - const InpAcc input, - OutAcc output, - const Comp &comp, - size_t &conseq_nelems_sorted, - const std::vector &depends = {}) -{ - - using inpT = typename GetValueType::value_type; - using outT = typename GetValueType::value_type; - using KernelName = exp_sort_over_work_group_contig_krn; - - const auto &kernel_id = sycl::get_kernel_id(); - - auto const &ctx = q.get_context(); - auto const &dev = q.get_device(); - auto kb = sycl::get_kernel_bundle( - ctx, {dev}, {kernel_id}); - - auto krn = kb.template get_kernel(kernel_id); - - std::uint32_t max_sg_size = krn.template get_info< - sycl::info::kernel_device_specific::max_sub_group_size>(dev); - - const size_t lws = 4 * max_sg_size; - const std::uint32_t chunk_size = dev.has(sycl::aspect::cpu) ? 8 : 2; - conseq_nelems_sorted = chunk_size * lws; - - // This assumption permits doing away with using a loop - assert(nelems_wg_sorts % lws == 0); - - sycl::event exp_default_sort_ev = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - - const size_t n_chunks = - quotient_ceil(sort_nelems, conseq_nelems_sorted); - - sycl::range<1> global_range{iter_nelems * n_chunks * lws}; - sycl::range<1> local_range{lws}; - - sycl::range<1> slm_size_range{conseq_nelems_sorted}; - - using Sorter = sycl::ext::oneapi::experimental::default_sorter; - - // calculate required local memory size - // MUST pass local_range, not integer. - // Have different meanings - const size_t temp_memory_size = Sorter::template memory_required( - sycl::memory_scope::work_group, conseq_nelems_sorted); - - sycl::local_accessor workspace(slm_size_range, cgh); - sycl::local_accessor scratch({temp_memory_size}, cgh); - - cgh.parallel_for( - sycl::nd_range<1>(global_range, local_range), - [=](sycl::nd_item<1> it) { - auto sorter_op = Sorter(sycl::span{ - scratch - .template get_multi_ptr() - .get(), - temp_memory_size}); - - const size_t gr_id = it.get_group_linear_id(); - const size_t iter_id = gr_id / n_chunks; - const size_t sort_chunk_id = gr_id - iter_id * n_chunks; - - const std::uint32_t lid = it.get_local_linear_id(); - - const size_t iter_offset = iter_id * sort_nelems; - const size_t chunk_offset = - sort_chunk_id * conseq_nelems_sorted; - const size_t global_start_offset = iter_offset + chunk_offset; - const size_t workspace_size = - std::min(sort_nelems, - chunk_offset + conseq_nelems_sorted) - - chunk_offset; - for (std::uint32_t i = lid; i < workspace_size; i += lws) { - workspace[i] = input[global_start_offset + i]; - } - sycl::group_barrier(it.get_group()); - - sycl::ext::oneapi::experimental::joint_sort( - it.get_group(), - workspace - .template get_multi_ptr() - .get(), - workspace - .template get_multi_ptr< - sycl::access::decorated::no>() - .get() + - workspace_size, - sorter_op); - - for (std::uint32_t i = lid; i < workspace_size; i += lws) { - output[global_start_offset + i] = workspace[i]; - } - }); - }); - - return exp_default_sort_ev; -} - class vacuous_krn; inline sycl::event tie_events(sycl::queue &q, @@ -847,20 +763,11 @@ sycl::event stable_sort_axis1_contig_impl( (sort_nelems >= 512) ? 512 : determine_automatically; // Sort segments of the array -#if 1 sycl::event base_sort_ev = sort_detail::sort_over_work_group_contig_impl< const argTy *, argTy *, Comp>( exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp, sorted_block_size, // modified in place with size of sorted block size depends); -#else - sycl::event base_sort_ev = - sort_detail::sort_base_step_contig_impl( - exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp, - sorted_block_size, // modified in place with size of sorted block - // size - depends); -#endif // Merge segments in parallel until all elements are sorted sycl::event merges_ev = From d79962d10ee68d1e4deb0cb86a1f1b6bdbf0ad79 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 7 Dec 2023 11:02:12 -0600 Subject: [PATCH 05/34] Ensure that we use no more SLM than is available --- .../libtensor/include/kernels/sorting.hpp | 137 ++++++++++-------- 1 file changed, 80 insertions(+), 57 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting.hpp b/dpctl/tensor/libtensor/include/kernels/sorting.hpp index 8b0532918e..02c9a551f9 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting.hpp @@ -378,6 +378,60 @@ struct GetReadWriteAccess> } }; +template +class sort_base_step_contig_krn; + +template +sycl::event +sort_base_step_contig_impl(sycl::queue &q, + size_t iter_nelems, + size_t sort_nelems, + const InpAcc input, + OutAcc output, + const Comp &comp, + size_t &conseq_nelems_sorted, + const std::vector &depends = {}) +{ + + using inpT = typename GetValueType::value_type; + using outT = typename GetValueType::value_type; + using KernelName = sort_base_step_contig_krn; + + conseq_nelems_sorted = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4); + + size_t n_segments = + quotient_ceil(sort_nelems, conseq_nelems_sorted); + + sycl::event base_sort = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::range<1> gRange{iter_nelems * n_segments}; + + auto input_acc = GetReadOnlyAccess{}(input, cgh); + auto output_acc = GetWriteDiscardAccess{}(output, cgh); + + cgh.parallel_for(gRange, [=](sycl::id<1> id) { + const size_t iter_id = id[0] / n_segments; + const size_t segment_id = id[0] - iter_id * n_segments; + + const size_t iter_offset = iter_id * sort_nelems; + const size_t beg_id = + iter_offset + segment_id * conseq_nelems_sorted; + const size_t end_id = + iter_offset + + std::min((segment_id + 1) * conseq_nelems_sorted, + sort_nelems); + for (size_t i = beg_id; i < end_id; ++i) { + output_acc[i] = input_acc[i]; + } + + leaf_sort_impl(output_acc, beg_id, end_id, comp); + }); + }); + + return base_sort; +} + template class sort_over_work_group_contig_krn; @@ -393,8 +447,8 @@ sort_over_work_group_contig_impl(sycl::queue &q, const std::vector &depends = {}) { using inpT = typename GetValueType::value_type; - using outT = typename GetValueType::value_type; - using KernelName = sort_over_work_group_contig_krn; + using T = typename GetValueType::value_type; + using KernelName = sort_over_work_group_contig_krn; const auto &kernel_id = sycl::get_kernel_id(); @@ -405,12 +459,30 @@ sort_over_work_group_contig_impl(sycl::queue &q, auto krn = kb.template get_kernel(kernel_id); - std::uint32_t max_sg_size = krn.template get_info< + const std::uint32_t max_sg_size = krn.template get_info< sycl::info::kernel_device_specific::max_sub_group_size>(dev); + const std::uint64_t device_local_memory_size = + dev.get_info(); + + // leave 512 bytes of local memory for RT + const std::uint64_t safety_margin = 512; + + const std::uint64_t nelems_per_slm = + (device_local_memory_size - safety_margin) / (2 * sizeof(T)); - const size_t lws = 4 * max_sg_size; - const std::uint32_t chunk_size = dev.has(sycl::aspect::cpu) ? 8 : 2; - nelems_wg_sorts = chunk_size * lws; + constexpr std::uint32_t sub_groups_per_work_group = 4; + const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2; + + const size_t lws = sub_groups_per_work_group * max_sg_size; + + nelems_wg_sorts = elems_per_wi * lws; + + if (nelems_wg_sorts > nelems_per_slm) { + nelems_wg_sorts = 0; + return sort_base_step_contig_impl( + q, iter_nelems, sort_nelems, input, output, comp, nelems_wg_sorts, + depends); + } // This assumption permits doing away with using a loop assert(nelems_wg_sorts % lws == 0); @@ -421,11 +493,12 @@ sort_over_work_group_contig_impl(sycl::queue &q, sycl::event base_sort_ev = q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); + cgh.use_kernel_bundle(kb); + sycl::range<1> global_range{iter_nelems * n_segments * lws}; sycl::range<1> local_range{lws}; sycl::range<1> slm_range{nelems_wg_sorts}; - using T = typename GetValueType::value_type; sycl::local_accessor work_space(slm_range, cgh); sycl::local_accessor scratch_space(slm_range, cgh); @@ -512,56 +585,6 @@ sort_over_work_group_contig_impl(sycl::queue &q, return base_sort_ev; } -template -class sort_base_step_contig_krn; - -template -sycl::event -sort_base_step_contig_impl(sycl::queue &q, - size_t iter_nelems, - size_t sort_nelems, - const InpAcc input, - OutAcc output, - const Comp &comp, - size_t &conseq_nelems_sorted, - const std::vector &depends = {}) -{ - - using inpT = typename GetValueType::value_type; - using outT = typename GetValueType::value_type; - using KernelName = sort_base_step_contig_krn; - - conseq_nelems_sorted = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4); - - size_t n_segments = - quotient_ceil(sort_nelems, conseq_nelems_sorted); - - sycl::event base_sort = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - - sycl::range<1> gRange{iter_nelems * n_segments}; - cgh.parallel_for(gRange, [=](sycl::id<1> id) { - const size_t iter_id = id[0] / n_segments; - const size_t segment_id = id[0] - iter_id * n_segments; - - const size_t iter_offset = iter_id * sort_nelems; - const size_t beg_id = - iter_offset + segment_id * conseq_nelems_sorted; - const size_t end_id = - iter_offset + - std::min((segment_id + 1) * conseq_nelems_sorted, - sort_nelems); - for (size_t i = beg_id; i < end_id; ++i) { - output[i] = input[i]; - } - - leaf_sort_impl(output, beg_id, end_id, comp); - }); - }); - - return base_sort; -} - class vacuous_krn; inline sycl::event tie_events(sycl::queue &q, From aa1df8b7c8eb55c60aea7ff67df0ffc6b4fd62bf Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 8 Dec 2023 09:46:00 -0600 Subject: [PATCH 06/34] Implementation of set functions Implements unique_all, unique_values, unique_counts, unique_inverse based on sort and argsort. Implementation is asynchronous, but can be improved ince searchsorted becomes available. --- dpctl/tensor/_set_functions.py | 547 +++++++++++++++++++++++++++++++++ 1 file changed, 547 insertions(+) create mode 100644 dpctl/tensor/_set_functions.py diff --git a/dpctl/tensor/_set_functions.py b/dpctl/tensor/_set_functions.py new file mode 100644 index 0000000000..c26fbc2848 --- /dev/null +++ b/dpctl/tensor/_set_functions.py @@ -0,0 +1,547 @@ +from typing import NamedTuple + +import dpctl +import dpctl.tensor as dpt + +from ._tensor_elementwise_impl import _not_equal, _subtract +from ._tensor_impl import ( + _copy_usm_ndarray_into_usm_ndarray, + _extract, + _full_usm_ndarray, + _linspace_step, + _take, + default_device_index_type, + mask_positions, +) +from ._tensor_sorting_impl import _argsort_ascending, _sort_ascending + + +class UniqueAllResult(NamedTuple): + values: dpt.usm_ndarray + indices: dpt.usm_ndarray + inverse_indices: dpt.usm_ndarray + counts: dpt.usm_ndarray + + +class UniqueCountsResult(NamedTuple): + values: dpt.usm_ndarray + counts: dpt.usm_ndarray + + +class UniqueInverseResult(NamedTuple): + values: dpt.usm_ndarray + inverse_indices: dpt.usm_ndarray + + +def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: + """unique_values(x) + + Returns the unique elements of an input array x. + + Args: + x (usm_ndarray): + input array. The input with more than one dimension is flattened. + Returns: + usm_ndarray + an array containing the set of unique elements in `x`. The + returned array has the same data type as `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + if x.ndim == 1: + fx = x + else: + fx = dpt.reshape(x, (x.size,), order="C", copy=False) + s = dpt.empty_like(fx, order="C") + host_tasks = [] + if fx.flags.c_contiguous: + ht_ev, sort_ev = _sort_ascending( + src=fx, trailing_dims_to_sort=1, dst=s, sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + else: + tmp = dpt.empty_like(fx, order="C") + ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( + src=fx, dst=tmp, sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + ht_ev, sort_ev = _sort_ascending( + src=fx, + trailing_dims_to_sort=1, + dst=s, + sycl_queue=exec_q, + depends=[copy_ev], + ) + host_tasks.append(ht_ev) + unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) + ht_ev, uneq_ev = _not_equal( + src1=s[:-1], + src2=s[1:], + dst=unique_mask[1:], + sycl_queue=exec_q, + depends=[sort_ev], + ) + host_tasks.append(ht_ev) + ht_ev, one_ev = _full_usm_ndarray( + fill_value=True, dst=unique_mask[0], sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + cumsum = dpt.empty(s.shape, dtype=dpt.int64) + # synchronizing call + n_uniques = mask_positions( + unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev] + ) + if n_uniques == fx.size: + dpctl.SyclEvent.wait_for(host_tasks) + return s + unique_vals = dpt.empty( + n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + ) + ht_ev, _ = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + host_tasks.append(ht_ev) + dpctl.SyclEvent.wait_for(host_tasks) + return unique_vals + + +def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: + """unique_counts(x) + + Returns the unique elements of an input array `x` and the corresponding + counts for each unique element in `x`. + + Args: + x (usm_ndarray): + input array. The input with more than one dimension is flattened. + Returns: + tuple[usm_ndarray, usm_ndarray] + a namedtuple `(values, counts)` whose + + * first element is the field name `values` and is an array + containing the unique elements of `x`. This array has the + same data type as `x`. + * second element has the field name `counts` and is an array + containing the number of times each unique element occurs in `x`. + This array has the same shape as `values` and has the default + array index data type. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + if x.ndim == 1: + fx = x + else: + fx = dpt.reshape(x, (x.size,), order="C", copy=False) + s = dpt.empty_like(fx, order="C") + host_tasks = [] + if fx.flags.c_contiguous: + ht_ev, sort_ev = _sort_ascending( + src=fx, trailing_dims_to_sort=1, dst=s, sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + else: + tmp = dpt.empty_like(fx, order="C") + ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( + src=fx, dst=tmp, sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + ht_ev, sort_ev = _sort_ascending( + src=fx, + dst=s, + trailing_dims_to_sort=1, + sycl_queue=exec_q, + depends=[copy_ev], + ) + host_tasks.append(ht_ev) + unique_mask = dpt.empty(s.shape, dtype="?", sycl_queue=exec_q) + ht_ev, uneq_ev = _not_equal( + src1=s[:-1], + src2=s[1:], + dst=unique_mask[1:], + sycl_queue=exec_q, + depends=[sort_ev], + ) + host_tasks.append(ht_ev) + ht_ev, one_ev = _full_usm_ndarray( + fill_value=True, dst=unique_mask[0], sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + ind_dt = default_device_index_type(exec_q) + cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64) + # synchronizing call + n_uniques = mask_positions( + unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev] + ) + if n_uniques == fx.size: + dpctl.SyclEvent.wait_for(host_tasks) + return UniqueCountsResult(s, dpt.ones(n_uniques, dtype=ind_dt)) + unique_vals = dpt.empty( + n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + ) + # populate unique values + ht_ev, _ = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + host_tasks.append(ht_ev) + unique_counts = dpt.empty( + n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q + ) + idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) + ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) + host_tasks.append(ht_ev) + ht_ev, extr_ev = _extract( + src=idx, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_counts[:-1], + sycl_queue=exec_q, + depends=[id_ev], + ) + host_tasks.append(ht_ev) + ht_ev, set_ev = _full_usm_ndarray( + x.size, dst=unique_counts[-1], sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + _counts = dpt.empty_like(unique_counts[1:]) + ht_ev, _ = _subtract( + src1=unique_counts[1:], + src2=unique_counts[:-1], + dst=_counts, + sycl_queue=exec_q, + depends=[set_ev, extr_ev], + ) + host_tasks.append(ht_ev) + dpctl.SyclEvent.wait_for(host_tasks) + return UniqueCountsResult(unique_vals, _counts) + + +def unique_inverse(x): + """unique_inverse + + Returns the unique elements of an input array x and the indices from the + set of unique elements that reconstruct `x`. + + Args: + x (usm_ndarray): + input array. The input with more than one dimension is flattened. + Returns: + tuple[usm_ndarray, usm_ndarray] + a namedtuple `(values, inverse_indices)` whose + + * first element has the field name `values` and is an array + containing the unique elements of `x`. The array has the same + data type as `x`. + * second element has the field name `inverse_indices` and is an + array containing the indices of values that reconstruct `x`. + The array has the same shape as `x` and has the default array + index data type. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + if x.ndim == 1: + fx = x + else: + fx = dpt.reshape(x, (x.size,), order="C", copy=False) + ind_dt = default_device_index_type(exec_q) + sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C") + unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C") + host_tasks = [] + if fx.flags.c_contiguous: + ht_ev, sort_ev = _argsort_ascending( + src=fx, trailing_dims_to_sort=1, dst=sorting_ids, sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + else: + tmp = dpt.empty_like(fx, order="C") + ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( + src=fx, dst=tmp, sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + ht_ev, sort_ev = _argsort_ascending( + src=fx, + trailing_dims_to_sort=1, + dst=sorting_ids, + sycl_queue=exec_q, + depends=[copy_ev], + ) + host_tasks.append(ht_ev) + ht_ev, _ = _argsort_ascending( + src=sorting_ids, + trailing_dims_to_sort=1, + dst=unsorting_ids, + sycl_queue=exec_q, + depends=[sort_ev], + ) + host_tasks.append(ht_ev) + s = dpt.empty_like(fx) + # s = fx[sorting_ids] + ht_ev, take_ev = _take( + src=fx, + ind=(sorting_ids,), + dst=s, + axis_start=0, + mode=0, + sycl_queue=exec_q, + depends=[sort_ev], + ) + host_tasks.append(ht_ev) + unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) + ht_ev, uneq_ev = _not_equal( + src1=s[:-1], + src2=s[1:], + dst=unique_mask[1:], + sycl_queue=exec_q, + depends=[take_ev], + ) + host_tasks.append(ht_ev) + ht_ev, one_ev = _full_usm_ndarray( + fill_value=True, dst=unique_mask[0], sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64) + # synchronizing call + n_uniques = mask_positions( + unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev] + ) + if n_uniques == fx.size: + dpctl.SyclEvent.wait_for(host_tasks) + return UniqueInverseResult(s, unsorting_ids) + unique_vals = dpt.empty( + n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + ) + ht_ev, _ = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + host_tasks.append(ht_ev) + cum_unique_counts = dpt.empty( + n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q + ) + idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) + ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) + host_tasks.append(ht_ev) + ht_ev, extr_ev = _extract( + src=idx, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=cum_unique_counts[:-1], + sycl_queue=exec_q, + depends=[id_ev], + ) + host_tasks.append(ht_ev) + ht_ev, set_ev = _full_usm_ndarray( + x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + _counts = dpt.empty_like(cum_unique_counts[1:]) + ht_ev, _ = _subtract( + src1=cum_unique_counts[1:], + src2=cum_unique_counts[:-1], + dst=_counts, + sycl_queue=exec_q, + depends=[set_ev, extr_ev], + ) + host_tasks.append(ht_ev) + # TODO: when searchsorted is available, + # inv = searchsorted(unique_vals, fx) + dpctl.SyclEvent.wait_for(host_tasks) + counts = dpt.asnumpy(_counts).tolist() + inv = dpt.empty_like(fx, dtype=ind_dt) + pos = 0 + host_tasks = [] + for i in range(len(counts)): + pos_next = pos + counts[i] + _dst = inv[pos:pos_next] + ht_ev, _ = _full_usm_ndarray(fill_value=i, dst=_dst, sycl_queue=exec_q) + pos = pos_next + host_tasks.append(ht_ev) + dpctl.SyclEvent.wait_for(host_tasks) + return UniqueInverseResult(unique_vals, inv[unsorting_ids]) + + +def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: + """unique_all(x) + + Returns the unique elements of an input array `x`, the first occurring + indices for each unique element in `x`, the indices from the set of unique + elements that reconstruct `x`, and the corresponding counts for each + unique element in `x`. + + Args: + x (usm_ndarray): + input array. The input with more than one dimension is flattened. + Returns: + tuple[usm_ndarray, usm_ndarray, usm_ndarray, usm_ndarray] + a namedtuple `(values, indices, inverse_indices, counts)` whose + + * first element has the field name `values` and is an array + containing the unique elements of `x`. The array has the same + data type as `x`. + * second element has the field name `indices` and is an array + the indices (of first occurrences) of `x` that result in + `values`. The array has the same shape as `values` and has the + default array index data type. + * third element has the field name `inverse_indices` and is an + array containing the indices of values that reconstruct `x`. + The array has the same shape as `x` and has the default array + index data type. + * fourth element has the field name `counts` and is an array + containing the number of times each unique element occurs in `x`. + This array has the same shape as `values` and has the default + array index data type. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + if x.ndim == 1: + fx = x + else: + fx = dpt.reshape(x, (x.size,), order="C", copy=False) + ind_dt = default_device_index_type(exec_q) + sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C") + unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C") + host_tasks = [] + if fx.flags.c_contiguous: + ht_ev, sort_ev = _argsort_ascending( + src=fx, trailing_dims_to_sort=1, dst=sorting_ids, sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + else: + tmp = dpt.empty_like(fx, order="C") + ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( + src=fx, dst=tmp, sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + ht_ev, sort_ev = _argsort_ascending( + src=fx, + trailing_dims_to_sort=1, + dst=sorting_ids, + sycl_queue=exec_q, + depends=[copy_ev], + ) + host_tasks.append(ht_ev) + ht_ev, _ = _argsort_ascending( + src=sorting_ids, + trailing_dims_to_sort=1, + dst=unsorting_ids, + sycl_queue=exec_q, + depends=[sort_ev], + ) + host_tasks.append(ht_ev) + s = dpt.empty_like(fx) + # s = fx[sorting_ids] + ht_ev, take_ev = _take( + src=fx, + ind=(sorting_ids,), + dst=s, + axis_start=0, + mode=0, + sycl_queue=exec_q, + depends=[sort_ev], + ) + host_tasks.append(ht_ev) + unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) + ht_ev, uneq_ev = _not_equal( + src1=s[:-1], + src2=s[1:], + dst=unique_mask[1:], + sycl_queue=exec_q, + depends=[take_ev], + ) + host_tasks.append(ht_ev) + ht_ev, one_ev = _full_usm_ndarray( + fill_value=True, dst=unique_mask[0], sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64) + # synchronizing call + n_uniques = mask_positions( + unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev] + ) + if n_uniques == fx.size: + dpctl.SyclEvent.wait_for(host_tasks) + return UniqueInverseResult(s, unsorting_ids) + unique_vals = dpt.empty( + n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + ) + ht_ev, _ = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + host_tasks.append(ht_ev) + cum_unique_counts = dpt.empty( + n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q + ) + idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) + ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) + host_tasks.append(ht_ev) + ht_ev, extr_ev = _extract( + src=idx, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=cum_unique_counts[:-1], + sycl_queue=exec_q, + depends=[id_ev], + ) + host_tasks.append(ht_ev) + ht_ev, set_ev = _full_usm_ndarray( + x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + _counts = dpt.empty_like(cum_unique_counts[1:]) + ht_ev, _ = _subtract( + src1=cum_unique_counts[1:], + src2=cum_unique_counts[:-1], + dst=_counts, + sycl_queue=exec_q, + depends=[set_ev, extr_ev], + ) + host_tasks.append(ht_ev) + # TODO: when searchsorted is available, + # inv = searchsorted(unique_vals, fx) + dpctl.SyclEvent.wait_for(host_tasks) + counts = dpt.asnumpy(_counts).tolist() + inv = dpt.empty_like(fx, dtype=ind_dt) + pos = 0 + host_tasks = [] + for i in range(len(counts)): + pos_next = pos + counts[i] + _dst = inv[pos:pos_next] + ht_ev, _ = _full_usm_ndarray(fill_value=i, dst=_dst, sycl_queue=exec_q) + pos = pos_next + host_tasks.append(ht_ev) + dpctl.SyclEvent.wait_for(host_tasks) + return UniqueAllResult( + unique_vals, + sorting_ids[cum_unique_counts[:-1]], + inv[unsorting_ids], + _counts, + ) From 709ed1edbc7668a7b5b51e770cab3041b48603f0 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 8 Dec 2023 09:51:13 -0600 Subject: [PATCH 07/34] Exposed set functions to dpctl.tensor namespace --- dpctl/tensor/__init__.py | 10 ++++++++++ dpctl/tensor/_set_functions.py | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index 15be87588e..81fc152e7a 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -175,6 +175,12 @@ reduce_hypot, sum, ) +from ._set_functions import ( + unique_all, + unique_counts, + unique_inverse, + unique_values, +) from ._sorting import argsort, sort from ._testing import allclose from ._type_utils import can_cast, finfo, iinfo, result_type @@ -346,4 +352,8 @@ "angle", "sort", "argsort", + "unique_all", + "unique_counts", + "unique_inverse", + "unique_values", ] diff --git a/dpctl/tensor/_set_functions.py b/dpctl/tensor/_set_functions.py index c26fbc2848..72eec556ae 100644 --- a/dpctl/tensor/_set_functions.py +++ b/dpctl/tensor/_set_functions.py @@ -15,6 +15,16 @@ ) from ._tensor_sorting_impl import _argsort_ascending, _sort_ascending +__all__ = [ + "unique_values", + "unique_counts", + "unique_inverse", + "unique_all", + "UniqueAllResult", + "UniqueCountsResult", + "UniqueInverseResult", +] + class UniqueAllResult(NamedTuple): values: dpt.usm_ndarray From 5635d852109dab3c3164f794b9d7adfc991ad807 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 8 Dec 2023 09:51:37 -0600 Subject: [PATCH 08/34] Added __all__ for _sorting.py --- dpctl/tensor/_sorting.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py index ba1590ac32..7158d981c1 100644 --- a/dpctl/tensor/_sorting.py +++ b/dpctl/tensor/_sorting.py @@ -27,6 +27,8 @@ _sort_descending, ) +__all__ = ["sort", "argsort"] + def sort(x, /, *, axis=-1, descending=False, stable=False): """sort(x, axis=-1, descending=False, stable=False) From 88ad12f5747ed0d6b2524ab57b00f1dfb98eb432 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 8 Dec 2023 10:02:32 -0600 Subject: [PATCH 09/34] Expand tests for sorting to improve coverage --- dpctl/tests/test_usm_ndarray_sorting.py | 54 +++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/dpctl/tests/test_usm_ndarray_sorting.py b/dpctl/tests/test_usm_ndarray_sorting.py index f0b57129d9..e582ad2c63 100644 --- a/dpctl/tests/test_usm_ndarray_sorting.py +++ b/dpctl/tests/test_usm_ndarray_sorting.py @@ -126,3 +126,57 @@ def test_argsort_1d(dtype): s1_idx = dpt.argsort(inp, descending=True) assert dpt.all(inp[s1_idx[:-1]] >= inp[s1_idx[1:]]) + + +def test_sort_validation(): + with pytest.raises(TypeError): + dpt.sort(dict()) + + +def test_argsort_validation(): + with pytest.raises(TypeError): + dpt.argsort(dict()) + + +def test_sort_axis0(): + get_queue_or_skip() + + n, m = 200, 30 + xf = dpt.arange(n * m, 0, step=-1, dtype="i4") + x = dpt.reshape(xf, (n, m)) + s = dpt.sort(x, axis=0) + + assert dpt.all(s[:-1, :] <= s[1:, :]) + + +def test_argsort_axis0(): + get_queue_or_skip() + + n, m = 200, 30 + xf = dpt.arange(n * m, 0, step=-1, dtype="i4") + x = dpt.reshape(xf, (n, m)) + idx = dpt.argsort(x, axis=0) + + s = x[idx, dpt.arange(m)[dpt.newaxis, :]] + + assert dpt.all(s[:-1, :] <= s[1:, :]) + + +def test_sort_strided(): + get_queue_or_skip() + + x_orig = dpt.arange(100, dtype="i4") + x_flipped = dpt.flip(x_orig, axis=0) + s = dpt.sort(x_flipped) + + assert dpt.all(s == x_orig) + + +def test_argsort_strided(): + get_queue_or_skip() + + x_orig = dpt.arange(100, dtype="i4") + x_flipped = dpt.flip(x_orig, axis=0) + idx = dpt.argsort(x_flipped) + + assert dpt.all(x_flipped[idx] == x_orig) From 734cf77abfe57fdbb8ca577f44548ccb5b2001a5 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 8 Dec 2023 10:24:32 -0600 Subject: [PATCH 10/34] Tests for set functions --- dpctl/tests/test_usm_ndarray_unique.py | 153 +++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 dpctl/tests/test_usm_ndarray_unique.py diff --git a/dpctl/tests/test_usm_ndarray_unique.py b/dpctl/tests/test_usm_ndarray_unique.py new file mode 100644 index 0000000000..012a6091cc --- /dev/null +++ b/dpctl/tests/test_usm_ndarray_unique.py @@ -0,0 +1,153 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_unique_values(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n, roll = 10000, 734 + inp = dpt.roll( + dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))), + roll, + ) + + uv = dpt.unique_values(inp) + assert dpt.all(uv == dpt.arange(2, dtype=dtype)) + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_unique_counts(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n, roll = 10000, 734 + inp = dpt.roll( + dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))), + roll, + ) + + uv, uv_counts = dpt.unique_counts(inp) + assert dpt.all(uv == dpt.arange(2, dtype=dtype)) + assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype)) + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_unique_inverse(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n, roll = 10000, 734 + inp = dpt.roll( + dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))), + roll, + ) + + uv, inv = dpt.unique_inverse(inp) + assert dpt.all(uv == dpt.arange(2, dtype=dtype)) + assert dpt.all(inp == uv[inv]) + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_unique_all(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n, roll = 10000, 734 + inp = dpt.roll( + dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))), + roll, + ) + + uv, ind, inv, uv_counts = dpt.unique_all(inp) + assert dpt.all(uv == dpt.arange(2, dtype=dtype)) + assert dpt.all(uv == inp[ind]) + assert dpt.all(inp == uv[inv]) + assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype)) From 11344b16637a23f65dd9c88410d6f88011752e69 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 8 Dec 2023 14:08:22 -0600 Subject: [PATCH 11/34] Added license header for test file --- dpctl/tests/test_usm_ndarray_sorting.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/dpctl/tests/test_usm_ndarray_sorting.py b/dpctl/tests/test_usm_ndarray_sorting.py index e582ad2c63..ef86d3a561 100644 --- a/dpctl/tests/test_usm_ndarray_sorting.py +++ b/dpctl/tests/test_usm_ndarray_sorting.py @@ -1,3 +1,19 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pytest import dpctl.tensor as dpt From b1d10064afe68a95b1e9c122c89be5268db9f9e3 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 8 Dec 2023 14:23:39 -0600 Subject: [PATCH 12/34] Do not use shorcut API for copy with dependency --- dpctl/tensor/libtensor/include/kernels/accumulators.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index bd6ad20b6d..505bb6d8f1 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -251,8 +251,10 @@ size_t accumulate_contig_impl(sycl::queue &q, if (last_elem_host_usm == nullptr) { throw std::bad_alloc(); } - sycl::event copy_e = - q.copy(last_elem, last_elem_host_usm, 1, {comp_ev}); + sycl::event copy_e = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(comp_ev); + cgh.copy(last_elem, last_elem_host_usm, 1); + }); copy_e.wait(); size_t return_val = static_cast(*last_elem_host_usm); sycl::free(last_elem_host_usm, q); From 34b50af5bb57a9daa750a2124ea17ab7e4989293 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 8 Dec 2023 17:35:44 -0600 Subject: [PATCH 13/34] Implement set functions using synchronizing ops Use synchronizing operation as implementation in dpctl.tensor Fixed tests. --- dpctl/tensor/__init__.py | 2 +- dpctl/tensor/_set_functions.py | 16 ++ dpctl/tensor/_set_functions_sync.py | 351 ++++++++++++++++++++++++ dpctl/tests/test_usm_ndarray_sorting.py | 3 +- 4 files changed, 370 insertions(+), 2 deletions(-) create mode 100644 dpctl/tensor/_set_functions_sync.py diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index 81fc152e7a..fb41c1240a 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -175,7 +175,7 @@ reduce_hypot, sum, ) -from ._set_functions import ( +from ._set_functions_sync import ( unique_all, unique_counts, unique_inverse, diff --git a/dpctl/tensor/_set_functions.py b/dpctl/tensor/_set_functions.py index 72eec556ae..631a12a592 100644 --- a/dpctl/tensor/_set_functions.py +++ b/dpctl/tensor/_set_functions.py @@ -1,3 +1,19 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from typing import NamedTuple import dpctl diff --git a/dpctl/tensor/_set_functions_sync.py b/dpctl/tensor/_set_functions_sync.py new file mode 100644 index 0000000000..7f16373c78 --- /dev/null +++ b/dpctl/tensor/_set_functions_sync.py @@ -0,0 +1,351 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import NamedTuple + +import dpctl.tensor as dpt + +from ._tensor_impl import ( + _extract, + _full_usm_ndarray, + _linspace_step, + default_device_index_type, + mask_positions, +) + +__all__ = [ + "unique_values", + "unique_counts", + "unique_inverse", + "unique_all", + "UniqueAllResult", + "UniqueCountsResult", + "UniqueInverseResult", +] + + +class UniqueAllResult(NamedTuple): + values: dpt.usm_ndarray + indices: dpt.usm_ndarray + inverse_indices: dpt.usm_ndarray + counts: dpt.usm_ndarray + + +class UniqueCountsResult(NamedTuple): + values: dpt.usm_ndarray + counts: dpt.usm_ndarray + + +class UniqueInverseResult(NamedTuple): + values: dpt.usm_ndarray + inverse_indices: dpt.usm_ndarray + + +def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: + """unique_values(x) + + Returns the unique elements of an input array x. + + Args: + x (usm_ndarray): + input array. The input with more than one dimension is flattened. + Returns: + usm_ndarray + an array containing the set of unique elements in `x`. The + returned array has the same data type as `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + if x.ndim == 1: + fx = x + else: + fx = dpt.reshape(x, (x.size,), order="C", copy=False) + s = dpt.sort(fx) + unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) + dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:]) + unique_mask[0] = True + cumsum = dpt.empty(s.shape, dtype=dpt.int64) + n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q) + if n_uniques == fx.size: + return s + unique_vals = dpt.empty( + n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + ) + ht_ev, _ = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + ht_ev.wait() + return unique_vals + + +def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: + """unique_counts(x) + + Returns the unique elements of an input array `x` and the corresponding + counts for each unique element in `x`. + + Args: + x (usm_ndarray): + input array. The input with more than one dimension is flattened. + Returns: + tuple[usm_ndarray, usm_ndarray] + a namedtuple `(values, counts)` whose + + * first element is the field name `values` and is an array + containing the unique elements of `x`. This array has the + same data type as `x`. + * second element has the field name `counts` and is an array + containing the number of times each unique element occurs in `x`. + This array has the same shape as `values` and has the default + array index data type. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + if x.ndim == 1: + fx = x + else: + fx = dpt.reshape(x, (x.size,), order="C", copy=False) + s = dpt.sort(x) + unique_mask = dpt.empty(s.shape, dtype="?", sycl_queue=exec_q) + dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:]) + unique_mask[0] = True + ind_dt = default_device_index_type(exec_q) + cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64) + # synchronizing call + n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q) + if n_uniques == fx.size: + return UniqueCountsResult(s, dpt.ones(n_uniques, dtype=ind_dt)) + unique_vals = dpt.empty( + n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + ) + # populate unique values + ht_ev, _ = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + ht_ev.wait() + unique_counts = dpt.empty( + n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q + ) + idx = dpt.arange(x.size, dtype=ind_dt, sycl_queue=exec_q) + ht_ev, _ = _extract( + src=idx, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_counts[:-1], + sycl_queue=exec_q, + ) + unique_counts[-1] = fx.size + ht_ev.wait() + _counts = dpt.empty_like(unique_counts[1:]) + dpt.subtract(unique_counts[1:], unique_counts[:-1], out=_counts) + return UniqueCountsResult(unique_vals, _counts) + + +def unique_inverse(x): + """unique_inverse + + Returns the unique elements of an input array x and the indices from the + set of unique elements that reconstruct `x`. + + Args: + x (usm_ndarray): + input array. The input with more than one dimension is flattened. + Returns: + tuple[usm_ndarray, usm_ndarray] + a namedtuple `(values, inverse_indices)` whose + + * first element has the field name `values` and is an array + containing the unique elements of `x`. The array has the same + data type as `x`. + * second element has the field name `inverse_indices` and is an + array containing the indices of values that reconstruct `x`. + The array has the same shape as `x` and has the default array + index data type. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + if x.ndim == 1: + fx = x + else: + fx = dpt.reshape(x, (x.size,), order="C", copy=False) + ind_dt = default_device_index_type(exec_q) + sorting_ids = dpt.argsort(fx) + unsorting_ids = dpt.argsort(sorting_ids) + s = fx[sorting_ids] + unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) + unique_mask[0] = True + dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:]) + cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64) + # synchronizing call + n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q) + if n_uniques == fx.size: + return UniqueInverseResult(s, unsorting_ids) + unique_vals = dpt.empty( + n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + ) + ht_ev, _ = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + ht_ev.wait() + cum_unique_counts = dpt.empty( + n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q + ) + idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) + ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) + ht_ev.wait() + ht_ev, _ = _extract( + src=idx, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=cum_unique_counts[:-1], + sycl_queue=exec_q, + ) + ht_ev.wait() + cum_unique_counts[-1] = fx.size + _counts = dpt.subtract(cum_unique_counts[1:], cum_unique_counts[:-1]) + # TODO: when searchsorted is available, + # inv = searchsorted(unique_vals, fx) + counts = dpt.asnumpy(_counts).tolist() + inv = dpt.empty_like(fx, dtype=ind_dt) + pos = 0 + for i in range(len(counts)): + pos_next = pos + counts[i] + _dst = inv[pos:pos_next] + ht_ev, _ = _full_usm_ndarray(fill_value=i, dst=_dst, sycl_queue=exec_q) + ht_ev.wait() + pos = pos_next + return UniqueInverseResult(unique_vals, inv[unsorting_ids]) + + +def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: + """unique_all(x) + + Returns the unique elements of an input array `x`, the first occurring + indices for each unique element in `x`, the indices from the set of unique + elements that reconstruct `x`, and the corresponding counts for each + unique element in `x`. + + Args: + x (usm_ndarray): + input array. The input with more than one dimension is flattened. + Returns: + tuple[usm_ndarray, usm_ndarray, usm_ndarray, usm_ndarray] + a namedtuple `(values, indices, inverse_indices, counts)` whose + + * first element has the field name `values` and is an array + containing the unique elements of `x`. The array has the same + data type as `x`. + * second element has the field name `indices` and is an array + the indices (of first occurrences) of `x` that result in + `values`. The array has the same shape as `values` and has the + default array index data type. + * third element has the field name `inverse_indices` and is an + array containing the indices of values that reconstruct `x`. + The array has the same shape as `x` and has the default array + index data type. + * fourth element has the field name `counts` and is an array + containing the number of times each unique element occurs in `x`. + This array has the same shape as `values` and has the default + array index data type. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + if x.ndim == 1: + fx = x + else: + fx = dpt.reshape(x, (x.size,), order="C", copy=False) + ind_dt = default_device_index_type(exec_q) + sorting_ids = dpt.argsort(fx) + unsorting_ids = dpt.argsort(sorting_ids) + s = fx[sorting_ids] + unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) + dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:]) + unique_mask[0] = True + cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64) + # synchronizing call + n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q) + if n_uniques == fx.size: + return UniqueInverseResult(s, unsorting_ids) + unique_vals = dpt.empty( + n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + ) + ht_ev, _ = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + ht_ev.wait() + cum_unique_counts = dpt.empty( + n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q + ) + idx = dpt.arange(fx.size, dtype=ind_dt, sycl_queue=exec_q) + ht_ev, extr_ev = _extract( + src=idx, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=cum_unique_counts[:-1], + sycl_queue=exec_q, + ) + ht_ev.wait() + cum_unique_counts[-1] = fx.size + _counts = cum_unique_counts[1:] - cum_unique_counts[:-1] + # TODO: when searchsorted is available, + # inv = searchsorted(unique_vals, fx) + counts = dpt.asnumpy(_counts).tolist() + inv = dpt.empty_like(fx, dtype=ind_dt) + pos = 0 + for i in range(len(counts)): + pos_next = pos + counts[i] + _dst = inv[pos:pos_next] + ht_ev, _ = _full_usm_ndarray(fill_value=i, dst=_dst, sycl_queue=exec_q) + ht_ev.wait() + pos = pos_next + return UniqueAllResult( + unique_vals, + sorting_ids[cum_unique_counts[:-1]], + inv[unsorting_ids], + _counts, + ) diff --git a/dpctl/tests/test_usm_ndarray_sorting.py b/dpctl/tests/test_usm_ndarray_sorting.py index ef86d3a561..84cf760a0c 100644 --- a/dpctl/tests/test_usm_ndarray_sorting.py +++ b/dpctl/tests/test_usm_ndarray_sorting.py @@ -173,7 +173,8 @@ def test_argsort_axis0(): x = dpt.reshape(xf, (n, m)) idx = dpt.argsort(x, axis=0) - s = x[idx, dpt.arange(m)[dpt.newaxis, :]] + conseq_idx = dpt.arange(m, dtype=idx.dtype) + s = x[idx, conseq_idx[dpt.newaxis, :]] assert dpt.all(s[:-1, :] <= s[1:, :]) From 1c7efc7b4b63b3066cde7d9ebdc26d95f1f318e3 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 8 Dec 2023 22:35:33 -0600 Subject: [PATCH 14/34] Renamed unused _set_functions.py and added it to omit list for coverage --- dpctl/tensor/{_set_functions.py => _set_functions_debug.py} | 0 pyproject.toml | 2 ++ 2 files changed, 2 insertions(+) rename dpctl/tensor/{_set_functions.py => _set_functions_debug.py} (100%) diff --git a/dpctl/tensor/_set_functions.py b/dpctl/tensor/_set_functions_debug.py similarity index 100% rename from dpctl/tensor/_set_functions.py rename to dpctl/tensor/_set_functions_debug.py diff --git a/pyproject.toml b/pyproject.toml index 5882128dab..074b27257d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ source = [ omit = [ "dpctl/tests/*", "dpctl/_version.py", + "dpctl/tensor/_set_functions_debug.py", "*/_cython_api*/stringsource", ] @@ -29,6 +30,7 @@ omit = [ omit = [ "dpctl/tests/*", "dpctl/_version.py", + "dpctl/tensor/_set_functions_debug.py", "*/_cython_api*/stringsource", ] From fad8470f1cbe1564a0cceef70f89ef97239ae70e Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sat, 9 Dec 2023 06:48:36 -0600 Subject: [PATCH 15/34] Use sequential sorting for small arrays --- .../libtensor/include/kernels/sorting.hpp | 61 +++++++++++-------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting.hpp b/dpctl/tensor/libtensor/include/kernels/sorting.hpp index 02c9a551f9..e7a024259e 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting.hpp @@ -384,12 +384,12 @@ class sort_base_step_contig_krn; template sycl::event sort_base_step_contig_impl(sycl::queue &q, - size_t iter_nelems, - size_t sort_nelems, + const size_t iter_nelems, + const size_t sort_nelems, const InpAcc input, OutAcc output, const Comp &comp, - size_t &conseq_nelems_sorted, + const size_t conseq_nelems_sorted, const std::vector &depends = {}) { @@ -397,15 +397,13 @@ sort_base_step_contig_impl(sycl::queue &q, using outT = typename GetValueType::value_type; using KernelName = sort_base_step_contig_krn; - conseq_nelems_sorted = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4); - - size_t n_segments = + const size_t n_segments = quotient_ceil(sort_nelems, conseq_nelems_sorted); sycl::event base_sort = q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - sycl::range<1> gRange{iter_nelems * n_segments}; + const sycl::range<1> gRange{iter_nelems * n_segments}; auto input_acc = GetReadOnlyAccess{}(input, cgh); auto output_acc = GetWriteDiscardAccess{}(output, cgh); @@ -478,7 +476,8 @@ sort_over_work_group_contig_impl(sycl::queue &q, nelems_wg_sorts = elems_per_wi * lws; if (nelems_wg_sorts > nelems_per_slm) { - nelems_wg_sorts = 0; + nelems_wg_sorts = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4); + return sort_base_step_contig_impl( q, iter_nelems, sort_nelems, input, output, comp, nelems_wg_sorts, depends); @@ -781,24 +780,38 @@ sycl::event stable_sort_axis1_contig_impl( auto comp = Comp{}; - static constexpr size_t determine_automatically = 0; - size_t sorted_block_size = - (sort_nelems >= 512) ? 512 : determine_automatically; + constexpr size_t sequential_sorting_threshold = 64; - // Sort segments of the array - sycl::event base_sort_ev = sort_detail::sort_over_work_group_contig_impl< - const argTy *, argTy *, Comp>( - exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp, - sorted_block_size, // modified in place with size of sorted block size - depends); - - // Merge segments in parallel until all elements are sorted - sycl::event merges_ev = - sort_detail::merge_sorted_block_contig_impl( - exec_q, iter_nelems, sort_nelems, res_tp, comp, sorted_block_size, - {base_sort_ev}); + if (sort_nelems < sequential_sorting_threshold) { + // equal work-item sorts entire row + sycl::event sequential_sorting_ev = + sort_detail::sort_base_step_contig_impl( + exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp, + sort_nelems, depends); - return merges_ev; + return sequential_sorting_ev; + } + else { + size_t sorted_block_size{}; + + // Sort segments of the array + sycl::event base_sort_ev = + sort_detail::sort_over_work_group_contig_impl( + exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp, + sorted_block_size, // modified in place with size of sorted + // block size + depends); + + // Merge segments in parallel until all elements are sorted + sycl::event merges_ev = + sort_detail::merge_sorted_block_contig_impl( + exec_q, iter_nelems, sort_nelems, res_tp, comp, + sorted_block_size, {base_sort_ev}); + + return merges_ev; + } } template From fa3d8fec9d704f3ceed4a11839b6e01b4ab066f3 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 11 Dec 2023 13:46:04 -0600 Subject: [PATCH 16/34] Do not use shorthand queue::copy --- dpctl/tensor/libtensor/include/kernels/accumulators.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index 505bb6d8f1..327f1eaca0 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -339,8 +339,10 @@ size_t accumulate_strided_impl(sycl::queue &q, if (last_elem_host_usm == nullptr) { throw std::bad_alloc(); } - sycl::event copy_e = - q.copy(last_elem, last_elem_host_usm, 1, {comp_ev}); + sycl::event copy_e = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(comp_ev); + cgh.copy(last_elem, last_elem_host_usm, 1); + }); copy_e.wait(); size_t return_val = static_cast(*last_elem_host_usm); sycl::free(last_elem_host_usm, q); From abfc28000786ede7e697c924327cbc3be9571967 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 11 Dec 2023 13:46:49 -0600 Subject: [PATCH 17/34] Do not use queue::fill convenience shortcut functions --- .../libtensor/include/kernels/boolean_reductions.hpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp index 61fb0f6ba0..877680c8bf 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp @@ -315,8 +315,10 @@ boolean_reduction_axis1_contig_impl(sycl::queue &exec_q, }); } else { - sycl::event init_ev = exec_q.fill(res_tp, resTy(identity_val), - iter_nelems, depends); + sycl::event init_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.fill(res_tp, resTy(identity_val), iter_nelems); + }); red_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(init_ev); @@ -484,8 +486,10 @@ boolean_reduction_axis0_contig_impl(sycl::queue &exec_q, size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); { - sycl::event init_ev = exec_q.fill(res_tp, resTy(identity_val), - iter_nelems, depends); + sycl::event init_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.fill(res_tp, resTy(identity_val), iter_nelems); + }); sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(init_ev); From 5e0e5eb2456a6d063f21558223ea62cb6151f11f Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 11 Dec 2023 14:58:06 -0600 Subject: [PATCH 18/34] Accumulate implementations API acquired host_task_events argument This is `std::vector` passed by reference to collect events associated with host_task submissions. The synchronizing call `mask_positions` is releasing GIL before wait on these events is called. It is either this, or accumulated host tasks must be returned to the user. --- .../include/kernels/accumulators.hpp | 24 ++++++---- .../tensor/libtensor/source/accumulators.cpp | 44 ++++++++++++++----- 2 files changed, 49 insertions(+), 19 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index 327f1eaca0..da461d56b0 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -112,7 +112,8 @@ sycl::event inclusive_scan_rec(sycl::queue &exec_q, size_t s1, IndexerT indexer, TransformerT transformer, - std::vector const &depends = {}) + std::vector &host_tasks, + const std::vector &depends = {}) { size_t n_groups = ceiling_quotient(n_elems, n_wi * wg_size); @@ -188,7 +189,7 @@ sycl::event inclusive_scan_rec(sycl::queue &exec_q, auto e2 = inclusive_scan_rec( exec_q, n_groups - 1, wg_size, output, temp, chunk_size - 1, - chunk_size, _no_op_indexer, _no_op_transformer, + chunk_size, _no_op_indexer, _no_op_transformer, host_tasks, {inc_scan_phase1_ev}); // output[ chunk_size * (i + 1) + j] += temp[i] @@ -209,8 +210,9 @@ sycl::event inclusive_scan_rec(sycl::queue &exec_q, const auto &ctx = exec_q.get_context(); cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); }); }); + host_tasks.push_back(e4); - out_event = std::move(e4); + out_event = std::move(e3); } return out_event; @@ -221,14 +223,16 @@ typedef size_t (*accumulate_contig_impl_fn_ptr_t)( size_t, const char *, char *, - std::vector const &); + std::vector &, + const std::vector &); template size_t accumulate_contig_impl(sycl::queue &q, size_t n_elems, const char *mask, char *cumsum, - std::vector const &depends = {}) + std::vector &host_tasks, + const std::vector &depends = {}) { constexpr int n_wi = 8; const maskT *mask_data_ptr = reinterpret_cast(mask); @@ -242,7 +246,7 @@ size_t accumulate_contig_impl(sycl::queue &q, inclusive_scan_rec( q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1, - flat_indexer, non_zero_indicator, depends); + flat_indexer, non_zero_indicator, host_tasks, depends); cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); @@ -307,7 +311,8 @@ typedef size_t (*accumulate_strided_impl_fn_ptr_t)( int, const py::ssize_t *, char *, - std::vector const &); + std::vector &, + const std::vector &); template size_t accumulate_strided_impl(sycl::queue &q, @@ -316,7 +321,8 @@ size_t accumulate_strided_impl(sycl::queue &q, int nd, const py::ssize_t *shape_strides, char *cumsum, - std::vector const &depends = {}) + std::vector &host_tasks, + const std::vector &depends = {}) { constexpr int n_wi = 8; const maskT *mask_data_ptr = reinterpret_cast(mask); @@ -330,7 +336,7 @@ size_t accumulate_strided_impl(sycl::queue &q, inclusive_scan_rec( q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1, - strided_indexer, non_zero_indicator, depends); + strided_indexer, non_zero_indicator, host_tasks, depends); cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); diff --git a/dpctl/tensor/libtensor/source/accumulators.cpp b/dpctl/tensor/libtensor/source/accumulators.cpp index 0a2ce69f69..094aa4c284 100644 --- a/dpctl/tensor/libtensor/source/accumulators.cpp +++ b/dpctl/tensor/libtensor/source/accumulators.cpp @@ -150,12 +150,20 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, const bool use_i32 = (cumsum_typeid == int32_typeid); + std::vector host_task_events; + if (mask.is_c_contiguous()) { auto fn = (use_i32) ? mask_positions_contig_i32_dispatch_vector[mask_typeid] : mask_positions_contig_i64_dispatch_vector[mask_typeid]; - return fn(exec_q, mask_size, mask_data, cumsum_data, depends); + size_t total_set = fn(exec_q, mask_size, mask_data, cumsum_data, + host_task_events, depends); + { + py::gil_scoped_release release; + sycl::event::wait(host_task_events); + } + return total_set; } const py::ssize_t *shape = mask.get_shape_raw(); @@ -175,7 +183,6 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, auto strided_fn = (use_i32) ? mask_positions_strided_i32_dispatch_vector[mask_typeid] : mask_positions_strided_i64_dispatch_vector[mask_typeid]; - std::vector host_task_events; using dpctl::tensor::offset_utils::device_allocate_and_pack; const auto &ptr_size_event_tuple = device_allocate_and_pack( @@ -189,7 +196,10 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, if (2 * static_cast(nd) != std::get<1>(ptr_size_event_tuple)) { copy_shape_ev.wait(); - sycl::event::wait(host_task_events); + { + py::gil_scoped_release release; + sycl::event::wait(host_task_events); + } sycl::free(shape_strides, exec_q); throw std::runtime_error("Unexpected error"); } @@ -200,10 +210,14 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, dependent_events.insert(dependent_events.end(), depends.begin(), depends.end()); - size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd, - shape_strides, cumsum_data, dependent_events); + size_t total_set = + strided_fn(exec_q, mask_size, mask_data, nd, shape_strides, cumsum_data, + host_task_events, dependent_events); - sycl::event::wait(host_task_events); + { + py::gil_scoped_release release; + sycl::event::wait(host_task_events); + } sycl::free(shape_strides, exec_q); return total_set; @@ -283,6 +297,8 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, "Cumulative sum array must have int64 data-type."); } + std::vector host_task_events; + if (src.is_c_contiguous()) { auto fn = cumsum_1d_contig_dispatch_vector[src_typeid]; if (fn == nullptr) { @@ -290,7 +306,13 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, "this cumsum requires integer type, got src_typeid=" + std::to_string(src_typeid)); } - return fn(exec_q, src_size, src_data, cumsum_data, depends); + size_t total = fn(exec_q, src_size, src_data, cumsum_data, + host_task_events, depends); + { + py::gil_scoped_release release; + sycl::event::wait(host_task_events); + } + return total; } const py::ssize_t *shape = src.get_shape_raw(); @@ -313,7 +335,6 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, "this cumsum requires integer type, got src_typeid=" + std::to_string(src_typeid)); } - std::vector host_task_events; using dpctl::tensor::offset_utils::device_allocate_and_pack; const auto &ptr_size_event_tuple = device_allocate_and_pack( @@ -339,9 +360,12 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, depends.end()); size_t total = strided_fn(exec_q, src_size, src_data, nd, shape_strides, - cumsum_data, dependent_events); + cumsum_data, host_task_events, dependent_events); - sycl::event::wait(host_task_events); + { + py::gil_scoped_release release; + sycl::event::wait(host_task_events); + } sycl::free(shape_strides, exec_q); return total; From 6abb9fe1592739515e0e8577158c3ee691cef892 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 15 Dec 2023 09:43:24 -0600 Subject: [PATCH 19/34] Changed inclusive sum implementation from recursive to iterative MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changed hyperparameter choices to be different for CPU and GPU, resulting in 20% performance gain on GPU. The non-recursive implementation allows to avoid repeated USM allocations, resulting in performance gains for large arrays. Furthermore, corrected base step kernel to accumulate in outputT rather than in size_t, which additionally realizes savings when int32 is used as accumulator type. Using example from gh-1249, previously, on my Iris Xe laptop: ``` In [1]: import dpctl.tensor as dpt ...: ag = dpt.ones((8192, 8192), device='gpu', dtype='f4') ...: bg = dpt.ones((8192, 8192), device='gpu', dtype=bool) In [2]: cg = ag[bg] In [3]: dpt.all(cg == dpt.reshape(ag, -1)) Out[3]: usm_ndarray(True) In [4]: %timeit -n 10 -r 3 cg = ag[bg] 212 ms ± 56 ms per loop (mean ± std. dev. of 3 runs, 10 loops each) ``` while with this change: ``` In [4]: %timeit -n 10 -r 3 cg = ag[bg] 178 ms ± 24.2 ms per loop (mean ± std. dev. of 3 runs, 10 loops each) ``` --- .../include/kernels/accumulators.hpp | 375 ++++++++++++------ 1 file changed, 258 insertions(+), 117 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index da461d56b0..491fb12126 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -51,23 +51,6 @@ template T ceiling_quotient(T n, T m) { return (n + m - 1) / m; } -template T1 ceiling_quotient(T1 n, T2 m) -{ - return ceiling_quotient(n, static_cast(m)); -} - -template -class inclusive_scan_rec_local_scan_krn; - -template -class inclusive_scan_rec_chunk_update_krn; template struct NonZeroIndicator { @@ -93,129 +76,261 @@ template struct NoOpTransformer } }; -/* - * for integer type maskT, - * output[j] = sum( input[s0 + i * s1], 0 <= i <= j) - * for 0 <= j < n_elems - */ +// Iterative cumulative summation + +using nwiT = std::uint16_t; + template -sycl::event inclusive_scan_rec(sycl::queue &exec_q, - size_t n_elems, - size_t wg_size, - const inputT *input, - outputT *output, - size_t s0, - size_t s1, - IndexerT indexer, - TransformerT transformer, - std::vector &host_tasks, - const std::vector &depends = {}) +class inclusive_scan_iter_local_scan_krn; + +template +class inclusive_scan_iter_chunk_update_krn; + +template +sycl::event +inclusive_scan_base_step(sycl::queue &exec_q, + const size_t wg_size, + const size_t n_elems, + const inputT *input, + outputT *output, + const size_t s0, + const size_t s1, + IndexerT indexer, + TransformerT transformer, + size_t &n_groups, + const std::vector &depends = {}) { - size_t n_groups = ceiling_quotient(n_elems, n_wi * wg_size); + n_groups = ceiling_quotient(n_elems, n_wi * wg_size); - const sycl::event &inc_scan_phase1_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); + sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); - using slmT = sycl::local_accessor; + using slmT = sycl::local_accessor; - auto lws = sycl::range<1>(wg_size); - auto gws = sycl::range<1>(n_groups * wg_size); + auto lws = sycl::range<1>(wg_size); + auto gws = sycl::range<1>(n_groups * wg_size); - auto ndRange = sycl::nd_range<1>(gws, lws); + auto ndRange = sycl::nd_range<1>(gws, lws); - slmT slm_iscan_tmp(lws, cgh); + slmT slm_iscan_tmp(lws, cgh); - using KernelName = inclusive_scan_rec_local_scan_krn< - inputT, outputT, n_wi, IndexerT, decltype(transformer)>; + using KernelName = + inclusive_scan_iter_local_scan_krn; - cgh.parallel_for(ndRange, [=, slm_iscan_tmp = std::move( - slm_iscan_tmp)]( - sycl::nd_item<1> it) { - auto chunk_gid = it.get_global_id(0); - auto lid = it.get_local_id(0); + cgh.parallel_for(ndRange, [=, slm_iscan_tmp = + std::move(slm_iscan_tmp)]( + sycl::nd_item<1> it) { + auto chunk_gid = it.get_global_id(0); + auto lid = it.get_local_id(0); - std::array local_isum; + std::array local_isum; - size_t i = chunk_gid * n_wi; - for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { - constexpr outputT out_zero(0); + size_t i = chunk_gid * n_wi; + +#pragma unroll + for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { + constexpr outputT out_zero(0); - local_isum[m_wi] = - (i + m_wi < n_elems) - ? transformer(input[indexer(s0 + s1 * (i + m_wi))]) - : out_zero; - } + local_isum[m_wi] = + (i + m_wi < n_elems) + ? transformer(input[indexer(s0 + s1 * (i + m_wi))]) + : out_zero; + } #pragma unroll - for (size_t m_wi = 1; m_wi < n_wi; ++m_wi) { - local_isum[m_wi] += local_isum[m_wi - 1]; - } - // local_isum is now result of - // inclusive scan of locally stored inputs + for (nwiT m_wi = 1; m_wi < n_wi; ++m_wi) { + local_isum[m_wi] += local_isum[m_wi - 1]; + } + // local_isum is now result of + // inclusive scan of locally stored inputs - size_t wg_iscan_val = sycl::inclusive_scan_over_group( - it.get_group(), local_isum.back(), sycl::plus(), - size_t(0)); + outputT wg_iscan_val = sycl::inclusive_scan_over_group( + it.get_group(), local_isum.back(), sycl::plus(), + outputT(0)); - slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; - it.barrier(sycl::access::fence_space::local_space); - size_t addand = (lid == 0) ? 0 : slm_iscan_tmp[lid]; + slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; + it.barrier(sycl::access::fence_space::local_space); + outputT addand = (lid == 0) ? outputT(0) : slm_iscan_tmp[lid]; #pragma unroll - for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { - local_isum[m_wi] += addand; - } - - for (size_t m_wi = 0; m_wi < n_wi && i + m_wi < n_elems; ++m_wi) - { - output[i + m_wi] = local_isum[m_wi]; - } - }); + for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { + local_isum[m_wi] += addand; + } + + for (nwiT m_wi = 0; (m_wi < n_wi) && (i + m_wi < n_elems); ++m_wi) { + output[i + m_wi] = local_isum[m_wi]; + } }); + }); - sycl::event out_event = inc_scan_phase1_ev; - if (n_groups > 1) { - outputT *temp = sycl::malloc_device(n_groups - 1, exec_q); + return inc_scan_phase1_ev; +} + +namespace +{ +template class stack_t +{ + T *src_; + size_t size_; + T *local_scans_; + +public: + stack_t() : src_{}, size_{}, local_scans_{} {} + stack_t(T *src, size_t sz, T *local_scans) + : src_(src), size_(sz), local_scans_(local_scans) + { + } + ~stack_t(){}; + + T *get_src_ptr() const + { + return src_; + } + + const T *get_src_const_ptr() const + { + return src_; + } + + size_t get_size() const + { + return size_; + } + + T *get_local_scans_ptr() const + { + return local_scans_; + } +}; +} // end of anonymous namespace + +/* + * for integer type maskT, + * output[j] = sum( input[s0 + i * s1], 0 <= i <= j) + * for 0 <= j < n_elems + */ +template +sycl::event inclusive_scan_iter(sycl::queue &exec_q, + const size_t wg_size, + const size_t n_elems, + const inputT *input, + outputT *output, + const size_t s0, + const size_t s1, + IndexerT indexer, + TransformerT transformer, + std::vector &host_tasks, + const std::vector &depends = {}) +{ + size_t n_groups; + sycl::event inc_scan_phase1_ev = + inclusive_scan_base_step( + exec_q, wg_size, n_elems, input, output, s0, s1, indexer, + transformer, n_groups, depends); + sycl::event dependent_event = inc_scan_phase1_ev; + if (n_groups > 1) { auto chunk_size = wg_size * n_wi; + // how much of temporary allocation do we need + size_t n_groups_ = n_groups; + size_t temp_size = 0; + while (n_groups_ > 1) { + const auto this_size = (n_groups_ - 1); + temp_size += this_size; + n_groups_ = ceiling_quotient(this_size, chunk_size); + } + + // allocate + outputT *temp = sycl::malloc_device(temp_size, exec_q); + + if (!temp) { + throw std::bad_alloc(); + } + + std::vector> stack{}; + + // inclusive scans over blocks + n_groups_ = n_groups; + outputT *src = output; + outputT *local_scans = temp; + NoOpIndexer _no_op_indexer{}; - NoOpTransformer _no_op_transformer{}; - auto e2 = inclusive_scan_rec( - exec_q, n_groups - 1, wg_size, output, temp, chunk_size - 1, - chunk_size, _no_op_indexer, _no_op_transformer, host_tasks, - {inc_scan_phase1_ev}); - - // output[ chunk_size * (i + 1) + j] += temp[i] - auto e3 = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(e2); - cgh.parallel_for>( - {n_elems}, [=](auto wiid) - { - auto gid = wiid[0]; - auto i = (gid / chunk_size); - output[gid] += (i > 0) ? temp[i - 1] : 0; + using NoOpTransformerT = NoOpTransformer; + NoOpTransformerT _no_op_transformer{}; + size_t size_to_update = n_elems; + while (n_groups_ > 1) { + + size_t src_size = n_groups_ - 1; + dependent_event = + inclusive_scan_base_step( + exec_q, wg_size, src_size, src, local_scans, chunk_size - 1, + chunk_size, _no_op_indexer, _no_op_transformer, + n_groups_, // n_groups_ is modified in place + {dependent_event}); + stack.push_back({src, size_to_update, local_scans}); + src = local_scans; + local_scans += src_size; + size_to_update = src_size; + } + + for (size_t reverse_stack_id = 0; reverse_stack_id < stack.size(); + ++reverse_stack_id) + { + auto stack_id = stack.size() - 1 - reverse_stack_id; + + auto stack_elem = stack[stack_id]; + outputT *src = stack_elem.get_src_ptr(); + size_t src_size = stack_elem.get_size(); + outputT *local_scans = stack_elem.get_local_scans_ptr(); + + // output[ chunk_size * (i + 1) + j] += temp[i] + dependent_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_event); + + using UpdateKernelName = + class inclusive_scan_iter_chunk_update_krn< + inputT, outputT, n_wi, IndexerT, TransformerT, + NoOpIndexer, NoOpTransformerT>; + + cgh.parallel_for( + {src_size}, [chunk_size, src, local_scans](auto wiid) { + auto gid = wiid[0]; + auto i = (gid / chunk_size); + src[gid] += (i > 0) ? local_scans[i - 1] : outputT(0); + }); }); - }); + } sycl::event e4 = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(e3); + cgh.depends_on(dependent_event); const auto &ctx = exec_q.get_context(); cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); }); }); host_tasks.push_back(e4); - - out_event = std::move(e3); } - return out_event; + return dependent_event; } typedef size_t (*accumulate_contig_impl_fn_ptr_t)( @@ -234,20 +349,33 @@ size_t accumulate_contig_impl(sycl::queue &q, std::vector &host_tasks, const std::vector &depends = {}) { - constexpr int n_wi = 8; const maskT *mask_data_ptr = reinterpret_cast(mask); cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); - size_t wg_size = 128; NoOpIndexer flat_indexer{}; transformerT non_zero_indicator{}; - const sycl::event &comp_ev = - inclusive_scan_rec( - q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1, - flat_indexer, non_zero_indicator, host_tasks, depends); + sycl::event comp_ev; + const sycl::device &dev = q.get_device(); + if (dev.has(sycl::aspect::cpu)) { + constexpr nwiT n_wi_for_cpu = 8; + size_t wg_size = 256; + comp_ev = inclusive_scan_iter( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1, + flat_indexer, non_zero_indicator, host_tasks, depends); + } + else { + constexpr nwiT n_wi_for_gpu = 4; + size_t wg_size = 256; + comp_ev = inclusive_scan_iter( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1, + flat_indexer, non_zero_indicator, host_tasks, depends); + } cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); cumsumT *last_elem_host_usm = sycl::malloc_host(1, q); @@ -324,19 +452,32 @@ size_t accumulate_strided_impl(sycl::queue &q, std::vector &host_tasks, const std::vector &depends = {}) { - constexpr int n_wi = 8; const maskT *mask_data_ptr = reinterpret_cast(mask); cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); - size_t wg_size = 128; StridedIndexer strided_indexer{nd, 0, shape_strides}; transformerT non_zero_indicator{}; - const sycl::event &comp_ev = - inclusive_scan_rec( - q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1, + const sycl::device &dev = q.get_device(); + sycl::event comp_ev; + if (dev.has(sycl::aspect::cpu)) { + constexpr nwiT n_wi_for_cpu = 8; + size_t wg_size = 256; + comp_ev = inclusive_scan_iter( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1, strided_indexer, non_zero_indicator, host_tasks, depends); + } + else { + constexpr nwiT n_wi_for_gpu = 4; + size_t wg_size = 256; + comp_ev = inclusive_scan_iter( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1, + strided_indexer, non_zero_indicator, host_tasks, depends); + } cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); From b1490968d08594b5acaacf4f3d0054ec8a16abe3 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 18 Dec 2023 20:48:52 -0600 Subject: [PATCH 20/34] Use async implementation of set functions --- dpctl/tensor/__init__.py | 2 +- .../{_set_functions_debug.py => _set_functions_async.py} | 0 pyproject.toml | 4 ++-- 3 files changed, 3 insertions(+), 3 deletions(-) rename dpctl/tensor/{_set_functions_debug.py => _set_functions_async.py} (100%) diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index fb41c1240a..5b88c1f33e 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -175,7 +175,7 @@ reduce_hypot, sum, ) -from ._set_functions_sync import ( +from ._set_functions_async import ( unique_all, unique_counts, unique_inverse, diff --git a/dpctl/tensor/_set_functions_debug.py b/dpctl/tensor/_set_functions_async.py similarity index 100% rename from dpctl/tensor/_set_functions_debug.py rename to dpctl/tensor/_set_functions_async.py diff --git a/pyproject.toml b/pyproject.toml index 074b27257d..f791411f6c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ source = [ omit = [ "dpctl/tests/*", "dpctl/_version.py", - "dpctl/tensor/_set_functions_debug.py", + "dpctl/tensor/_set_functions_sync.py", "*/_cython_api*/stringsource", ] @@ -30,7 +30,7 @@ omit = [ omit = [ "dpctl/tests/*", "dpctl/_version.py", - "dpctl/tensor/_set_functions_debug.py", + "dpctl/tensor/_set_functions_sync.py", "*/_cython_api*/stringsource", ] From eeb30f6988b2b21c56cba8287028bb92b8f7afee Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Tue, 19 Dec 2023 01:36:36 -0800 Subject: [PATCH 21/34] Permit copying when flattening for `unique` funcs As the array will be copied if it isn't C-contiguous regardless, this change does not impact performance and permits calls to set functions with strided, ND data --- dpctl/tensor/_set_functions_async.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dpctl/tensor/_set_functions_async.py b/dpctl/tensor/_set_functions_async.py index 631a12a592..a0ea188634 100644 --- a/dpctl/tensor/_set_functions_async.py +++ b/dpctl/tensor/_set_functions_async.py @@ -79,7 +79,7 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: if x.ndim == 1: fx = x else: - fx = dpt.reshape(x, (x.size,), order="C", copy=False) + fx = dpt.reshape(x, (x.size,), order="C") s = dpt.empty_like(fx, order="C") host_tasks = [] if fx.flags.c_contiguous: @@ -166,7 +166,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: if x.ndim == 1: fx = x else: - fx = dpt.reshape(x, (x.size,), order="C", copy=False) + fx = dpt.reshape(x, (x.size,), order="C") s = dpt.empty_like(fx, order="C") host_tasks = [] if fx.flags.c_contiguous: @@ -284,7 +284,7 @@ def unique_inverse(x): if x.ndim == 1: fx = x else: - fx = dpt.reshape(x, (x.size,), order="C", copy=False) + fx = dpt.reshape(x, (x.size,), order="C") ind_dt = default_device_index_type(exec_q) sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C") unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C") From 0035558a1aa311b58cea9e94c1fb5ec1a9829614 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Tue, 19 Dec 2023 01:39:17 -0800 Subject: [PATCH 22/34] Fix typos in set functions Set functions were making copies and then sorting the original data, resulting in incorrect results. --- dpctl/tensor/_set_functions_async.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dpctl/tensor/_set_functions_async.py b/dpctl/tensor/_set_functions_async.py index a0ea188634..49679d0c32 100644 --- a/dpctl/tensor/_set_functions_async.py +++ b/dpctl/tensor/_set_functions_async.py @@ -94,7 +94,7 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: ) host_tasks.append(ht_ev) ht_ev, sort_ev = _sort_ascending( - src=fx, + src=tmp, trailing_dims_to_sort=1, dst=s, sycl_queue=exec_q, @@ -181,7 +181,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: ) host_tasks.append(ht_ev) ht_ev, sort_ev = _sort_ascending( - src=fx, + src=tmp, dst=s, trailing_dims_to_sort=1, sycl_queue=exec_q, @@ -301,7 +301,7 @@ def unique_inverse(x): ) host_tasks.append(ht_ev) ht_ev, sort_ev = _argsort_ascending( - src=fx, + src=tmp, trailing_dims_to_sort=1, dst=sorting_ids, sycl_queue=exec_q, @@ -462,7 +462,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: ) host_tasks.append(ht_ev) ht_ev, sort_ev = _argsort_ascending( - src=fx, + src=tmp, trailing_dims_to_sort=1, dst=sorting_ids, sycl_queue=exec_q, From a7e76255345dbda6b26d7708f8e578f094041857 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Tue, 19 Dec 2023 12:17:07 -0800 Subject: [PATCH 23/34] Fixes to unique functions, adds tests `unique_all` would not return a `UniqueAllResult` tuple when early exiting for an array of all unique elements Fixed problems through `unique` functions where the cumulative sum was being allocated on a separate (default) queue, causing inputs with a non-default queue to fail `unique` functions now behave correctly for 0-size array inputs --- dpctl/tensor/_set_functions_async.py | 56 +++++++++++++++------ dpctl/tests/test_usm_ndarray_unique.py | 70 ++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 14 deletions(-) diff --git a/dpctl/tensor/_set_functions_async.py b/dpctl/tensor/_set_functions_async.py index 49679d0c32..5b1291ea82 100644 --- a/dpctl/tensor/_set_functions_async.py +++ b/dpctl/tensor/_set_functions_async.py @@ -80,6 +80,8 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: fx = x else: fx = dpt.reshape(x, (x.size,), order="C") + if fx.size == 0: + return fx s = dpt.empty_like(fx, order="C") host_tasks = [] if fx.flags.c_contiguous: @@ -114,7 +116,7 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: fill_value=True, dst=unique_mask[0], sycl_queue=exec_q ) host_tasks.append(ht_ev) - cumsum = dpt.empty(s.shape, dtype=dpt.int64) + cumsum = dpt.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q) # synchronizing call n_uniques = mask_positions( unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev] @@ -163,10 +165,14 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") array_api_dev = x.device exec_q = array_api_dev.sycl_queue + x_usm_type = x.usm_type if x.ndim == 1: fx = x else: fx = dpt.reshape(x, (x.size,), order="C") + ind_dt = default_device_index_type(exec_q) + if fx.size == 0: + return UniqueCountsResult(fx, dpt.empty_like(fx, dtype=ind_dt)) s = dpt.empty_like(fx, order="C") host_tasks = [] if fx.flags.c_contiguous: @@ -201,17 +207,21 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: fill_value=True, dst=unique_mask[0], sycl_queue=exec_q ) host_tasks.append(ht_ev) - ind_dt = default_device_index_type(exec_q) - cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64) + cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) # synchronizing call n_uniques = mask_positions( unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev] ) if n_uniques == fx.size: dpctl.SyclEvent.wait_for(host_tasks) - return UniqueCountsResult(s, dpt.ones(n_uniques, dtype=ind_dt)) + return UniqueCountsResult( + s, + dpt.ones( + n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q + ), + ) unique_vals = dpt.empty( - n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q ) # populate unique values ht_ev, _ = _extract( @@ -224,7 +234,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: ) host_tasks.append(ht_ev) unique_counts = dpt.empty( - n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q + n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q ) idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) @@ -281,6 +291,7 @@ def unique_inverse(x): raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") array_api_dev = x.device exec_q = array_api_dev.sycl_queue + x_usm_type = x.usm_type if x.ndim == 1: fx = x else: @@ -288,6 +299,8 @@ def unique_inverse(x): ind_dt = default_device_index_type(exec_q) sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C") unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C") + if fx.size == 0: + return UniqueInverseResult(fx, unsorting_ids) host_tasks = [] if fx.flags.c_contiguous: ht_ev, sort_ev = _argsort_ascending( @@ -341,7 +354,7 @@ def unique_inverse(x): fill_value=True, dst=unique_mask[0], sycl_queue=exec_q ) host_tasks.append(ht_ev) - cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64) + cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) # synchronizing call n_uniques = mask_positions( unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev] @@ -350,7 +363,7 @@ def unique_inverse(x): dpctl.SyclEvent.wait_for(host_tasks) return UniqueInverseResult(s, unsorting_ids) unique_vals = dpt.empty( - n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q ) ht_ev, _ = _extract( src=s, @@ -362,7 +375,7 @@ def unique_inverse(x): ) host_tasks.append(ht_ev) cum_unique_counts = dpt.empty( - n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q + n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q ) idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) @@ -442,13 +455,20 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") array_api_dev = x.device exec_q = array_api_dev.sycl_queue + x_usm_type = x.usm_type if x.ndim == 1: fx = x else: - fx = dpt.reshape(x, (x.size,), order="C", copy=False) + fx = dpt.reshape(x, (x.size,), order="C") ind_dt = default_device_index_type(exec_q) sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C") unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C") + if fx.size == 0: + # original array contains no data + # so it can be safely returned as values + return UniqueAllResult( + fx, sorting_ids, unsorting_ids, dpt.empty_like(fx, dtype=ind_dt) + ) host_tasks = [] if fx.flags.c_contiguous: ht_ev, sort_ev = _argsort_ascending( @@ -502,16 +522,24 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: fill_value=True, dst=unique_mask[0], sycl_queue=exec_q ) host_tasks.append(ht_ev) - cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64) + cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) # synchronizing call n_uniques = mask_positions( unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev] ) if n_uniques == fx.size: dpctl.SyclEvent.wait_for(host_tasks) - return UniqueInverseResult(s, unsorting_ids) + _counts = dpt.ones( + n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q + ) + return UniqueAllResult( + s, + sorting_ids, + unsorting_ids, + _counts, + ) unique_vals = dpt.empty( - n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q ) ht_ev, _ = _extract( src=s, @@ -523,7 +551,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: ) host_tasks.append(ht_ev) cum_unique_counts = dpt.empty( - n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q + n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q ) idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) diff --git a/dpctl/tests/test_usm_ndarray_unique.py b/dpctl/tests/test_usm_ndarray_unique.py index 012a6091cc..4a0af6361f 100644 --- a/dpctl/tests/test_usm_ndarray_unique.py +++ b/dpctl/tests/test_usm_ndarray_unique.py @@ -16,6 +16,7 @@ import pytest +import dpctl import dpctl.tensor as dpt from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported @@ -151,3 +152,72 @@ def test_unique_all(dtype): assert dpt.all(uv == inp[ind]) assert dpt.all(inp == uv[inv]) assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype)) + + +def test_set_functions_empty_input(): + get_queue_or_skip() + x = dpt.ones((10, 0, 1), dtype="i4") + + res = dpt.unique_values(x) + assert isinstance(res, dpctl.tensor.usm_ndarray) + assert res.size == 0 + assert res.dtype == x.dtype + + res = dpt.unique_inverse(x) + assert type(res).__name__ == "UniqueInverseResult" + uv, inv = res + assert isinstance(uv, dpctl.tensor.usm_ndarray) + assert uv.size == 0 + assert isinstance(inv, dpctl.tensor.usm_ndarray) + assert inv.size == 0 + + res = dpt.unique_counts(x) + assert type(res).__name__ == "UniqueCountsResult" + uv, uv_counts = res + assert isinstance(uv, dpctl.tensor.usm_ndarray) + assert uv.size == 0 + assert isinstance(uv_counts, dpctl.tensor.usm_ndarray) + assert uv_counts.size == 0 + + res = dpt.unique_all(x) + assert type(res).__name__ == "UniqueAllResult" + uv, ind, inv, uv_counts = res + assert isinstance(uv, dpctl.tensor.usm_ndarray) + assert uv.size == 0 + assert isinstance(ind, dpctl.tensor.usm_ndarray) + assert ind.size == 0 + assert isinstance(inv, dpctl.tensor.usm_ndarray) + assert inv.size == 0 + assert isinstance(uv_counts, dpctl.tensor.usm_ndarray) + assert uv_counts.size == 0 + + +def test_set_function_outputs(): + get_queue_or_skip() + # check standard and early exit paths + x1 = dpt.arange(10, dtype="i4") + x2 = dpt.ones((10, 10), dtype="i4") + + assert isinstance(dpt.unique_values(x1), dpctl.tensor.usm_ndarray) + assert isinstance(dpt.unique_values(x2), dpctl.tensor.usm_ndarray) + + assert type(dpt.unique_inverse(x1)).__name__ == "UniqueInverseResult" + assert type(dpt.unique_inverse(x2)).__name__ == "UniqueInverseResult" + + assert type(dpt.unique_counts(x1)).__name__ == "UniqueCountsResult" + assert type(dpt.unique_counts(x2)).__name__ == "UniqueCountsResult" + + assert type(dpt.unique_all(x1)).__name__ == "UniqueAllResult" + assert type(dpt.unique_all(x2)).__name__ == "UniqueAllResult" + + +def test_set_functions_compute_follows_data(): + # tests that all intermediate calls and allocations + # are compatible with an input with an arbitrary queue + q = dpctl.SyclQueue() + x = dpt.arange(10, dtype="i4", sycl_queue=q) + + assert isinstance(dpt.unique_values(x), dpctl.tensor.usm_ndarray) + assert dpt.unique_counts(x) + assert dpt.unique_inverse(x) + assert dpt.unique_all(x) From 2fe8620f2282c9ff39091e5ecca8bf290b0a54d2 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 20 Dec 2023 14:21:40 -0800 Subject: [PATCH 24/34] Sorting, unique functions properly handle 0D arrays --- dpctl/tensor/_set_functions_async.py | 23 +++++++++++++++++++---- dpctl/tensor/_sorting.py | 14 ++++++++++++-- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/dpctl/tensor/_set_functions_async.py b/dpctl/tensor/_set_functions_async.py index 5b1291ea82..58c1d4ea71 100644 --- a/dpctl/tensor/_set_functions_async.py +++ b/dpctl/tensor/_set_functions_async.py @@ -292,11 +292,16 @@ def unique_inverse(x): array_api_dev = x.device exec_q = array_api_dev.sycl_queue x_usm_type = x.usm_type - if x.ndim == 1: + ind_dt = default_device_index_type(exec_q) + if x.ndim == 0: + return UniqueInverseResult( + dpt.reshape(x, (1,), order="C", copy=True), + dpt.zeros_like(x, ind_dt, usm_type=x_usm_type, sycl_queue=exec_q), + ) + elif x.ndim == 1: fx = x else: fx = dpt.reshape(x, (x.size,), order="C") - ind_dt = default_device_index_type(exec_q) sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C") unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C") if fx.size == 0: @@ -456,11 +461,21 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: array_api_dev = x.device exec_q = array_api_dev.sycl_queue x_usm_type = x.usm_type - if x.ndim == 1: + ind_dt = default_device_index_type(exec_q) + if x.ndim == 0: + uv = dpt.reshape(x, (1,), order="C", copy=True) + return UniqueAllResult( + uv, + dpt.zeros_like(uv, ind_dt, usm_type=x_usm_type, sycl_queue=exec_q), + dpt.zeros_like(x, ind_dt, usm_type=x_usm_type, sycl_queue=exec_q), + dpt.ones_like( + uv, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q + ), + ) + elif x.ndim == 1: fx = x else: fx = dpt.reshape(x, (x.size,), order="C") - ind_dt = default_device_index_type(exec_q) sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C") unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C") if fx.size == 0: diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py index 7158d981c1..cc46da9c97 100644 --- a/dpctl/tensor/_sorting.py +++ b/dpctl/tensor/_sorting.py @@ -61,7 +61,11 @@ def sort(x, /, *, axis=-1, descending=False, stable=False): f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}" ) nd = x.ndim - axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis") + if nd == 0: + axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis") + return dpt.copy(x, order="C") + else: + axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis") a1 = axis + 1 if a1 == nd: perm = list(range(nd)) @@ -134,7 +138,13 @@ def argsort(x, axis=-1, descending=False, stable=False): f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}" ) nd = x.ndim - axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis") + if nd == 0: + axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis") + return dpt.zeros_like( + x, dtype=ti.default_device_index_type(x.sycl_queue), order="C" + ) + else: + axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis") a1 = axis + 1 if a1 == nd: perm = list(range(nd)) From 8a6581adc350e2021f855ee7517dfbe765cf902c Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 20 Dec 2023 14:45:26 -0800 Subject: [PATCH 25/34] `unique_all` and `unique_inverse` inverse indices shape fixed Were previously returning a 1D array of indices rather than an array with the same shape as input `x` --- dpctl/tensor/_set_functions_async.py | 36 ++++++++++---------------- dpctl/tests/test_usm_ndarray_unique.py | 2 ++ 2 files changed, 15 insertions(+), 23 deletions(-) diff --git a/dpctl/tensor/_set_functions_async.py b/dpctl/tensor/_set_functions_async.py index 58c1d4ea71..f18d64d715 100644 --- a/dpctl/tensor/_set_functions_async.py +++ b/dpctl/tensor/_set_functions_async.py @@ -293,19 +293,14 @@ def unique_inverse(x): exec_q = array_api_dev.sycl_queue x_usm_type = x.usm_type ind_dt = default_device_index_type(exec_q) - if x.ndim == 0: - return UniqueInverseResult( - dpt.reshape(x, (1,), order="C", copy=True), - dpt.zeros_like(x, ind_dt, usm_type=x_usm_type, sycl_queue=exec_q), - ) - elif x.ndim == 1: + if x.ndim == 1: fx = x else: fx = dpt.reshape(x, (x.size,), order="C") sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C") unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C") if fx.size == 0: - return UniqueInverseResult(fx, unsorting_ids) + return UniqueInverseResult(fx, dpt.reshape(unsorting_ids, x.shape)) host_tasks = [] if fx.flags.c_contiguous: ht_ev, sort_ev = _argsort_ascending( @@ -366,7 +361,7 @@ def unique_inverse(x): ) if n_uniques == fx.size: dpctl.SyclEvent.wait_for(host_tasks) - return UniqueInverseResult(s, unsorting_ids) + return UniqueInverseResult(s, dpt.reshape(unsorting_ids, x.shape)) unique_vals = dpt.empty( n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q ) @@ -422,7 +417,9 @@ def unique_inverse(x): pos = pos_next host_tasks.append(ht_ev) dpctl.SyclEvent.wait_for(host_tasks) - return UniqueInverseResult(unique_vals, inv[unsorting_ids]) + return UniqueInverseResult( + unique_vals, dpt.reshape(inv[unsorting_ids], x.shape) + ) def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: @@ -462,17 +459,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: exec_q = array_api_dev.sycl_queue x_usm_type = x.usm_type ind_dt = default_device_index_type(exec_q) - if x.ndim == 0: - uv = dpt.reshape(x, (1,), order="C", copy=True) - return UniqueAllResult( - uv, - dpt.zeros_like(uv, ind_dt, usm_type=x_usm_type, sycl_queue=exec_q), - dpt.zeros_like(x, ind_dt, usm_type=x_usm_type, sycl_queue=exec_q), - dpt.ones_like( - uv, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q - ), - ) - elif x.ndim == 1: + if x.ndim == 1: fx = x else: fx = dpt.reshape(x, (x.size,), order="C") @@ -482,7 +469,10 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: # original array contains no data # so it can be safely returned as values return UniqueAllResult( - fx, sorting_ids, unsorting_ids, dpt.empty_like(fx, dtype=ind_dt) + fx, + sorting_ids, + dpt.reshape(unsorting_ids, x.shape), + dpt.empty_like(fx, dtype=ind_dt), ) host_tasks = [] if fx.flags.c_contiguous: @@ -550,7 +540,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: return UniqueAllResult( s, sorting_ids, - unsorting_ids, + dpt.reshape(unsorting_ids, x.shape), _counts, ) unique_vals = dpt.empty( @@ -611,6 +601,6 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: return UniqueAllResult( unique_vals, sorting_ids[cum_unique_counts[:-1]], - inv[unsorting_ids], + dpt.reshape(inv[unsorting_ids], x.shape), _counts, ) diff --git a/dpctl/tests/test_usm_ndarray_unique.py b/dpctl/tests/test_usm_ndarray_unique.py index 4a0af6361f..6fff952dd3 100644 --- a/dpctl/tests/test_usm_ndarray_unique.py +++ b/dpctl/tests/test_usm_ndarray_unique.py @@ -117,6 +117,7 @@ def test_unique_inverse(dtype): uv, inv = dpt.unique_inverse(inp) assert dpt.all(uv == dpt.arange(2, dtype=dtype)) assert dpt.all(inp == uv[inv]) + assert inp.shape == inv.shape @pytest.mark.parametrize( @@ -151,6 +152,7 @@ def test_unique_all(dtype): assert dpt.all(uv == dpt.arange(2, dtype=dtype)) assert dpt.all(uv == inp[ind]) assert dpt.all(inp == uv[inv]) + assert inp.shape == inv.shape assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype)) From 82be69a9582568bfcd09ca3ffc7700d999f84a0d Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 21 Dec 2023 11:41:13 -0800 Subject: [PATCH 26/34] Set `stable` to True for sorting functions by default As mandated by the array API spec --- dpctl/tensor/_sorting.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py index cc46da9c97..661c3bc1c4 100644 --- a/dpctl/tensor/_sorting.py +++ b/dpctl/tensor/_sorting.py @@ -30,8 +30,8 @@ __all__ = ["sort", "argsort"] -def sort(x, /, *, axis=-1, descending=False, stable=False): - """sort(x, axis=-1, descending=False, stable=False) +def sort(x, /, *, axis=-1, descending=False, stable=True): + """sort(x, axis=-1, descending=False, stable=True) Returns a sorted copy of an input array `x`. @@ -106,8 +106,8 @@ def sort(x, /, *, axis=-1, descending=False, stable=False): return res -def argsort(x, axis=-1, descending=False, stable=False): - """argsort(x, axis=-1, descending=False, stable=False) +def argsort(x, axis=-1, descending=False, stable=True): + """argsort(x, axis=-1, descending=False, stable=True) Returns the indices that sort an array `x` along a specified axis. From 81610d60a398af029c445cb60939e1f0eb6709df Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 21 Dec 2023 11:46:57 -0800 Subject: [PATCH 27/34] Adds tests for 0D array cases for sorting --- dpctl/tests/test_usm_ndarray_sorting.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/dpctl/tests/test_usm_ndarray_sorting.py b/dpctl/tests/test_usm_ndarray_sorting.py index 84cf760a0c..69f2c62a04 100644 --- a/dpctl/tests/test_usm_ndarray_sorting.py +++ b/dpctl/tests/test_usm_ndarray_sorting.py @@ -197,3 +197,17 @@ def test_argsort_strided(): idx = dpt.argsort(x_flipped) assert dpt.all(x_flipped[idx] == x_orig) + + +def test_sort_0d_array(): + get_queue_or_skip() + + x = dpt.asarray(1, dtype="i4") + assert dpt.sort(x) == 1 + + +def test_argsort_0d_array(): + get_queue_or_skip() + + x = dpt.asarray(1, dtype="i4") + assert dpt.argsort(x) == 0 From b0cc135becf14b620bdecd590205b44b74dbbf23 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 21 Dec 2023 12:09:55 -0800 Subject: [PATCH 28/34] Adds tests for `unique` functions on strided inputs --- dpctl/tests/test_usm_ndarray_unique.py | 51 ++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/dpctl/tests/test_usm_ndarray_unique.py b/dpctl/tests/test_usm_ndarray_unique.py index 6fff952dd3..3081fc1e9d 100644 --- a/dpctl/tests/test_usm_ndarray_unique.py +++ b/dpctl/tests/test_usm_ndarray_unique.py @@ -53,6 +53,17 @@ def test_unique_values(dtype): assert dpt.all(uv == dpt.arange(2, dtype=dtype)) +def test_unique_values_strided(): + get_queue_or_skip() + + n, m = 1000, 20 + inp = dpt.ones((n, m), dtype="i4", order="F") + inp[:, ::2] = 0 + + uv = dpt.unique_values(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + + @pytest.mark.parametrize( "dtype", [ @@ -86,6 +97,18 @@ def test_unique_counts(dtype): assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype)) +def test_unique_counts_strided(): + get_queue_or_skip() + + n, m = 1000, 20 + inp = dpt.ones((n, m), dtype="i4", order="F") + inp[:, ::2] = 0 + + uv, uv_counts = dpt.unique_counts(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype)) + + @pytest.mark.parametrize( "dtype", [ @@ -120,6 +143,19 @@ def test_unique_inverse(dtype): assert inp.shape == inv.shape +def test_unique_inverse_strided(): + get_queue_or_skip() + + n, m = 1000, 20 + inp = dpt.ones((n, m), dtype="i4", order="F") + inp[:, ::2] = 0 + + uv, inv = dpt.unique_inverse(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + assert dpt.all(inp == uv[inv]) + assert inp.shape == inv.shape + + @pytest.mark.parametrize( "dtype", [ @@ -156,6 +192,21 @@ def test_unique_all(dtype): assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype)) +def test_unique_all_strided(): + get_queue_or_skip() + + n, m = 1000, 20 + inp = dpt.ones((n, m), dtype="i4", order="F") + inp[:, ::2] = 0 + + uv, ind, inv, uv_counts = dpt.unique_all(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + assert dpt.all(uv == dpt.reshape(inp, -1)[ind]) + assert dpt.all(inp == uv[inv]) + assert inp.shape == inv.shape + assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype)) + + def test_set_functions_empty_input(): get_queue_or_skip() x = dpt.ones((10, 0, 1), dtype="i4") From 8a6282fdf1e4e5090b9bee99320a9eab052353bf Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 21 Dec 2023 12:52:32 -0800 Subject: [PATCH 29/34] Tests for strided 1D data input for `unique` functions --- dpctl/tests/test_usm_ndarray_unique.py | 28 ++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/dpctl/tests/test_usm_ndarray_unique.py b/dpctl/tests/test_usm_ndarray_unique.py index 3081fc1e9d..0d4247def5 100644 --- a/dpctl/tests/test_usm_ndarray_unique.py +++ b/dpctl/tests/test_usm_ndarray_unique.py @@ -63,6 +63,12 @@ def test_unique_values_strided(): uv = dpt.unique_values(inp) assert dpt.all(uv == dpt.arange(2, dtype="i4")) + inp = dpt.reshape(inp, -1) + inp = dpt.flip(dpt.reshape(inp, -1)) + + uv = dpt.unique_values(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + @pytest.mark.parametrize( "dtype", @@ -108,6 +114,12 @@ def test_unique_counts_strided(): assert dpt.all(uv == dpt.arange(2, dtype="i4")) assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype)) + inp = dpt.flip(dpt.reshape(inp, -1)) + + uv, uv_counts = dpt.unique_counts(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype)) + @pytest.mark.parametrize( "dtype", @@ -155,6 +167,13 @@ def test_unique_inverse_strided(): assert dpt.all(inp == uv[inv]) assert inp.shape == inv.shape + inp = dpt.flip(dpt.reshape(inp, -1)) + + uv, inv = dpt.unique_inverse(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + assert dpt.all(inp == uv[inv]) + assert inp.shape == inv.shape + @pytest.mark.parametrize( "dtype", @@ -206,6 +225,15 @@ def test_unique_all_strided(): assert inp.shape == inv.shape assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype)) + inp = dpt.flip(dpt.reshape(inp, -1)) + + uv, ind, inv, uv_counts = dpt.unique_all(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + assert dpt.all(uv == inp[ind]) + assert dpt.all(inp == uv[inv]) + assert inp.shape == inv.shape + assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype)) + def test_set_functions_empty_input(): get_queue_or_skip() From dd71e4eb3a8966e28f9e8aaa9f03e70ab2eff602 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 8 Jan 2024 08:01:59 -0600 Subject: [PATCH 30/34] Changes to _set_functions_sync.py for test suite to pass Changes to sync implementation of set functions so that test suite passes with sync implementation as it does for async implementation. --- dpctl/tensor/_set_functions_sync.py | 55 +++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/dpctl/tensor/_set_functions_sync.py b/dpctl/tensor/_set_functions_sync.py index 7f16373c78..f1e0332b42 100644 --- a/dpctl/tensor/_set_functions_sync.py +++ b/dpctl/tensor/_set_functions_sync.py @@ -75,12 +75,14 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: if x.ndim == 1: fx = x else: - fx = dpt.reshape(x, (x.size,), order="C", copy=False) + fx = dpt.reshape(x, (x.size,), order="C") + if fx.size == 0: + return fx s = dpt.sort(fx) unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:]) unique_mask[0] = True - cumsum = dpt.empty(s.shape, dtype=dpt.int64) + cumsum = dpt.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q) n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q) if n_uniques == fx.size: return s @@ -127,13 +129,15 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: if x.ndim == 1: fx = x else: - fx = dpt.reshape(x, (x.size,), order="C", copy=False) - s = dpt.sort(x) + fx = dpt.reshape(x, (x.size,), order="C") + ind_dt = default_device_index_type(exec_q) + if fx.size == 0: + return UniqueCountsResult(fx, dpt.empty_like(fx, dtype=ind_dt)) + s = dpt.sort(fx) unique_mask = dpt.empty(s.shape, dtype="?", sycl_queue=exec_q) dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:]) unique_mask[0] = True - ind_dt = default_device_index_type(exec_q) - cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64) + cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) # synchronizing call n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q) if n_uniques == fx.size: @@ -195,18 +199,20 @@ def unique_inverse(x): raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") array_api_dev = x.device exec_q = array_api_dev.sycl_queue + ind_dt = default_device_index_type(exec_q) if x.ndim == 1: fx = x else: - fx = dpt.reshape(x, (x.size,), order="C", copy=False) - ind_dt = default_device_index_type(exec_q) + fx = dpt.reshape(x, (x.size,), order="C") sorting_ids = dpt.argsort(fx) unsorting_ids = dpt.argsort(sorting_ids) + if fx.size == 0: + return UniqueInverseResult(fx, dpt.reshape(unsorting_ids, x.shape)) s = fx[sorting_ids] unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) unique_mask[0] = True dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:]) - cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64) + cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) # synchronizing call n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q) if n_uniques == fx.size: @@ -251,7 +257,9 @@ def unique_inverse(x): ht_ev, _ = _full_usm_ndarray(fill_value=i, dst=_dst, sycl_queue=exec_q) ht_ev.wait() pos = pos_next - return UniqueInverseResult(unique_vals, inv[unsorting_ids]) + return UniqueInverseResult( + unique_vals, dpt.reshape(inv[unsorting_ids], x.shape) + ) def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: @@ -289,22 +297,39 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") array_api_dev = x.device exec_q = array_api_dev.sycl_queue + ind_dt = default_device_index_type(exec_q) if x.ndim == 1: fx = x else: - fx = dpt.reshape(x, (x.size,), order="C", copy=False) - ind_dt = default_device_index_type(exec_q) + fx = dpt.reshape(x, (x.size,), order="C") sorting_ids = dpt.argsort(fx) unsorting_ids = dpt.argsort(sorting_ids) + if fx.size == 0: + # original array contains no data + # so it can be safely returned as values + return UniqueAllResult( + fx, + sorting_ids, + dpt.reshape(unsorting_ids, x.shape), + dpt.empty_like(fx, dtype=ind_dt), + ) s = fx[sorting_ids] unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:]) unique_mask[0] = True - cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64) + cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) # synchronizing call n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q) if n_uniques == fx.size: - return UniqueInverseResult(s, unsorting_ids) + _counts = dpt.ones( + n_uniques, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q + ) + return UniqueAllResult( + s, + sorting_ids, + dpt.reshape(unsorting_ids, x.shape), + _counts, + ) unique_vals = dpt.empty( n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q ) @@ -346,6 +371,6 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: return UniqueAllResult( unique_vals, sorting_ids[cum_unique_counts[:-1]], - inv[unsorting_ids], + dpt.reshape(inv[unsorting_ids], x.shape), _counts, ) From b1b0af1769b29c7fd43e87d034555cbe43ea4366 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 8 Jan 2024 08:06:10 -0600 Subject: [PATCH 31/34] Removed sync implementation of set functions --- dpctl/tensor/_set_functions_sync.py | 376 ---------------------------- pyproject.toml | 2 - 2 files changed, 378 deletions(-) delete mode 100644 dpctl/tensor/_set_functions_sync.py diff --git a/dpctl/tensor/_set_functions_sync.py b/dpctl/tensor/_set_functions_sync.py deleted file mode 100644 index f1e0332b42..0000000000 --- a/dpctl/tensor/_set_functions_sync.py +++ /dev/null @@ -1,376 +0,0 @@ -# Data Parallel Control (dpctl) -# -# Copyright 2020-2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import NamedTuple - -import dpctl.tensor as dpt - -from ._tensor_impl import ( - _extract, - _full_usm_ndarray, - _linspace_step, - default_device_index_type, - mask_positions, -) - -__all__ = [ - "unique_values", - "unique_counts", - "unique_inverse", - "unique_all", - "UniqueAllResult", - "UniqueCountsResult", - "UniqueInverseResult", -] - - -class UniqueAllResult(NamedTuple): - values: dpt.usm_ndarray - indices: dpt.usm_ndarray - inverse_indices: dpt.usm_ndarray - counts: dpt.usm_ndarray - - -class UniqueCountsResult(NamedTuple): - values: dpt.usm_ndarray - counts: dpt.usm_ndarray - - -class UniqueInverseResult(NamedTuple): - values: dpt.usm_ndarray - inverse_indices: dpt.usm_ndarray - - -def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: - """unique_values(x) - - Returns the unique elements of an input array x. - - Args: - x (usm_ndarray): - input array. The input with more than one dimension is flattened. - Returns: - usm_ndarray - an array containing the set of unique elements in `x`. The - returned array has the same data type as `x`. - """ - if not isinstance(x, dpt.usm_ndarray): - raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") - array_api_dev = x.device - exec_q = array_api_dev.sycl_queue - if x.ndim == 1: - fx = x - else: - fx = dpt.reshape(x, (x.size,), order="C") - if fx.size == 0: - return fx - s = dpt.sort(fx) - unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) - dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:]) - unique_mask[0] = True - cumsum = dpt.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q) - n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q) - if n_uniques == fx.size: - return s - unique_vals = dpt.empty( - n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q - ) - ht_ev, _ = _extract( - src=s, - cumsum=cumsum, - axis_start=0, - axis_end=1, - dst=unique_vals, - sycl_queue=exec_q, - ) - ht_ev.wait() - return unique_vals - - -def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: - """unique_counts(x) - - Returns the unique elements of an input array `x` and the corresponding - counts for each unique element in `x`. - - Args: - x (usm_ndarray): - input array. The input with more than one dimension is flattened. - Returns: - tuple[usm_ndarray, usm_ndarray] - a namedtuple `(values, counts)` whose - - * first element is the field name `values` and is an array - containing the unique elements of `x`. This array has the - same data type as `x`. - * second element has the field name `counts` and is an array - containing the number of times each unique element occurs in `x`. - This array has the same shape as `values` and has the default - array index data type. - """ - if not isinstance(x, dpt.usm_ndarray): - raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") - array_api_dev = x.device - exec_q = array_api_dev.sycl_queue - if x.ndim == 1: - fx = x - else: - fx = dpt.reshape(x, (x.size,), order="C") - ind_dt = default_device_index_type(exec_q) - if fx.size == 0: - return UniqueCountsResult(fx, dpt.empty_like(fx, dtype=ind_dt)) - s = dpt.sort(fx) - unique_mask = dpt.empty(s.shape, dtype="?", sycl_queue=exec_q) - dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:]) - unique_mask[0] = True - cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) - # synchronizing call - n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q) - if n_uniques == fx.size: - return UniqueCountsResult(s, dpt.ones(n_uniques, dtype=ind_dt)) - unique_vals = dpt.empty( - n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q - ) - # populate unique values - ht_ev, _ = _extract( - src=s, - cumsum=cumsum, - axis_start=0, - axis_end=1, - dst=unique_vals, - sycl_queue=exec_q, - ) - ht_ev.wait() - unique_counts = dpt.empty( - n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q - ) - idx = dpt.arange(x.size, dtype=ind_dt, sycl_queue=exec_q) - ht_ev, _ = _extract( - src=idx, - cumsum=cumsum, - axis_start=0, - axis_end=1, - dst=unique_counts[:-1], - sycl_queue=exec_q, - ) - unique_counts[-1] = fx.size - ht_ev.wait() - _counts = dpt.empty_like(unique_counts[1:]) - dpt.subtract(unique_counts[1:], unique_counts[:-1], out=_counts) - return UniqueCountsResult(unique_vals, _counts) - - -def unique_inverse(x): - """unique_inverse - - Returns the unique elements of an input array x and the indices from the - set of unique elements that reconstruct `x`. - - Args: - x (usm_ndarray): - input array. The input with more than one dimension is flattened. - Returns: - tuple[usm_ndarray, usm_ndarray] - a namedtuple `(values, inverse_indices)` whose - - * first element has the field name `values` and is an array - containing the unique elements of `x`. The array has the same - data type as `x`. - * second element has the field name `inverse_indices` and is an - array containing the indices of values that reconstruct `x`. - The array has the same shape as `x` and has the default array - index data type. - """ - if not isinstance(x, dpt.usm_ndarray): - raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") - array_api_dev = x.device - exec_q = array_api_dev.sycl_queue - ind_dt = default_device_index_type(exec_q) - if x.ndim == 1: - fx = x - else: - fx = dpt.reshape(x, (x.size,), order="C") - sorting_ids = dpt.argsort(fx) - unsorting_ids = dpt.argsort(sorting_ids) - if fx.size == 0: - return UniqueInverseResult(fx, dpt.reshape(unsorting_ids, x.shape)) - s = fx[sorting_ids] - unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) - unique_mask[0] = True - dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:]) - cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) - # synchronizing call - n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q) - if n_uniques == fx.size: - return UniqueInverseResult(s, unsorting_ids) - unique_vals = dpt.empty( - n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q - ) - ht_ev, _ = _extract( - src=s, - cumsum=cumsum, - axis_start=0, - axis_end=1, - dst=unique_vals, - sycl_queue=exec_q, - ) - ht_ev.wait() - cum_unique_counts = dpt.empty( - n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q - ) - idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) - ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) - ht_ev.wait() - ht_ev, _ = _extract( - src=idx, - cumsum=cumsum, - axis_start=0, - axis_end=1, - dst=cum_unique_counts[:-1], - sycl_queue=exec_q, - ) - ht_ev.wait() - cum_unique_counts[-1] = fx.size - _counts = dpt.subtract(cum_unique_counts[1:], cum_unique_counts[:-1]) - # TODO: when searchsorted is available, - # inv = searchsorted(unique_vals, fx) - counts = dpt.asnumpy(_counts).tolist() - inv = dpt.empty_like(fx, dtype=ind_dt) - pos = 0 - for i in range(len(counts)): - pos_next = pos + counts[i] - _dst = inv[pos:pos_next] - ht_ev, _ = _full_usm_ndarray(fill_value=i, dst=_dst, sycl_queue=exec_q) - ht_ev.wait() - pos = pos_next - return UniqueInverseResult( - unique_vals, dpt.reshape(inv[unsorting_ids], x.shape) - ) - - -def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: - """unique_all(x) - - Returns the unique elements of an input array `x`, the first occurring - indices for each unique element in `x`, the indices from the set of unique - elements that reconstruct `x`, and the corresponding counts for each - unique element in `x`. - - Args: - x (usm_ndarray): - input array. The input with more than one dimension is flattened. - Returns: - tuple[usm_ndarray, usm_ndarray, usm_ndarray, usm_ndarray] - a namedtuple `(values, indices, inverse_indices, counts)` whose - - * first element has the field name `values` and is an array - containing the unique elements of `x`. The array has the same - data type as `x`. - * second element has the field name `indices` and is an array - the indices (of first occurrences) of `x` that result in - `values`. The array has the same shape as `values` and has the - default array index data type. - * third element has the field name `inverse_indices` and is an - array containing the indices of values that reconstruct `x`. - The array has the same shape as `x` and has the default array - index data type. - * fourth element has the field name `counts` and is an array - containing the number of times each unique element occurs in `x`. - This array has the same shape as `values` and has the default - array index data type. - """ - if not isinstance(x, dpt.usm_ndarray): - raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") - array_api_dev = x.device - exec_q = array_api_dev.sycl_queue - ind_dt = default_device_index_type(exec_q) - if x.ndim == 1: - fx = x - else: - fx = dpt.reshape(x, (x.size,), order="C") - sorting_ids = dpt.argsort(fx) - unsorting_ids = dpt.argsort(sorting_ids) - if fx.size == 0: - # original array contains no data - # so it can be safely returned as values - return UniqueAllResult( - fx, - sorting_ids, - dpt.reshape(unsorting_ids, x.shape), - dpt.empty_like(fx, dtype=ind_dt), - ) - s = fx[sorting_ids] - unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) - dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:]) - unique_mask[0] = True - cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) - # synchronizing call - n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q) - if n_uniques == fx.size: - _counts = dpt.ones( - n_uniques, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q - ) - return UniqueAllResult( - s, - sorting_ids, - dpt.reshape(unsorting_ids, x.shape), - _counts, - ) - unique_vals = dpt.empty( - n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q - ) - ht_ev, _ = _extract( - src=s, - cumsum=cumsum, - axis_start=0, - axis_end=1, - dst=unique_vals, - sycl_queue=exec_q, - ) - ht_ev.wait() - cum_unique_counts = dpt.empty( - n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q - ) - idx = dpt.arange(fx.size, dtype=ind_dt, sycl_queue=exec_q) - ht_ev, extr_ev = _extract( - src=idx, - cumsum=cumsum, - axis_start=0, - axis_end=1, - dst=cum_unique_counts[:-1], - sycl_queue=exec_q, - ) - ht_ev.wait() - cum_unique_counts[-1] = fx.size - _counts = cum_unique_counts[1:] - cum_unique_counts[:-1] - # TODO: when searchsorted is available, - # inv = searchsorted(unique_vals, fx) - counts = dpt.asnumpy(_counts).tolist() - inv = dpt.empty_like(fx, dtype=ind_dt) - pos = 0 - for i in range(len(counts)): - pos_next = pos + counts[i] - _dst = inv[pos:pos_next] - ht_ev, _ = _full_usm_ndarray(fill_value=i, dst=_dst, sycl_queue=exec_q) - ht_ev.wait() - pos = pos_next - return UniqueAllResult( - unique_vals, - sorting_ids[cum_unique_counts[:-1]], - dpt.reshape(inv[unsorting_ids], x.shape), - _counts, - ) diff --git a/pyproject.toml b/pyproject.toml index f791411f6c..5882128dab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,6 @@ source = [ omit = [ "dpctl/tests/*", "dpctl/_version.py", - "dpctl/tensor/_set_functions_sync.py", "*/_cython_api*/stringsource", ] @@ -30,7 +29,6 @@ omit = [ omit = [ "dpctl/tests/*", "dpctl/_version.py", - "dpctl/tensor/_set_functions_sync.py", "*/_cython_api*/stringsource", ] From 08e5dacbb8e3e845f5bb0b0e1871483a816a5640 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 8 Jan 2024 13:30:03 -0600 Subject: [PATCH 32/34] Renamed _set_functions_async -> _set_functions --- dpctl/tensor/__init__.py | 2 +- dpctl/tensor/{_set_functions_async.py => _set_functions.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename dpctl/tensor/{_set_functions_async.py => _set_functions.py} (100%) diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index 5b88c1f33e..81fc152e7a 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -175,7 +175,7 @@ reduce_hypot, sum, ) -from ._set_functions_async import ( +from ._set_functions import ( unique_all, unique_counts, unique_inverse, diff --git a/dpctl/tensor/_set_functions_async.py b/dpctl/tensor/_set_functions.py similarity index 100% rename from dpctl/tensor/_set_functions_async.py rename to dpctl/tensor/_set_functions.py From a3d0d087b3a6b3521502d67c097f4f5a13216375 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 9 Jan 2024 12:54:19 -0600 Subject: [PATCH 33/34] Use extended comparison operators to define weak order on real/complex FP types We use extended comparison operators compatible with NumPy's behavior: https://numpy.org/devdocs/reference/generated/numpy.sort.html Specifically, we use [R, nan] block ordering for reals, and [(R, R), (R, nan), (nan, R), (nan, nan)] for complexes. --- .../source/sorting/sorting_common.hpp | 80 ++++++++++++++++--- 1 file changed, 68 insertions(+), 12 deletions(-) diff --git a/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp b/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp index 6e8f273ea5..62b30ccb8f 100644 --- a/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp +++ b/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp @@ -24,7 +24,8 @@ #pragma once -#include "utils/math_utils.hpp" +#include "sycl/sycl.hpp" +#include namespace dpctl { @@ -33,44 +34,99 @@ namespace tensor namespace py_internal { -template struct ComplexLess +namespace { - bool operator()(const cT &v1, const cT &v2) const +template struct ExtendedRealFPLess +{ + /* [R, nan] */ + bool operator()(const fpT v1, const fpT v2) const { - using dpctl::tensor::math_utils::less_complex; + return (!sycl::isnan(v1) && (sycl::isnan(v2) || (v1 < v2))); + } +}; - return less_complex(v1, v2); +template struct ExtendedRealFPGreater +{ + bool operator()(const fpT v1, const fpT v2) const + { + return (!sycl::isnan(v2) && (sycl::isnan(v1) || (v2 < v1))); } }; -template struct ComplexGreater +template struct ExtendedComplexFPLess { + /* [(R, R), (R, nan), (nan, R), (nan, nan)] */ + bool operator()(const cT &v1, const cT &v2) const { - using dpctl::tensor::math_utils::greater_complex; + using realT = typename cT::value_type; + + const realT real1 = std::real(v1); + const realT real2 = std::real(v2); + + const bool r1_nan = sycl::isnan(real1); + const bool r2_nan = sycl::isnan(real2); + + const realT imag1 = std::imag(v1); + const realT imag2 = std::imag(v2); + + const bool i1_nan = sycl::isnan(imag1); + const bool i2_nan = sycl::isnan(imag2); - return greater_complex(v1, v2); + const int idx1 = ((r1_nan) ? 2 : 0) + ((i1_nan) ? 1 : 0); + const int idx2 = ((r2_nan) ? 2 : 0) + ((i2_nan) ? 1 : 0); + + const bool res = + !(r1_nan && i1_nan) && + ((idx1 < idx2) || + ((idx1 == idx2) && + ((r1_nan && !i1_nan && (imag1 < imag2)) || + (!r1_nan && i1_nan && (real1 < real2)) || + (!r1_nan && !i1_nan && + ((real1 < real2) || (!(real2 < real1) && (imag1 < imag2))))))); + + return res; + } +}; + +template struct ExtendedComplexFPGreater +{ + bool operator()(const cT &v1, const cT &v2) const + { + auto less_ = ExtendedComplexFPLess{}; + return less_(v2, v1); } }; +template +inline constexpr bool is_fp_v = (std::is_same_v || + std::is_same_v || + std::is_same_v); + +} // end of anonymous namespace + template struct AscendingSorter { - using type = std::less; + using type = std::conditional_t, + ExtendedRealFPLess, + std::less>; }; template struct AscendingSorter> { - using type = ComplexLess>; + using type = ExtendedComplexFPLess>; }; template struct DescendingSorter { - using type = std::greater; + using type = std::conditional_t, + ExtendedRealFPGreater, + std::greater>; }; template struct DescendingSorter> { - using type = ComplexGreater>; + using type = ExtendedComplexFPGreater>; }; } // end of namespace py_internal From 54698321303b8708e3ac7a5c23188fe6527f8b66 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 9 Jan 2024 12:58:42 -0600 Subject: [PATCH 34/34] Adding tests for sorting of FP arrays with NaNs --- dpctl/tests/test_usm_ndarray_sorting.py | 69 +++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/dpctl/tests/test_usm_ndarray_sorting.py b/dpctl/tests/test_usm_ndarray_sorting.py index 69f2c62a04..e76ac667e1 100644 --- a/dpctl/tests/test_usm_ndarray_sorting.py +++ b/dpctl/tests/test_usm_ndarray_sorting.py @@ -14,6 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import itertools + +import numpy as np import pytest import dpctl.tensor as dpt @@ -211,3 +214,69 @@ def test_argsort_0d_array(): x = dpt.asarray(1, dtype="i4") assert dpt.argsort(x) == 0 + + +@pytest.mark.parametrize( + "dtype", + [ + "f2", + "f4", + "f8", + ], +) +def test_sort_real_fp_nan(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.asarray( + [-0.0, 0.1, dpt.nan, 0.0, -0.1, dpt.nan, 0.2, -0.3], dtype=dtype + ) + s = dpt.sort(x) + + expected = dpt.asarray( + [-0.3, -0.1, -0.0, 0.0, 0.1, 0.2, dpt.nan, dpt.nan], dtype=dtype + ) + + assert dpt.allclose(s, expected, equal_nan=True) + + s = dpt.sort(x, descending=True) + + expected = dpt.asarray( + [dpt.nan, dpt.nan, 0.2, 0.1, -0.0, 0.0, -0.1, -0.3], dtype=dtype + ) + + assert dpt.allclose(s, expected, equal_nan=True) + + +@pytest.mark.parametrize( + "dtype", + [ + "c8", + "c16", + ], +) +def test_sort_complex_fp_nan(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + rvs = [-0.0, 0.1, 0.0, 0.2, -0.3, dpt.nan] + ivs = [-0.0, 0.1, 0.0, 0.2, -0.3, dpt.nan] + + cv = [] + for rv in rvs: + for iv in ivs: + cv.append(complex(rv, iv)) + + inp = dpt.asarray(cv, dtype=dtype) + s = dpt.sort(inp) + + expected = np.sort(dpt.asnumpy(inp)) + + assert np.allclose(dpt.asnumpy(s), expected, equal_nan=True) + + for i, j in itertools.permutations(range(inp.shape[0]), 2): + r1 = dpt.asnumpy(dpt.sort(inp[dpt.asarray([i, j])])) + r2 = np.sort(dpt.asnumpy(inp[dpt.asarray([i, j])])) + assert np.array_equal( + r1.view(np.int64), r2.view(np.int64) + ), f"Failed for {i} and {j}"