From c2d21d04a1ffb8e52b19661b636a2fa17446980a Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 6 Dec 2023 08:29:06 -0600
Subject: [PATCH 01/34] Initial check in of sort/argsort per array API spec

Implemented data-parallel merge-sort algorithm.
---
 dpctl/tensor/CMakeLists.txt                   |  14 +
 dpctl/tensor/__init__.py                      |   3 +
 dpctl/tensor/_sorting.py                      | 107 ++
 .../libtensor/include/kernels/sorting.hpp     | 970 ++++++++++++++++++
 .../libtensor/source/sorting/argsort.cpp      | 238 +++++
 .../libtensor/source/sorting/argsort.hpp      |  18 +
 .../tensor/libtensor/source/sorting/sort.cpp  | 212 ++++
 .../tensor/libtensor/source/sorting/sort.hpp  |  18 +
 .../source/sorting/sorting_common.hpp         |  54 +
 .../libtensor/source/tensor_sorting.cpp       |  37 +
 dpctl/tests/test_usm_ndarray_sorting.py       | 128 +++
 11 files changed, 1799 insertions(+)
 create mode 100644 dpctl/tensor/_sorting.py
 create mode 100644 dpctl/tensor/libtensor/include/kernels/sorting.hpp
 create mode 100644 dpctl/tensor/libtensor/source/sorting/argsort.cpp
 create mode 100644 dpctl/tensor/libtensor/source/sorting/argsort.hpp
 create mode 100644 dpctl/tensor/libtensor/source/sorting/sort.cpp
 create mode 100644 dpctl/tensor/libtensor/source/sorting/sort.hpp
 create mode 100644 dpctl/tensor/libtensor/source/sorting/sorting_common.hpp
 create mode 100644 dpctl/tensor/libtensor/source/tensor_sorting.cpp
 create mode 100644 dpctl/tests/test_usm_ndarray_sorting.py

diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index de8be6fae0..d2947aa772 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -118,6 +118,10 @@ set(_reduction_sources
 set(_boolean_reduction_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp
 )
+set(_sorting_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/sort.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/argsort.cpp
+)
 set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
@@ -148,6 +152,10 @@ set(_tensor_reductions_impl_sources
     ${_boolean_reduction_sources}
     ${_reduction_sources}
 )
+set(_tensor_sorting_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_sorting.cpp
+    ${_sorting_sources}
+)
 
 set(_py_trgts)
 
@@ -166,6 +174,11 @@ pybind11_add_module(${python_module_name} MODULE ${_tensor_reductions_impl_sourc
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_reductions_impl_sources})
 list(APPEND _py_trgts ${python_module_name})
 
+set(python_module_name _tensor_sorting_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_sorting_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_sorting_impl_sources})
+list(APPEND _py_trgts ${python_module_name})
+
 set(_clang_prefix "")
 if (WIN32)
   set(_clang_prefix "/clang:")
@@ -179,6 +192,7 @@ set(_no_fast_math_sources
 list(APPEND _no_fast_math_sources
      ${_elementwise_sources}
      ${_reduction_sources}
+     ${_sorting_sources}
 )
 
 foreach(_src_fn ${_no_fast_math_sources})
diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index cdb701e1cb..15be87588e 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -175,6 +175,7 @@
     reduce_hypot,
     sum,
 )
+from ._sorting import argsort, sort
 from ._testing import allclose
 from ._type_utils import can_cast, finfo, iinfo, result_type
 
@@ -343,4 +344,6 @@
     "__array_namespace_info__",
     "reciprocal",
     "angle",
+    "sort",
+    "argsort",
 ]
diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py
new file mode 100644
index 0000000000..8ad496d0a4
--- /dev/null
+++ b/dpctl/tensor/_sorting.py
@@ -0,0 +1,107 @@
+from numpy.core.numeric import normalize_axis_index
+
+import dpctl
+import dpctl.tensor as dpt
+import dpctl.tensor._tensor_impl as ti
+
+from ._tensor_sorting_impl import (
+    _argsort_ascending,
+    _argsort_descending,
+    _sort_ascending,
+    _sort_descending,
+)
+
+
+def sort(x, axis=-1, descending=False, stable=False):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
+        )
+    nd = x.ndim
+    axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
+    a1 = axis + 1
+    if a1 == nd:
+        perm = list(range(nd))
+        arr = x
+    else:
+        perm = [i for i in range(nd) if i != axis] + [
+            axis,
+        ]
+        arr = dpt.permute_dims(x, perm)
+    exec_q = x.sycl_queue
+    host_tasks_list = []
+    impl_fn = _sort_descending if descending else _sort_ascending
+    if arr.flags.c_contiguous:
+        res = dpt.empty_like(arr, order="C")
+        ht_ev, _ = impl_fn(
+            src=arr, trailing_dims_to_sort=1, dst=res, sycl_queue=exec_q
+        )
+        host_tasks_list.append(ht_ev)
+    else:
+        tmp = dpt.empty_like(arr, order="C")
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=tmp, sycl_queue=exec_q
+        )
+        host_tasks_list.append(ht_ev)
+        res = dpt.empty_like(arr, order="C")
+        ht_ev, _ = impl_fn(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=res,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        host_tasks_list.append(ht_ev)
+    if a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt.permute_dims(res, inv_perm)
+    dpctl.SyclEvent.wait_for(host_tasks_list)
+    return res
+
+
+def argsort(x, axis=-1, descending=False, stable=False):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
+        )
+    nd = x.ndim
+    axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
+    a1 = axis + 1
+    if a1 == nd:
+        perm = list(range(nd))
+        arr = x
+    else:
+        perm = [i for i in range(nd) if i != axis] + [
+            axis,
+        ]
+        arr = dpt.permute_dims(x, perm)
+    exec_q = x.sycl_queue
+    host_tasks_list = []
+    impl_fn = _argsort_descending if descending else _argsort_ascending
+    index_dt = ti.default_device_index_type(exec_q)
+    if arr.flags.c_contiguous:
+        res = dpt.empty_like(arr, dtype=index_dt, order="C")
+        ht_ev, _ = impl_fn(
+            src=arr, trailing_dims_to_sort=1, dst=res, sycl_queue=exec_q
+        )
+        host_tasks_list.append(ht_ev)
+    else:
+        tmp = dpt.empty_like(arr, order="C")
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=tmp, sycl_queue=exec_q
+        )
+        host_tasks_list.append(ht_ev)
+        res = dpt.empty_like(arr, dtype=index_dt, order="C")
+        ht_ev, _ = impl_fn(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=res,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        host_tasks_list.append(ht_ev)
+    if a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt.permute_dims(res, inv_perm)
+    dpctl.SyclEvent.wait_for(host_tasks_list)
+    return res
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting.hpp b/dpctl/tensor/libtensor/include/kernels/sorting.hpp
new file mode 100644
index 0000000000..cda9ce770d
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/sorting.hpp
@@ -0,0 +1,970 @@
+#pragma once
+
+#include "pybind11/pybind11.h"
+
+#include <functional>
+#include <iterator>
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+
+namespace sort_detail
+{
+
+template <typename T> T quotient_ceil(T n, T m)
+{
+    return (n + m - 1) / m;
+}
+
+template <typename Acc, typename Value, typename Compare>
+std::size_t lower_bound_impl(const Acc acc,
+                             std::size_t first,
+                             std::size_t last,
+                             const Value &value,
+                             Compare comp)
+{
+    std::size_t n = last - first;
+    std::size_t cur = n;
+    std::size_t it;
+    while (n > 0) {
+        it = first;
+        cur = n / 2;
+        it += cur;
+        if (comp(acc[it], value)) {
+            n -= cur + 1, first = ++it;
+        }
+        else
+            n = cur;
+    }
+    return first;
+}
+
+template <typename Acc, typename Value, typename Compare>
+std::size_t upper_bound_impl(const Acc acc,
+                             const std::size_t first,
+                             const std::size_t last,
+                             const Value &value,
+                             Compare comp)
+{
+    const auto &op_comp = [comp](auto x, auto y) { return !comp(y, x); };
+    return lower_bound_impl(acc, first, last, value, op_comp);
+}
+
+/*! @brief Merge two contiguous sorted segments */
+template <typename InAcc, typename OutAcc, typename Compare>
+void merge_impl(const std::size_t offset,
+                const InAcc in_acc,
+                OutAcc out_acc,
+                const std::size_t start_1,
+                const std::size_t end_1,
+                const std::size_t end_2,
+                const std::size_t start_out,
+                Compare comp,
+                const std::size_t chunk)
+{
+    const std::size_t start_2 = end_1;
+    // Borders of the sequences to merge within this call
+    const std::size_t local_start_1 = sycl::min(offset + start_1, end_1);
+    const std::size_t local_end_1 = sycl::min(local_start_1 + chunk, end_1);
+    const std::size_t local_start_2 = sycl::min(offset + start_2, end_2);
+    const std::size_t local_end_2 = sycl::min(local_start_2 + chunk, end_2);
+
+    const std::size_t local_size_1 = local_end_1 - local_start_1;
+    const std::size_t local_size_2 = local_end_2 - local_start_2;
+
+    const auto r_item_1 = in_acc[end_1 - 1];
+    const auto l_item_2 = in_acc[start_2];
+
+    // Copy if the sequences are sorted with respect to each other or merge
+    // otherwise
+    if (!comp(l_item_2, r_item_1)) {
+        const std::size_t out_shift_1 = start_out + local_start_1 - start_1;
+        const std::size_t out_shift_2 =
+            start_out + end_1 - start_1 + local_start_2 - start_2;
+
+        for (std::size_t i = 0; i < local_size_1; ++i) {
+            out_acc[out_shift_1 + i] = in_acc[local_start_1 + i];
+        }
+        for (std::size_t i = 0; i < local_size_2; ++i) {
+            out_acc[out_shift_2 + i] = in_acc[local_start_2 + i];
+        }
+    }
+    else if (comp(r_item_1, l_item_2)) {
+        const std::size_t out_shift_1 =
+            start_out + end_2 - start_2 + local_start_1 - start_1;
+        const std::size_t out_shift_2 = start_out + local_start_2 - start_2;
+        for (std::size_t i = 0; i < local_size_1; ++i) {
+            out_acc[out_shift_1 + i] = in_acc[local_start_1 + i];
+        }
+        for (std::size_t i = 0; i < local_size_2; ++i) {
+            out_acc[out_shift_2 + i] = in_acc[local_start_2 + i];
+        }
+    }
+    // Perform merging
+    else {
+
+        // Process 1st sequence
+        if (local_start_1 < local_end_1) {
+            // Reduce the range for searching within the 2nd sequence and handle
+            // bound items find left border in 2nd sequence
+            const auto local_l_item_1 = in_acc[local_start_1];
+            std::size_t l_search_bound_2 =
+                lower_bound_impl(in_acc, start_2, end_2, local_l_item_1, comp);
+            const std::size_t l_shift_1 = local_start_1 - start_1;
+            const std::size_t l_shift_2 = l_search_bound_2 - start_2;
+
+            out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_1;
+
+            std::size_t r_search_bound_2{};
+            // find right border in 2nd sequence
+            if (local_size_1 > 1) {
+                const auto local_r_item_1 = in_acc[local_end_1 - 1];
+                r_search_bound_2 = lower_bound_impl(
+                    in_acc, l_search_bound_2, end_2, local_r_item_1, comp);
+                const auto r_shift_1 = local_end_1 - 1 - start_1;
+                const auto r_shift_2 = r_search_bound_2 - start_2;
+
+                out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_1;
+            }
+
+            // Handle intermediate items
+            if (r_search_bound_2 == l_search_bound_2) {
+                const std::size_t shift_2 = l_search_bound_2 - start_2;
+                for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1;
+                     ++idx) {
+                    const auto intermediate_item_1 = in_acc[idx];
+                    const std::size_t shift_1 = idx - start_1;
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_1;
+                }
+            }
+            else {
+                for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1;
+                     ++idx) {
+                    const auto intermediate_item_1 = in_acc[idx];
+                    // we shouldn't seek in whole 2nd sequence. Just for the
+                    // part where the 1st sequence should be
+                    l_search_bound_2 = lower_bound_impl(
+                        in_acc, l_search_bound_2, r_search_bound_2,
+                        intermediate_item_1, comp);
+                    const std::size_t shift_1 = idx - start_1;
+                    const std::size_t shift_2 = l_search_bound_2 - start_2;
+
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_1;
+                }
+            }
+        }
+        // Process 2nd sequence
+        if (local_start_2 < local_end_2) {
+            // Reduce the range for searching within the 1st sequence and handle
+            // bound items find left border in 1st sequence
+            const auto local_l_item_2 = in_acc[local_start_2];
+            std::size_t l_search_bound_1 =
+                upper_bound_impl(in_acc, start_1, end_1, local_l_item_2, comp);
+            const std::size_t l_shift_1 = l_search_bound_1 - start_1;
+            const std::size_t l_shift_2 = local_start_2 - start_2;
+
+            out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_2;
+
+            std::size_t r_search_bound_1{};
+            // find right border in 1st sequence
+            if (local_size_2 > 1) {
+                const auto local_r_item_2 = in_acc[local_end_2 - 1];
+                r_search_bound_1 = upper_bound_impl(
+                    in_acc, l_search_bound_1, end_1, local_r_item_2, comp);
+                const std::size_t r_shift_1 = r_search_bound_1 - start_1;
+                const std::size_t r_shift_2 = local_end_2 - 1 - start_2;
+
+                out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_2;
+            }
+
+            // Handle intermediate items
+            if (l_search_bound_1 == r_search_bound_1) {
+                const std::size_t shift_1 = l_search_bound_1 - start_1;
+                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx)
+                {
+                    const auto intermediate_item_2 = in_acc[idx];
+                    const std::size_t shift_2 = idx - start_2;
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_2;
+                }
+            }
+            else {
+                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx)
+                {
+                    const auto intermediate_item_2 = in_acc[idx];
+                    // we shouldn't seek in whole 1st sequence. Just for the
+                    // part where the 2nd sequence should be
+                    l_search_bound_1 = upper_bound_impl(
+                        in_acc, l_search_bound_1, r_search_bound_1,
+                        intermediate_item_2, comp);
+                    const std::size_t shift_1 = l_search_bound_1 - start_1;
+                    const std::size_t shift_2 = idx - start_2;
+
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_2;
+                }
+            }
+        }
+    }
+}
+
+namespace
+{
+template <typename Iter, typename Compare>
+void insertion_sort_impl(Iter first,
+                         const size_t begin,
+                         const size_t end,
+                         Compare comp)
+{
+    for (size_t i = begin + 1; i < end; ++i) {
+        const auto val_i = first[i];
+        size_t j = i - 1;
+        while ((j + 1 > begin) && (comp(val_i, first[j]))) {
+            first[j + 1] = first[j];
+            --j;
+        }
+        if (j + 1 < i) {
+            first[j + 1] = val_i;
+        }
+    }
+}
+
+template <typename Iter, typename Compare>
+void bubble_sort_impl(Iter first,
+                      const size_t begin,
+                      const size_t end,
+                      Compare comp)
+{
+    if (begin < end) {
+        for (size_t i = begin; i < end; ++i) {
+            // Handle intermediate items
+            for (size_t idx = i + 1; idx < end; ++idx) {
+                if (comp(first[idx], first[i])) {
+                    std::swap(first[i], first[idx]);
+                }
+            }
+        }
+    }
+}
+
+template <typename Iter, typename Compare>
+void leaf_sort_impl(Iter first,
+                    const size_t begin,
+                    const size_t end,
+                    Compare comp)
+{
+    return insertion_sort_impl<Iter, Compare>(first, begin, end, comp);
+}
+} // namespace
+
+template <typename Iter> struct GetValueType
+{
+    using value_type = typename std::iterator_traits<Iter>::value_type;
+};
+
+template <typename ElementType,
+          sycl::access::address_space Space,
+          sycl::access::decorated IsDecorated>
+struct GetValueType<sycl::multi_ptr<ElementType, Space, IsDecorated>>
+{
+    using value_type = ElementType;
+};
+
+template <typename ElementType,
+          int Dim,
+          sycl::access_mode Mode,
+          sycl::target Target,
+          sycl::access::placeholder isPlaceholder>
+struct GetValueType<
+    sycl::accessor<ElementType, Dim, Mode, Target, isPlaceholder>>
+{
+    using value_type = ElementType;
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetValueType<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    using value_type = ElementType;
+};
+
+template <typename Iter> struct GetReadOnlyAccess
+{
+    Iter operator()(Iter it, sycl::handler &)
+    {
+        return it;
+    }
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetReadOnlyAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    auto operator()(sycl::buffer<ElementType, Dim, AllocatorT> buf,
+                    sycl::handler &cgh)
+    {
+        sycl::accessor acc(buf, cgh, sycl::read_only);
+        return acc;
+    }
+};
+
+template <typename Iter> struct GetWriteDiscardAccess
+{
+    Iter operator()(Iter it, sycl::handler &)
+    {
+        return it;
+    }
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetWriteDiscardAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    auto operator()(sycl::buffer<ElementType, Dim, AllocatorT> buf,
+                    sycl::handler &cgh)
+    {
+        sycl::accessor acc(buf, cgh, sycl::write_only, sycl::no_init);
+        return acc;
+    }
+};
+
+template <typename Iter> struct GetReadWriteAccess
+{
+    Iter operator()(Iter it, sycl::handler &)
+    {
+        return it;
+    }
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetReadWriteAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    auto operator()(sycl::buffer<ElementType, Dim, AllocatorT> buf,
+                    sycl::handler &cgh)
+    {
+        sycl::accessor acc(buf, cgh, sycl::read_write);
+        return acc;
+    }
+};
+
+template <typename T1, typename T2, typename Comp>
+class sort_over_work_group_contig_krn;
+
+template <typename InpAcc, typename OutAcc, typename Comp>
+sycl::event
+sort_over_work_group_contig_impl(sycl::queue &q,
+                                 size_t iter_nelems,
+                                 size_t sort_nelems,
+                                 const InpAcc input,
+                                 OutAcc output,
+                                 const Comp &comp,
+                                 size_t &nelems_wg_sorts,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using inpT = typename GetValueType<InpAcc>::value_type;
+    using outT = typename GetValueType<InpAcc>::value_type;
+    using KernelName = sort_over_work_group_contig_krn<inpT, outT, Comp>;
+
+    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+    auto const &ctx = q.get_context();
+    auto const &dev = q.get_device();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+
+    auto krn = kb.template get_kernel(kernel_id);
+
+    std::uint32_t max_sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+    const size_t lws = 4 * max_sg_size;
+    const std::uint32_t chunk_size = dev.has(sycl::aspect::cpu) ? 8 : 2;
+    nelems_wg_sorts = chunk_size * lws;
+
+    // This assumption permits doing away with using a loop
+    assert(nelems_wg_sorts % lws == 0);
+
+    const size_t n_segments =
+        quotient_ceil<size_t>(sort_nelems, nelems_wg_sorts);
+
+    sycl::event base_sort_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::range<1> global_range{iter_nelems * n_segments * lws};
+        sycl::range<1> local_range{lws};
+
+        sycl::range<1> slm_range{nelems_wg_sorts};
+        using T = typename GetValueType<OutAcc>::value_type;
+        sycl::local_accessor<T, 1> work_space(slm_range, cgh);
+        sycl::local_accessor<T, 1> scratch_space(slm_range, cgh);
+
+        auto input_acc = GetReadOnlyAccess<InpAcc>{}(input, cgh);
+        auto output_acc = GetWriteDiscardAccess<OutAcc>{}(output, cgh);
+
+        sycl::nd_range<1> ndRange(global_range, local_range);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const size_t group_id = it.get_group_linear_id();
+            const size_t iter_id = group_id / n_segments;
+            const size_t segment_id = group_id - iter_id * n_segments;
+            const size_t lid = it.get_local_linear_id();
+
+            const size_t segment_start_idx = segment_id * nelems_wg_sorts;
+            const size_t segment_end_idx = std::min<size_t>(
+                segment_start_idx + nelems_wg_sorts, sort_nelems);
+            const size_t wg_chunk_size = segment_end_idx - segment_start_idx;
+
+            // load input into SLM
+            for (size_t array_id = segment_start_idx + lid;
+                 array_id < segment_end_idx; array_id += lws)
+            {
+                T v = (array_id < sort_nelems)
+                          ? input_acc[iter_id * sort_nelems + array_id]
+                          : T{};
+                work_space[array_id - segment_start_idx] = v;
+            }
+            sycl::group_barrier(it.get_group());
+
+            const size_t chunk = quotient_ceil<size_t>(nelems_wg_sorts, lws);
+
+            const size_t chunk_start_idx = lid * chunk;
+            const size_t chunk_end_idx =
+                sycl::min(chunk_start_idx + chunk, wg_chunk_size);
+
+            leaf_sort_impl(work_space, chunk_start_idx, chunk_end_idx, comp);
+
+            sycl::group_barrier(it.get_group());
+
+            bool data_in_temp = false;
+            size_t sorted_size = 1;
+            while (true) {
+                const size_t nelems_sorted_so_far = sorted_size * chunk;
+                if (nelems_sorted_so_far < wg_chunk_size) {
+                    const size_t q = (lid / sorted_size);
+                    const size_t start_1 =
+                        sycl::min(2 * nelems_sorted_so_far * q, wg_chunk_size);
+                    const size_t end_1 = sycl::min(
+                        start_1 + nelems_sorted_so_far, wg_chunk_size);
+                    const size_t end_2 =
+                        sycl::min(end_1 + nelems_sorted_so_far, wg_chunk_size);
+                    const size_t offset = chunk * (lid - q * sorted_size);
+
+                    if (data_in_temp) {
+                        merge_impl(offset, scratch_space, work_space, start_1,
+                                   end_1, end_2, start_1, comp, chunk);
+                    }
+                    else {
+                        merge_impl(offset, work_space, scratch_space, start_1,
+                                   end_1, end_2, start_1, comp, chunk);
+                    }
+                    sycl::group_barrier(it.get_group());
+
+                    data_in_temp = !data_in_temp;
+                    sorted_size *= 2;
+                }
+                else
+                    break;
+            }
+
+            const auto &out_src = (data_in_temp) ? scratch_space : work_space;
+            for (size_t array_id = segment_start_idx + lid;
+                 array_id < segment_end_idx; array_id += lws)
+            {
+                if (array_id < sort_nelems) {
+                    output_acc[iter_id * sort_nelems + array_id] =
+                        out_src[array_id - segment_start_idx];
+                }
+            }
+        });
+    });
+
+    return base_sort_ev;
+}
+
+template <typename T1, typename T2, typename Comp>
+class sort_base_step_contig_krn;
+
+template <typename InpAcc, typename OutAcc, typename Comp>
+sycl::event
+sort_base_step_contig_impl(sycl::queue &q,
+                           size_t iter_nelems,
+                           size_t sort_nelems,
+                           const InpAcc input,
+                           OutAcc output,
+                           const Comp &comp,
+                           size_t &conseq_nelems_sorted,
+                           const std::vector<sycl::event> &depends = {})
+{
+
+    using inpT = typename GetValueType<InpAcc>::value_type;
+    using outT = typename GetValueType<InpAcc>::value_type;
+    using KernelName = sort_base_step_contig_krn<inpT, outT, Comp>;
+
+    conseq_nelems_sorted = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4);
+
+    size_t n_segments =
+        quotient_ceil<size_t>(sort_nelems, conseq_nelems_sorted);
+
+    sycl::event base_sort = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::range<1> gRange{iter_nelems * n_segments};
+        cgh.parallel_for<KernelName>(gRange, [=](sycl::id<1> id) {
+            const size_t iter_id = id[0] / n_segments;
+            const size_t segment_id = id[0] - iter_id * n_segments;
+
+            const size_t iter_offset = iter_id * sort_nelems;
+            const size_t beg_id =
+                iter_offset + segment_id * conseq_nelems_sorted;
+            const size_t end_id =
+                iter_offset +
+                std::min<size_t>((segment_id + 1) * conseq_nelems_sorted,
+                                 sort_nelems);
+            for (size_t i = beg_id; i < end_id; ++i) {
+                output[i] = input[i];
+            }
+
+            leaf_sort_impl(output, beg_id, end_id, comp);
+        });
+    });
+
+    return base_sort;
+}
+
+template <typename T1, typename T2, typename Comp>
+class exp_sort_over_work_group_contig_krn;
+
+template <typename InpAcc, typename OutAcc, typename Comp>
+sycl::event exp_sort_over_work_group_contig_impl(
+    sycl::queue &q,
+    size_t iter_nelems,
+    size_t sort_nelems,
+    const InpAcc input,
+    OutAcc output,
+    const Comp &comp,
+    size_t &conseq_nelems_sorted,
+    const std::vector<sycl::event> &depends = {})
+{
+
+    using inpT = typename GetValueType<InpAcc>::value_type;
+    using outT = typename GetValueType<InpAcc>::value_type;
+    using KernelName = exp_sort_over_work_group_contig_krn<inpT, outT, Comp>;
+
+    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+    auto const &ctx = q.get_context();
+    auto const &dev = q.get_device();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+
+    auto krn = kb.template get_kernel(kernel_id);
+
+    std::uint32_t max_sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+    const size_t lws = 4 * max_sg_size;
+    const std::uint32_t chunk_size = dev.has(sycl::aspect::cpu) ? 8 : 2;
+    conseq_nelems_sorted = chunk_size * lws;
+
+    // This assumption permits doing away with using a loop
+    assert(nelems_wg_sorts % lws == 0);
+
+    sycl::event exp_default_sort_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const size_t n_chunks =
+            quotient_ceil<size_t>(sort_nelems, conseq_nelems_sorted);
+
+        sycl::range<1> global_range{iter_nelems * n_chunks * lws};
+        sycl::range<1> local_range{lws};
+
+        sycl::range<1> slm_size_range{conseq_nelems_sorted};
+
+        using Sorter = sycl::ext::oneapi::experimental::default_sorter<Comp>;
+
+        // calculate required local memory size
+        // MUST pass local_range, not integer.
+        // Have different meanings
+        const size_t temp_memory_size = Sorter::template memory_required<outT>(
+            sycl::memory_scope::work_group, conseq_nelems_sorted);
+
+        sycl::local_accessor<outT, 1> workspace(slm_size_range, cgh);
+        sycl::local_accessor<std::byte, 1> scratch({temp_memory_size}, cgh);
+
+        cgh.parallel_for<KernelName>(
+            sycl::nd_range<1>(global_range, local_range),
+            [=](sycl::nd_item<1> it) {
+                auto sorter_op = Sorter(sycl::span<std::byte>{
+                    scratch
+                        .template get_multi_ptr<sycl::access::decorated::no>()
+                        .get(),
+                    temp_memory_size});
+
+                const size_t gr_id = it.get_group_linear_id();
+                const size_t iter_id = gr_id / n_chunks;
+                const size_t sort_chunk_id = gr_id - iter_id * n_chunks;
+
+                const std::uint32_t lid = it.get_local_linear_id();
+
+                const size_t iter_offset = iter_id * sort_nelems;
+                const size_t chunk_offset =
+                    sort_chunk_id * conseq_nelems_sorted;
+                const size_t global_start_offset = iter_offset + chunk_offset;
+                const size_t workspace_size =
+                    std::min<size_t>(sort_nelems,
+                                     chunk_offset + conseq_nelems_sorted) -
+                    chunk_offset;
+                for (std::uint32_t i = lid; i < workspace_size; i += lws) {
+                    workspace[i] = input[global_start_offset + i];
+                }
+                sycl::group_barrier(it.get_group());
+
+                sycl::ext::oneapi::experimental::joint_sort(
+                    it.get_group(),
+                    workspace
+                        .template get_multi_ptr<sycl::access::decorated::no>()
+                        .get(),
+                    workspace
+                            .template get_multi_ptr<
+                                sycl::access::decorated::no>()
+                            .get() +
+                        workspace_size,
+                    sorter_op);
+
+                for (std::uint32_t i = lid; i < workspace_size; i += lws) {
+                    output[global_start_offset + i] = workspace[i];
+                }
+            });
+    });
+
+    return exp_default_sort_ev;
+}
+
+class vacuous_krn;
+
+inline sycl::event tie_events(sycl::queue &q,
+                              const std::vector<sycl::event> depends)
+{
+    if (depends.empty())
+        return sycl::event();
+    if (depends.size() == 1)
+        return depends[0];
+
+    sycl::event e = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        using KernelName = vacuous_krn;
+        cgh.single_task<KernelName>([]() {});
+    });
+
+    return e;
+}
+
+template <typename T, typename Comp> class merge_adjacent_blocks_to_temp_krn;
+
+template <typename T, typename Comp> class merge_adjacent_blocks_from_temp_krn;
+
+template <typename Acc, typename Comp>
+sycl::event
+merge_sorted_block_contig_impl(sycl::queue &q,
+                               size_t iter_nelems,
+                               size_t sort_nelems,
+                               Acc output,
+                               const Comp comp,
+                               size_t sorted_block_size,
+                               const std::vector<sycl::event> &depends = {})
+{
+
+    if (sorted_block_size >= sort_nelems)
+        return tie_events(q, depends);
+
+    // experimentally determined value
+    // size of segments worked upon by each work-item during merging
+    const sycl::device &dev = q.get_device();
+    const size_t segment_size = (dev.has(sycl::aspect::cpu)) ? 32 : 4;
+
+    const size_t chunk_size =
+        (sorted_block_size < segment_size) ? sorted_block_size : segment_size;
+
+    assert(sorted_block_size % chunk_size == 0);
+
+    using T = typename GetValueType<Acc>::value_type;
+
+    sycl::buffer<T, 1> temp_buf(sycl::range<1>{iter_nelems * sort_nelems});
+    // T *allocated_mem = sycl::malloc_device<T>(iter_nelems * sort_nelems, q);
+
+    bool needs_copy = true;
+    bool used_depends = false;
+
+    sycl::event dep_ev;
+    size_t chunks_merged = sorted_block_size / chunk_size;
+
+    assert(!(chunks_merged & (chunks_merged - 1)));
+
+    using ToTempKernelName = class merge_adjacent_blocks_to_temp_krn<T, Comp>;
+    using FromTempKernelName =
+        class merge_adjacent_blocks_from_temp_krn<T, Comp>;
+
+    while (chunks_merged * chunk_size < sort_nelems) {
+        sycl::event local_dep = dep_ev;
+
+        sycl::event merge_ev = q.submit([&](sycl::handler &cgh) {
+            if (used_depends) {
+                cgh.depends_on(local_dep);
+            }
+            else {
+                cgh.depends_on(depends);
+                used_depends = true;
+            }
+
+            const size_t n_chunks =
+                quotient_ceil<size_t>(sort_nelems, chunk_size);
+
+            if (needs_copy) {
+                sycl::accessor temp_acc{temp_buf, cgh, sycl::write_only,
+                                        sycl::no_init};
+                auto output_acc = GetReadOnlyAccess<Acc>{}(output, cgh);
+                cgh.parallel_for<ToTempKernelName>(
+                    {iter_nelems * n_chunks}, [=](sycl::id<1> wid) {
+                        auto flat_idx = wid[0];
+                        auto iter_idx = flat_idx / n_chunks;
+                        auto idx = flat_idx - n_chunks * iter_idx;
+
+                        const std::size_t idx_mult =
+                            (idx / chunks_merged) * chunks_merged;
+                        const std::size_t idx_rem = (idx - idx_mult);
+                        const std::size_t start_1 =
+                            sycl::min(2 * idx_mult * chunk_size, sort_nelems);
+                        const std::size_t end_1 = sycl::min(
+                            start_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t end_2 = sycl::min(
+                            end_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t offset = chunk_size * idx_rem;
+
+                        const std::size_t iter_offset = iter_idx * sort_nelems;
+
+                        merge_impl(offset, output_acc, temp_acc,
+                                   iter_offset + start_1, iter_offset + end_1,
+                                   iter_offset + end_2, iter_offset + start_1,
+                                   comp, chunk_size);
+                    });
+            }
+            else {
+                sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only};
+                auto output_acc = GetWriteDiscardAccess<Acc>{}(output, cgh);
+                cgh.parallel_for<FromTempKernelName>(
+                    {iter_nelems * n_chunks}, [=](sycl::id<1> wid) {
+                        auto flat_idx = wid[0];
+                        auto iter_idx = flat_idx / n_chunks;
+                        auto idx = flat_idx - n_chunks * iter_idx;
+
+                        const std::size_t idx_mult =
+                            (idx / chunks_merged) * chunks_merged;
+                        const std::size_t idx_rem = (idx - idx_mult);
+                        const std::size_t start_1 =
+                            sycl::min(2 * idx_mult * chunk_size, sort_nelems);
+                        const std::size_t end_1 = sycl::min(
+                            start_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t end_2 = sycl::min(
+                            end_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t offset = chunk_size * idx_rem;
+
+                        const std::size_t iter_offset = iter_idx * sort_nelems;
+
+                        merge_impl(offset, temp_acc, output_acc,
+                                   iter_offset + start_1, iter_offset + end_1,
+                                   iter_offset + end_2, iter_offset + start_1,
+                                   comp, chunk_size);
+                    });
+            }
+        });
+
+        chunks_merged *= 2;
+        dep_ev = merge_ev;
+
+        if (chunks_merged * chunk_size < sort_nelems) {
+            needs_copy = !needs_copy;
+        }
+    }
+
+    if (needs_copy) {
+        sycl::event copy_ev = q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(dep_ev);
+
+            sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only};
+            auto output_acc = GetWriteDiscardAccess<Acc>{}(output, cgh);
+
+            cgh.copy(temp_acc, output_acc);
+        });
+        dep_ev = copy_ev;
+    }
+
+    return dep_ev;
+}
+
+} // end of namespace sort_detail
+
+typedef sycl::event (*sort_contig_fn_ptr_t)(sycl::queue &,
+                                            size_t,
+                                            size_t,
+                                            const char *,
+                                            char *,
+                                            py::ssize_t,
+                                            py::ssize_t,
+                                            py::ssize_t,
+                                            py::ssize_t,
+                                            const std::vector<sycl::event> &);
+
+template <typename argTy, typename Comp = std::less<argTy>>
+sycl::event stable_sort_axis1_contig_impl(
+    sycl::queue &exec_q,
+    size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a
+                        // matrix when sorting over rows)
+    size_t sort_nelems, // size of each array to sort  (length of rows, i.e.
+                        // number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    py::ssize_t iter_arg_offset,
+    py::ssize_t iter_res_offset,
+    py::ssize_t sort_arg_offset,
+    py::ssize_t sort_res_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + sort_arg_offset;
+    argTy *res_tp =
+        reinterpret_cast<argTy *>(res_cp) + iter_res_offset + sort_res_offset;
+
+    auto comp = Comp{};
+
+    static constexpr size_t determine_automatically = 0;
+    size_t sorted_block_size =
+        (sort_nelems >= 512) ? 512 : determine_automatically;
+
+    // Sort segments of the array
+#if 1
+    sycl::event base_sort_ev = sort_detail::sort_over_work_group_contig_impl<
+        const argTy *, argTy *, Comp>(
+        exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
+        sorted_block_size, // modified in place with size of sorted block size
+        depends);
+#else
+    sycl::event base_sort_ev =
+        sort_detail::sort_base_step_contig_impl<const argTy *, argTy *, Comp>(
+            exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
+            sorted_block_size, // modified in place with size of sorted block
+                               // size
+            depends);
+#endif
+
+    // Merge segments in parallel until all elements are sorted
+    sycl::event merges_ev =
+        sort_detail::merge_sorted_block_contig_impl<argTy *, Comp>(
+            exec_q, iter_nelems, sort_nelems, res_tp, comp, sorted_block_size,
+            {base_sort_ev});
+
+    return merges_ev;
+}
+
+template <typename T1, typename T2, typename T3>
+class populate_indexed_data_krn;
+
+template <typename T1, typename T2, typename T3> class index_write_out_krn;
+
+template <typename pairT, typename ValueComp> struct TupleComp
+{
+    bool operator()(const pairT &p1, const pairT &p2) const
+    {
+        const ValueComp value_comp{};
+        return value_comp(std::get<0>(p1), std::get<0>(p2));
+    }
+};
+
+template <typename argTy,
+          typename IndexTy,
+          typename ValueComp = std::less<argTy>>
+sycl::event stable_argsort_axis1_contig_impl(
+    sycl::queue &exec_q,
+    size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a
+                        // matrix when sorting over rows)
+    size_t sort_nelems, // size of each array to sort  (length of rows, i.e.
+                        // number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    py::ssize_t iter_arg_offset,
+    py::ssize_t iter_res_offset,
+    py::ssize_t sort_arg_offset,
+    py::ssize_t sort_res_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + sort_arg_offset;
+    IndexTy *res_tp =
+        reinterpret_cast<IndexTy *>(res_cp) + iter_res_offset + sort_res_offset;
+
+    using ValueIndexT = std::pair<argTy, IndexTy>;
+    const TupleComp<ValueIndexT, ValueComp> tuple_comp{};
+
+    static constexpr size_t determine_automatically = 0;
+    size_t sorted_block_size =
+        (sort_nelems >= 512) ? 512 : determine_automatically;
+
+    sycl::buffer<ValueIndexT, 1> indexed_data(
+        sycl::range<1>(iter_nelems * sort_nelems));
+    sycl::buffer<ValueIndexT, 1> temp_buf(
+        sycl::range<1>(iter_nelems * sort_nelems));
+
+    sycl::event populate_indexed_data_ev =
+        exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+            sycl::accessor acc(indexed_data, cgh, sycl::write_only,
+                               sycl::no_init);
+
+            auto const &range = indexed_data.get_range();
+
+            using KernelName =
+                populate_indexed_data_krn<argTy, IndexTy, ValueComp>;
+
+            cgh.parallel_for<KernelName>(range, [=](sycl::id<1> id) {
+                size_t i = id[0];
+                size_t sort_id = i % sort_nelems;
+                acc[i] =
+                    std::make_pair(arg_tp[i], static_cast<IndexTy>(sort_id));
+            });
+        });
+
+    // Sort segments of the array
+    sycl::event base_sort_ev = sort_detail::sort_over_work_group_contig_impl(
+        exec_q, iter_nelems, sort_nelems, indexed_data, temp_buf, tuple_comp,
+        sorted_block_size, // modified in place with size of sorted block size
+        {populate_indexed_data_ev});
+
+    // Merge segments in parallel until all elements are sorted
+    sycl::event merges_ev = sort_detail::merge_sorted_block_contig_impl(
+        exec_q, iter_nelems, sort_nelems, temp_buf, tuple_comp,
+        sorted_block_size, {base_sort_ev});
+
+    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(merges_ev);
+
+        auto temp_acc =
+            sort_detail::GetReadOnlyAccess<decltype(temp_buf)>{}(temp_buf, cgh);
+
+        using KernelName = index_write_out_krn<argTy, IndexTy, ValueComp>;
+
+        cgh.parallel_for<KernelName>(temp_buf.get_range(), [=](sycl::id<1> id) {
+            res_tp[id] = std::get<1>(temp_acc[id]);
+        });
+    });
+
+    return write_out_ev;
+}
+
+} // end of namespace kernels
+} // end of namespace tensor
+} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/argsort.cpp b/dpctl/tensor/libtensor/source/sorting/argsort.cpp
new file mode 100644
index 0000000000..753d516e7b
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/sorting/argsort.cpp
@@ -0,0 +1,238 @@
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+
+#include "utils/math_utils.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "argsort.hpp"
+#include "kernels/sorting.hpp"
+#include "sorting_common.hpp"
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+template <typename sorting_contig_impl_fnT>
+std::pair<sycl::event, sycl::event>
+py_argsort(const dpctl::tensor::usm_ndarray &src,
+           const int trailing_dims_to_sort,
+           const dpctl::tensor::usm_ndarray &dst,
+           sycl::queue &exec_q,
+           const std::vector<sycl::event> &depends,
+           const sorting_contig_impl_fnT &stable_sort_contig_fns)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+    int iteration_nd = src_nd - trailing_dims_to_sort;
+    if (trailing_dims_to_sort <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_sort must be positive, but no "
+                              "greater than rank of the array being sorted");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    size_t iter_nelems(1);
+
+    for (int i = 0; same_shapes && (i < iteration_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        iter_nelems *= static_cast<size_t>(src_shape_i);
+    }
+
+    size_t sort_nelems(1);
+    for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        sort_nelems *= static_cast<size_t>(src_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if ((iter_nelems == 0) || (sort_nelems == 0)) {
+        // Nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    // destination must be ample enough to accommodate all elements
+    {
+        auto dst_offsets = dst.get_minmax_offsets();
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < sort_nelems * iter_nelems) {
+            throw py::value_error(
+                "Destination array can not accommodate all the "
+                "elements of source array.");
+        }
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if ((dst_typeid != static_cast<int>(td_ns::typenum_t::INT64)) &&
+        (dst_typeid != static_cast<int>(td_ns::typenum_t::INT32)))
+    {
+        throw py::value_error(
+            "Output index array must have data type int32 or int64");
+    }
+
+    // handle special case when both reduction and iteration are 1D contiguous
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    if (is_src_c_contig && is_dst_c_contig) {
+        using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl;
+
+        static constexpr py::ssize_t zero_offset = py::ssize_t(0);
+
+        auto fn = stable_sort_contig_fns[src_typeid][dst_typeid];
+
+        if (fn == nullptr) {
+            throw py::value_error("Not implemented for given index type");
+        }
+
+        sycl::event comp_ev =
+            fn(exec_q, iter_nelems, sort_nelems, src.get_data(), dst.get_data(),
+               zero_offset, zero_offset, zero_offset, zero_offset, depends);
+
+        sycl::event keep_args_alive_ev =
+            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev});
+
+        return std::make_pair(keep_args_alive_ev, comp_ev);
+    }
+
+    return std::make_pair(sycl::event(), sycl::event());
+}
+
+using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+static sort_contig_fn_ptr_t
+    ascending_argsort_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static sort_contig_fn_ptr_t
+    descending_argsort_contig_dispatch_table[td_ns::num_types]
+                                            [td_ns::num_types];
+
+template <typename fnT, typename argTy, typename IndexTy>
+struct AscendingArgSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<IndexTy, std::int64_t> ||
+                      std::is_same_v<IndexTy, std::int32_t>)
+        {
+            using Comp = typename AscendingSorter<argTy>::type;
+
+            using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl;
+            return stable_argsort_axis1_contig_impl<argTy, IndexTy, Comp>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename argTy, typename IndexTy>
+struct DescendingArgSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<IndexTy, std::int64_t> ||
+                      std::is_same_v<IndexTy, std::int32_t>)
+        {
+            using Comp = typename DescendingSorter<argTy>::type;
+
+            using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl;
+            return stable_argsort_axis1_contig_impl<argTy, IndexTy, Comp>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_argsort_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+    td_ns::DispatchTableBuilder<sort_contig_fn_ptr_t,
+                                AscendingArgSortContigFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(ascending_argsort_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<
+        sort_contig_fn_ptr_t, DescendingArgSortContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(descending_argsort_contig_dispatch_table);
+}
+
+void init_argsort_functions(py::module_ m)
+{
+    dpctl::tensor::py_internal::init_argsort_dispatch_tables();
+
+    auto py_argsort_ascending = [](const dpctl::tensor::usm_ndarray &src,
+                                   const int trailing_dims_to_sort,
+                                   const dpctl::tensor::usm_ndarray &dst,
+                                   sycl::queue &exec_q,
+                                   const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return dpctl::tensor::py_internal::py_argsort(
+            src, trailing_dims_to_sort, dst, exec_q, depends,
+            dpctl::tensor::py_internal::
+                ascending_argsort_contig_dispatch_table);
+    };
+    m.def("_argsort_ascending", py_argsort_ascending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto py_argsort_descending = [](const dpctl::tensor::usm_ndarray &src,
+                                    const int trailing_dims_to_sort,
+                                    const dpctl::tensor::usm_ndarray &dst,
+                                    sycl::queue &exec_q,
+                                    const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return dpctl::tensor::py_internal::py_argsort(
+            src, trailing_dims_to_sort, dst, exec_q, depends,
+            dpctl::tensor::py_internal::
+                descending_argsort_contig_dispatch_table);
+    };
+    m.def("_argsort_descending", py_argsort_descending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    return;
+}
+
+} // end of namespace py_internal
+} // end of namespace tensor
+} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/argsort.hpp b/dpctl/tensor/libtensor/source/sorting/argsort.hpp
new file mode 100644
index 0000000000..70548a492c
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/sorting/argsort.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_argsort_functions(py::module_);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/sort.cpp b/dpctl/tensor/libtensor/source/sorting/sort.cpp
new file mode 100644
index 0000000000..e163fd1494
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/sorting/sort.cpp
@@ -0,0 +1,212 @@
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+
+#include "utils/math_utils.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/sorting.hpp"
+#include "sort.hpp"
+#include "sorting_common.hpp"
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+template <typename sorting_contig_impl_fnT>
+std::pair<sycl::event, sycl::event>
+py_sort(const dpctl::tensor::usm_ndarray &src,
+        const int trailing_dims_to_sort,
+        const dpctl::tensor::usm_ndarray &dst,
+        sycl::queue &exec_q,
+        const std::vector<sycl::event> &depends,
+        const sorting_contig_impl_fnT &stable_sort_contig_fns)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+    int iteration_nd = src_nd - trailing_dims_to_sort;
+    if (trailing_dims_to_sort <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_sort must be positive, but no "
+                              "greater than rank of the array being sorted");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    size_t iter_nelems(1);
+
+    for (int i = 0; same_shapes && (i < iteration_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        iter_nelems *= static_cast<size_t>(src_shape_i);
+    }
+
+    size_t sort_nelems(1);
+    for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        sort_nelems *= static_cast<size_t>(src_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if ((iter_nelems == 0) || (sort_nelems == 0)) {
+        // Nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    // destination must be ample enough to accommodate all elements
+    {
+        auto dst_offsets = dst.get_minmax_offsets();
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < sort_nelems * iter_nelems) {
+            throw py::value_error(
+                "Destination array can not accommodate all the "
+                "elements of source array.");
+        }
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error("Both input arrays must have "
+                              "the same value data type");
+    }
+
+    // handle special case when both reduction and iteration are 1D contiguous
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    if (is_src_c_contig && is_dst_c_contig) {
+        using dpctl::tensor::kernels::stable_sort_axis1_contig_impl;
+
+        static constexpr py::ssize_t zero_offset = py::ssize_t(0);
+
+        auto fn = stable_sort_contig_fns[src_typeid];
+
+        sycl::event comp_ev =
+            fn(exec_q, iter_nelems, sort_nelems, src.get_data(), dst.get_data(),
+               zero_offset, zero_offset, zero_offset, zero_offset, depends);
+
+        sycl::event keep_args_alive_ev =
+            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev});
+
+        return std::make_pair(keep_args_alive_ev, comp_ev);
+    }
+
+    return std::make_pair(sycl::event(), sycl::event());
+}
+
+using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+static sort_contig_fn_ptr_t
+    ascending_sort_contig_dispatch_vector[td_ns::num_types];
+static sort_contig_fn_ptr_t
+    descending_sort_contig_dispatch_vector[td_ns::num_types];
+
+template <typename fnT, typename argTy> struct AscendingSortContigFactory
+{
+    fnT get()
+    {
+        using Comp = typename AscendingSorter<argTy>::type;
+
+        using dpctl::tensor::kernels::stable_sort_axis1_contig_impl;
+        return stable_sort_axis1_contig_impl<argTy, Comp>;
+    }
+};
+
+template <typename fnT, typename argTy> struct DescendingSortContigFactory
+{
+    fnT get()
+    {
+        using Comp = typename DescendingSorter<argTy>::type;
+        using dpctl::tensor::kernels::stable_sort_axis1_contig_impl;
+        return stable_sort_axis1_contig_impl<argTy, Comp>;
+    }
+};
+
+void init_sort_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+    td_ns::DispatchVectorBuilder<sort_contig_fn_ptr_t,
+                                 AscendingSortContigFactory, td_ns::num_types>
+        dtv1;
+    dtv1.populate_dispatch_vector(ascending_sort_contig_dispatch_vector);
+
+    td_ns::DispatchVectorBuilder<sort_contig_fn_ptr_t,
+                                 DescendingSortContigFactory, td_ns::num_types>
+        dtv2;
+    dtv2.populate_dispatch_vector(descending_sort_contig_dispatch_vector);
+}
+
+void init_sort_functions(py::module_ m)
+{
+    dpctl::tensor::py_internal::init_sort_dispatch_vectors();
+
+    auto py_sort_ascending = [](const dpctl::tensor::usm_ndarray &src,
+                                const int trailing_dims_to_sort,
+                                const dpctl::tensor::usm_ndarray &dst,
+                                sycl::queue &exec_q,
+                                const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return dpctl::tensor::py_internal::py_sort(
+            src, trailing_dims_to_sort, dst, exec_q, depends,
+            dpctl::tensor::py_internal::ascending_sort_contig_dispatch_vector);
+    };
+    m.def("_sort_ascending", py_sort_ascending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto py_sort_descending = [](const dpctl::tensor::usm_ndarray &src,
+                                 const int trailing_dims_to_sort,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return dpctl::tensor::py_internal::py_sort(
+            src, trailing_dims_to_sort, dst, exec_q, depends,
+            dpctl::tensor::py_internal::descending_sort_contig_dispatch_vector);
+    };
+    m.def("_sort_descending", py_sort_descending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    return;
+}
+
+} // end of namespace py_internal
+} // end of namespace tensor
+} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/sort.hpp b/dpctl/tensor/libtensor/source/sorting/sort.hpp
new file mode 100644
index 0000000000..f47f8f4173
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/sorting/sort.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_sort_functions(py::module_);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp b/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp
new file mode 100644
index 0000000000..d6e1e4e71a
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp
@@ -0,0 +1,54 @@
+#pragma once
+
+#include "utils/math_utils.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+template <typename cT> struct ComplexLess
+{
+    bool operator()(const cT &v1, const cT &v2) const
+    {
+        using dpctl::tensor::math_utils::less_complex;
+
+        return less_complex(v1, v2);
+    }
+};
+
+template <typename cT> struct ComplexGreater
+{
+    bool operator()(const cT &v1, const cT &v2) const
+    {
+        using dpctl::tensor::math_utils::greater_complex;
+
+        return greater_complex(v1, v2);
+    }
+};
+
+template <typename argTy> struct AscendingSorter
+{
+    using type = std::less<argTy>;
+};
+
+template <typename T> struct AscendingSorter<std::complex<T>>
+{
+    using type = ComplexLess<std::complex<T>>;
+};
+
+template <typename argTy> struct DescendingSorter
+{
+    using type = std::greater<argTy>;
+};
+
+template <typename T> struct DescendingSorter<std::complex<T>>
+{
+    using type = ComplexGreater<std::complex<T>>;
+};
+
+} // end of namespace py_internal
+} // end of namespace tensor
+} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/tensor_sorting.cpp b/dpctl/tensor/libtensor/source/tensor_sorting.cpp
new file mode 100644
index 0000000000..5d534d85cd
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/tensor_sorting.cpp
@@ -0,0 +1,37 @@
+//===-- tensor_sorting.cpp -                                 -----*-C++-*-/===//
+//   Implementation of _tensor_reductions_impl module
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "sorting/argsort.hpp"
+#include "sorting/sort.hpp"
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(_tensor_sorting_impl, m)
+{
+    dpctl::tensor::py_internal::init_sort_functions(m);
+    dpctl::tensor::py_internal::init_argsort_functions(m);
+}
diff --git a/dpctl/tests/test_usm_ndarray_sorting.py b/dpctl/tests/test_usm_ndarray_sorting.py
new file mode 100644
index 0000000000..f0b57129d9
--- /dev/null
+++ b/dpctl/tests/test_usm_ndarray_sorting.py
@@ -0,0 +1,128 @@
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_sort_1d(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    inp = dpt.roll(
+        dpt.concat(
+            (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype))
+        ),
+        734,
+    )
+
+    s = dpt.sort(inp, descending=False)
+    assert dpt.all(s[:-1] <= s[1:])
+
+    s1 = dpt.sort(inp, descending=True)
+    assert dpt.all(s1[:-1] >= s1[1:])
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_sort_2d(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    fl = dpt.roll(
+        dpt.concat(
+            (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype))
+        ),
+        734,
+    )
+    inp = dpt.reshape(fl, (20, -1))
+
+    s = dpt.sort(inp, axis=1, descending=False)
+    assert dpt.all(s[:, :-1] <= s[:, 1:])
+
+    s1 = dpt.sort(inp, axis=1, descending=True)
+    assert dpt.all(s1[:, :-1] >= s1[:, 1:])
+
+
+def test_sort_strides():
+
+    fl = dpt.roll(
+        dpt.concat((dpt.ones(10000, dtype="i4"), dpt.zeros(10000, dtype="i4"))),
+        734,
+    )
+    inp = dpt.reshape(fl, (-1, 20))
+
+    s = dpt.sort(inp, axis=0, descending=False)
+    assert dpt.all(s[:-1, :] <= s[1:, :])
+
+    s1 = dpt.sort(inp, axis=0, descending=True)
+    assert dpt.all(s1[:-1, :] >= s1[1:, :])
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_argsort_1d(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    inp = dpt.roll(
+        dpt.concat(
+            (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype))
+        ),
+        734,
+    )
+
+    s_idx = dpt.argsort(inp, descending=False)
+    assert dpt.all(inp[s_idx[:-1]] <= inp[s_idx[1:]])
+
+    s1_idx = dpt.argsort(inp, descending=True)
+    assert dpt.all(inp[s1_idx[:-1]] >= inp[s1_idx[1:]])

From 84437f1938a594dfe92add9264bb02ffa063d034 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 6 Dec 2023 10:52:44 -0600
Subject: [PATCH 02/34] Added docstrings

---
 dpctl/tensor/_sorting.py | 69 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 68 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py
index 8ad496d0a4..ba1590ac32 100644
--- a/dpctl/tensor/_sorting.py
+++ b/dpctl/tensor/_sorting.py
@@ -1,3 +1,19 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
 from numpy.core.numeric import normalize_axis_index
 
 import dpctl
@@ -12,7 +28,32 @@
 )
 
 
-def sort(x, axis=-1, descending=False, stable=False):
+def sort(x, /, *, axis=-1, descending=False, stable=False):
+    """sort(x, axis=-1, descending=False, stable=False)
+
+    Returns a sorted copy of an input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to sort. If set to `-1`, the function
+            must sort along the last axis. Default: `-1`.
+        descending (Optional[bool]):
+            sort order. If `True`, the array must be sorted in descending
+            order (by value). If `False`, the array must be sorted in
+            ascending order (by value). Default: `False`.
+        stable (Optional[bool]):
+            sort stability. If `True`, the returned array must maintain the
+            relative order of `x` values which compare as equal. If `False`,
+            the returned array may or may not maintain the relative order of
+            `x` values which compare as equal. Default: `True`.
+
+    Returns:
+        usm_ndarray:
+            a sorted array. The returned array has the same data type and
+            the same shape as the input array `x`.
+    """
     if not isinstance(x, dpt.usm_ndarray):
         raise TypeError(
             f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
@@ -60,6 +101,32 @@ def sort(x, axis=-1, descending=False, stable=False):
 
 
 def argsort(x, axis=-1, descending=False, stable=False):
+    """argsort(x, axis=-1, descending=False, stable=False)
+
+    Returns the indices that sort an array `x` along a specified axis.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to sort. If set to `-1`, the function
+            must sort along the last axis. Default: `-1`.
+        descending (Optional[bool]):
+            sort order. If `True`, the array must be sorted in descending
+            order (by value). If `False`, the array must be sorted in
+            ascending order (by value). Default: `False`.
+        stable (Optional[bool]):
+            sort stability. If `True`, the returned array must maintain the
+            relative order of `x` values which compare as equal. If `False`,
+            the returned array may or may not maintain the relative order of
+            `x` values which compare as equal. Default: `True`.
+
+    Returns:
+        usm_ndarray:
+            an array of indices. The returned array has the  same shape as
+            the input array `x`. The return array has default array index
+            data type.
+    """
     if not isinstance(x, dpt.usm_ndarray):
         raise TypeError(
             f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"

From 8076adfb38da7b855f34316c4c4bd89c9f37532e Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 6 Dec 2023 10:53:45 -0600
Subject: [PATCH 03/34] Added license header comments

---
 .../libtensor/source/sorting/argsort.cpp      | 23 ++++++++++++++++++
 .../libtensor/source/sorting/argsort.hpp      | 24 +++++++++++++++++++
 .../tensor/libtensor/source/sorting/sort.cpp  | 23 ++++++++++++++++++
 .../tensor/libtensor/source/sorting/sort.hpp  | 24 +++++++++++++++++++
 .../source/sorting/sorting_common.hpp         | 24 +++++++++++++++++++
 5 files changed, 118 insertions(+)

diff --git a/dpctl/tensor/libtensor/source/sorting/argsort.cpp b/dpctl/tensor/libtensor/source/sorting/argsort.cpp
index 753d516e7b..f23084fe21 100644
--- a/dpctl/tensor/libtensor/source/sorting/argsort.cpp
+++ b/dpctl/tensor/libtensor/source/sorting/argsort.cpp
@@ -1,3 +1,26 @@
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===--------------------------------------------------------------------===//
 
 #include "dpctl4pybind11.hpp"
 #include <pybind11/pybind11.h>
diff --git a/dpctl/tensor/libtensor/source/sorting/argsort.hpp b/dpctl/tensor/libtensor/source/sorting/argsort.hpp
index 70548a492c..7637f2df4c 100644
--- a/dpctl/tensor/libtensor/source/sorting/argsort.hpp
+++ b/dpctl/tensor/libtensor/source/sorting/argsort.hpp
@@ -1,3 +1,27 @@
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===--------------------------------------------------------------------===//
+
 #pragma once
 
 #include <pybind11/pybind11.h>
diff --git a/dpctl/tensor/libtensor/source/sorting/sort.cpp b/dpctl/tensor/libtensor/source/sorting/sort.cpp
index e163fd1494..8a4a710354 100644
--- a/dpctl/tensor/libtensor/source/sorting/sort.cpp
+++ b/dpctl/tensor/libtensor/source/sorting/sort.cpp
@@ -1,3 +1,26 @@
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===--------------------------------------------------------------------===//
 
 #include "dpctl4pybind11.hpp"
 #include <pybind11/pybind11.h>
diff --git a/dpctl/tensor/libtensor/source/sorting/sort.hpp b/dpctl/tensor/libtensor/source/sorting/sort.hpp
index f47f8f4173..2ad5bdb623 100644
--- a/dpctl/tensor/libtensor/source/sorting/sort.hpp
+++ b/dpctl/tensor/libtensor/source/sorting/sort.hpp
@@ -1,3 +1,27 @@
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===--------------------------------------------------------------------===//
+
 #pragma once
 
 #include <pybind11/pybind11.h>
diff --git a/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp b/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp
index d6e1e4e71a..6e8f273ea5 100644
--- a/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp
+++ b/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp
@@ -1,3 +1,27 @@
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===--------------------------------------------------------------------===//
+
 #pragma once
 
 #include "utils/math_utils.hpp"

From 3cb7b150586ffcd5a56ba691642fb9ccf71a9f95 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 6 Dec 2023 10:53:59 -0600
Subject: [PATCH 04/34] Corrected variable name used in assertion

Removed dead code, added include of cassert, added lincese header comment.
---
 .../libtensor/include/kernels/sorting.hpp     | 143 +++---------------
 1 file changed, 25 insertions(+), 118 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting.hpp b/dpctl/tensor/libtensor/include/kernels/sorting.hpp
index cda9ce770d..8b0532918e 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting.hpp
@@ -1,7 +1,32 @@
+//=== sorting.hpp -  Implementation of sorting kernels       ---*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor sort/argsort operations.
+//===----------------------------------------------------------------------===//
+
 #pragma once
 
 #include "pybind11/pybind11.h"
 
+#include <cassert>
 #include <functional>
 #include <iterator>
 #include <sycl/sycl.hpp>
@@ -537,115 +562,6 @@ sort_base_step_contig_impl(sycl::queue &q,
     return base_sort;
 }
 
-template <typename T1, typename T2, typename Comp>
-class exp_sort_over_work_group_contig_krn;
-
-template <typename InpAcc, typename OutAcc, typename Comp>
-sycl::event exp_sort_over_work_group_contig_impl(
-    sycl::queue &q,
-    size_t iter_nelems,
-    size_t sort_nelems,
-    const InpAcc input,
-    OutAcc output,
-    const Comp &comp,
-    size_t &conseq_nelems_sorted,
-    const std::vector<sycl::event> &depends = {})
-{
-
-    using inpT = typename GetValueType<InpAcc>::value_type;
-    using outT = typename GetValueType<InpAcc>::value_type;
-    using KernelName = exp_sort_over_work_group_contig_krn<inpT, outT, Comp>;
-
-    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
-
-    auto const &ctx = q.get_context();
-    auto const &dev = q.get_device();
-    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
-        ctx, {dev}, {kernel_id});
-
-    auto krn = kb.template get_kernel(kernel_id);
-
-    std::uint32_t max_sg_size = krn.template get_info<
-        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
-
-    const size_t lws = 4 * max_sg_size;
-    const std::uint32_t chunk_size = dev.has(sycl::aspect::cpu) ? 8 : 2;
-    conseq_nelems_sorted = chunk_size * lws;
-
-    // This assumption permits doing away with using a loop
-    assert(nelems_wg_sorts % lws == 0);
-
-    sycl::event exp_default_sort_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        const size_t n_chunks =
-            quotient_ceil<size_t>(sort_nelems, conseq_nelems_sorted);
-
-        sycl::range<1> global_range{iter_nelems * n_chunks * lws};
-        sycl::range<1> local_range{lws};
-
-        sycl::range<1> slm_size_range{conseq_nelems_sorted};
-
-        using Sorter = sycl::ext::oneapi::experimental::default_sorter<Comp>;
-
-        // calculate required local memory size
-        // MUST pass local_range, not integer.
-        // Have different meanings
-        const size_t temp_memory_size = Sorter::template memory_required<outT>(
-            sycl::memory_scope::work_group, conseq_nelems_sorted);
-
-        sycl::local_accessor<outT, 1> workspace(slm_size_range, cgh);
-        sycl::local_accessor<std::byte, 1> scratch({temp_memory_size}, cgh);
-
-        cgh.parallel_for<KernelName>(
-            sycl::nd_range<1>(global_range, local_range),
-            [=](sycl::nd_item<1> it) {
-                auto sorter_op = Sorter(sycl::span<std::byte>{
-                    scratch
-                        .template get_multi_ptr<sycl::access::decorated::no>()
-                        .get(),
-                    temp_memory_size});
-
-                const size_t gr_id = it.get_group_linear_id();
-                const size_t iter_id = gr_id / n_chunks;
-                const size_t sort_chunk_id = gr_id - iter_id * n_chunks;
-
-                const std::uint32_t lid = it.get_local_linear_id();
-
-                const size_t iter_offset = iter_id * sort_nelems;
-                const size_t chunk_offset =
-                    sort_chunk_id * conseq_nelems_sorted;
-                const size_t global_start_offset = iter_offset + chunk_offset;
-                const size_t workspace_size =
-                    std::min<size_t>(sort_nelems,
-                                     chunk_offset + conseq_nelems_sorted) -
-                    chunk_offset;
-                for (std::uint32_t i = lid; i < workspace_size; i += lws) {
-                    workspace[i] = input[global_start_offset + i];
-                }
-                sycl::group_barrier(it.get_group());
-
-                sycl::ext::oneapi::experimental::joint_sort(
-                    it.get_group(),
-                    workspace
-                        .template get_multi_ptr<sycl::access::decorated::no>()
-                        .get(),
-                    workspace
-                            .template get_multi_ptr<
-                                sycl::access::decorated::no>()
-                            .get() +
-                        workspace_size,
-                    sorter_op);
-
-                for (std::uint32_t i = lid; i < workspace_size; i += lws) {
-                    output[global_start_offset + i] = workspace[i];
-                }
-            });
-    });
-
-    return exp_default_sort_ev;
-}
-
 class vacuous_krn;
 
 inline sycl::event tie_events(sycl::queue &q,
@@ -847,20 +763,11 @@ sycl::event stable_sort_axis1_contig_impl(
         (sort_nelems >= 512) ? 512 : determine_automatically;
 
     // Sort segments of the array
-#if 1
     sycl::event base_sort_ev = sort_detail::sort_over_work_group_contig_impl<
         const argTy *, argTy *, Comp>(
         exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
         sorted_block_size, // modified in place with size of sorted block size
         depends);
-#else
-    sycl::event base_sort_ev =
-        sort_detail::sort_base_step_contig_impl<const argTy *, argTy *, Comp>(
-            exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
-            sorted_block_size, // modified in place with size of sorted block
-                               // size
-            depends);
-#endif
 
     // Merge segments in parallel until all elements are sorted
     sycl::event merges_ev =

From d79962d10ee68d1e4deb0cb86a1f1b6bdbf0ad79 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 7 Dec 2023 11:02:12 -0600
Subject: [PATCH 05/34] Ensure that we use no more SLM than is available

---
 .../libtensor/include/kernels/sorting.hpp     | 137 ++++++++++--------
 1 file changed, 80 insertions(+), 57 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting.hpp b/dpctl/tensor/libtensor/include/kernels/sorting.hpp
index 8b0532918e..02c9a551f9 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting.hpp
@@ -378,6 +378,60 @@ struct GetReadWriteAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
     }
 };
 
+template <typename T1, typename T2, typename Comp>
+class sort_base_step_contig_krn;
+
+template <typename InpAcc, typename OutAcc, typename Comp>
+sycl::event
+sort_base_step_contig_impl(sycl::queue &q,
+                           size_t iter_nelems,
+                           size_t sort_nelems,
+                           const InpAcc input,
+                           OutAcc output,
+                           const Comp &comp,
+                           size_t &conseq_nelems_sorted,
+                           const std::vector<sycl::event> &depends = {})
+{
+
+    using inpT = typename GetValueType<InpAcc>::value_type;
+    using outT = typename GetValueType<OutAcc>::value_type;
+    using KernelName = sort_base_step_contig_krn<inpT, outT, Comp>;
+
+    conseq_nelems_sorted = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4);
+
+    size_t n_segments =
+        quotient_ceil<size_t>(sort_nelems, conseq_nelems_sorted);
+
+    sycl::event base_sort = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::range<1> gRange{iter_nelems * n_segments};
+
+        auto input_acc = GetReadOnlyAccess<InpAcc>{}(input, cgh);
+        auto output_acc = GetWriteDiscardAccess<OutAcc>{}(output, cgh);
+
+        cgh.parallel_for<KernelName>(gRange, [=](sycl::id<1> id) {
+            const size_t iter_id = id[0] / n_segments;
+            const size_t segment_id = id[0] - iter_id * n_segments;
+
+            const size_t iter_offset = iter_id * sort_nelems;
+            const size_t beg_id =
+                iter_offset + segment_id * conseq_nelems_sorted;
+            const size_t end_id =
+                iter_offset +
+                std::min<size_t>((segment_id + 1) * conseq_nelems_sorted,
+                                 sort_nelems);
+            for (size_t i = beg_id; i < end_id; ++i) {
+                output_acc[i] = input_acc[i];
+            }
+
+            leaf_sort_impl(output_acc, beg_id, end_id, comp);
+        });
+    });
+
+    return base_sort;
+}
+
 template <typename T1, typename T2, typename Comp>
 class sort_over_work_group_contig_krn;
 
@@ -393,8 +447,8 @@ sort_over_work_group_contig_impl(sycl::queue &q,
                                  const std::vector<sycl::event> &depends = {})
 {
     using inpT = typename GetValueType<InpAcc>::value_type;
-    using outT = typename GetValueType<InpAcc>::value_type;
-    using KernelName = sort_over_work_group_contig_krn<inpT, outT, Comp>;
+    using T = typename GetValueType<OutAcc>::value_type;
+    using KernelName = sort_over_work_group_contig_krn<inpT, T, Comp>;
 
     const auto &kernel_id = sycl::get_kernel_id<KernelName>();
 
@@ -405,12 +459,30 @@ sort_over_work_group_contig_impl(sycl::queue &q,
 
     auto krn = kb.template get_kernel(kernel_id);
 
-    std::uint32_t max_sg_size = krn.template get_info<
+    const std::uint32_t max_sg_size = krn.template get_info<
         sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+    const std::uint64_t device_local_memory_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+
+    //  leave 512 bytes of local memory for RT
+    const std::uint64_t safety_margin = 512;
+
+    const std::uint64_t nelems_per_slm =
+        (device_local_memory_size - safety_margin) / (2 * sizeof(T));
 
-    const size_t lws = 4 * max_sg_size;
-    const std::uint32_t chunk_size = dev.has(sycl::aspect::cpu) ? 8 : 2;
-    nelems_wg_sorts = chunk_size * lws;
+    constexpr std::uint32_t sub_groups_per_work_group = 4;
+    const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2;
+
+    const size_t lws = sub_groups_per_work_group * max_sg_size;
+
+    nelems_wg_sorts = elems_per_wi * lws;
+
+    if (nelems_wg_sorts > nelems_per_slm) {
+        nelems_wg_sorts = 0;
+        return sort_base_step_contig_impl<InpAcc, OutAcc, Comp>(
+            q, iter_nelems, sort_nelems, input, output, comp, nelems_wg_sorts,
+            depends);
+    }
 
     // This assumption permits doing away with using a loop
     assert(nelems_wg_sorts % lws == 0);
@@ -421,11 +493,12 @@ sort_over_work_group_contig_impl(sycl::queue &q,
     sycl::event base_sort_ev = q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
+        cgh.use_kernel_bundle(kb);
+
         sycl::range<1> global_range{iter_nelems * n_segments * lws};
         sycl::range<1> local_range{lws};
 
         sycl::range<1> slm_range{nelems_wg_sorts};
-        using T = typename GetValueType<OutAcc>::value_type;
         sycl::local_accessor<T, 1> work_space(slm_range, cgh);
         sycl::local_accessor<T, 1> scratch_space(slm_range, cgh);
 
@@ -512,56 +585,6 @@ sort_over_work_group_contig_impl(sycl::queue &q,
     return base_sort_ev;
 }
 
-template <typename T1, typename T2, typename Comp>
-class sort_base_step_contig_krn;
-
-template <typename InpAcc, typename OutAcc, typename Comp>
-sycl::event
-sort_base_step_contig_impl(sycl::queue &q,
-                           size_t iter_nelems,
-                           size_t sort_nelems,
-                           const InpAcc input,
-                           OutAcc output,
-                           const Comp &comp,
-                           size_t &conseq_nelems_sorted,
-                           const std::vector<sycl::event> &depends = {})
-{
-
-    using inpT = typename GetValueType<InpAcc>::value_type;
-    using outT = typename GetValueType<InpAcc>::value_type;
-    using KernelName = sort_base_step_contig_krn<inpT, outT, Comp>;
-
-    conseq_nelems_sorted = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4);
-
-    size_t n_segments =
-        quotient_ceil<size_t>(sort_nelems, conseq_nelems_sorted);
-
-    sycl::event base_sort = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        sycl::range<1> gRange{iter_nelems * n_segments};
-        cgh.parallel_for<KernelName>(gRange, [=](sycl::id<1> id) {
-            const size_t iter_id = id[0] / n_segments;
-            const size_t segment_id = id[0] - iter_id * n_segments;
-
-            const size_t iter_offset = iter_id * sort_nelems;
-            const size_t beg_id =
-                iter_offset + segment_id * conseq_nelems_sorted;
-            const size_t end_id =
-                iter_offset +
-                std::min<size_t>((segment_id + 1) * conseq_nelems_sorted,
-                                 sort_nelems);
-            for (size_t i = beg_id; i < end_id; ++i) {
-                output[i] = input[i];
-            }
-
-            leaf_sort_impl(output, beg_id, end_id, comp);
-        });
-    });
-
-    return base_sort;
-}
-
 class vacuous_krn;
 
 inline sycl::event tie_events(sycl::queue &q,

From aa1df8b7c8eb55c60aea7ff67df0ffc6b4fd62bf Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 8 Dec 2023 09:46:00 -0600
Subject: [PATCH 06/34] Implementation of set functions

Implements unique_all, unique_values, unique_counts, unique_inverse
based on sort and argsort.

Implementation is asynchronous, but can be improved ince searchsorted
becomes available.
---
 dpctl/tensor/_set_functions.py | 547 +++++++++++++++++++++++++++++++++
 1 file changed, 547 insertions(+)
 create mode 100644 dpctl/tensor/_set_functions.py

diff --git a/dpctl/tensor/_set_functions.py b/dpctl/tensor/_set_functions.py
new file mode 100644
index 0000000000..c26fbc2848
--- /dev/null
+++ b/dpctl/tensor/_set_functions.py
@@ -0,0 +1,547 @@
+from typing import NamedTuple
+
+import dpctl
+import dpctl.tensor as dpt
+
+from ._tensor_elementwise_impl import _not_equal, _subtract
+from ._tensor_impl import (
+    _copy_usm_ndarray_into_usm_ndarray,
+    _extract,
+    _full_usm_ndarray,
+    _linspace_step,
+    _take,
+    default_device_index_type,
+    mask_positions,
+)
+from ._tensor_sorting_impl import _argsort_ascending, _sort_ascending
+
+
+class UniqueAllResult(NamedTuple):
+    values: dpt.usm_ndarray
+    indices: dpt.usm_ndarray
+    inverse_indices: dpt.usm_ndarray
+    counts: dpt.usm_ndarray
+
+
+class UniqueCountsResult(NamedTuple):
+    values: dpt.usm_ndarray
+    counts: dpt.usm_ndarray
+
+
+class UniqueInverseResult(NamedTuple):
+    values: dpt.usm_ndarray
+    inverse_indices: dpt.usm_ndarray
+
+
+def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
+    """unique_values(x)
+
+    Returns the unique elements of an input array x.
+
+    Args:
+        x (usm_ndarray):
+            input array. The input with more than one dimension is flattened.
+    Returns:
+        usm_ndarray
+            an array containing the set of unique elements in `x`. The
+            returned array has the same data type as `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt.reshape(x, (x.size,), order="C", copy=False)
+    s = dpt.empty_like(fx, order="C")
+    host_tasks = []
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _sort_ascending(
+            src=fx, trailing_dims_to_sort=1, dst=s, sycl_queue=exec_q
+        )
+        host_tasks.append(ht_ev)
+    else:
+        tmp = dpt.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q
+        )
+        host_tasks.append(ht_ev)
+        ht_ev, sort_ev = _sort_ascending(
+            src=fx,
+            trailing_dims_to_sort=1,
+            dst=s,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        host_tasks.append(ht_ev)
+    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    host_tasks.append(ht_ev)
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    host_tasks.append(ht_ev)
+    cumsum = dpt.empty(s.shape, dtype=dpt.int64)
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev]
+    )
+    if n_uniques == fx.size:
+        dpctl.SyclEvent.wait_for(host_tasks)
+        return s
+    unique_vals = dpt.empty(
+        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    ht_ev, _ = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    host_tasks.append(ht_ev)
+    dpctl.SyclEvent.wait_for(host_tasks)
+    return unique_vals
+
+
+def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
+    """unique_counts(x)
+
+    Returns the unique elements of an input array `x` and the corresponding
+    counts for each unique element in `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. The input with more than one dimension is flattened.
+    Returns:
+        tuple[usm_ndarray, usm_ndarray]
+            a namedtuple `(values, counts)` whose
+
+            * first element is the field name `values` and is an array
+               containing the unique elements of `x`. This array has the
+               same data type as `x`.
+            * second element has the field name `counts` and is an array
+              containing the number of times each unique element occurs in `x`.
+              This array has the same shape as `values` and has the default
+              array index data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt.reshape(x, (x.size,), order="C", copy=False)
+    s = dpt.empty_like(fx, order="C")
+    host_tasks = []
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _sort_ascending(
+            src=fx, trailing_dims_to_sort=1, dst=s, sycl_queue=exec_q
+        )
+        host_tasks.append(ht_ev)
+    else:
+        tmp = dpt.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q
+        )
+        host_tasks.append(ht_ev)
+        ht_ev, sort_ev = _sort_ascending(
+            src=fx,
+            dst=s,
+            trailing_dims_to_sort=1,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        host_tasks.append(ht_ev)
+    unique_mask = dpt.empty(s.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    host_tasks.append(ht_ev)
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    host_tasks.append(ht_ev)
+    ind_dt = default_device_index_type(exec_q)
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64)
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev]
+    )
+    if n_uniques == fx.size:
+        dpctl.SyclEvent.wait_for(host_tasks)
+        return UniqueCountsResult(s, dpt.ones(n_uniques, dtype=ind_dt))
+    unique_vals = dpt.empty(
+        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    # populate unique values
+    ht_ev, _ = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    host_tasks.append(ht_ev)
+    unique_counts = dpt.empty(
+        n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
+    host_tasks.append(ht_ev)
+    ht_ev, extr_ev = _extract(
+        src=idx,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_counts[:-1],
+        sycl_queue=exec_q,
+        depends=[id_ev],
+    )
+    host_tasks.append(ht_ev)
+    ht_ev, set_ev = _full_usm_ndarray(
+        x.size, dst=unique_counts[-1], sycl_queue=exec_q
+    )
+    host_tasks.append(ht_ev)
+    _counts = dpt.empty_like(unique_counts[1:])
+    ht_ev, _ = _subtract(
+        src1=unique_counts[1:],
+        src2=unique_counts[:-1],
+        dst=_counts,
+        sycl_queue=exec_q,
+        depends=[set_ev, extr_ev],
+    )
+    host_tasks.append(ht_ev)
+    dpctl.SyclEvent.wait_for(host_tasks)
+    return UniqueCountsResult(unique_vals, _counts)
+
+
+def unique_inverse(x):
+    """unique_inverse
+
+    Returns the unique elements of an input array x and the indices from the
+    set of unique elements that reconstruct `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. The input with more than one dimension is flattened.
+    Returns:
+        tuple[usm_ndarray, usm_ndarray]
+            a namedtuple `(values, inverse_indices)` whose
+
+            * first element has the field name `values` and is an array
+              containing the unique elements of `x`. The array has the same
+              data type as `x`.
+            * second element has the field name `inverse_indices` and is an
+              array containing the indices of values that reconstruct `x`.
+              The array has the same shape as `x` and has the default array
+              index data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt.reshape(x, (x.size,), order="C", copy=False)
+    ind_dt = default_device_index_type(exec_q)
+    sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C")
+    unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C")
+    host_tasks = []
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _argsort_ascending(
+            src=fx, trailing_dims_to_sort=1, dst=sorting_ids, sycl_queue=exec_q
+        )
+        host_tasks.append(ht_ev)
+    else:
+        tmp = dpt.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q
+        )
+        host_tasks.append(ht_ev)
+        ht_ev, sort_ev = _argsort_ascending(
+            src=fx,
+            trailing_dims_to_sort=1,
+            dst=sorting_ids,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        host_tasks.append(ht_ev)
+    ht_ev, _ = _argsort_ascending(
+        src=sorting_ids,
+        trailing_dims_to_sort=1,
+        dst=unsorting_ids,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    host_tasks.append(ht_ev)
+    s = dpt.empty_like(fx)
+    # s = fx[sorting_ids]
+    ht_ev, take_ev = _take(
+        src=fx,
+        ind=(sorting_ids,),
+        dst=s,
+        axis_start=0,
+        mode=0,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    host_tasks.append(ht_ev)
+    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[take_ev],
+    )
+    host_tasks.append(ht_ev)
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    host_tasks.append(ht_ev)
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64)
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev]
+    )
+    if n_uniques == fx.size:
+        dpctl.SyclEvent.wait_for(host_tasks)
+        return UniqueInverseResult(s, unsorting_ids)
+    unique_vals = dpt.empty(
+        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    ht_ev, _ = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    host_tasks.append(ht_ev)
+    cum_unique_counts = dpt.empty(
+        n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
+    host_tasks.append(ht_ev)
+    ht_ev, extr_ev = _extract(
+        src=idx,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=cum_unique_counts[:-1],
+        sycl_queue=exec_q,
+        depends=[id_ev],
+    )
+    host_tasks.append(ht_ev)
+    ht_ev, set_ev = _full_usm_ndarray(
+        x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q
+    )
+    host_tasks.append(ht_ev)
+    _counts = dpt.empty_like(cum_unique_counts[1:])
+    ht_ev, _ = _subtract(
+        src1=cum_unique_counts[1:],
+        src2=cum_unique_counts[:-1],
+        dst=_counts,
+        sycl_queue=exec_q,
+        depends=[set_ev, extr_ev],
+    )
+    host_tasks.append(ht_ev)
+    # TODO: when searchsorted is available,
+    #   inv = searchsorted(unique_vals, fx)
+    dpctl.SyclEvent.wait_for(host_tasks)
+    counts = dpt.asnumpy(_counts).tolist()
+    inv = dpt.empty_like(fx, dtype=ind_dt)
+    pos = 0
+    host_tasks = []
+    for i in range(len(counts)):
+        pos_next = pos + counts[i]
+        _dst = inv[pos:pos_next]
+        ht_ev, _ = _full_usm_ndarray(fill_value=i, dst=_dst, sycl_queue=exec_q)
+        pos = pos_next
+        host_tasks.append(ht_ev)
+    dpctl.SyclEvent.wait_for(host_tasks)
+    return UniqueInverseResult(unique_vals, inv[unsorting_ids])
+
+
+def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
+    """unique_all(x)
+
+    Returns the unique elements of an input array `x`, the first occurring
+    indices for each unique element in `x`, the indices from the set of unique
+    elements that reconstruct `x`, and the corresponding counts for each
+    unique element in `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. The input with more than one dimension is flattened.
+    Returns:
+        tuple[usm_ndarray, usm_ndarray, usm_ndarray, usm_ndarray]
+            a namedtuple `(values, indices, inverse_indices, counts)` whose
+
+            * first element has the field name `values` and is an array
+              containing the unique elements of `x`. The array has the same
+              data type as `x`.
+            * second element has the field name `indices` and is an array
+              the indices (of first occurrences) of `x` that result in
+              `values`. The array has the same shape as `values` and has the
+              default array index data type.
+            * third element has the field name `inverse_indices` and is an
+              array containing the indices of values that reconstruct `x`.
+              The array has the same shape as `x` and has the default array
+              index data type.
+            * fourth element has the field name `counts` and is an array
+              containing the number of times each unique element occurs in `x`.
+              This array has the same shape as `values` and has the default
+              array index data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt.reshape(x, (x.size,), order="C", copy=False)
+    ind_dt = default_device_index_type(exec_q)
+    sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C")
+    unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C")
+    host_tasks = []
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _argsort_ascending(
+            src=fx, trailing_dims_to_sort=1, dst=sorting_ids, sycl_queue=exec_q
+        )
+        host_tasks.append(ht_ev)
+    else:
+        tmp = dpt.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q
+        )
+        host_tasks.append(ht_ev)
+        ht_ev, sort_ev = _argsort_ascending(
+            src=fx,
+            trailing_dims_to_sort=1,
+            dst=sorting_ids,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        host_tasks.append(ht_ev)
+    ht_ev, _ = _argsort_ascending(
+        src=sorting_ids,
+        trailing_dims_to_sort=1,
+        dst=unsorting_ids,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    host_tasks.append(ht_ev)
+    s = dpt.empty_like(fx)
+    # s = fx[sorting_ids]
+    ht_ev, take_ev = _take(
+        src=fx,
+        ind=(sorting_ids,),
+        dst=s,
+        axis_start=0,
+        mode=0,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    host_tasks.append(ht_ev)
+    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[take_ev],
+    )
+    host_tasks.append(ht_ev)
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    host_tasks.append(ht_ev)
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64)
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev]
+    )
+    if n_uniques == fx.size:
+        dpctl.SyclEvent.wait_for(host_tasks)
+        return UniqueInverseResult(s, unsorting_ids)
+    unique_vals = dpt.empty(
+        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    ht_ev, _ = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    host_tasks.append(ht_ev)
+    cum_unique_counts = dpt.empty(
+        n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
+    host_tasks.append(ht_ev)
+    ht_ev, extr_ev = _extract(
+        src=idx,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=cum_unique_counts[:-1],
+        sycl_queue=exec_q,
+        depends=[id_ev],
+    )
+    host_tasks.append(ht_ev)
+    ht_ev, set_ev = _full_usm_ndarray(
+        x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q
+    )
+    host_tasks.append(ht_ev)
+    _counts = dpt.empty_like(cum_unique_counts[1:])
+    ht_ev, _ = _subtract(
+        src1=cum_unique_counts[1:],
+        src2=cum_unique_counts[:-1],
+        dst=_counts,
+        sycl_queue=exec_q,
+        depends=[set_ev, extr_ev],
+    )
+    host_tasks.append(ht_ev)
+    # TODO: when searchsorted is available,
+    #   inv = searchsorted(unique_vals, fx)
+    dpctl.SyclEvent.wait_for(host_tasks)
+    counts = dpt.asnumpy(_counts).tolist()
+    inv = dpt.empty_like(fx, dtype=ind_dt)
+    pos = 0
+    host_tasks = []
+    for i in range(len(counts)):
+        pos_next = pos + counts[i]
+        _dst = inv[pos:pos_next]
+        ht_ev, _ = _full_usm_ndarray(fill_value=i, dst=_dst, sycl_queue=exec_q)
+        pos = pos_next
+        host_tasks.append(ht_ev)
+    dpctl.SyclEvent.wait_for(host_tasks)
+    return UniqueAllResult(
+        unique_vals,
+        sorting_ids[cum_unique_counts[:-1]],
+        inv[unsorting_ids],
+        _counts,
+    )

From 709ed1edbc7668a7b5b51e770cab3041b48603f0 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 8 Dec 2023 09:51:13 -0600
Subject: [PATCH 07/34] Exposed set functions to dpctl.tensor namespace

---
 dpctl/tensor/__init__.py       | 10 ++++++++++
 dpctl/tensor/_set_functions.py | 10 ++++++++++
 2 files changed, 20 insertions(+)

diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index 15be87588e..81fc152e7a 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -175,6 +175,12 @@
     reduce_hypot,
     sum,
 )
+from ._set_functions import (
+    unique_all,
+    unique_counts,
+    unique_inverse,
+    unique_values,
+)
 from ._sorting import argsort, sort
 from ._testing import allclose
 from ._type_utils import can_cast, finfo, iinfo, result_type
@@ -346,4 +352,8 @@
     "angle",
     "sort",
     "argsort",
+    "unique_all",
+    "unique_counts",
+    "unique_inverse",
+    "unique_values",
 ]
diff --git a/dpctl/tensor/_set_functions.py b/dpctl/tensor/_set_functions.py
index c26fbc2848..72eec556ae 100644
--- a/dpctl/tensor/_set_functions.py
+++ b/dpctl/tensor/_set_functions.py
@@ -15,6 +15,16 @@
 )
 from ._tensor_sorting_impl import _argsort_ascending, _sort_ascending
 
+__all__ = [
+    "unique_values",
+    "unique_counts",
+    "unique_inverse",
+    "unique_all",
+    "UniqueAllResult",
+    "UniqueCountsResult",
+    "UniqueInverseResult",
+]
+
 
 class UniqueAllResult(NamedTuple):
     values: dpt.usm_ndarray

From 5635d852109dab3c3164f794b9d7adfc991ad807 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 8 Dec 2023 09:51:37 -0600
Subject: [PATCH 08/34] Added __all__ for _sorting.py

---
 dpctl/tensor/_sorting.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py
index ba1590ac32..7158d981c1 100644
--- a/dpctl/tensor/_sorting.py
+++ b/dpctl/tensor/_sorting.py
@@ -27,6 +27,8 @@
     _sort_descending,
 )
 
+__all__ = ["sort", "argsort"]
+
 
 def sort(x, /, *, axis=-1, descending=False, stable=False):
     """sort(x, axis=-1, descending=False, stable=False)

From 88ad12f5747ed0d6b2524ab57b00f1dfb98eb432 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 8 Dec 2023 10:02:32 -0600
Subject: [PATCH 09/34] Expand tests for sorting to improve coverage

---
 dpctl/tests/test_usm_ndarray_sorting.py | 54 +++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/dpctl/tests/test_usm_ndarray_sorting.py b/dpctl/tests/test_usm_ndarray_sorting.py
index f0b57129d9..e582ad2c63 100644
--- a/dpctl/tests/test_usm_ndarray_sorting.py
+++ b/dpctl/tests/test_usm_ndarray_sorting.py
@@ -126,3 +126,57 @@ def test_argsort_1d(dtype):
 
     s1_idx = dpt.argsort(inp, descending=True)
     assert dpt.all(inp[s1_idx[:-1]] >= inp[s1_idx[1:]])
+
+
+def test_sort_validation():
+    with pytest.raises(TypeError):
+        dpt.sort(dict())
+
+
+def test_argsort_validation():
+    with pytest.raises(TypeError):
+        dpt.argsort(dict())
+
+
+def test_sort_axis0():
+    get_queue_or_skip()
+
+    n, m = 200, 30
+    xf = dpt.arange(n * m, 0, step=-1, dtype="i4")
+    x = dpt.reshape(xf, (n, m))
+    s = dpt.sort(x, axis=0)
+
+    assert dpt.all(s[:-1, :] <= s[1:, :])
+
+
+def test_argsort_axis0():
+    get_queue_or_skip()
+
+    n, m = 200, 30
+    xf = dpt.arange(n * m, 0, step=-1, dtype="i4")
+    x = dpt.reshape(xf, (n, m))
+    idx = dpt.argsort(x, axis=0)
+
+    s = x[idx, dpt.arange(m)[dpt.newaxis, :]]
+
+    assert dpt.all(s[:-1, :] <= s[1:, :])
+
+
+def test_sort_strided():
+    get_queue_or_skip()
+
+    x_orig = dpt.arange(100, dtype="i4")
+    x_flipped = dpt.flip(x_orig, axis=0)
+    s = dpt.sort(x_flipped)
+
+    assert dpt.all(s == x_orig)
+
+
+def test_argsort_strided():
+    get_queue_or_skip()
+
+    x_orig = dpt.arange(100, dtype="i4")
+    x_flipped = dpt.flip(x_orig, axis=0)
+    idx = dpt.argsort(x_flipped)
+
+    assert dpt.all(x_flipped[idx] == x_orig)

From 734cf77abfe57fdbb8ca577f44548ccb5b2001a5 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 8 Dec 2023 10:24:32 -0600
Subject: [PATCH 10/34] Tests for set functions

---
 dpctl/tests/test_usm_ndarray_unique.py | 153 +++++++++++++++++++++++++
 1 file changed, 153 insertions(+)
 create mode 100644 dpctl/tests/test_usm_ndarray_unique.py

diff --git a/dpctl/tests/test_usm_ndarray_unique.py b/dpctl/tests/test_usm_ndarray_unique.py
new file mode 100644
index 0000000000..012a6091cc
--- /dev/null
+++ b/dpctl/tests/test_usm_ndarray_unique.py
@@ -0,0 +1,153 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_unique_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, roll = 10000, 734
+    inp = dpt.roll(
+        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
+        roll,
+    )
+
+    uv = dpt.unique_values(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_unique_counts(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, roll = 10000, 734
+    inp = dpt.roll(
+        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
+        roll,
+    )
+
+    uv, uv_counts = dpt.unique_counts(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
+    assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype))
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_unique_inverse(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, roll = 10000, 734
+    inp = dpt.roll(
+        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
+        roll,
+    )
+
+    uv, inv = dpt.unique_inverse(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
+    assert dpt.all(inp == uv[inv])
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_unique_all(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, roll = 10000, 734
+    inp = dpt.roll(
+        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
+        roll,
+    )
+
+    uv, ind, inv, uv_counts = dpt.unique_all(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
+    assert dpt.all(uv == inp[ind])
+    assert dpt.all(inp == uv[inv])
+    assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype))

From 11344b16637a23f65dd9c88410d6f88011752e69 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 8 Dec 2023 14:08:22 -0600
Subject: [PATCH 11/34] Added license header for test file

---
 dpctl/tests/test_usm_ndarray_sorting.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/dpctl/tests/test_usm_ndarray_sorting.py b/dpctl/tests/test_usm_ndarray_sorting.py
index e582ad2c63..ef86d3a561 100644
--- a/dpctl/tests/test_usm_ndarray_sorting.py
+++ b/dpctl/tests/test_usm_ndarray_sorting.py
@@ -1,3 +1,19 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import pytest
 
 import dpctl.tensor as dpt

From b1d10064afe68a95b1e9c122c89be5268db9f9e3 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 8 Dec 2023 14:23:39 -0600
Subject: [PATCH 12/34] Do not use shorcut API for copy with dependency

---
 dpctl/tensor/libtensor/include/kernels/accumulators.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
index bd6ad20b6d..505bb6d8f1 100644
--- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
@@ -251,8 +251,10 @@ size_t accumulate_contig_impl(sycl::queue &q,
     if (last_elem_host_usm == nullptr) {
         throw std::bad_alloc();
     }
-    sycl::event copy_e =
-        q.copy<cumsumT>(last_elem, last_elem_host_usm, 1, {comp_ev});
+    sycl::event copy_e = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(comp_ev);
+        cgh.copy<cumsumT>(last_elem, last_elem_host_usm, 1);
+    });
     copy_e.wait();
     size_t return_val = static_cast<size_t>(*last_elem_host_usm);
     sycl::free(last_elem_host_usm, q);

From 34b50af5bb57a9daa750a2124ea17ab7e4989293 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 8 Dec 2023 17:35:44 -0600
Subject: [PATCH 13/34] Implement set functions using synchronizing ops

Use synchronizing operation as implementation in dpctl.tensor

Fixed tests.
---
 dpctl/tensor/__init__.py                |   2 +-
 dpctl/tensor/_set_functions.py          |  16 ++
 dpctl/tensor/_set_functions_sync.py     | 351 ++++++++++++++++++++++++
 dpctl/tests/test_usm_ndarray_sorting.py |   3 +-
 4 files changed, 370 insertions(+), 2 deletions(-)
 create mode 100644 dpctl/tensor/_set_functions_sync.py

diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index 81fc152e7a..fb41c1240a 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -175,7 +175,7 @@
     reduce_hypot,
     sum,
 )
-from ._set_functions import (
+from ._set_functions_sync import (
     unique_all,
     unique_counts,
     unique_inverse,
diff --git a/dpctl/tensor/_set_functions.py b/dpctl/tensor/_set_functions.py
index 72eec556ae..631a12a592 100644
--- a/dpctl/tensor/_set_functions.py
+++ b/dpctl/tensor/_set_functions.py
@@ -1,3 +1,19 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
 from typing import NamedTuple
 
 import dpctl
diff --git a/dpctl/tensor/_set_functions_sync.py b/dpctl/tensor/_set_functions_sync.py
new file mode 100644
index 0000000000..7f16373c78
--- /dev/null
+++ b/dpctl/tensor/_set_functions_sync.py
@@ -0,0 +1,351 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+from typing import NamedTuple
+
+import dpctl.tensor as dpt
+
+from ._tensor_impl import (
+    _extract,
+    _full_usm_ndarray,
+    _linspace_step,
+    default_device_index_type,
+    mask_positions,
+)
+
+__all__ = [
+    "unique_values",
+    "unique_counts",
+    "unique_inverse",
+    "unique_all",
+    "UniqueAllResult",
+    "UniqueCountsResult",
+    "UniqueInverseResult",
+]
+
+
+class UniqueAllResult(NamedTuple):
+    values: dpt.usm_ndarray
+    indices: dpt.usm_ndarray
+    inverse_indices: dpt.usm_ndarray
+    counts: dpt.usm_ndarray
+
+
+class UniqueCountsResult(NamedTuple):
+    values: dpt.usm_ndarray
+    counts: dpt.usm_ndarray
+
+
+class UniqueInverseResult(NamedTuple):
+    values: dpt.usm_ndarray
+    inverse_indices: dpt.usm_ndarray
+
+
+def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
+    """unique_values(x)
+
+    Returns the unique elements of an input array x.
+
+    Args:
+        x (usm_ndarray):
+            input array. The input with more than one dimension is flattened.
+    Returns:
+        usm_ndarray
+            an array containing the set of unique elements in `x`. The
+            returned array has the same data type as `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt.reshape(x, (x.size,), order="C", copy=False)
+    s = dpt.sort(fx)
+    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:])
+    unique_mask[0] = True
+    cumsum = dpt.empty(s.shape, dtype=dpt.int64)
+    n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q)
+    if n_uniques == fx.size:
+        return s
+    unique_vals = dpt.empty(
+        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    ht_ev, _ = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    ht_ev.wait()
+    return unique_vals
+
+
+def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
+    """unique_counts(x)
+
+    Returns the unique elements of an input array `x` and the corresponding
+    counts for each unique element in `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. The input with more than one dimension is flattened.
+    Returns:
+        tuple[usm_ndarray, usm_ndarray]
+            a namedtuple `(values, counts)` whose
+
+            * first element is the field name `values` and is an array
+               containing the unique elements of `x`. This array has the
+               same data type as `x`.
+            * second element has the field name `counts` and is an array
+              containing the number of times each unique element occurs in `x`.
+              This array has the same shape as `values` and has the default
+              array index data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt.reshape(x, (x.size,), order="C", copy=False)
+    s = dpt.sort(x)
+    unique_mask = dpt.empty(s.shape, dtype="?", sycl_queue=exec_q)
+    dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:])
+    unique_mask[0] = True
+    ind_dt = default_device_index_type(exec_q)
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64)
+    # synchronizing call
+    n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q)
+    if n_uniques == fx.size:
+        return UniqueCountsResult(s, dpt.ones(n_uniques, dtype=ind_dt))
+    unique_vals = dpt.empty(
+        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    # populate unique values
+    ht_ev, _ = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    ht_ev.wait()
+    unique_counts = dpt.empty(
+        n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    idx = dpt.arange(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    ht_ev, _ = _extract(
+        src=idx,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_counts[:-1],
+        sycl_queue=exec_q,
+    )
+    unique_counts[-1] = fx.size
+    ht_ev.wait()
+    _counts = dpt.empty_like(unique_counts[1:])
+    dpt.subtract(unique_counts[1:], unique_counts[:-1], out=_counts)
+    return UniqueCountsResult(unique_vals, _counts)
+
+
+def unique_inverse(x):
+    """unique_inverse
+
+    Returns the unique elements of an input array x and the indices from the
+    set of unique elements that reconstruct `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. The input with more than one dimension is flattened.
+    Returns:
+        tuple[usm_ndarray, usm_ndarray]
+            a namedtuple `(values, inverse_indices)` whose
+
+            * first element has the field name `values` and is an array
+              containing the unique elements of `x`. The array has the same
+              data type as `x`.
+            * second element has the field name `inverse_indices` and is an
+              array containing the indices of values that reconstruct `x`.
+              The array has the same shape as `x` and has the default array
+              index data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt.reshape(x, (x.size,), order="C", copy=False)
+    ind_dt = default_device_index_type(exec_q)
+    sorting_ids = dpt.argsort(fx)
+    unsorting_ids = dpt.argsort(sorting_ids)
+    s = fx[sorting_ids]
+    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    unique_mask[0] = True
+    dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:])
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64)
+    # synchronizing call
+    n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q)
+    if n_uniques == fx.size:
+        return UniqueInverseResult(s, unsorting_ids)
+    unique_vals = dpt.empty(
+        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    ht_ev, _ = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    ht_ev.wait()
+    cum_unique_counts = dpt.empty(
+        n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
+    ht_ev.wait()
+    ht_ev, _ = _extract(
+        src=idx,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=cum_unique_counts[:-1],
+        sycl_queue=exec_q,
+    )
+    ht_ev.wait()
+    cum_unique_counts[-1] = fx.size
+    _counts = dpt.subtract(cum_unique_counts[1:], cum_unique_counts[:-1])
+    # TODO: when searchsorted is available,
+    #   inv = searchsorted(unique_vals, fx)
+    counts = dpt.asnumpy(_counts).tolist()
+    inv = dpt.empty_like(fx, dtype=ind_dt)
+    pos = 0
+    for i in range(len(counts)):
+        pos_next = pos + counts[i]
+        _dst = inv[pos:pos_next]
+        ht_ev, _ = _full_usm_ndarray(fill_value=i, dst=_dst, sycl_queue=exec_q)
+        ht_ev.wait()
+        pos = pos_next
+    return UniqueInverseResult(unique_vals, inv[unsorting_ids])
+
+
+def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
+    """unique_all(x)
+
+    Returns the unique elements of an input array `x`, the first occurring
+    indices for each unique element in `x`, the indices from the set of unique
+    elements that reconstruct `x`, and the corresponding counts for each
+    unique element in `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. The input with more than one dimension is flattened.
+    Returns:
+        tuple[usm_ndarray, usm_ndarray, usm_ndarray, usm_ndarray]
+            a namedtuple `(values, indices, inverse_indices, counts)` whose
+
+            * first element has the field name `values` and is an array
+              containing the unique elements of `x`. The array has the same
+              data type as `x`.
+            * second element has the field name `indices` and is an array
+              the indices (of first occurrences) of `x` that result in
+              `values`. The array has the same shape as `values` and has the
+              default array index data type.
+            * third element has the field name `inverse_indices` and is an
+              array containing the indices of values that reconstruct `x`.
+              The array has the same shape as `x` and has the default array
+              index data type.
+            * fourth element has the field name `counts` and is an array
+              containing the number of times each unique element occurs in `x`.
+              This array has the same shape as `values` and has the default
+              array index data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt.reshape(x, (x.size,), order="C", copy=False)
+    ind_dt = default_device_index_type(exec_q)
+    sorting_ids = dpt.argsort(fx)
+    unsorting_ids = dpt.argsort(sorting_ids)
+    s = fx[sorting_ids]
+    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:])
+    unique_mask[0] = True
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64)
+    # synchronizing call
+    n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q)
+    if n_uniques == fx.size:
+        return UniqueInverseResult(s, unsorting_ids)
+    unique_vals = dpt.empty(
+        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    ht_ev, _ = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    ht_ev.wait()
+    cum_unique_counts = dpt.empty(
+        n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    idx = dpt.arange(fx.size, dtype=ind_dt, sycl_queue=exec_q)
+    ht_ev, extr_ev = _extract(
+        src=idx,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=cum_unique_counts[:-1],
+        sycl_queue=exec_q,
+    )
+    ht_ev.wait()
+    cum_unique_counts[-1] = fx.size
+    _counts = cum_unique_counts[1:] - cum_unique_counts[:-1]
+    # TODO: when searchsorted is available,
+    #   inv = searchsorted(unique_vals, fx)
+    counts = dpt.asnumpy(_counts).tolist()
+    inv = dpt.empty_like(fx, dtype=ind_dt)
+    pos = 0
+    for i in range(len(counts)):
+        pos_next = pos + counts[i]
+        _dst = inv[pos:pos_next]
+        ht_ev, _ = _full_usm_ndarray(fill_value=i, dst=_dst, sycl_queue=exec_q)
+        ht_ev.wait()
+        pos = pos_next
+    return UniqueAllResult(
+        unique_vals,
+        sorting_ids[cum_unique_counts[:-1]],
+        inv[unsorting_ids],
+        _counts,
+    )
diff --git a/dpctl/tests/test_usm_ndarray_sorting.py b/dpctl/tests/test_usm_ndarray_sorting.py
index ef86d3a561..84cf760a0c 100644
--- a/dpctl/tests/test_usm_ndarray_sorting.py
+++ b/dpctl/tests/test_usm_ndarray_sorting.py
@@ -173,7 +173,8 @@ def test_argsort_axis0():
     x = dpt.reshape(xf, (n, m))
     idx = dpt.argsort(x, axis=0)
 
-    s = x[idx, dpt.arange(m)[dpt.newaxis, :]]
+    conseq_idx = dpt.arange(m, dtype=idx.dtype)
+    s = x[idx, conseq_idx[dpt.newaxis, :]]
 
     assert dpt.all(s[:-1, :] <= s[1:, :])
 

From 1c7efc7b4b63b3066cde7d9ebdc26d95f1f318e3 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 8 Dec 2023 22:35:33 -0600
Subject: [PATCH 14/34] Renamed unused _set_functions.py and added it to omit
 list for coverage

---
 dpctl/tensor/{_set_functions.py => _set_functions_debug.py} | 0
 pyproject.toml                                              | 2 ++
 2 files changed, 2 insertions(+)
 rename dpctl/tensor/{_set_functions.py => _set_functions_debug.py} (100%)

diff --git a/dpctl/tensor/_set_functions.py b/dpctl/tensor/_set_functions_debug.py
similarity index 100%
rename from dpctl/tensor/_set_functions.py
rename to dpctl/tensor/_set_functions_debug.py
diff --git a/pyproject.toml b/pyproject.toml
index 5882128dab..074b27257d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,6 +22,7 @@ source = [
 omit = [
     "dpctl/tests/*",
     "dpctl/_version.py",
+    "dpctl/tensor/_set_functions_debug.py",
     "*/_cython_api*/stringsource",
 ]
 
@@ -29,6 +30,7 @@ omit = [
 omit = [
     "dpctl/tests/*",
     "dpctl/_version.py",
+    "dpctl/tensor/_set_functions_debug.py",
     "*/_cython_api*/stringsource",
 ]
 

From fad8470f1cbe1564a0cceef70f89ef97239ae70e Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sat, 9 Dec 2023 06:48:36 -0600
Subject: [PATCH 15/34] Use sequential sorting for small arrays

---
 .../libtensor/include/kernels/sorting.hpp     | 61 +++++++++++--------
 1 file changed, 37 insertions(+), 24 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting.hpp b/dpctl/tensor/libtensor/include/kernels/sorting.hpp
index 02c9a551f9..e7a024259e 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting.hpp
@@ -384,12 +384,12 @@ class sort_base_step_contig_krn;
 template <typename InpAcc, typename OutAcc, typename Comp>
 sycl::event
 sort_base_step_contig_impl(sycl::queue &q,
-                           size_t iter_nelems,
-                           size_t sort_nelems,
+                           const size_t iter_nelems,
+                           const size_t sort_nelems,
                            const InpAcc input,
                            OutAcc output,
                            const Comp &comp,
-                           size_t &conseq_nelems_sorted,
+                           const size_t conseq_nelems_sorted,
                            const std::vector<sycl::event> &depends = {})
 {
 
@@ -397,15 +397,13 @@ sort_base_step_contig_impl(sycl::queue &q,
     using outT = typename GetValueType<OutAcc>::value_type;
     using KernelName = sort_base_step_contig_krn<inpT, outT, Comp>;
 
-    conseq_nelems_sorted = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4);
-
-    size_t n_segments =
+    const size_t n_segments =
         quotient_ceil<size_t>(sort_nelems, conseq_nelems_sorted);
 
     sycl::event base_sort = q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
-        sycl::range<1> gRange{iter_nelems * n_segments};
+        const sycl::range<1> gRange{iter_nelems * n_segments};
 
         auto input_acc = GetReadOnlyAccess<InpAcc>{}(input, cgh);
         auto output_acc = GetWriteDiscardAccess<OutAcc>{}(output, cgh);
@@ -478,7 +476,8 @@ sort_over_work_group_contig_impl(sycl::queue &q,
     nelems_wg_sorts = elems_per_wi * lws;
 
     if (nelems_wg_sorts > nelems_per_slm) {
-        nelems_wg_sorts = 0;
+        nelems_wg_sorts = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4);
+
         return sort_base_step_contig_impl<InpAcc, OutAcc, Comp>(
             q, iter_nelems, sort_nelems, input, output, comp, nelems_wg_sorts,
             depends);
@@ -781,24 +780,38 @@ sycl::event stable_sort_axis1_contig_impl(
 
     auto comp = Comp{};
 
-    static constexpr size_t determine_automatically = 0;
-    size_t sorted_block_size =
-        (sort_nelems >= 512) ? 512 : determine_automatically;
+    constexpr size_t sequential_sorting_threshold = 64;
 
-    // Sort segments of the array
-    sycl::event base_sort_ev = sort_detail::sort_over_work_group_contig_impl<
-        const argTy *, argTy *, Comp>(
-        exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
-        sorted_block_size, // modified in place with size of sorted block size
-        depends);
-
-    // Merge segments in parallel until all elements are sorted
-    sycl::event merges_ev =
-        sort_detail::merge_sorted_block_contig_impl<argTy *, Comp>(
-            exec_q, iter_nelems, sort_nelems, res_tp, comp, sorted_block_size,
-            {base_sort_ev});
+    if (sort_nelems < sequential_sorting_threshold) {
+        // equal work-item sorts entire row
+        sycl::event sequential_sorting_ev =
+            sort_detail::sort_base_step_contig_impl<const argTy *, argTy *,
+                                                    Comp>(
+                exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
+                sort_nelems, depends);
 
-    return merges_ev;
+        return sequential_sorting_ev;
+    }
+    else {
+        size_t sorted_block_size{};
+
+        // Sort segments of the array
+        sycl::event base_sort_ev =
+            sort_detail::sort_over_work_group_contig_impl<const argTy *,
+                                                          argTy *, Comp>(
+                exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
+                sorted_block_size, // modified in place with size of sorted
+                                   // block size
+                depends);
+
+        // Merge segments in parallel until all elements are sorted
+        sycl::event merges_ev =
+            sort_detail::merge_sorted_block_contig_impl<argTy *, Comp>(
+                exec_q, iter_nelems, sort_nelems, res_tp, comp,
+                sorted_block_size, {base_sort_ev});
+
+        return merges_ev;
+    }
 }
 
 template <typename T1, typename T2, typename T3>

From fa3d8fec9d704f3ceed4a11839b6e01b4ab066f3 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 11 Dec 2023 13:46:04 -0600
Subject: [PATCH 16/34] Do not use shorthand queue::copy

---
 dpctl/tensor/libtensor/include/kernels/accumulators.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
index 505bb6d8f1..327f1eaca0 100644
--- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
@@ -339,8 +339,10 @@ size_t accumulate_strided_impl(sycl::queue &q,
     if (last_elem_host_usm == nullptr) {
         throw std::bad_alloc();
     }
-    sycl::event copy_e =
-        q.copy<cumsumT>(last_elem, last_elem_host_usm, 1, {comp_ev});
+    sycl::event copy_e = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(comp_ev);
+        cgh.copy<cumsumT>(last_elem, last_elem_host_usm, 1);
+    });
     copy_e.wait();
     size_t return_val = static_cast<size_t>(*last_elem_host_usm);
     sycl::free(last_elem_host_usm, q);

From abfc28000786ede7e697c924327cbc3be9571967 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 11 Dec 2023 13:46:49 -0600
Subject: [PATCH 17/34] Do not use queue::fill convenience shortcut functions

---
 .../libtensor/include/kernels/boolean_reductions.hpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
index 61fb0f6ba0..877680c8bf 100644
--- a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
@@ -315,8 +315,10 @@ boolean_reduction_axis1_contig_impl(sycl::queue &exec_q,
         });
     }
     else {
-        sycl::event init_ev = exec_q.fill<resTy>(res_tp, resTy(identity_val),
-                                                 iter_nelems, depends);
+        sycl::event init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+            cgh.fill<resTy>(res_tp, resTy(identity_val), iter_nelems);
+        });
         red_ev = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(init_ev);
 
@@ -484,8 +486,10 @@ boolean_reduction_axis0_contig_impl(sycl::queue &exec_q,
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
     {
-        sycl::event init_ev = exec_q.fill<resTy>(res_tp, resTy(identity_val),
-                                                 iter_nelems, depends);
+        sycl::event init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+            cgh.fill<resTy>(res_tp, resTy(identity_val), iter_nelems);
+        });
         sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(init_ev);
 

From 5e0e5eb2456a6d063f21558223ea62cb6151f11f Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 11 Dec 2023 14:58:06 -0600
Subject: [PATCH 18/34] Accumulate implementations API acquired
 host_task_events argument

This is `std::vector<sycl::event>` passed by reference to collect
events associated with host_task submissions.

The synchronizing call `mask_positions` is releasing GIL before
wait on these events is called.

It is either this, or accumulated host tasks must be returned to
the user.
---
 .../include/kernels/accumulators.hpp          | 24 ++++++----
 .../tensor/libtensor/source/accumulators.cpp  | 44 ++++++++++++++-----
 2 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
index 327f1eaca0..da461d56b0 100644
--- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
@@ -112,7 +112,8 @@ sycl::event inclusive_scan_rec(sycl::queue &exec_q,
                                size_t s1,
                                IndexerT indexer,
                                TransformerT transformer,
-                               std::vector<sycl::event> const &depends = {})
+                               std::vector<sycl::event> &host_tasks,
+                               const std::vector<sycl::event> &depends = {})
 {
     size_t n_groups = ceiling_quotient(n_elems, n_wi * wg_size);
 
@@ -188,7 +189,7 @@ sycl::event inclusive_scan_rec(sycl::queue &exec_q,
         auto e2 = inclusive_scan_rec<outputT, outputT, n_wi, NoOpIndexer,
                                      decltype(_no_op_transformer)>(
             exec_q, n_groups - 1, wg_size, output, temp, chunk_size - 1,
-            chunk_size, _no_op_indexer, _no_op_transformer,
+            chunk_size, _no_op_indexer, _no_op_transformer, host_tasks,
             {inc_scan_phase1_ev});
 
         // output[ chunk_size * (i + 1) + j] += temp[i]
@@ -209,8 +210,9 @@ sycl::event inclusive_scan_rec(sycl::queue &exec_q,
             const auto &ctx = exec_q.get_context();
             cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); });
         });
+        host_tasks.push_back(e4);
 
-        out_event = std::move(e4);
+        out_event = std::move(e3);
     }
 
     return out_event;
@@ -221,14 +223,16 @@ typedef size_t (*accumulate_contig_impl_fn_ptr_t)(
     size_t,
     const char *,
     char *,
-    std::vector<sycl::event> const &);
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
 
 template <typename maskT, typename cumsumT, typename transformerT>
 size_t accumulate_contig_impl(sycl::queue &q,
                               size_t n_elems,
                               const char *mask,
                               char *cumsum,
-                              std::vector<sycl::event> const &depends = {})
+                              std::vector<sycl::event> &host_tasks,
+                              const std::vector<sycl::event> &depends = {})
 {
     constexpr int n_wi = 8;
     const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
@@ -242,7 +246,7 @@ size_t accumulate_contig_impl(sycl::queue &q,
         inclusive_scan_rec<maskT, cumsumT, n_wi, decltype(flat_indexer),
                            decltype(non_zero_indicator)>(
             q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1,
-            flat_indexer, non_zero_indicator, depends);
+            flat_indexer, non_zero_indicator, host_tasks, depends);
 
     cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
 
@@ -307,7 +311,8 @@ typedef size_t (*accumulate_strided_impl_fn_ptr_t)(
     int,
     const py::ssize_t *,
     char *,
-    std::vector<sycl::event> const &);
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
 
 template <typename maskT, typename cumsumT, typename transformerT>
 size_t accumulate_strided_impl(sycl::queue &q,
@@ -316,7 +321,8 @@ size_t accumulate_strided_impl(sycl::queue &q,
                                int nd,
                                const py::ssize_t *shape_strides,
                                char *cumsum,
-                               std::vector<sycl::event> const &depends = {})
+                               std::vector<sycl::event> &host_tasks,
+                               const std::vector<sycl::event> &depends = {})
 {
     constexpr int n_wi = 8;
     const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
@@ -330,7 +336,7 @@ size_t accumulate_strided_impl(sycl::queue &q,
         inclusive_scan_rec<maskT, cumsumT, n_wi, decltype(strided_indexer),
                            decltype(non_zero_indicator)>(
             q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1,
-            strided_indexer, non_zero_indicator, depends);
+            strided_indexer, non_zero_indicator, host_tasks, depends);
 
     cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
 
diff --git a/dpctl/tensor/libtensor/source/accumulators.cpp b/dpctl/tensor/libtensor/source/accumulators.cpp
index 0a2ce69f69..094aa4c284 100644
--- a/dpctl/tensor/libtensor/source/accumulators.cpp
+++ b/dpctl/tensor/libtensor/source/accumulators.cpp
@@ -150,12 +150,20 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
 
     const bool use_i32 = (cumsum_typeid == int32_typeid);
 
+    std::vector<sycl::event> host_task_events;
+
     if (mask.is_c_contiguous()) {
         auto fn = (use_i32)
                       ? mask_positions_contig_i32_dispatch_vector[mask_typeid]
                       : mask_positions_contig_i64_dispatch_vector[mask_typeid];
 
-        return fn(exec_q, mask_size, mask_data, cumsum_data, depends);
+        size_t total_set = fn(exec_q, mask_size, mask_data, cumsum_data,
+                              host_task_events, depends);
+        {
+            py::gil_scoped_release release;
+            sycl::event::wait(host_task_events);
+        }
+        return total_set;
     }
 
     const py::ssize_t *shape = mask.get_shape_raw();
@@ -175,7 +183,6 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
     auto strided_fn =
         (use_i32) ? mask_positions_strided_i32_dispatch_vector[mask_typeid]
                   : mask_positions_strided_i64_dispatch_vector[mask_typeid];
-    std::vector<sycl::event> host_task_events;
 
     using dpctl::tensor::offset_utils::device_allocate_and_pack;
     const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
@@ -189,7 +196,10 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
 
     if (2 * static_cast<size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
         copy_shape_ev.wait();
-        sycl::event::wait(host_task_events);
+        {
+            py::gil_scoped_release release;
+            sycl::event::wait(host_task_events);
+        }
         sycl::free(shape_strides, exec_q);
         throw std::runtime_error("Unexpected error");
     }
@@ -200,10 +210,14 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
     dependent_events.insert(dependent_events.end(), depends.begin(),
                             depends.end());
 
-    size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd,
-                                  shape_strides, cumsum_data, dependent_events);
+    size_t total_set =
+        strided_fn(exec_q, mask_size, mask_data, nd, shape_strides, cumsum_data,
+                   host_task_events, dependent_events);
 
-    sycl::event::wait(host_task_events);
+    {
+        py::gil_scoped_release release;
+        sycl::event::wait(host_task_events);
+    }
     sycl::free(shape_strides, exec_q);
 
     return total_set;
@@ -283,6 +297,8 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
             "Cumulative sum array must have int64 data-type.");
     }
 
+    std::vector<sycl::event> host_task_events;
+
     if (src.is_c_contiguous()) {
         auto fn = cumsum_1d_contig_dispatch_vector[src_typeid];
         if (fn == nullptr) {
@@ -290,7 +306,13 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
                 "this cumsum requires integer type, got src_typeid=" +
                 std::to_string(src_typeid));
         }
-        return fn(exec_q, src_size, src_data, cumsum_data, depends);
+        size_t total = fn(exec_q, src_size, src_data, cumsum_data,
+                          host_task_events, depends);
+        {
+            py::gil_scoped_release release;
+            sycl::event::wait(host_task_events);
+        }
+        return total;
     }
 
     const py::ssize_t *shape = src.get_shape_raw();
@@ -313,7 +335,6 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
             "this cumsum requires integer type, got src_typeid=" +
             std::to_string(src_typeid));
     }
-    std::vector<sycl::event> host_task_events;
 
     using dpctl::tensor::offset_utils::device_allocate_and_pack;
     const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
@@ -339,9 +360,12 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
                             depends.end());
 
     size_t total = strided_fn(exec_q, src_size, src_data, nd, shape_strides,
-                              cumsum_data, dependent_events);
+                              cumsum_data, host_task_events, dependent_events);
 
-    sycl::event::wait(host_task_events);
+    {
+        py::gil_scoped_release release;
+        sycl::event::wait(host_task_events);
+    }
     sycl::free(shape_strides, exec_q);
 
     return total;

From 6abb9fe1592739515e0e8577158c3ee691cef892 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 15 Dec 2023 09:43:24 -0600
Subject: [PATCH 19/34] Changed inclusive sum implementation from recursive to
 iterative
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Changed hyperparameter choices to be different for CPU and GPU, resulting
in 20% performance gain on GPU.

The non-recursive implementation allows to avoid repeated USM allocations,
resulting in performance gains for large arrays.

Furthermore, corrected base step kernel to accumulate in outputT rather than
in size_t, which additionally realizes savings when int32 is used as
accumulator type.

Using example from gh-1249, previously, on my Iris Xe laptop:

```
In [1]: import dpctl.tensor as dpt
   ...: ag = dpt.ones((8192, 8192), device='gpu', dtype='f4')
   ...: bg = dpt.ones((8192, 8192), device='gpu', dtype=bool)

In [2]: cg = ag[bg]

In [3]: dpt.all(cg == dpt.reshape(ag, -1))
Out[3]: usm_ndarray(True)

In [4]: %timeit -n 10 -r 3 cg = ag[bg]
212 ms ± 56 ms per loop (mean ± std. dev. of 3 runs, 10 loops each)
```

while with this change:

```
In [4]: %timeit -n 10 -r 3 cg = ag[bg]
178 ms ± 24.2 ms per loop (mean ± std. dev. of 3 runs, 10 loops each)
```
---
 .../include/kernels/accumulators.hpp          | 375 ++++++++++++------
 1 file changed, 258 insertions(+), 117 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
index da461d56b0..491fb12126 100644
--- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
@@ -51,23 +51,6 @@ template <typename T> T ceiling_quotient(T n, T m)
 {
     return (n + m - 1) / m;
 }
-template <typename T1, typename T2> T1 ceiling_quotient(T1 n, T2 m)
-{
-    return ceiling_quotient<T1>(n, static_cast<T1>(m));
-}
-
-template <typename inputT,
-          typename outputT,
-          size_t n_wi,
-          typename IndexerT,
-          typename TransformerT>
-class inclusive_scan_rec_local_scan_krn;
-
-template <typename inputT,
-          typename outputT,
-          typename IndexerT,
-          typename TransformerT>
-class inclusive_scan_rec_chunk_update_krn;
 
 template <typename inputT, typename outputT> struct NonZeroIndicator
 {
@@ -93,129 +76,261 @@ template <typename T> struct NoOpTransformer
     }
 };
 
-/*
- * for integer type maskT,
- *       output[j] = sum( input[s0 + i * s1], 0 <= i <= j)
- * for 0 <= j < n_elems
- */
+// Iterative cumulative summation
+
+using nwiT = std::uint16_t;
+
 template <typename inputT,
           typename outputT,
-          size_t n_wi,
+          nwiT n_wi,
           typename IndexerT,
           typename TransformerT>
-sycl::event inclusive_scan_rec(sycl::queue &exec_q,
-                               size_t n_elems,
-                               size_t wg_size,
-                               const inputT *input,
-                               outputT *output,
-                               size_t s0,
-                               size_t s1,
-                               IndexerT indexer,
-                               TransformerT transformer,
-                               std::vector<sycl::event> &host_tasks,
-                               const std::vector<sycl::event> &depends = {})
+class inclusive_scan_iter_local_scan_krn;
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IndexerT,
+          typename TransformerT,
+          typename OtherIndexerT,
+          typename OtherTransformerT>
+class inclusive_scan_iter_chunk_update_krn;
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IndexerT,
+          typename TransformerT>
+sycl::event
+inclusive_scan_base_step(sycl::queue &exec_q,
+                         const size_t wg_size,
+                         const size_t n_elems,
+                         const inputT *input,
+                         outputT *output,
+                         const size_t s0,
+                         const size_t s1,
+                         IndexerT indexer,
+                         TransformerT transformer,
+                         size_t &n_groups,
+                         const std::vector<sycl::event> &depends = {})
 {
-    size_t n_groups = ceiling_quotient(n_elems, n_wi * wg_size);
+    n_groups = ceiling_quotient<size_t>(n_elems, n_wi * wg_size);
 
-    const sycl::event &inc_scan_phase1_ev =
-        exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
+    sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
 
-            using slmT = sycl::local_accessor<size_t, 1>;
+        using slmT = sycl::local_accessor<outputT, 1>;
 
-            auto lws = sycl::range<1>(wg_size);
-            auto gws = sycl::range<1>(n_groups * wg_size);
+        auto lws = sycl::range<1>(wg_size);
+        auto gws = sycl::range<1>(n_groups * wg_size);
 
-            auto ndRange = sycl::nd_range<1>(gws, lws);
+        auto ndRange = sycl::nd_range<1>(gws, lws);
 
-            slmT slm_iscan_tmp(lws, cgh);
+        slmT slm_iscan_tmp(lws, cgh);
 
-            using KernelName = inclusive_scan_rec_local_scan_krn<
-                inputT, outputT, n_wi, IndexerT, decltype(transformer)>;
+        using KernelName =
+            inclusive_scan_iter_local_scan_krn<inputT, outputT, n_wi, IndexerT,
+                                               TransformerT>;
 
-            cgh.parallel_for<KernelName>(ndRange, [=, slm_iscan_tmp = std::move(
-                                                          slm_iscan_tmp)](
-                                                      sycl::nd_item<1> it) {
-                auto chunk_gid = it.get_global_id(0);
-                auto lid = it.get_local_id(0);
+        cgh.parallel_for<KernelName>(ndRange, [=, slm_iscan_tmp =
+                                                      std::move(slm_iscan_tmp)](
+                                                  sycl::nd_item<1> it) {
+            auto chunk_gid = it.get_global_id(0);
+            auto lid = it.get_local_id(0);
 
-                std::array<size_t, n_wi> local_isum;
+            std::array<outputT, n_wi> local_isum;
 
-                size_t i = chunk_gid * n_wi;
-                for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) {
-                    constexpr outputT out_zero(0);
+            size_t i = chunk_gid * n_wi;
+
+#pragma unroll
+            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                constexpr outputT out_zero(0);
 
-                    local_isum[m_wi] =
-                        (i + m_wi < n_elems)
-                            ? transformer(input[indexer(s0 + s1 * (i + m_wi))])
-                            : out_zero;
-                }
+                local_isum[m_wi] =
+                    (i + m_wi < n_elems)
+                        ? transformer(input[indexer(s0 + s1 * (i + m_wi))])
+                        : out_zero;
+            }
 
 #pragma unroll
-                for (size_t m_wi = 1; m_wi < n_wi; ++m_wi) {
-                    local_isum[m_wi] += local_isum[m_wi - 1];
-                }
-                // local_isum is now result of
-                // inclusive scan of locally stored inputs
+            for (nwiT m_wi = 1; m_wi < n_wi; ++m_wi) {
+                local_isum[m_wi] += local_isum[m_wi - 1];
+            }
+            // local_isum is now result of
+            // inclusive scan of locally stored inputs
 
-                size_t wg_iscan_val = sycl::inclusive_scan_over_group(
-                    it.get_group(), local_isum.back(), sycl::plus<size_t>(),
-                    size_t(0));
+            outputT wg_iscan_val = sycl::inclusive_scan_over_group(
+                it.get_group(), local_isum.back(), sycl::plus<outputT>(),
+                outputT(0));
 
-                slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val;
-                it.barrier(sycl::access::fence_space::local_space);
-                size_t addand = (lid == 0) ? 0 : slm_iscan_tmp[lid];
+            slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val;
+            it.barrier(sycl::access::fence_space::local_space);
+            outputT addand = (lid == 0) ? outputT(0) : slm_iscan_tmp[lid];
 
 #pragma unroll
-                for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) {
-                    local_isum[m_wi] += addand;
-                }
-
-                for (size_t m_wi = 0; m_wi < n_wi && i + m_wi < n_elems; ++m_wi)
-                {
-                    output[i + m_wi] = local_isum[m_wi];
-                }
-            });
+            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                local_isum[m_wi] += addand;
+            }
+
+            for (nwiT m_wi = 0; (m_wi < n_wi) && (i + m_wi < n_elems); ++m_wi) {
+                output[i + m_wi] = local_isum[m_wi];
+            }
         });
+    });
 
-    sycl::event out_event = inc_scan_phase1_ev;
-    if (n_groups > 1) {
-        outputT *temp = sycl::malloc_device<outputT>(n_groups - 1, exec_q);
+    return inc_scan_phase1_ev;
+}
+
+namespace
+{
+template <typename T> class stack_t
+{
+    T *src_;
+    size_t size_;
+    T *local_scans_;
+
+public:
+    stack_t() : src_{}, size_{}, local_scans_{} {}
+    stack_t(T *src, size_t sz, T *local_scans)
+        : src_(src), size_(sz), local_scans_(local_scans)
+    {
+    }
+    ~stack_t(){};
+
+    T *get_src_ptr() const
+    {
+        return src_;
+    }
+
+    const T *get_src_const_ptr() const
+    {
+        return src_;
+    }
+
+    size_t get_size() const
+    {
+        return size_;
+    }
+
+    T *get_local_scans_ptr() const
+    {
+        return local_scans_;
+    }
+};
+} // end of anonymous namespace
+
+/*
+ * for integer type maskT,
+ *       output[j] = sum( input[s0 + i * s1], 0 <= i <= j)
+ * for 0 <= j < n_elems
+ */
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IndexerT,
+          typename TransformerT>
+sycl::event inclusive_scan_iter(sycl::queue &exec_q,
+                                const size_t wg_size,
+                                const size_t n_elems,
+                                const inputT *input,
+                                outputT *output,
+                                const size_t s0,
+                                const size_t s1,
+                                IndexerT indexer,
+                                TransformerT transformer,
+                                std::vector<sycl::event> &host_tasks,
+                                const std::vector<sycl::event> &depends = {})
+{
+    size_t n_groups;
+    sycl::event inc_scan_phase1_ev =
+        inclusive_scan_base_step<inputT, outputT, n_wi, IndexerT, TransformerT>(
+            exec_q, wg_size, n_elems, input, output, s0, s1, indexer,
+            transformer, n_groups, depends);
 
+    sycl::event dependent_event = inc_scan_phase1_ev;
+    if (n_groups > 1) {
         auto chunk_size = wg_size * n_wi;
 
+        // how much of temporary allocation do we need
+        size_t n_groups_ = n_groups;
+        size_t temp_size = 0;
+        while (n_groups_ > 1) {
+            const auto this_size = (n_groups_ - 1);
+            temp_size += this_size;
+            n_groups_ = ceiling_quotient<size_t>(this_size, chunk_size);
+        }
+
+        // allocate
+        outputT *temp = sycl::malloc_device<outputT>(temp_size, exec_q);
+
+        if (!temp) {
+            throw std::bad_alloc();
+        }
+
+        std::vector<stack_t<outputT>> stack{};
+
+        // inclusive scans over blocks
+        n_groups_ = n_groups;
+        outputT *src = output;
+        outputT *local_scans = temp;
+
         NoOpIndexer _no_op_indexer{};
-        NoOpTransformer<outputT> _no_op_transformer{};
-        auto e2 = inclusive_scan_rec<outputT, outputT, n_wi, NoOpIndexer,
-                                     decltype(_no_op_transformer)>(
-            exec_q, n_groups - 1, wg_size, output, temp, chunk_size - 1,
-            chunk_size, _no_op_indexer, _no_op_transformer, host_tasks,
-            {inc_scan_phase1_ev});
-
-        // output[ chunk_size * (i + 1) + j] += temp[i]
-        auto e3 = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(e2);
-            cgh.parallel_for<class inclusive_scan_rec_chunk_update_krn<
-                inputT, outputT, IndexerT, decltype(transformer)>>(
-                {n_elems}, [=](auto wiid)
-            {
-                auto gid = wiid[0];
-                auto i = (gid / chunk_size);
-                output[gid] += (i > 0) ? temp[i - 1] : 0;
+        using NoOpTransformerT = NoOpTransformer<outputT>;
+        NoOpTransformerT _no_op_transformer{};
+        size_t size_to_update = n_elems;
+        while (n_groups_ > 1) {
+
+            size_t src_size = n_groups_ - 1;
+            dependent_event =
+                inclusive_scan_base_step<outputT, outputT, n_wi, NoOpIndexer,
+                                         NoOpTransformerT>(
+                    exec_q, wg_size, src_size, src, local_scans, chunk_size - 1,
+                    chunk_size, _no_op_indexer, _no_op_transformer,
+                    n_groups_, // n_groups_ is modified in place
+                    {dependent_event});
+            stack.push_back({src, size_to_update, local_scans});
+            src = local_scans;
+            local_scans += src_size;
+            size_to_update = src_size;
+        }
+
+        for (size_t reverse_stack_id = 0; reverse_stack_id < stack.size();
+             ++reverse_stack_id)
+        {
+            auto stack_id = stack.size() - 1 - reverse_stack_id;
+
+            auto stack_elem = stack[stack_id];
+            outputT *src = stack_elem.get_src_ptr();
+            size_t src_size = stack_elem.get_size();
+            outputT *local_scans = stack_elem.get_local_scans_ptr();
+
+            // output[ chunk_size * (i + 1) + j] += temp[i]
+            dependent_event = exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(dependent_event);
+
+                using UpdateKernelName =
+                    class inclusive_scan_iter_chunk_update_krn<
+                        inputT, outputT, n_wi, IndexerT, TransformerT,
+                        NoOpIndexer, NoOpTransformerT>;
+
+                cgh.parallel_for<UpdateKernelName>(
+                    {src_size}, [chunk_size, src, local_scans](auto wiid) {
+                        auto gid = wiid[0];
+                        auto i = (gid / chunk_size);
+                        src[gid] += (i > 0) ? local_scans[i - 1] : outputT(0);
+                    });
             });
-        });
+        }
 
         sycl::event e4 = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(e3);
+            cgh.depends_on(dependent_event);
             const auto &ctx = exec_q.get_context();
             cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); });
         });
         host_tasks.push_back(e4);
-
-        out_event = std::move(e3);
     }
 
-    return out_event;
+    return dependent_event;
 }
 
 typedef size_t (*accumulate_contig_impl_fn_ptr_t)(
@@ -234,20 +349,33 @@ size_t accumulate_contig_impl(sycl::queue &q,
                               std::vector<sycl::event> &host_tasks,
                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr int n_wi = 8;
     const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
     cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
-    size_t wg_size = 128;
 
     NoOpIndexer flat_indexer{};
     transformerT non_zero_indicator{};
 
-    const sycl::event &comp_ev =
-        inclusive_scan_rec<maskT, cumsumT, n_wi, decltype(flat_indexer),
-                           decltype(non_zero_indicator)>(
-            q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1,
-            flat_indexer, non_zero_indicator, host_tasks, depends);
+    sycl::event comp_ev;
 
+    const sycl::device &dev = q.get_device();
+    if (dev.has(sycl::aspect::cpu)) {
+        constexpr nwiT n_wi_for_cpu = 8;
+        size_t wg_size = 256;
+        comp_ev = inclusive_scan_iter<maskT, cumsumT, n_wi_for_cpu,
+                                      decltype(flat_indexer),
+                                      decltype(non_zero_indicator)>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1,
+            flat_indexer, non_zero_indicator, host_tasks, depends);
+    }
+    else {
+        constexpr nwiT n_wi_for_gpu = 4;
+        size_t wg_size = 256;
+        comp_ev = inclusive_scan_iter<maskT, cumsumT, n_wi_for_gpu,
+                                      decltype(flat_indexer),
+                                      decltype(non_zero_indicator)>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1,
+            flat_indexer, non_zero_indicator, host_tasks, depends);
+    }
     cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
 
     cumsumT *last_elem_host_usm = sycl::malloc_host<cumsumT>(1, q);
@@ -324,19 +452,32 @@ size_t accumulate_strided_impl(sycl::queue &q,
                                std::vector<sycl::event> &host_tasks,
                                const std::vector<sycl::event> &depends = {})
 {
-    constexpr int n_wi = 8;
     const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
     cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
-    size_t wg_size = 128;
 
     StridedIndexer strided_indexer{nd, 0, shape_strides};
     transformerT non_zero_indicator{};
 
-    const sycl::event &comp_ev =
-        inclusive_scan_rec<maskT, cumsumT, n_wi, decltype(strided_indexer),
-                           decltype(non_zero_indicator)>(
-            q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1,
+    const sycl::device &dev = q.get_device();
+    sycl::event comp_ev;
+    if (dev.has(sycl::aspect::cpu)) {
+        constexpr nwiT n_wi_for_cpu = 8;
+        size_t wg_size = 256;
+        comp_ev = inclusive_scan_iter<maskT, cumsumT, n_wi_for_cpu,
+                                      decltype(strided_indexer),
+                                      decltype(non_zero_indicator)>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1,
             strided_indexer, non_zero_indicator, host_tasks, depends);
+    }
+    else {
+        constexpr nwiT n_wi_for_gpu = 4;
+        size_t wg_size = 256;
+        comp_ev = inclusive_scan_iter<maskT, cumsumT, n_wi_for_gpu,
+                                      decltype(strided_indexer),
+                                      decltype(non_zero_indicator)>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1,
+            strided_indexer, non_zero_indicator, host_tasks, depends);
+    }
 
     cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
 

From b1490968d08594b5acaacf4f3d0054ec8a16abe3 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 18 Dec 2023 20:48:52 -0600
Subject: [PATCH 20/34] Use async implementation of set functions

---
 dpctl/tensor/__init__.py                                      | 2 +-
 .../{_set_functions_debug.py => _set_functions_async.py}      | 0
 pyproject.toml                                                | 4 ++--
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename dpctl/tensor/{_set_functions_debug.py => _set_functions_async.py} (100%)

diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index fb41c1240a..5b88c1f33e 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -175,7 +175,7 @@
     reduce_hypot,
     sum,
 )
-from ._set_functions_sync import (
+from ._set_functions_async import (
     unique_all,
     unique_counts,
     unique_inverse,
diff --git a/dpctl/tensor/_set_functions_debug.py b/dpctl/tensor/_set_functions_async.py
similarity index 100%
rename from dpctl/tensor/_set_functions_debug.py
rename to dpctl/tensor/_set_functions_async.py
diff --git a/pyproject.toml b/pyproject.toml
index 074b27257d..f791411f6c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ source = [
 omit = [
     "dpctl/tests/*",
     "dpctl/_version.py",
-    "dpctl/tensor/_set_functions_debug.py",
+    "dpctl/tensor/_set_functions_sync.py",
     "*/_cython_api*/stringsource",
 ]
 
@@ -30,7 +30,7 @@ omit = [
 omit = [
     "dpctl/tests/*",
     "dpctl/_version.py",
-    "dpctl/tensor/_set_functions_debug.py",
+    "dpctl/tensor/_set_functions_sync.py",
     "*/_cython_api*/stringsource",
 ]
 

From eeb30f6988b2b21c56cba8287028bb92b8f7afee Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 19 Dec 2023 01:36:36 -0800
Subject: [PATCH 21/34] Permit copying when flattening for `unique` funcs

As the array will be copied if it isn't C-contiguous regardless,
this change does not impact performance and permits calls to set functions
with strided, ND data
---
 dpctl/tensor/_set_functions_async.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dpctl/tensor/_set_functions_async.py b/dpctl/tensor/_set_functions_async.py
index 631a12a592..a0ea188634 100644
--- a/dpctl/tensor/_set_functions_async.py
+++ b/dpctl/tensor/_set_functions_async.py
@@ -79,7 +79,7 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
     if x.ndim == 1:
         fx = x
     else:
-        fx = dpt.reshape(x, (x.size,), order="C", copy=False)
+        fx = dpt.reshape(x, (x.size,), order="C")
     s = dpt.empty_like(fx, order="C")
     host_tasks = []
     if fx.flags.c_contiguous:
@@ -166,7 +166,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
     if x.ndim == 1:
         fx = x
     else:
-        fx = dpt.reshape(x, (x.size,), order="C", copy=False)
+        fx = dpt.reshape(x, (x.size,), order="C")
     s = dpt.empty_like(fx, order="C")
     host_tasks = []
     if fx.flags.c_contiguous:
@@ -284,7 +284,7 @@ def unique_inverse(x):
     if x.ndim == 1:
         fx = x
     else:
-        fx = dpt.reshape(x, (x.size,), order="C", copy=False)
+        fx = dpt.reshape(x, (x.size,), order="C")
     ind_dt = default_device_index_type(exec_q)
     sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C")
     unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C")

From 0035558a1aa311b58cea9e94c1fb5ec1a9829614 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 19 Dec 2023 01:39:17 -0800
Subject: [PATCH 22/34] Fix typos in set functions

Set functions were making copies and then sorting the original data,
resulting in incorrect results.
---
 dpctl/tensor/_set_functions_async.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dpctl/tensor/_set_functions_async.py b/dpctl/tensor/_set_functions_async.py
index a0ea188634..49679d0c32 100644
--- a/dpctl/tensor/_set_functions_async.py
+++ b/dpctl/tensor/_set_functions_async.py
@@ -94,7 +94,7 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
         )
         host_tasks.append(ht_ev)
         ht_ev, sort_ev = _sort_ascending(
-            src=fx,
+            src=tmp,
             trailing_dims_to_sort=1,
             dst=s,
             sycl_queue=exec_q,
@@ -181,7 +181,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
         )
         host_tasks.append(ht_ev)
         ht_ev, sort_ev = _sort_ascending(
-            src=fx,
+            src=tmp,
             dst=s,
             trailing_dims_to_sort=1,
             sycl_queue=exec_q,
@@ -301,7 +301,7 @@ def unique_inverse(x):
         )
         host_tasks.append(ht_ev)
         ht_ev, sort_ev = _argsort_ascending(
-            src=fx,
+            src=tmp,
             trailing_dims_to_sort=1,
             dst=sorting_ids,
             sycl_queue=exec_q,
@@ -462,7 +462,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
         )
         host_tasks.append(ht_ev)
         ht_ev, sort_ev = _argsort_ascending(
-            src=fx,
+            src=tmp,
             trailing_dims_to_sort=1,
             dst=sorting_ids,
             sycl_queue=exec_q,

From a7e76255345dbda6b26d7708f8e578f094041857 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 19 Dec 2023 12:17:07 -0800
Subject: [PATCH 23/34] Fixes to unique functions, adds tests

`unique_all` would not return a `UniqueAllResult` tuple when early exiting for an array of all unique elements

Fixed problems through `unique` functions where the cumulative sum was being allocated on a separate (default) queue, causing inputs with a non-default queue to fail

`unique` functions now behave correctly for 0-size array inputs
---
 dpctl/tensor/_set_functions_async.py   | 56 +++++++++++++++------
 dpctl/tests/test_usm_ndarray_unique.py | 70 ++++++++++++++++++++++++++
 2 files changed, 112 insertions(+), 14 deletions(-)

diff --git a/dpctl/tensor/_set_functions_async.py b/dpctl/tensor/_set_functions_async.py
index 49679d0c32..5b1291ea82 100644
--- a/dpctl/tensor/_set_functions_async.py
+++ b/dpctl/tensor/_set_functions_async.py
@@ -80,6 +80,8 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
         fx = x
     else:
         fx = dpt.reshape(x, (x.size,), order="C")
+    if fx.size == 0:
+        return fx
     s = dpt.empty_like(fx, order="C")
     host_tasks = []
     if fx.flags.c_contiguous:
@@ -114,7 +116,7 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
         fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
     )
     host_tasks.append(ht_ev)
-    cumsum = dpt.empty(s.shape, dtype=dpt.int64)
+    cumsum = dpt.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q)
     # synchronizing call
     n_uniques = mask_positions(
         unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev]
@@ -163,10 +165,14 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
         raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
     array_api_dev = x.device
     exec_q = array_api_dev.sycl_queue
+    x_usm_type = x.usm_type
     if x.ndim == 1:
         fx = x
     else:
         fx = dpt.reshape(x, (x.size,), order="C")
+    ind_dt = default_device_index_type(exec_q)
+    if fx.size == 0:
+        return UniqueCountsResult(fx, dpt.empty_like(fx, dtype=ind_dt))
     s = dpt.empty_like(fx, order="C")
     host_tasks = []
     if fx.flags.c_contiguous:
@@ -201,17 +207,21 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
         fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
     )
     host_tasks.append(ht_ev)
-    ind_dt = default_device_index_type(exec_q)
-    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64)
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
     # synchronizing call
     n_uniques = mask_positions(
         unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev]
     )
     if n_uniques == fx.size:
         dpctl.SyclEvent.wait_for(host_tasks)
-        return UniqueCountsResult(s, dpt.ones(n_uniques, dtype=ind_dt))
+        return UniqueCountsResult(
+            s,
+            dpt.ones(
+                n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+            ),
+        )
     unique_vals = dpt.empty(
-        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+        n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
     )
     # populate unique values
     ht_ev, _ = _extract(
@@ -224,7 +234,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
     )
     host_tasks.append(ht_ev)
     unique_counts = dpt.empty(
-        n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q
+        n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
     )
     idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
     ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
@@ -281,6 +291,7 @@ def unique_inverse(x):
         raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
     array_api_dev = x.device
     exec_q = array_api_dev.sycl_queue
+    x_usm_type = x.usm_type
     if x.ndim == 1:
         fx = x
     else:
@@ -288,6 +299,8 @@ def unique_inverse(x):
     ind_dt = default_device_index_type(exec_q)
     sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C")
     unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C")
+    if fx.size == 0:
+        return UniqueInverseResult(fx, unsorting_ids)
     host_tasks = []
     if fx.flags.c_contiguous:
         ht_ev, sort_ev = _argsort_ascending(
@@ -341,7 +354,7 @@ def unique_inverse(x):
         fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
     )
     host_tasks.append(ht_ev)
-    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64)
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
     # synchronizing call
     n_uniques = mask_positions(
         unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev]
@@ -350,7 +363,7 @@ def unique_inverse(x):
         dpctl.SyclEvent.wait_for(host_tasks)
         return UniqueInverseResult(s, unsorting_ids)
     unique_vals = dpt.empty(
-        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+        n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
     )
     ht_ev, _ = _extract(
         src=s,
@@ -362,7 +375,7 @@ def unique_inverse(x):
     )
     host_tasks.append(ht_ev)
     cum_unique_counts = dpt.empty(
-        n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q
+        n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
     )
     idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
     ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
@@ -442,13 +455,20 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
         raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
     array_api_dev = x.device
     exec_q = array_api_dev.sycl_queue
+    x_usm_type = x.usm_type
     if x.ndim == 1:
         fx = x
     else:
-        fx = dpt.reshape(x, (x.size,), order="C", copy=False)
+        fx = dpt.reshape(x, (x.size,), order="C")
     ind_dt = default_device_index_type(exec_q)
     sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C")
     unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C")
+    if fx.size == 0:
+        # original array contains no data
+        # so it can be safely returned as values
+        return UniqueAllResult(
+            fx, sorting_ids, unsorting_ids, dpt.empty_like(fx, dtype=ind_dt)
+        )
     host_tasks = []
     if fx.flags.c_contiguous:
         ht_ev, sort_ev = _argsort_ascending(
@@ -502,16 +522,24 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
         fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
     )
     host_tasks.append(ht_ev)
-    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64)
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
     # synchronizing call
     n_uniques = mask_positions(
         unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev]
     )
     if n_uniques == fx.size:
         dpctl.SyclEvent.wait_for(host_tasks)
-        return UniqueInverseResult(s, unsorting_ids)
+        _counts = dpt.ones(
+            n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+        )
+        return UniqueAllResult(
+            s,
+            sorting_ids,
+            unsorting_ids,
+            _counts,
+        )
     unique_vals = dpt.empty(
-        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+        n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
     )
     ht_ev, _ = _extract(
         src=s,
@@ -523,7 +551,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
     )
     host_tasks.append(ht_ev)
     cum_unique_counts = dpt.empty(
-        n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q
+        n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
     )
     idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
     ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
diff --git a/dpctl/tests/test_usm_ndarray_unique.py b/dpctl/tests/test_usm_ndarray_unique.py
index 012a6091cc..4a0af6361f 100644
--- a/dpctl/tests/test_usm_ndarray_unique.py
+++ b/dpctl/tests/test_usm_ndarray_unique.py
@@ -16,6 +16,7 @@
 
 import pytest
 
+import dpctl
 import dpctl.tensor as dpt
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
@@ -151,3 +152,72 @@ def test_unique_all(dtype):
     assert dpt.all(uv == inp[ind])
     assert dpt.all(inp == uv[inv])
     assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype))
+
+
+def test_set_functions_empty_input():
+    get_queue_or_skip()
+    x = dpt.ones((10, 0, 1), dtype="i4")
+
+    res = dpt.unique_values(x)
+    assert isinstance(res, dpctl.tensor.usm_ndarray)
+    assert res.size == 0
+    assert res.dtype == x.dtype
+
+    res = dpt.unique_inverse(x)
+    assert type(res).__name__ == "UniqueInverseResult"
+    uv, inv = res
+    assert isinstance(uv, dpctl.tensor.usm_ndarray)
+    assert uv.size == 0
+    assert isinstance(inv, dpctl.tensor.usm_ndarray)
+    assert inv.size == 0
+
+    res = dpt.unique_counts(x)
+    assert type(res).__name__ == "UniqueCountsResult"
+    uv, uv_counts = res
+    assert isinstance(uv, dpctl.tensor.usm_ndarray)
+    assert uv.size == 0
+    assert isinstance(uv_counts, dpctl.tensor.usm_ndarray)
+    assert uv_counts.size == 0
+
+    res = dpt.unique_all(x)
+    assert type(res).__name__ == "UniqueAllResult"
+    uv, ind, inv, uv_counts = res
+    assert isinstance(uv, dpctl.tensor.usm_ndarray)
+    assert uv.size == 0
+    assert isinstance(ind, dpctl.tensor.usm_ndarray)
+    assert ind.size == 0
+    assert isinstance(inv, dpctl.tensor.usm_ndarray)
+    assert inv.size == 0
+    assert isinstance(uv_counts, dpctl.tensor.usm_ndarray)
+    assert uv_counts.size == 0
+
+
+def test_set_function_outputs():
+    get_queue_or_skip()
+    # check standard and early exit paths
+    x1 = dpt.arange(10, dtype="i4")
+    x2 = dpt.ones((10, 10), dtype="i4")
+
+    assert isinstance(dpt.unique_values(x1), dpctl.tensor.usm_ndarray)
+    assert isinstance(dpt.unique_values(x2), dpctl.tensor.usm_ndarray)
+
+    assert type(dpt.unique_inverse(x1)).__name__ == "UniqueInverseResult"
+    assert type(dpt.unique_inverse(x2)).__name__ == "UniqueInverseResult"
+
+    assert type(dpt.unique_counts(x1)).__name__ == "UniqueCountsResult"
+    assert type(dpt.unique_counts(x2)).__name__ == "UniqueCountsResult"
+
+    assert type(dpt.unique_all(x1)).__name__ == "UniqueAllResult"
+    assert type(dpt.unique_all(x2)).__name__ == "UniqueAllResult"
+
+
+def test_set_functions_compute_follows_data():
+    # tests that all intermediate calls and allocations
+    # are compatible with an input with an arbitrary queue
+    q = dpctl.SyclQueue()
+    x = dpt.arange(10, dtype="i4", sycl_queue=q)
+
+    assert isinstance(dpt.unique_values(x), dpctl.tensor.usm_ndarray)
+    assert dpt.unique_counts(x)
+    assert dpt.unique_inverse(x)
+    assert dpt.unique_all(x)

From 2fe8620f2282c9ff39091e5ecca8bf290b0a54d2 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 20 Dec 2023 14:21:40 -0800
Subject: [PATCH 24/34] Sorting, unique functions properly handle 0D arrays

---
 dpctl/tensor/_set_functions_async.py | 23 +++++++++++++++++++----
 dpctl/tensor/_sorting.py             | 14 ++++++++++++--
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/dpctl/tensor/_set_functions_async.py b/dpctl/tensor/_set_functions_async.py
index 5b1291ea82..58c1d4ea71 100644
--- a/dpctl/tensor/_set_functions_async.py
+++ b/dpctl/tensor/_set_functions_async.py
@@ -292,11 +292,16 @@ def unique_inverse(x):
     array_api_dev = x.device
     exec_q = array_api_dev.sycl_queue
     x_usm_type = x.usm_type
-    if x.ndim == 1:
+    ind_dt = default_device_index_type(exec_q)
+    if x.ndim == 0:
+        return UniqueInverseResult(
+            dpt.reshape(x, (1,), order="C", copy=True),
+            dpt.zeros_like(x, ind_dt, usm_type=x_usm_type, sycl_queue=exec_q),
+        )
+    elif x.ndim == 1:
         fx = x
     else:
         fx = dpt.reshape(x, (x.size,), order="C")
-    ind_dt = default_device_index_type(exec_q)
     sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C")
     unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C")
     if fx.size == 0:
@@ -456,11 +461,21 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
     array_api_dev = x.device
     exec_q = array_api_dev.sycl_queue
     x_usm_type = x.usm_type
-    if x.ndim == 1:
+    ind_dt = default_device_index_type(exec_q)
+    if x.ndim == 0:
+        uv = dpt.reshape(x, (1,), order="C", copy=True)
+        return UniqueAllResult(
+            uv,
+            dpt.zeros_like(uv, ind_dt, usm_type=x_usm_type, sycl_queue=exec_q),
+            dpt.zeros_like(x, ind_dt, usm_type=x_usm_type, sycl_queue=exec_q),
+            dpt.ones_like(
+                uv, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+            ),
+        )
+    elif x.ndim == 1:
         fx = x
     else:
         fx = dpt.reshape(x, (x.size,), order="C")
-    ind_dt = default_device_index_type(exec_q)
     sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C")
     unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C")
     if fx.size == 0:
diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py
index 7158d981c1..cc46da9c97 100644
--- a/dpctl/tensor/_sorting.py
+++ b/dpctl/tensor/_sorting.py
@@ -61,7 +61,11 @@ def sort(x, /, *, axis=-1, descending=False, stable=False):
             f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
         )
     nd = x.ndim
-    axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
+    if nd == 0:
+        axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis")
+        return dpt.copy(x, order="C")
+    else:
+        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
     a1 = axis + 1
     if a1 == nd:
         perm = list(range(nd))
@@ -134,7 +138,13 @@ def argsort(x, axis=-1, descending=False, stable=False):
             f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
         )
     nd = x.ndim
-    axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
+    if nd == 0:
+        axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis")
+        return dpt.zeros_like(
+            x, dtype=ti.default_device_index_type(x.sycl_queue), order="C"
+        )
+    else:
+        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
     a1 = axis + 1
     if a1 == nd:
         perm = list(range(nd))

From 8a6581adc350e2021f855ee7517dfbe765cf902c Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 20 Dec 2023 14:45:26 -0800
Subject: [PATCH 25/34] `unique_all` and `unique_inverse` inverse indices shape
 fixed

Were previously returning a 1D array of indices rather than an array with the same shape as input `x`
---
 dpctl/tensor/_set_functions_async.py   | 36 ++++++++++----------------
 dpctl/tests/test_usm_ndarray_unique.py |  2 ++
 2 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/dpctl/tensor/_set_functions_async.py b/dpctl/tensor/_set_functions_async.py
index 58c1d4ea71..f18d64d715 100644
--- a/dpctl/tensor/_set_functions_async.py
+++ b/dpctl/tensor/_set_functions_async.py
@@ -293,19 +293,14 @@ def unique_inverse(x):
     exec_q = array_api_dev.sycl_queue
     x_usm_type = x.usm_type
     ind_dt = default_device_index_type(exec_q)
-    if x.ndim == 0:
-        return UniqueInverseResult(
-            dpt.reshape(x, (1,), order="C", copy=True),
-            dpt.zeros_like(x, ind_dt, usm_type=x_usm_type, sycl_queue=exec_q),
-        )
-    elif x.ndim == 1:
+    if x.ndim == 1:
         fx = x
     else:
         fx = dpt.reshape(x, (x.size,), order="C")
     sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C")
     unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C")
     if fx.size == 0:
-        return UniqueInverseResult(fx, unsorting_ids)
+        return UniqueInverseResult(fx, dpt.reshape(unsorting_ids, x.shape))
     host_tasks = []
     if fx.flags.c_contiguous:
         ht_ev, sort_ev = _argsort_ascending(
@@ -366,7 +361,7 @@ def unique_inverse(x):
     )
     if n_uniques == fx.size:
         dpctl.SyclEvent.wait_for(host_tasks)
-        return UniqueInverseResult(s, unsorting_ids)
+        return UniqueInverseResult(s, dpt.reshape(unsorting_ids, x.shape))
     unique_vals = dpt.empty(
         n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
     )
@@ -422,7 +417,9 @@ def unique_inverse(x):
         pos = pos_next
         host_tasks.append(ht_ev)
     dpctl.SyclEvent.wait_for(host_tasks)
-    return UniqueInverseResult(unique_vals, inv[unsorting_ids])
+    return UniqueInverseResult(
+        unique_vals, dpt.reshape(inv[unsorting_ids], x.shape)
+    )
 
 
 def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
@@ -462,17 +459,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
     exec_q = array_api_dev.sycl_queue
     x_usm_type = x.usm_type
     ind_dt = default_device_index_type(exec_q)
-    if x.ndim == 0:
-        uv = dpt.reshape(x, (1,), order="C", copy=True)
-        return UniqueAllResult(
-            uv,
-            dpt.zeros_like(uv, ind_dt, usm_type=x_usm_type, sycl_queue=exec_q),
-            dpt.zeros_like(x, ind_dt, usm_type=x_usm_type, sycl_queue=exec_q),
-            dpt.ones_like(
-                uv, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
-            ),
-        )
-    elif x.ndim == 1:
+    if x.ndim == 1:
         fx = x
     else:
         fx = dpt.reshape(x, (x.size,), order="C")
@@ -482,7 +469,10 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
         # original array contains no data
         # so it can be safely returned as values
         return UniqueAllResult(
-            fx, sorting_ids, unsorting_ids, dpt.empty_like(fx, dtype=ind_dt)
+            fx,
+            sorting_ids,
+            dpt.reshape(unsorting_ids, x.shape),
+            dpt.empty_like(fx, dtype=ind_dt),
         )
     host_tasks = []
     if fx.flags.c_contiguous:
@@ -550,7 +540,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
         return UniqueAllResult(
             s,
             sorting_ids,
-            unsorting_ids,
+            dpt.reshape(unsorting_ids, x.shape),
             _counts,
         )
     unique_vals = dpt.empty(
@@ -611,6 +601,6 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
     return UniqueAllResult(
         unique_vals,
         sorting_ids[cum_unique_counts[:-1]],
-        inv[unsorting_ids],
+        dpt.reshape(inv[unsorting_ids], x.shape),
         _counts,
     )
diff --git a/dpctl/tests/test_usm_ndarray_unique.py b/dpctl/tests/test_usm_ndarray_unique.py
index 4a0af6361f..6fff952dd3 100644
--- a/dpctl/tests/test_usm_ndarray_unique.py
+++ b/dpctl/tests/test_usm_ndarray_unique.py
@@ -117,6 +117,7 @@ def test_unique_inverse(dtype):
     uv, inv = dpt.unique_inverse(inp)
     assert dpt.all(uv == dpt.arange(2, dtype=dtype))
     assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
 
 
 @pytest.mark.parametrize(
@@ -151,6 +152,7 @@ def test_unique_all(dtype):
     assert dpt.all(uv == dpt.arange(2, dtype=dtype))
     assert dpt.all(uv == inp[ind])
     assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
     assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype))
 
 

From 82be69a9582568bfcd09ca3ffc7700d999f84a0d Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 21 Dec 2023 11:41:13 -0800
Subject: [PATCH 26/34] Set `stable` to True for sorting functions by default

As mandated by the array API spec
---
 dpctl/tensor/_sorting.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py
index cc46da9c97..661c3bc1c4 100644
--- a/dpctl/tensor/_sorting.py
+++ b/dpctl/tensor/_sorting.py
@@ -30,8 +30,8 @@
 __all__ = ["sort", "argsort"]
 
 
-def sort(x, /, *, axis=-1, descending=False, stable=False):
-    """sort(x, axis=-1, descending=False, stable=False)
+def sort(x, /, *, axis=-1, descending=False, stable=True):
+    """sort(x, axis=-1, descending=False, stable=True)
 
     Returns a sorted copy of an input array `x`.
 
@@ -106,8 +106,8 @@ def sort(x, /, *, axis=-1, descending=False, stable=False):
     return res
 
 
-def argsort(x, axis=-1, descending=False, stable=False):
-    """argsort(x, axis=-1, descending=False, stable=False)
+def argsort(x, axis=-1, descending=False, stable=True):
+    """argsort(x, axis=-1, descending=False, stable=True)
 
     Returns the indices that sort an array `x` along a specified axis.
 

From 81610d60a398af029c445cb60939e1f0eb6709df Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 21 Dec 2023 11:46:57 -0800
Subject: [PATCH 27/34] Adds tests for 0D array cases for sorting

---
 dpctl/tests/test_usm_ndarray_sorting.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/dpctl/tests/test_usm_ndarray_sorting.py b/dpctl/tests/test_usm_ndarray_sorting.py
index 84cf760a0c..69f2c62a04 100644
--- a/dpctl/tests/test_usm_ndarray_sorting.py
+++ b/dpctl/tests/test_usm_ndarray_sorting.py
@@ -197,3 +197,17 @@ def test_argsort_strided():
     idx = dpt.argsort(x_flipped)
 
     assert dpt.all(x_flipped[idx] == x_orig)
+
+
+def test_sort_0d_array():
+    get_queue_or_skip()
+
+    x = dpt.asarray(1, dtype="i4")
+    assert dpt.sort(x) == 1
+
+
+def test_argsort_0d_array():
+    get_queue_or_skip()
+
+    x = dpt.asarray(1, dtype="i4")
+    assert dpt.argsort(x) == 0

From b0cc135becf14b620bdecd590205b44b74dbbf23 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 21 Dec 2023 12:09:55 -0800
Subject: [PATCH 28/34] Adds tests for `unique` functions on strided inputs

---
 dpctl/tests/test_usm_ndarray_unique.py | 51 ++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/dpctl/tests/test_usm_ndarray_unique.py b/dpctl/tests/test_usm_ndarray_unique.py
index 6fff952dd3..3081fc1e9d 100644
--- a/dpctl/tests/test_usm_ndarray_unique.py
+++ b/dpctl/tests/test_usm_ndarray_unique.py
@@ -53,6 +53,17 @@ def test_unique_values(dtype):
     assert dpt.all(uv == dpt.arange(2, dtype=dtype))
 
 
+def test_unique_values_strided():
+    get_queue_or_skip()
+
+    n, m = 1000, 20
+    inp = dpt.ones((n, m), dtype="i4", order="F")
+    inp[:, ::2] = 0
+
+    uv = dpt.unique_values(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+
+
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -86,6 +97,18 @@ def test_unique_counts(dtype):
     assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype))
 
 
+def test_unique_counts_strided():
+    get_queue_or_skip()
+
+    n, m = 1000, 20
+    inp = dpt.ones((n, m), dtype="i4", order="F")
+    inp[:, ::2] = 0
+
+    uv, uv_counts = dpt.unique_counts(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
+
+
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -120,6 +143,19 @@ def test_unique_inverse(dtype):
     assert inp.shape == inv.shape
 
 
+def test_unique_inverse_strided():
+    get_queue_or_skip()
+
+    n, m = 1000, 20
+    inp = dpt.ones((n, m), dtype="i4", order="F")
+    inp[:, ::2] = 0
+
+    uv, inv = dpt.unique_inverse(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+
+
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -156,6 +192,21 @@ def test_unique_all(dtype):
     assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype))
 
 
+def test_unique_all_strided():
+    get_queue_or_skip()
+
+    n, m = 1000, 20
+    inp = dpt.ones((n, m), dtype="i4", order="F")
+    inp[:, ::2] = 0
+
+    uv, ind, inv, uv_counts = dpt.unique_all(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(uv == dpt.reshape(inp, -1)[ind])
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
+
+
 def test_set_functions_empty_input():
     get_queue_or_skip()
     x = dpt.ones((10, 0, 1), dtype="i4")

From 8a6282fdf1e4e5090b9bee99320a9eab052353bf Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 21 Dec 2023 12:52:32 -0800
Subject: [PATCH 29/34] Tests for strided 1D data input for `unique` functions

---
 dpctl/tests/test_usm_ndarray_unique.py | 28 ++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/dpctl/tests/test_usm_ndarray_unique.py b/dpctl/tests/test_usm_ndarray_unique.py
index 3081fc1e9d..0d4247def5 100644
--- a/dpctl/tests/test_usm_ndarray_unique.py
+++ b/dpctl/tests/test_usm_ndarray_unique.py
@@ -63,6 +63,12 @@ def test_unique_values_strided():
     uv = dpt.unique_values(inp)
     assert dpt.all(uv == dpt.arange(2, dtype="i4"))
 
+    inp = dpt.reshape(inp, -1)
+    inp = dpt.flip(dpt.reshape(inp, -1))
+
+    uv = dpt.unique_values(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+
 
 @pytest.mark.parametrize(
     "dtype",
@@ -108,6 +114,12 @@ def test_unique_counts_strided():
     assert dpt.all(uv == dpt.arange(2, dtype="i4"))
     assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
 
+    inp = dpt.flip(dpt.reshape(inp, -1))
+
+    uv, uv_counts = dpt.unique_counts(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
+
 
 @pytest.mark.parametrize(
     "dtype",
@@ -155,6 +167,13 @@ def test_unique_inverse_strided():
     assert dpt.all(inp == uv[inv])
     assert inp.shape == inv.shape
 
+    inp = dpt.flip(dpt.reshape(inp, -1))
+
+    uv, inv = dpt.unique_inverse(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+
 
 @pytest.mark.parametrize(
     "dtype",
@@ -206,6 +225,15 @@ def test_unique_all_strided():
     assert inp.shape == inv.shape
     assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
 
+    inp = dpt.flip(dpt.reshape(inp, -1))
+
+    uv, ind, inv, uv_counts = dpt.unique_all(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(uv == inp[ind])
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
+
 
 def test_set_functions_empty_input():
     get_queue_or_skip()

From dd71e4eb3a8966e28f9e8aaa9f03e70ab2eff602 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 8 Jan 2024 08:01:59 -0600
Subject: [PATCH 30/34] Changes to _set_functions_sync.py for test suite to
 pass

Changes to sync implementation of set functions so that test
suite passes with sync implementation as it does for async
implementation.
---
 dpctl/tensor/_set_functions_sync.py | 55 +++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 15 deletions(-)

diff --git a/dpctl/tensor/_set_functions_sync.py b/dpctl/tensor/_set_functions_sync.py
index 7f16373c78..f1e0332b42 100644
--- a/dpctl/tensor/_set_functions_sync.py
+++ b/dpctl/tensor/_set_functions_sync.py
@@ -75,12 +75,14 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
     if x.ndim == 1:
         fx = x
     else:
-        fx = dpt.reshape(x, (x.size,), order="C", copy=False)
+        fx = dpt.reshape(x, (x.size,), order="C")
+    if fx.size == 0:
+        return fx
     s = dpt.sort(fx)
     unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
     dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:])
     unique_mask[0] = True
-    cumsum = dpt.empty(s.shape, dtype=dpt.int64)
+    cumsum = dpt.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q)
     n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q)
     if n_uniques == fx.size:
         return s
@@ -127,13 +129,15 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
     if x.ndim == 1:
         fx = x
     else:
-        fx = dpt.reshape(x, (x.size,), order="C", copy=False)
-    s = dpt.sort(x)
+        fx = dpt.reshape(x, (x.size,), order="C")
+    ind_dt = default_device_index_type(exec_q)
+    if fx.size == 0:
+        return UniqueCountsResult(fx, dpt.empty_like(fx, dtype=ind_dt))
+    s = dpt.sort(fx)
     unique_mask = dpt.empty(s.shape, dtype="?", sycl_queue=exec_q)
     dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:])
     unique_mask[0] = True
-    ind_dt = default_device_index_type(exec_q)
-    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64)
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
     # synchronizing call
     n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q)
     if n_uniques == fx.size:
@@ -195,18 +199,20 @@ def unique_inverse(x):
         raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
     array_api_dev = x.device
     exec_q = array_api_dev.sycl_queue
+    ind_dt = default_device_index_type(exec_q)
     if x.ndim == 1:
         fx = x
     else:
-        fx = dpt.reshape(x, (x.size,), order="C", copy=False)
-    ind_dt = default_device_index_type(exec_q)
+        fx = dpt.reshape(x, (x.size,), order="C")
     sorting_ids = dpt.argsort(fx)
     unsorting_ids = dpt.argsort(sorting_ids)
+    if fx.size == 0:
+        return UniqueInverseResult(fx, dpt.reshape(unsorting_ids, x.shape))
     s = fx[sorting_ids]
     unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
     unique_mask[0] = True
     dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:])
-    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64)
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
     # synchronizing call
     n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q)
     if n_uniques == fx.size:
@@ -251,7 +257,9 @@ def unique_inverse(x):
         ht_ev, _ = _full_usm_ndarray(fill_value=i, dst=_dst, sycl_queue=exec_q)
         ht_ev.wait()
         pos = pos_next
-    return UniqueInverseResult(unique_vals, inv[unsorting_ids])
+    return UniqueInverseResult(
+        unique_vals, dpt.reshape(inv[unsorting_ids], x.shape)
+    )
 
 
 def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
@@ -289,22 +297,39 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
         raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
     array_api_dev = x.device
     exec_q = array_api_dev.sycl_queue
+    ind_dt = default_device_index_type(exec_q)
     if x.ndim == 1:
         fx = x
     else:
-        fx = dpt.reshape(x, (x.size,), order="C", copy=False)
-    ind_dt = default_device_index_type(exec_q)
+        fx = dpt.reshape(x, (x.size,), order="C")
     sorting_ids = dpt.argsort(fx)
     unsorting_ids = dpt.argsort(sorting_ids)
+    if fx.size == 0:
+        # original array contains no data
+        # so it can be safely returned as values
+        return UniqueAllResult(
+            fx,
+            sorting_ids,
+            dpt.reshape(unsorting_ids, x.shape),
+            dpt.empty_like(fx, dtype=ind_dt),
+        )
     s = fx[sorting_ids]
     unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
     dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:])
     unique_mask[0] = True
-    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64)
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
     # synchronizing call
     n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q)
     if n_uniques == fx.size:
-        return UniqueInverseResult(s, unsorting_ids)
+        _counts = dpt.ones(
+            n_uniques, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q
+        )
+        return UniqueAllResult(
+            s,
+            sorting_ids,
+            dpt.reshape(unsorting_ids, x.shape),
+            _counts,
+        )
     unique_vals = dpt.empty(
         n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
     )
@@ -346,6 +371,6 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
     return UniqueAllResult(
         unique_vals,
         sorting_ids[cum_unique_counts[:-1]],
-        inv[unsorting_ids],
+        dpt.reshape(inv[unsorting_ids], x.shape),
         _counts,
     )

From b1b0af1769b29c7fd43e87d034555cbe43ea4366 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 8 Jan 2024 08:06:10 -0600
Subject: [PATCH 31/34] Removed sync implementation of set functions

---
 dpctl/tensor/_set_functions_sync.py | 376 ----------------------------
 pyproject.toml                      |   2 -
 2 files changed, 378 deletions(-)
 delete mode 100644 dpctl/tensor/_set_functions_sync.py

diff --git a/dpctl/tensor/_set_functions_sync.py b/dpctl/tensor/_set_functions_sync.py
deleted file mode 100644
index f1e0332b42..0000000000
--- a/dpctl/tensor/_set_functions_sync.py
+++ /dev/null
@@ -1,376 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2023 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-
-from typing import NamedTuple
-
-import dpctl.tensor as dpt
-
-from ._tensor_impl import (
-    _extract,
-    _full_usm_ndarray,
-    _linspace_step,
-    default_device_index_type,
-    mask_positions,
-)
-
-__all__ = [
-    "unique_values",
-    "unique_counts",
-    "unique_inverse",
-    "unique_all",
-    "UniqueAllResult",
-    "UniqueCountsResult",
-    "UniqueInverseResult",
-]
-
-
-class UniqueAllResult(NamedTuple):
-    values: dpt.usm_ndarray
-    indices: dpt.usm_ndarray
-    inverse_indices: dpt.usm_ndarray
-    counts: dpt.usm_ndarray
-
-
-class UniqueCountsResult(NamedTuple):
-    values: dpt.usm_ndarray
-    counts: dpt.usm_ndarray
-
-
-class UniqueInverseResult(NamedTuple):
-    values: dpt.usm_ndarray
-    inverse_indices: dpt.usm_ndarray
-
-
-def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
-    """unique_values(x)
-
-    Returns the unique elements of an input array x.
-
-    Args:
-        x (usm_ndarray):
-            input array. The input with more than one dimension is flattened.
-    Returns:
-        usm_ndarray
-            an array containing the set of unique elements in `x`. The
-            returned array has the same data type as `x`.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-    array_api_dev = x.device
-    exec_q = array_api_dev.sycl_queue
-    if x.ndim == 1:
-        fx = x
-    else:
-        fx = dpt.reshape(x, (x.size,), order="C")
-    if fx.size == 0:
-        return fx
-    s = dpt.sort(fx)
-    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
-    dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:])
-    unique_mask[0] = True
-    cumsum = dpt.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q)
-    n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q)
-    if n_uniques == fx.size:
-        return s
-    unique_vals = dpt.empty(
-        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
-    )
-    ht_ev, _ = _extract(
-        src=s,
-        cumsum=cumsum,
-        axis_start=0,
-        axis_end=1,
-        dst=unique_vals,
-        sycl_queue=exec_q,
-    )
-    ht_ev.wait()
-    return unique_vals
-
-
-def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
-    """unique_counts(x)
-
-    Returns the unique elements of an input array `x` and the corresponding
-    counts for each unique element in `x`.
-
-    Args:
-        x (usm_ndarray):
-            input array. The input with more than one dimension is flattened.
-    Returns:
-        tuple[usm_ndarray, usm_ndarray]
-            a namedtuple `(values, counts)` whose
-
-            * first element is the field name `values` and is an array
-               containing the unique elements of `x`. This array has the
-               same data type as `x`.
-            * second element has the field name `counts` and is an array
-              containing the number of times each unique element occurs in `x`.
-              This array has the same shape as `values` and has the default
-              array index data type.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-    array_api_dev = x.device
-    exec_q = array_api_dev.sycl_queue
-    if x.ndim == 1:
-        fx = x
-    else:
-        fx = dpt.reshape(x, (x.size,), order="C")
-    ind_dt = default_device_index_type(exec_q)
-    if fx.size == 0:
-        return UniqueCountsResult(fx, dpt.empty_like(fx, dtype=ind_dt))
-    s = dpt.sort(fx)
-    unique_mask = dpt.empty(s.shape, dtype="?", sycl_queue=exec_q)
-    dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:])
-    unique_mask[0] = True
-    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
-    # synchronizing call
-    n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q)
-    if n_uniques == fx.size:
-        return UniqueCountsResult(s, dpt.ones(n_uniques, dtype=ind_dt))
-    unique_vals = dpt.empty(
-        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
-    )
-    # populate unique values
-    ht_ev, _ = _extract(
-        src=s,
-        cumsum=cumsum,
-        axis_start=0,
-        axis_end=1,
-        dst=unique_vals,
-        sycl_queue=exec_q,
-    )
-    ht_ev.wait()
-    unique_counts = dpt.empty(
-        n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q
-    )
-    idx = dpt.arange(x.size, dtype=ind_dt, sycl_queue=exec_q)
-    ht_ev, _ = _extract(
-        src=idx,
-        cumsum=cumsum,
-        axis_start=0,
-        axis_end=1,
-        dst=unique_counts[:-1],
-        sycl_queue=exec_q,
-    )
-    unique_counts[-1] = fx.size
-    ht_ev.wait()
-    _counts = dpt.empty_like(unique_counts[1:])
-    dpt.subtract(unique_counts[1:], unique_counts[:-1], out=_counts)
-    return UniqueCountsResult(unique_vals, _counts)
-
-
-def unique_inverse(x):
-    """unique_inverse
-
-    Returns the unique elements of an input array x and the indices from the
-    set of unique elements that reconstruct `x`.
-
-    Args:
-        x (usm_ndarray):
-            input array. The input with more than one dimension is flattened.
-    Returns:
-        tuple[usm_ndarray, usm_ndarray]
-            a namedtuple `(values, inverse_indices)` whose
-
-            * first element has the field name `values` and is an array
-              containing the unique elements of `x`. The array has the same
-              data type as `x`.
-            * second element has the field name `inverse_indices` and is an
-              array containing the indices of values that reconstruct `x`.
-              The array has the same shape as `x` and has the default array
-              index data type.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-    array_api_dev = x.device
-    exec_q = array_api_dev.sycl_queue
-    ind_dt = default_device_index_type(exec_q)
-    if x.ndim == 1:
-        fx = x
-    else:
-        fx = dpt.reshape(x, (x.size,), order="C")
-    sorting_ids = dpt.argsort(fx)
-    unsorting_ids = dpt.argsort(sorting_ids)
-    if fx.size == 0:
-        return UniqueInverseResult(fx, dpt.reshape(unsorting_ids, x.shape))
-    s = fx[sorting_ids]
-    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
-    unique_mask[0] = True
-    dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:])
-    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
-    # synchronizing call
-    n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q)
-    if n_uniques == fx.size:
-        return UniqueInverseResult(s, unsorting_ids)
-    unique_vals = dpt.empty(
-        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
-    )
-    ht_ev, _ = _extract(
-        src=s,
-        cumsum=cumsum,
-        axis_start=0,
-        axis_end=1,
-        dst=unique_vals,
-        sycl_queue=exec_q,
-    )
-    ht_ev.wait()
-    cum_unique_counts = dpt.empty(
-        n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q
-    )
-    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
-    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
-    ht_ev.wait()
-    ht_ev, _ = _extract(
-        src=idx,
-        cumsum=cumsum,
-        axis_start=0,
-        axis_end=1,
-        dst=cum_unique_counts[:-1],
-        sycl_queue=exec_q,
-    )
-    ht_ev.wait()
-    cum_unique_counts[-1] = fx.size
-    _counts = dpt.subtract(cum_unique_counts[1:], cum_unique_counts[:-1])
-    # TODO: when searchsorted is available,
-    #   inv = searchsorted(unique_vals, fx)
-    counts = dpt.asnumpy(_counts).tolist()
-    inv = dpt.empty_like(fx, dtype=ind_dt)
-    pos = 0
-    for i in range(len(counts)):
-        pos_next = pos + counts[i]
-        _dst = inv[pos:pos_next]
-        ht_ev, _ = _full_usm_ndarray(fill_value=i, dst=_dst, sycl_queue=exec_q)
-        ht_ev.wait()
-        pos = pos_next
-    return UniqueInverseResult(
-        unique_vals, dpt.reshape(inv[unsorting_ids], x.shape)
-    )
-
-
-def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
-    """unique_all(x)
-
-    Returns the unique elements of an input array `x`, the first occurring
-    indices for each unique element in `x`, the indices from the set of unique
-    elements that reconstruct `x`, and the corresponding counts for each
-    unique element in `x`.
-
-    Args:
-        x (usm_ndarray):
-            input array. The input with more than one dimension is flattened.
-    Returns:
-        tuple[usm_ndarray, usm_ndarray, usm_ndarray, usm_ndarray]
-            a namedtuple `(values, indices, inverse_indices, counts)` whose
-
-            * first element has the field name `values` and is an array
-              containing the unique elements of `x`. The array has the same
-              data type as `x`.
-            * second element has the field name `indices` and is an array
-              the indices (of first occurrences) of `x` that result in
-              `values`. The array has the same shape as `values` and has the
-              default array index data type.
-            * third element has the field name `inverse_indices` and is an
-              array containing the indices of values that reconstruct `x`.
-              The array has the same shape as `x` and has the default array
-              index data type.
-            * fourth element has the field name `counts` and is an array
-              containing the number of times each unique element occurs in `x`.
-              This array has the same shape as `values` and has the default
-              array index data type.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-    array_api_dev = x.device
-    exec_q = array_api_dev.sycl_queue
-    ind_dt = default_device_index_type(exec_q)
-    if x.ndim == 1:
-        fx = x
-    else:
-        fx = dpt.reshape(x, (x.size,), order="C")
-    sorting_ids = dpt.argsort(fx)
-    unsorting_ids = dpt.argsort(sorting_ids)
-    if fx.size == 0:
-        # original array contains no data
-        # so it can be safely returned as values
-        return UniqueAllResult(
-            fx,
-            sorting_ids,
-            dpt.reshape(unsorting_ids, x.shape),
-            dpt.empty_like(fx, dtype=ind_dt),
-        )
-    s = fx[sorting_ids]
-    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
-    dpt.not_equal(s[:-1], s[1:], out=unique_mask[1:])
-    unique_mask[0] = True
-    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
-    # synchronizing call
-    n_uniques = mask_positions(unique_mask, cumsum, sycl_queue=exec_q)
-    if n_uniques == fx.size:
-        _counts = dpt.ones(
-            n_uniques, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q
-        )
-        return UniqueAllResult(
-            s,
-            sorting_ids,
-            dpt.reshape(unsorting_ids, x.shape),
-            _counts,
-        )
-    unique_vals = dpt.empty(
-        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
-    )
-    ht_ev, _ = _extract(
-        src=s,
-        cumsum=cumsum,
-        axis_start=0,
-        axis_end=1,
-        dst=unique_vals,
-        sycl_queue=exec_q,
-    )
-    ht_ev.wait()
-    cum_unique_counts = dpt.empty(
-        n_uniques + 1, dtype=ind_dt, usm_type=x.usm_type, sycl_queue=exec_q
-    )
-    idx = dpt.arange(fx.size, dtype=ind_dt, sycl_queue=exec_q)
-    ht_ev, extr_ev = _extract(
-        src=idx,
-        cumsum=cumsum,
-        axis_start=0,
-        axis_end=1,
-        dst=cum_unique_counts[:-1],
-        sycl_queue=exec_q,
-    )
-    ht_ev.wait()
-    cum_unique_counts[-1] = fx.size
-    _counts = cum_unique_counts[1:] - cum_unique_counts[:-1]
-    # TODO: when searchsorted is available,
-    #   inv = searchsorted(unique_vals, fx)
-    counts = dpt.asnumpy(_counts).tolist()
-    inv = dpt.empty_like(fx, dtype=ind_dt)
-    pos = 0
-    for i in range(len(counts)):
-        pos_next = pos + counts[i]
-        _dst = inv[pos:pos_next]
-        ht_ev, _ = _full_usm_ndarray(fill_value=i, dst=_dst, sycl_queue=exec_q)
-        ht_ev.wait()
-        pos = pos_next
-    return UniqueAllResult(
-        unique_vals,
-        sorting_ids[cum_unique_counts[:-1]],
-        dpt.reshape(inv[unsorting_ids], x.shape),
-        _counts,
-    )
diff --git a/pyproject.toml b/pyproject.toml
index f791411f6c..5882128dab 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,6 @@ source = [
 omit = [
     "dpctl/tests/*",
     "dpctl/_version.py",
-    "dpctl/tensor/_set_functions_sync.py",
     "*/_cython_api*/stringsource",
 ]
 
@@ -30,7 +29,6 @@ omit = [
 omit = [
     "dpctl/tests/*",
     "dpctl/_version.py",
-    "dpctl/tensor/_set_functions_sync.py",
     "*/_cython_api*/stringsource",
 ]
 

From 08e5dacbb8e3e845f5bb0b0e1871483a816a5640 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 8 Jan 2024 13:30:03 -0600
Subject: [PATCH 32/34] Renamed _set_functions_async -> _set_functions

---
 dpctl/tensor/__init__.py                                    | 2 +-
 dpctl/tensor/{_set_functions_async.py => _set_functions.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename dpctl/tensor/{_set_functions_async.py => _set_functions.py} (100%)

diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index 5b88c1f33e..81fc152e7a 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -175,7 +175,7 @@
     reduce_hypot,
     sum,
 )
-from ._set_functions_async import (
+from ._set_functions import (
     unique_all,
     unique_counts,
     unique_inverse,
diff --git a/dpctl/tensor/_set_functions_async.py b/dpctl/tensor/_set_functions.py
similarity index 100%
rename from dpctl/tensor/_set_functions_async.py
rename to dpctl/tensor/_set_functions.py

From a3d0d087b3a6b3521502d67c097f4f5a13216375 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 9 Jan 2024 12:54:19 -0600
Subject: [PATCH 33/34] Use extended comparison operators to define weak order
 on real/complex FP types

We use extended comparison operators compatible with NumPy's behavior:

https://numpy.org/devdocs/reference/generated/numpy.sort.html

Specifically, we use [R, nan] block ordering for reals, and
[(R, R), (R, nan), (nan, R), (nan, nan)] for complexes.
---
 .../source/sorting/sorting_common.hpp         | 80 ++++++++++++++++---
 1 file changed, 68 insertions(+), 12 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp b/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp
index 6e8f273ea5..62b30ccb8f 100644
--- a/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp
+++ b/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp
@@ -24,7 +24,8 @@
 
 #pragma once
 
-#include "utils/math_utils.hpp"
+#include "sycl/sycl.hpp"
+#include <type_traits>
 
 namespace dpctl
 {
@@ -33,44 +34,99 @@ namespace tensor
 namespace py_internal
 {
 
-template <typename cT> struct ComplexLess
+namespace
 {
-    bool operator()(const cT &v1, const cT &v2) const
+template <typename fpT> struct ExtendedRealFPLess
+{
+    /* [R, nan] */
+    bool operator()(const fpT v1, const fpT v2) const
     {
-        using dpctl::tensor::math_utils::less_complex;
+        return (!sycl::isnan(v1) && (sycl::isnan(v2) || (v1 < v2)));
+    }
+};
 
-        return less_complex(v1, v2);
+template <typename fpT> struct ExtendedRealFPGreater
+{
+    bool operator()(const fpT v1, const fpT v2) const
+    {
+        return (!sycl::isnan(v2) && (sycl::isnan(v1) || (v2 < v1)));
     }
 };
 
-template <typename cT> struct ComplexGreater
+template <typename cT> struct ExtendedComplexFPLess
 {
+    /* [(R, R), (R, nan), (nan, R), (nan, nan)] */
+
     bool operator()(const cT &v1, const cT &v2) const
     {
-        using dpctl::tensor::math_utils::greater_complex;
+        using realT = typename cT::value_type;
+
+        const realT real1 = std::real(v1);
+        const realT real2 = std::real(v2);
+
+        const bool r1_nan = sycl::isnan(real1);
+        const bool r2_nan = sycl::isnan(real2);
+
+        const realT imag1 = std::imag(v1);
+        const realT imag2 = std::imag(v2);
+
+        const bool i1_nan = sycl::isnan(imag1);
+        const bool i2_nan = sycl::isnan(imag2);
 
-        return greater_complex(v1, v2);
+        const int idx1 = ((r1_nan) ? 2 : 0) + ((i1_nan) ? 1 : 0);
+        const int idx2 = ((r2_nan) ? 2 : 0) + ((i2_nan) ? 1 : 0);
+
+        const bool res =
+            !(r1_nan && i1_nan) &&
+            ((idx1 < idx2) ||
+             ((idx1 == idx2) &&
+              ((r1_nan && !i1_nan && (imag1 < imag2)) ||
+               (!r1_nan && i1_nan && (real1 < real2)) ||
+               (!r1_nan && !i1_nan &&
+                ((real1 < real2) || (!(real2 < real1) && (imag1 < imag2)))))));
+
+        return res;
+    }
+};
+
+template <typename cT> struct ExtendedComplexFPGreater
+{
+    bool operator()(const cT &v1, const cT &v2) const
+    {
+        auto less_ = ExtendedComplexFPLess<cT>{};
+        return less_(v2, v1);
     }
 };
 
+template <typename T>
+inline constexpr bool is_fp_v = (std::is_same_v<T, sycl::half> ||
+                                 std::is_same_v<T, float> ||
+                                 std::is_same_v<T, double>);
+
+} // end of anonymous namespace
+
 template <typename argTy> struct AscendingSorter
 {
-    using type = std::less<argTy>;
+    using type = std::conditional_t<is_fp_v<argTy>,
+                                    ExtendedRealFPLess<argTy>,
+                                    std::less<argTy>>;
 };
 
 template <typename T> struct AscendingSorter<std::complex<T>>
 {
-    using type = ComplexLess<std::complex<T>>;
+    using type = ExtendedComplexFPLess<std::complex<T>>;
 };
 
 template <typename argTy> struct DescendingSorter
 {
-    using type = std::greater<argTy>;
+    using type = std::conditional_t<is_fp_v<argTy>,
+                                    ExtendedRealFPGreater<argTy>,
+                                    std::greater<argTy>>;
 };
 
 template <typename T> struct DescendingSorter<std::complex<T>>
 {
-    using type = ComplexGreater<std::complex<T>>;
+    using type = ExtendedComplexFPGreater<std::complex<T>>;
 };
 
 } // end of namespace py_internal

From 54698321303b8708e3ac7a5c23188fe6527f8b66 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 9 Jan 2024 12:58:42 -0600
Subject: [PATCH 34/34] Adding tests for sorting of FP arrays with NaNs

---
 dpctl/tests/test_usm_ndarray_sorting.py | 69 +++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/dpctl/tests/test_usm_ndarray_sorting.py b/dpctl/tests/test_usm_ndarray_sorting.py
index 69f2c62a04..e76ac667e1 100644
--- a/dpctl/tests/test_usm_ndarray_sorting.py
+++ b/dpctl/tests/test_usm_ndarray_sorting.py
@@ -14,6 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import itertools
+
+import numpy as np
 import pytest
 
 import dpctl.tensor as dpt
@@ -211,3 +214,69 @@ def test_argsort_0d_array():
 
     x = dpt.asarray(1, dtype="i4")
     assert dpt.argsort(x) == 0
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "f2",
+        "f4",
+        "f8",
+    ],
+)
+def test_sort_real_fp_nan(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.asarray(
+        [-0.0, 0.1, dpt.nan, 0.0, -0.1, dpt.nan, 0.2, -0.3], dtype=dtype
+    )
+    s = dpt.sort(x)
+
+    expected = dpt.asarray(
+        [-0.3, -0.1, -0.0, 0.0, 0.1, 0.2, dpt.nan, dpt.nan], dtype=dtype
+    )
+
+    assert dpt.allclose(s, expected, equal_nan=True)
+
+    s = dpt.sort(x, descending=True)
+
+    expected = dpt.asarray(
+        [dpt.nan, dpt.nan, 0.2, 0.1, -0.0, 0.0, -0.1, -0.3], dtype=dtype
+    )
+
+    assert dpt.allclose(s, expected, equal_nan=True)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "c8",
+        "c16",
+    ],
+)
+def test_sort_complex_fp_nan(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    rvs = [-0.0, 0.1, 0.0, 0.2, -0.3, dpt.nan]
+    ivs = [-0.0, 0.1, 0.0, 0.2, -0.3, dpt.nan]
+
+    cv = []
+    for rv in rvs:
+        for iv in ivs:
+            cv.append(complex(rv, iv))
+
+    inp = dpt.asarray(cv, dtype=dtype)
+    s = dpt.sort(inp)
+
+    expected = np.sort(dpt.asnumpy(inp))
+
+    assert np.allclose(dpt.asnumpy(s), expected, equal_nan=True)
+
+    for i, j in itertools.permutations(range(inp.shape[0]), 2):
+        r1 = dpt.asnumpy(dpt.sort(inp[dpt.asarray([i, j])]))
+        r2 = np.sort(dpt.asnumpy(inp[dpt.asarray([i, j])]))
+        assert np.array_equal(
+            r1.view(np.int64), r2.view(np.int64)
+        ), f"Failed for {i} and {j}"