diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index de8be6fae0..d2947aa772 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -118,6 +118,10 @@ set(_reduction_sources
 set(_boolean_reduction_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp
 )
+set(_sorting_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/sort.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/argsort.cpp
+)
 set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
@@ -148,6 +152,10 @@ set(_tensor_reductions_impl_sources
     ${_boolean_reduction_sources}
     ${_reduction_sources}
 )
+set(_tensor_sorting_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_sorting.cpp
+    ${_sorting_sources}
+)
 
 set(_py_trgts)
 
@@ -166,6 +174,11 @@ pybind11_add_module(${python_module_name} MODULE ${_tensor_reductions_impl_sourc
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_reductions_impl_sources})
 list(APPEND _py_trgts ${python_module_name})
 
+set(python_module_name _tensor_sorting_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_sorting_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_sorting_impl_sources})
+list(APPEND _py_trgts ${python_module_name})
+
 set(_clang_prefix "")
 if (WIN32)
   set(_clang_prefix "/clang:")
@@ -179,6 +192,7 @@ set(_no_fast_math_sources
 list(APPEND _no_fast_math_sources
      ${_elementwise_sources}
      ${_reduction_sources}
+     ${_sorting_sources}
 )
 
 foreach(_src_fn ${_no_fast_math_sources})
diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index cdb701e1cb..81fc152e7a 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -175,6 +175,13 @@
     reduce_hypot,
     sum,
 )
+from ._set_functions import (
+    unique_all,
+    unique_counts,
+    unique_inverse,
+    unique_values,
+)
+from ._sorting import argsort, sort
 from ._testing import allclose
 from ._type_utils import can_cast, finfo, iinfo, result_type
 
@@ -343,4 +350,10 @@
     "__array_namespace_info__",
     "reciprocal",
     "angle",
+    "sort",
+    "argsort",
+    "unique_all",
+    "unique_counts",
+    "unique_inverse",
+    "unique_values",
 ]
diff --git a/dpctl/tensor/_set_functions.py b/dpctl/tensor/_set_functions.py
new file mode 100644
index 0000000000..f18d64d715
--- /dev/null
+++ b/dpctl/tensor/_set_functions.py
@@ -0,0 +1,606 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from typing import NamedTuple
+
+import dpctl
+import dpctl.tensor as dpt
+
+from ._tensor_elementwise_impl import _not_equal, _subtract
+from ._tensor_impl import (
+    _copy_usm_ndarray_into_usm_ndarray,
+    _extract,
+    _full_usm_ndarray,
+    _linspace_step,
+    _take,
+    default_device_index_type,
+    mask_positions,
+)
+from ._tensor_sorting_impl import _argsort_ascending, _sort_ascending
+
+__all__ = [
+    "unique_values",
+    "unique_counts",
+    "unique_inverse",
+    "unique_all",
+    "UniqueAllResult",
+    "UniqueCountsResult",
+    "UniqueInverseResult",
+]
+
+
+class UniqueAllResult(NamedTuple):
+    values: dpt.usm_ndarray
+    indices: dpt.usm_ndarray
+    inverse_indices: dpt.usm_ndarray
+    counts: dpt.usm_ndarray
+
+
+class UniqueCountsResult(NamedTuple):
+    values: dpt.usm_ndarray
+    counts: dpt.usm_ndarray
+
+
+class UniqueInverseResult(NamedTuple):
+    values: dpt.usm_ndarray
+    inverse_indices: dpt.usm_ndarray
+
+
+def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
+    """unique_values(x)
+
+    Returns the unique elements of an input array x.
+
+    Args:
+        x (usm_ndarray):
+            input array. The input with more than one dimension is flattened.
+    Returns:
+        usm_ndarray
+            an array containing the set of unique elements in `x`. The
+            returned array has the same data type as `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt.reshape(x, (x.size,), order="C")
+    if fx.size == 0:
+        return fx
+    s = dpt.empty_like(fx, order="C")
+    host_tasks = []
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _sort_ascending(
+            src=fx, trailing_dims_to_sort=1, dst=s, sycl_queue=exec_q
+        )
+        host_tasks.append(ht_ev)
+    else:
+        tmp = dpt.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q
+        )
+        host_tasks.append(ht_ev)
+        ht_ev, sort_ev = _sort_ascending(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=s,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        host_tasks.append(ht_ev)
+    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    host_tasks.append(ht_ev)
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    host_tasks.append(ht_ev)
+    cumsum = dpt.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q)
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev]
+    )
+    if n_uniques == fx.size:
+        dpctl.SyclEvent.wait_for(host_tasks)
+        return s
+    unique_vals = dpt.empty(
+        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    ht_ev, _ = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    host_tasks.append(ht_ev)
+    dpctl.SyclEvent.wait_for(host_tasks)
+    return unique_vals
+
+
+def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
+    """unique_counts(x)
+
+    Returns the unique elements of an input array `x` and the corresponding
+    counts for each unique element in `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. The input with more than one dimension is flattened.
+    Returns:
+        tuple[usm_ndarray, usm_ndarray]
+            a namedtuple `(values, counts)` whose
+
+            * first element is the field name `values` and is an array
+               containing the unique elements of `x`. This array has the
+               same data type as `x`.
+            * second element has the field name `counts` and is an array
+              containing the number of times each unique element occurs in `x`.
+              This array has the same shape as `values` and has the default
+              array index data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    x_usm_type = x.usm_type
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt.reshape(x, (x.size,), order="C")
+    ind_dt = default_device_index_type(exec_q)
+    if fx.size == 0:
+        return UniqueCountsResult(fx, dpt.empty_like(fx, dtype=ind_dt))
+    s = dpt.empty_like(fx, order="C")
+    host_tasks = []
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _sort_ascending(
+            src=fx, trailing_dims_to_sort=1, dst=s, sycl_queue=exec_q
+        )
+        host_tasks.append(ht_ev)
+    else:
+        tmp = dpt.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q
+        )
+        host_tasks.append(ht_ev)
+        ht_ev, sort_ev = _sort_ascending(
+            src=tmp,
+            dst=s,
+            trailing_dims_to_sort=1,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        host_tasks.append(ht_ev)
+    unique_mask = dpt.empty(s.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    host_tasks.append(ht_ev)
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    host_tasks.append(ht_ev)
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev]
+    )
+    if n_uniques == fx.size:
+        dpctl.SyclEvent.wait_for(host_tasks)
+        return UniqueCountsResult(
+            s,
+            dpt.ones(
+                n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+            ),
+        )
+    unique_vals = dpt.empty(
+        n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    # populate unique values
+    ht_ev, _ = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    host_tasks.append(ht_ev)
+    unique_counts = dpt.empty(
+        n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
+    host_tasks.append(ht_ev)
+    ht_ev, extr_ev = _extract(
+        src=idx,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_counts[:-1],
+        sycl_queue=exec_q,
+        depends=[id_ev],
+    )
+    host_tasks.append(ht_ev)
+    ht_ev, set_ev = _full_usm_ndarray(
+        x.size, dst=unique_counts[-1], sycl_queue=exec_q
+    )
+    host_tasks.append(ht_ev)
+    _counts = dpt.empty_like(unique_counts[1:])
+    ht_ev, _ = _subtract(
+        src1=unique_counts[1:],
+        src2=unique_counts[:-1],
+        dst=_counts,
+        sycl_queue=exec_q,
+        depends=[set_ev, extr_ev],
+    )
+    host_tasks.append(ht_ev)
+    dpctl.SyclEvent.wait_for(host_tasks)
+    return UniqueCountsResult(unique_vals, _counts)
+
+
+def unique_inverse(x):
+    """unique_inverse
+
+    Returns the unique elements of an input array x and the indices from the
+    set of unique elements that reconstruct `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. The input with more than one dimension is flattened.
+    Returns:
+        tuple[usm_ndarray, usm_ndarray]
+            a namedtuple `(values, inverse_indices)` whose
+
+            * first element has the field name `values` and is an array
+              containing the unique elements of `x`. The array has the same
+              data type as `x`.
+            * second element has the field name `inverse_indices` and is an
+              array containing the indices of values that reconstruct `x`.
+              The array has the same shape as `x` and has the default array
+              index data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    x_usm_type = x.usm_type
+    ind_dt = default_device_index_type(exec_q)
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt.reshape(x, (x.size,), order="C")
+    sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C")
+    unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C")
+    if fx.size == 0:
+        return UniqueInverseResult(fx, dpt.reshape(unsorting_ids, x.shape))
+    host_tasks = []
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _argsort_ascending(
+            src=fx, trailing_dims_to_sort=1, dst=sorting_ids, sycl_queue=exec_q
+        )
+        host_tasks.append(ht_ev)
+    else:
+        tmp = dpt.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q
+        )
+        host_tasks.append(ht_ev)
+        ht_ev, sort_ev = _argsort_ascending(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=sorting_ids,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        host_tasks.append(ht_ev)
+    ht_ev, _ = _argsort_ascending(
+        src=sorting_ids,
+        trailing_dims_to_sort=1,
+        dst=unsorting_ids,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    host_tasks.append(ht_ev)
+    s = dpt.empty_like(fx)
+    # s = fx[sorting_ids]
+    ht_ev, take_ev = _take(
+        src=fx,
+        ind=(sorting_ids,),
+        dst=s,
+        axis_start=0,
+        mode=0,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    host_tasks.append(ht_ev)
+    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[take_ev],
+    )
+    host_tasks.append(ht_ev)
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    host_tasks.append(ht_ev)
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev]
+    )
+    if n_uniques == fx.size:
+        dpctl.SyclEvent.wait_for(host_tasks)
+        return UniqueInverseResult(s, dpt.reshape(unsorting_ids, x.shape))
+    unique_vals = dpt.empty(
+        n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    ht_ev, _ = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    host_tasks.append(ht_ev)
+    cum_unique_counts = dpt.empty(
+        n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
+    host_tasks.append(ht_ev)
+    ht_ev, extr_ev = _extract(
+        src=idx,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=cum_unique_counts[:-1],
+        sycl_queue=exec_q,
+        depends=[id_ev],
+    )
+    host_tasks.append(ht_ev)
+    ht_ev, set_ev = _full_usm_ndarray(
+        x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q
+    )
+    host_tasks.append(ht_ev)
+    _counts = dpt.empty_like(cum_unique_counts[1:])
+    ht_ev, _ = _subtract(
+        src1=cum_unique_counts[1:],
+        src2=cum_unique_counts[:-1],
+        dst=_counts,
+        sycl_queue=exec_q,
+        depends=[set_ev, extr_ev],
+    )
+    host_tasks.append(ht_ev)
+    # TODO: when searchsorted is available,
+    #   inv = searchsorted(unique_vals, fx)
+    dpctl.SyclEvent.wait_for(host_tasks)
+    counts = dpt.asnumpy(_counts).tolist()
+    inv = dpt.empty_like(fx, dtype=ind_dt)
+    pos = 0
+    host_tasks = []
+    for i in range(len(counts)):
+        pos_next = pos + counts[i]
+        _dst = inv[pos:pos_next]
+        ht_ev, _ = _full_usm_ndarray(fill_value=i, dst=_dst, sycl_queue=exec_q)
+        pos = pos_next
+        host_tasks.append(ht_ev)
+    dpctl.SyclEvent.wait_for(host_tasks)
+    return UniqueInverseResult(
+        unique_vals, dpt.reshape(inv[unsorting_ids], x.shape)
+    )
+
+
+def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
+    """unique_all(x)
+
+    Returns the unique elements of an input array `x`, the first occurring
+    indices for each unique element in `x`, the indices from the set of unique
+    elements that reconstruct `x`, and the corresponding counts for each
+    unique element in `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. The input with more than one dimension is flattened.
+    Returns:
+        tuple[usm_ndarray, usm_ndarray, usm_ndarray, usm_ndarray]
+            a namedtuple `(values, indices, inverse_indices, counts)` whose
+
+            * first element has the field name `values` and is an array
+              containing the unique elements of `x`. The array has the same
+              data type as `x`.
+            * second element has the field name `indices` and is an array
+              the indices (of first occurrences) of `x` that result in
+              `values`. The array has the same shape as `values` and has the
+              default array index data type.
+            * third element has the field name `inverse_indices` and is an
+              array containing the indices of values that reconstruct `x`.
+              The array has the same shape as `x` and has the default array
+              index data type.
+            * fourth element has the field name `counts` and is an array
+              containing the number of times each unique element occurs in `x`.
+              This array has the same shape as `values` and has the default
+              array index data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    x_usm_type = x.usm_type
+    ind_dt = default_device_index_type(exec_q)
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt.reshape(x, (x.size,), order="C")
+    sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C")
+    unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C")
+    if fx.size == 0:
+        # original array contains no data
+        # so it can be safely returned as values
+        return UniqueAllResult(
+            fx,
+            sorting_ids,
+            dpt.reshape(unsorting_ids, x.shape),
+            dpt.empty_like(fx, dtype=ind_dt),
+        )
+    host_tasks = []
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _argsort_ascending(
+            src=fx, trailing_dims_to_sort=1, dst=sorting_ids, sycl_queue=exec_q
+        )
+        host_tasks.append(ht_ev)
+    else:
+        tmp = dpt.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q
+        )
+        host_tasks.append(ht_ev)
+        ht_ev, sort_ev = _argsort_ascending(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=sorting_ids,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        host_tasks.append(ht_ev)
+    ht_ev, _ = _argsort_ascending(
+        src=sorting_ids,
+        trailing_dims_to_sort=1,
+        dst=unsorting_ids,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    host_tasks.append(ht_ev)
+    s = dpt.empty_like(fx)
+    # s = fx[sorting_ids]
+    ht_ev, take_ev = _take(
+        src=fx,
+        ind=(sorting_ids,),
+        dst=s,
+        axis_start=0,
+        mode=0,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    host_tasks.append(ht_ev)
+    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[take_ev],
+    )
+    host_tasks.append(ht_ev)
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    host_tasks.append(ht_ev)
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev]
+    )
+    if n_uniques == fx.size:
+        dpctl.SyclEvent.wait_for(host_tasks)
+        _counts = dpt.ones(
+            n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+        )
+        return UniqueAllResult(
+            s,
+            sorting_ids,
+            dpt.reshape(unsorting_ids, x.shape),
+            _counts,
+        )
+    unique_vals = dpt.empty(
+        n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    ht_ev, _ = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    host_tasks.append(ht_ev)
+    cum_unique_counts = dpt.empty(
+        n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
+    host_tasks.append(ht_ev)
+    ht_ev, extr_ev = _extract(
+        src=idx,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=cum_unique_counts[:-1],
+        sycl_queue=exec_q,
+        depends=[id_ev],
+    )
+    host_tasks.append(ht_ev)
+    ht_ev, set_ev = _full_usm_ndarray(
+        x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q
+    )
+    host_tasks.append(ht_ev)
+    _counts = dpt.empty_like(cum_unique_counts[1:])
+    ht_ev, _ = _subtract(
+        src1=cum_unique_counts[1:],
+        src2=cum_unique_counts[:-1],
+        dst=_counts,
+        sycl_queue=exec_q,
+        depends=[set_ev, extr_ev],
+    )
+    host_tasks.append(ht_ev)
+    # TODO: when searchsorted is available,
+    #   inv = searchsorted(unique_vals, fx)
+    dpctl.SyclEvent.wait_for(host_tasks)
+    counts = dpt.asnumpy(_counts).tolist()
+    inv = dpt.empty_like(fx, dtype=ind_dt)
+    pos = 0
+    host_tasks = []
+    for i in range(len(counts)):
+        pos_next = pos + counts[i]
+        _dst = inv[pos:pos_next]
+        ht_ev, _ = _full_usm_ndarray(fill_value=i, dst=_dst, sycl_queue=exec_q)
+        pos = pos_next
+        host_tasks.append(ht_ev)
+    dpctl.SyclEvent.wait_for(host_tasks)
+    return UniqueAllResult(
+        unique_vals,
+        sorting_ids[cum_unique_counts[:-1]],
+        dpt.reshape(inv[unsorting_ids], x.shape),
+        _counts,
+    )
diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py
new file mode 100644
index 0000000000..661c3bc1c4
--- /dev/null
+++ b/dpctl/tensor/_sorting.py
@@ -0,0 +1,186 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from numpy.core.numeric import normalize_axis_index
+
+import dpctl
+import dpctl.tensor as dpt
+import dpctl.tensor._tensor_impl as ti
+
+from ._tensor_sorting_impl import (
+    _argsort_ascending,
+    _argsort_descending,
+    _sort_ascending,
+    _sort_descending,
+)
+
+__all__ = ["sort", "argsort"]
+
+
+def sort(x, /, *, axis=-1, descending=False, stable=True):
+    """sort(x, axis=-1, descending=False, stable=True)
+
+    Returns a sorted copy of an input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to sort. If set to `-1`, the function
+            must sort along the last axis. Default: `-1`.
+        descending (Optional[bool]):
+            sort order. If `True`, the array must be sorted in descending
+            order (by value). If `False`, the array must be sorted in
+            ascending order (by value). Default: `False`.
+        stable (Optional[bool]):
+            sort stability. If `True`, the returned array must maintain the
+            relative order of `x` values which compare as equal. If `False`,
+            the returned array may or may not maintain the relative order of
+            `x` values which compare as equal. Default: `True`.
+
+    Returns:
+        usm_ndarray:
+            a sorted array. The returned array has the same data type and
+            the same shape as the input array `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
+        )
+    nd = x.ndim
+    if nd == 0:
+        axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis")
+        return dpt.copy(x, order="C")
+    else:
+        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
+    a1 = axis + 1
+    if a1 == nd:
+        perm = list(range(nd))
+        arr = x
+    else:
+        perm = [i for i in range(nd) if i != axis] + [
+            axis,
+        ]
+        arr = dpt.permute_dims(x, perm)
+    exec_q = x.sycl_queue
+    host_tasks_list = []
+    impl_fn = _sort_descending if descending else _sort_ascending
+    if arr.flags.c_contiguous:
+        res = dpt.empty_like(arr, order="C")
+        ht_ev, _ = impl_fn(
+            src=arr, trailing_dims_to_sort=1, dst=res, sycl_queue=exec_q
+        )
+        host_tasks_list.append(ht_ev)
+    else:
+        tmp = dpt.empty_like(arr, order="C")
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=tmp, sycl_queue=exec_q
+        )
+        host_tasks_list.append(ht_ev)
+        res = dpt.empty_like(arr, order="C")
+        ht_ev, _ = impl_fn(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=res,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        host_tasks_list.append(ht_ev)
+    if a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt.permute_dims(res, inv_perm)
+    dpctl.SyclEvent.wait_for(host_tasks_list)
+    return res
+
+
+def argsort(x, axis=-1, descending=False, stable=True):
+    """argsort(x, axis=-1, descending=False, stable=True)
+
+    Returns the indices that sort an array `x` along a specified axis.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to sort. If set to `-1`, the function
+            must sort along the last axis. Default: `-1`.
+        descending (Optional[bool]):
+            sort order. If `True`, the array must be sorted in descending
+            order (by value). If `False`, the array must be sorted in
+            ascending order (by value). Default: `False`.
+        stable (Optional[bool]):
+            sort stability. If `True`, the returned array must maintain the
+            relative order of `x` values which compare as equal. If `False`,
+            the returned array may or may not maintain the relative order of
+            `x` values which compare as equal. Default: `True`.
+
+    Returns:
+        usm_ndarray:
+            an array of indices. The returned array has the  same shape as
+            the input array `x`. The return array has default array index
+            data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
+        )
+    nd = x.ndim
+    if nd == 0:
+        axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis")
+        return dpt.zeros_like(
+            x, dtype=ti.default_device_index_type(x.sycl_queue), order="C"
+        )
+    else:
+        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
+    a1 = axis + 1
+    if a1 == nd:
+        perm = list(range(nd))
+        arr = x
+    else:
+        perm = [i for i in range(nd) if i != axis] + [
+            axis,
+        ]
+        arr = dpt.permute_dims(x, perm)
+    exec_q = x.sycl_queue
+    host_tasks_list = []
+    impl_fn = _argsort_descending if descending else _argsort_ascending
+    index_dt = ti.default_device_index_type(exec_q)
+    if arr.flags.c_contiguous:
+        res = dpt.empty_like(arr, dtype=index_dt, order="C")
+        ht_ev, _ = impl_fn(
+            src=arr, trailing_dims_to_sort=1, dst=res, sycl_queue=exec_q
+        )
+        host_tasks_list.append(ht_ev)
+    else:
+        tmp = dpt.empty_like(arr, order="C")
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=tmp, sycl_queue=exec_q
+        )
+        host_tasks_list.append(ht_ev)
+        res = dpt.empty_like(arr, dtype=index_dt, order="C")
+        ht_ev, _ = impl_fn(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=res,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        host_tasks_list.append(ht_ev)
+    if a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt.permute_dims(res, inv_perm)
+    dpctl.SyclEvent.wait_for(host_tasks_list)
+    return res
diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
index bd6ad20b6d..491fb12126 100644
--- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
@@ -51,23 +51,6 @@ template <typename T> T ceiling_quotient(T n, T m)
 {
     return (n + m - 1) / m;
 }
-template <typename T1, typename T2> T1 ceiling_quotient(T1 n, T2 m)
-{
-    return ceiling_quotient<T1>(n, static_cast<T1>(m));
-}
-
-template <typename inputT,
-          typename outputT,
-          size_t n_wi,
-          typename IndexerT,
-          typename TransformerT>
-class inclusive_scan_rec_local_scan_krn;
-
-template <typename inputT,
-          typename outputT,
-          typename IndexerT,
-          typename TransformerT>
-class inclusive_scan_rec_chunk_update_krn;
 
 template <typename inputT, typename outputT> struct NonZeroIndicator
 {
@@ -93,127 +76,261 @@ template <typename T> struct NoOpTransformer
     }
 };
 
-/*
- * for integer type maskT,
- *       output[j] = sum( input[s0 + i * s1], 0 <= i <= j)
- * for 0 <= j < n_elems
- */
+// Iterative cumulative summation
+
+using nwiT = std::uint16_t;
+
 template <typename inputT,
           typename outputT,
-          size_t n_wi,
+          nwiT n_wi,
           typename IndexerT,
           typename TransformerT>
-sycl::event inclusive_scan_rec(sycl::queue &exec_q,
-                               size_t n_elems,
-                               size_t wg_size,
-                               const inputT *input,
-                               outputT *output,
-                               size_t s0,
-                               size_t s1,
-                               IndexerT indexer,
-                               TransformerT transformer,
-                               std::vector<sycl::event> const &depends = {})
+class inclusive_scan_iter_local_scan_krn;
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IndexerT,
+          typename TransformerT,
+          typename OtherIndexerT,
+          typename OtherTransformerT>
+class inclusive_scan_iter_chunk_update_krn;
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IndexerT,
+          typename TransformerT>
+sycl::event
+inclusive_scan_base_step(sycl::queue &exec_q,
+                         const size_t wg_size,
+                         const size_t n_elems,
+                         const inputT *input,
+                         outputT *output,
+                         const size_t s0,
+                         const size_t s1,
+                         IndexerT indexer,
+                         TransformerT transformer,
+                         size_t &n_groups,
+                         const std::vector<sycl::event> &depends = {})
 {
-    size_t n_groups = ceiling_quotient(n_elems, n_wi * wg_size);
+    n_groups = ceiling_quotient<size_t>(n_elems, n_wi * wg_size);
 
-    const sycl::event &inc_scan_phase1_ev =
-        exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
+    sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
 
-            using slmT = sycl::local_accessor<size_t, 1>;
+        using slmT = sycl::local_accessor<outputT, 1>;
 
-            auto lws = sycl::range<1>(wg_size);
-            auto gws = sycl::range<1>(n_groups * wg_size);
+        auto lws = sycl::range<1>(wg_size);
+        auto gws = sycl::range<1>(n_groups * wg_size);
 
-            auto ndRange = sycl::nd_range<1>(gws, lws);
+        auto ndRange = sycl::nd_range<1>(gws, lws);
 
-            slmT slm_iscan_tmp(lws, cgh);
+        slmT slm_iscan_tmp(lws, cgh);
 
-            using KernelName = inclusive_scan_rec_local_scan_krn<
-                inputT, outputT, n_wi, IndexerT, decltype(transformer)>;
+        using KernelName =
+            inclusive_scan_iter_local_scan_krn<inputT, outputT, n_wi, IndexerT,
+                                               TransformerT>;
 
-            cgh.parallel_for<KernelName>(ndRange, [=, slm_iscan_tmp = std::move(
-                                                          slm_iscan_tmp)](
-                                                      sycl::nd_item<1> it) {
-                auto chunk_gid = it.get_global_id(0);
-                auto lid = it.get_local_id(0);
+        cgh.parallel_for<KernelName>(ndRange, [=, slm_iscan_tmp =
+                                                      std::move(slm_iscan_tmp)](
+                                                  sycl::nd_item<1> it) {
+            auto chunk_gid = it.get_global_id(0);
+            auto lid = it.get_local_id(0);
 
-                std::array<size_t, n_wi> local_isum;
+            std::array<outputT, n_wi> local_isum;
 
-                size_t i = chunk_gid * n_wi;
-                for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) {
-                    constexpr outputT out_zero(0);
+            size_t i = chunk_gid * n_wi;
+
+#pragma unroll
+            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                constexpr outputT out_zero(0);
 
-                    local_isum[m_wi] =
-                        (i + m_wi < n_elems)
-                            ? transformer(input[indexer(s0 + s1 * (i + m_wi))])
-                            : out_zero;
-                }
+                local_isum[m_wi] =
+                    (i + m_wi < n_elems)
+                        ? transformer(input[indexer(s0 + s1 * (i + m_wi))])
+                        : out_zero;
+            }
 
 #pragma unroll
-                for (size_t m_wi = 1; m_wi < n_wi; ++m_wi) {
-                    local_isum[m_wi] += local_isum[m_wi - 1];
-                }
-                // local_isum is now result of
-                // inclusive scan of locally stored inputs
+            for (nwiT m_wi = 1; m_wi < n_wi; ++m_wi) {
+                local_isum[m_wi] += local_isum[m_wi - 1];
+            }
+            // local_isum is now result of
+            // inclusive scan of locally stored inputs
 
-                size_t wg_iscan_val = sycl::inclusive_scan_over_group(
-                    it.get_group(), local_isum.back(), sycl::plus<size_t>(),
-                    size_t(0));
+            outputT wg_iscan_val = sycl::inclusive_scan_over_group(
+                it.get_group(), local_isum.back(), sycl::plus<outputT>(),
+                outputT(0));
 
-                slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val;
-                it.barrier(sycl::access::fence_space::local_space);
-                size_t addand = (lid == 0) ? 0 : slm_iscan_tmp[lid];
+            slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val;
+            it.barrier(sycl::access::fence_space::local_space);
+            outputT addand = (lid == 0) ? outputT(0) : slm_iscan_tmp[lid];
 
 #pragma unroll
-                for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) {
-                    local_isum[m_wi] += addand;
-                }
-
-                for (size_t m_wi = 0; m_wi < n_wi && i + m_wi < n_elems; ++m_wi)
-                {
-                    output[i + m_wi] = local_isum[m_wi];
-                }
-            });
+            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                local_isum[m_wi] += addand;
+            }
+
+            for (nwiT m_wi = 0; (m_wi < n_wi) && (i + m_wi < n_elems); ++m_wi) {
+                output[i + m_wi] = local_isum[m_wi];
+            }
         });
+    });
 
-    sycl::event out_event = inc_scan_phase1_ev;
-    if (n_groups > 1) {
-        outputT *temp = sycl::malloc_device<outputT>(n_groups - 1, exec_q);
+    return inc_scan_phase1_ev;
+}
+
+namespace
+{
+template <typename T> class stack_t
+{
+    T *src_;
+    size_t size_;
+    T *local_scans_;
+
+public:
+    stack_t() : src_{}, size_{}, local_scans_{} {}
+    stack_t(T *src, size_t sz, T *local_scans)
+        : src_(src), size_(sz), local_scans_(local_scans)
+    {
+    }
+    ~stack_t(){};
+
+    T *get_src_ptr() const
+    {
+        return src_;
+    }
+
+    const T *get_src_const_ptr() const
+    {
+        return src_;
+    }
 
+    size_t get_size() const
+    {
+        return size_;
+    }
+
+    T *get_local_scans_ptr() const
+    {
+        return local_scans_;
+    }
+};
+} // end of anonymous namespace
+
+/*
+ * for integer type maskT,
+ *       output[j] = sum( input[s0 + i * s1], 0 <= i <= j)
+ * for 0 <= j < n_elems
+ */
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IndexerT,
+          typename TransformerT>
+sycl::event inclusive_scan_iter(sycl::queue &exec_q,
+                                const size_t wg_size,
+                                const size_t n_elems,
+                                const inputT *input,
+                                outputT *output,
+                                const size_t s0,
+                                const size_t s1,
+                                IndexerT indexer,
+                                TransformerT transformer,
+                                std::vector<sycl::event> &host_tasks,
+                                const std::vector<sycl::event> &depends = {})
+{
+    size_t n_groups;
+    sycl::event inc_scan_phase1_ev =
+        inclusive_scan_base_step<inputT, outputT, n_wi, IndexerT, TransformerT>(
+            exec_q, wg_size, n_elems, input, output, s0, s1, indexer,
+            transformer, n_groups, depends);
+
+    sycl::event dependent_event = inc_scan_phase1_ev;
+    if (n_groups > 1) {
         auto chunk_size = wg_size * n_wi;
 
+        // how much of temporary allocation do we need
+        size_t n_groups_ = n_groups;
+        size_t temp_size = 0;
+        while (n_groups_ > 1) {
+            const auto this_size = (n_groups_ - 1);
+            temp_size += this_size;
+            n_groups_ = ceiling_quotient<size_t>(this_size, chunk_size);
+        }
+
+        // allocate
+        outputT *temp = sycl::malloc_device<outputT>(temp_size, exec_q);
+
+        if (!temp) {
+            throw std::bad_alloc();
+        }
+
+        std::vector<stack_t<outputT>> stack{};
+
+        // inclusive scans over blocks
+        n_groups_ = n_groups;
+        outputT *src = output;
+        outputT *local_scans = temp;
+
         NoOpIndexer _no_op_indexer{};
-        NoOpTransformer<outputT> _no_op_transformer{};
-        auto e2 = inclusive_scan_rec<outputT, outputT, n_wi, NoOpIndexer,
-                                     decltype(_no_op_transformer)>(
-            exec_q, n_groups - 1, wg_size, output, temp, chunk_size - 1,
-            chunk_size, _no_op_indexer, _no_op_transformer,
-            {inc_scan_phase1_ev});
-
-        // output[ chunk_size * (i + 1) + j] += temp[i]
-        auto e3 = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(e2);
-            cgh.parallel_for<class inclusive_scan_rec_chunk_update_krn<
-                inputT, outputT, IndexerT, decltype(transformer)>>(
-                {n_elems}, [=](auto wiid)
-            {
-                auto gid = wiid[0];
-                auto i = (gid / chunk_size);
-                output[gid] += (i > 0) ? temp[i - 1] : 0;
+        using NoOpTransformerT = NoOpTransformer<outputT>;
+        NoOpTransformerT _no_op_transformer{};
+        size_t size_to_update = n_elems;
+        while (n_groups_ > 1) {
+
+            size_t src_size = n_groups_ - 1;
+            dependent_event =
+                inclusive_scan_base_step<outputT, outputT, n_wi, NoOpIndexer,
+                                         NoOpTransformerT>(
+                    exec_q, wg_size, src_size, src, local_scans, chunk_size - 1,
+                    chunk_size, _no_op_indexer, _no_op_transformer,
+                    n_groups_, // n_groups_ is modified in place
+                    {dependent_event});
+            stack.push_back({src, size_to_update, local_scans});
+            src = local_scans;
+            local_scans += src_size;
+            size_to_update = src_size;
+        }
+
+        for (size_t reverse_stack_id = 0; reverse_stack_id < stack.size();
+             ++reverse_stack_id)
+        {
+            auto stack_id = stack.size() - 1 - reverse_stack_id;
+
+            auto stack_elem = stack[stack_id];
+            outputT *src = stack_elem.get_src_ptr();
+            size_t src_size = stack_elem.get_size();
+            outputT *local_scans = stack_elem.get_local_scans_ptr();
+
+            // output[ chunk_size * (i + 1) + j] += temp[i]
+            dependent_event = exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(dependent_event);
+
+                using UpdateKernelName =
+                    class inclusive_scan_iter_chunk_update_krn<
+                        inputT, outputT, n_wi, IndexerT, TransformerT,
+                        NoOpIndexer, NoOpTransformerT>;
+
+                cgh.parallel_for<UpdateKernelName>(
+                    {src_size}, [chunk_size, src, local_scans](auto wiid) {
+                        auto gid = wiid[0];
+                        auto i = (gid / chunk_size);
+                        src[gid] += (i > 0) ? local_scans[i - 1] : outputT(0);
+                    });
             });
-        });
+        }
 
         sycl::event e4 = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(e3);
+            cgh.depends_on(dependent_event);
             const auto &ctx = exec_q.get_context();
             cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); });
         });
-
-        out_event = std::move(e4);
+        host_tasks.push_back(e4);
     }
 
-    return out_event;
+    return dependent_event;
 }
 
 typedef size_t (*accumulate_contig_impl_fn_ptr_t)(
@@ -221,29 +338,44 @@ typedef size_t (*accumulate_contig_impl_fn_ptr_t)(
     size_t,
     const char *,
     char *,
-    std::vector<sycl::event> const &);
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
 
 template <typename maskT, typename cumsumT, typename transformerT>
 size_t accumulate_contig_impl(sycl::queue &q,
                               size_t n_elems,
                               const char *mask,
                               char *cumsum,
-                              std::vector<sycl::event> const &depends = {})
+                              std::vector<sycl::event> &host_tasks,
+                              const std::vector<sycl::event> &depends = {})
 {
-    constexpr int n_wi = 8;
     const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
     cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
-    size_t wg_size = 128;
 
     NoOpIndexer flat_indexer{};
     transformerT non_zero_indicator{};
 
-    const sycl::event &comp_ev =
-        inclusive_scan_rec<maskT, cumsumT, n_wi, decltype(flat_indexer),
-                           decltype(non_zero_indicator)>(
-            q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1,
-            flat_indexer, non_zero_indicator, depends);
-
+    sycl::event comp_ev;
+
+    const sycl::device &dev = q.get_device();
+    if (dev.has(sycl::aspect::cpu)) {
+        constexpr nwiT n_wi_for_cpu = 8;
+        size_t wg_size = 256;
+        comp_ev = inclusive_scan_iter<maskT, cumsumT, n_wi_for_cpu,
+                                      decltype(flat_indexer),
+                                      decltype(non_zero_indicator)>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1,
+            flat_indexer, non_zero_indicator, host_tasks, depends);
+    }
+    else {
+        constexpr nwiT n_wi_for_gpu = 4;
+        size_t wg_size = 256;
+        comp_ev = inclusive_scan_iter<maskT, cumsumT, n_wi_for_gpu,
+                                      decltype(flat_indexer),
+                                      decltype(non_zero_indicator)>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1,
+            flat_indexer, non_zero_indicator, host_tasks, depends);
+    }
     cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
 
     cumsumT *last_elem_host_usm = sycl::malloc_host<cumsumT>(1, q);
@@ -251,8 +383,10 @@ size_t accumulate_contig_impl(sycl::queue &q,
     if (last_elem_host_usm == nullptr) {
         throw std::bad_alloc();
     }
-    sycl::event copy_e =
-        q.copy<cumsumT>(last_elem, last_elem_host_usm, 1, {comp_ev});
+    sycl::event copy_e = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(comp_ev);
+        cgh.copy<cumsumT>(last_elem, last_elem_host_usm, 1);
+    });
     copy_e.wait();
     size_t return_val = static_cast<size_t>(*last_elem_host_usm);
     sycl::free(last_elem_host_usm, q);
@@ -305,7 +439,8 @@ typedef size_t (*accumulate_strided_impl_fn_ptr_t)(
     int,
     const py::ssize_t *,
     char *,
-    std::vector<sycl::event> const &);
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
 
 template <typename maskT, typename cumsumT, typename transformerT>
 size_t accumulate_strided_impl(sycl::queue &q,
@@ -314,21 +449,35 @@ size_t accumulate_strided_impl(sycl::queue &q,
                                int nd,
                                const py::ssize_t *shape_strides,
                                char *cumsum,
-                               std::vector<sycl::event> const &depends = {})
+                               std::vector<sycl::event> &host_tasks,
+                               const std::vector<sycl::event> &depends = {})
 {
-    constexpr int n_wi = 8;
     const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
     cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
-    size_t wg_size = 128;
 
     StridedIndexer strided_indexer{nd, 0, shape_strides};
     transformerT non_zero_indicator{};
 
-    const sycl::event &comp_ev =
-        inclusive_scan_rec<maskT, cumsumT, n_wi, decltype(strided_indexer),
-                           decltype(non_zero_indicator)>(
-            q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1,
-            strided_indexer, non_zero_indicator, depends);
+    const sycl::device &dev = q.get_device();
+    sycl::event comp_ev;
+    if (dev.has(sycl::aspect::cpu)) {
+        constexpr nwiT n_wi_for_cpu = 8;
+        size_t wg_size = 256;
+        comp_ev = inclusive_scan_iter<maskT, cumsumT, n_wi_for_cpu,
+                                      decltype(strided_indexer),
+                                      decltype(non_zero_indicator)>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1,
+            strided_indexer, non_zero_indicator, host_tasks, depends);
+    }
+    else {
+        constexpr nwiT n_wi_for_gpu = 4;
+        size_t wg_size = 256;
+        comp_ev = inclusive_scan_iter<maskT, cumsumT, n_wi_for_gpu,
+                                      decltype(strided_indexer),
+                                      decltype(non_zero_indicator)>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1,
+            strided_indexer, non_zero_indicator, host_tasks, depends);
+    }
 
     cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
 
@@ -337,8 +486,10 @@ size_t accumulate_strided_impl(sycl::queue &q,
     if (last_elem_host_usm == nullptr) {
         throw std::bad_alloc();
     }
-    sycl::event copy_e =
-        q.copy<cumsumT>(last_elem, last_elem_host_usm, 1, {comp_ev});
+    sycl::event copy_e = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(comp_ev);
+        cgh.copy<cumsumT>(last_elem, last_elem_host_usm, 1);
+    });
     copy_e.wait();
     size_t return_val = static_cast<size_t>(*last_elem_host_usm);
     sycl::free(last_elem_host_usm, q);
diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
index 61fb0f6ba0..877680c8bf 100644
--- a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
@@ -315,8 +315,10 @@ boolean_reduction_axis1_contig_impl(sycl::queue &exec_q,
         });
     }
     else {
-        sycl::event init_ev = exec_q.fill<resTy>(res_tp, resTy(identity_val),
-                                                 iter_nelems, depends);
+        sycl::event init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+            cgh.fill<resTy>(res_tp, resTy(identity_val), iter_nelems);
+        });
         red_ev = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(init_ev);
 
@@ -484,8 +486,10 @@ boolean_reduction_axis0_contig_impl(sycl::queue &exec_q,
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
     {
-        sycl::event init_ev = exec_q.fill<resTy>(res_tp, resTy(identity_val),
-                                                 iter_nelems, depends);
+        sycl::event init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+            cgh.fill<resTy>(res_tp, resTy(identity_val), iter_nelems);
+        });
         sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(init_ev);
 
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting.hpp b/dpctl/tensor/libtensor/include/kernels/sorting.hpp
new file mode 100644
index 0000000000..e7a024259e
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/sorting.hpp
@@ -0,0 +1,913 @@
+//=== sorting.hpp -  Implementation of sorting kernels       ---*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor sort/argsort operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+
+#include <cassert>
+#include <functional>
+#include <iterator>
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+
+namespace sort_detail
+{
+
+template <typename T> T quotient_ceil(T n, T m)
+{
+    return (n + m - 1) / m;
+}
+
+template <typename Acc, typename Value, typename Compare>
+std::size_t lower_bound_impl(const Acc acc,
+                             std::size_t first,
+                             std::size_t last,
+                             const Value &value,
+                             Compare comp)
+{
+    std::size_t n = last - first;
+    std::size_t cur = n;
+    std::size_t it;
+    while (n > 0) {
+        it = first;
+        cur = n / 2;
+        it += cur;
+        if (comp(acc[it], value)) {
+            n -= cur + 1, first = ++it;
+        }
+        else
+            n = cur;
+    }
+    return first;
+}
+
+template <typename Acc, typename Value, typename Compare>
+std::size_t upper_bound_impl(const Acc acc,
+                             const std::size_t first,
+                             const std::size_t last,
+                             const Value &value,
+                             Compare comp)
+{
+    const auto &op_comp = [comp](auto x, auto y) { return !comp(y, x); };
+    return lower_bound_impl(acc, first, last, value, op_comp);
+}
+
+/*! @brief Merge two contiguous sorted segments */
+template <typename InAcc, typename OutAcc, typename Compare>
+void merge_impl(const std::size_t offset,
+                const InAcc in_acc,
+                OutAcc out_acc,
+                const std::size_t start_1,
+                const std::size_t end_1,
+                const std::size_t end_2,
+                const std::size_t start_out,
+                Compare comp,
+                const std::size_t chunk)
+{
+    const std::size_t start_2 = end_1;
+    // Borders of the sequences to merge within this call
+    const std::size_t local_start_1 = sycl::min(offset + start_1, end_1);
+    const std::size_t local_end_1 = sycl::min(local_start_1 + chunk, end_1);
+    const std::size_t local_start_2 = sycl::min(offset + start_2, end_2);
+    const std::size_t local_end_2 = sycl::min(local_start_2 + chunk, end_2);
+
+    const std::size_t local_size_1 = local_end_1 - local_start_1;
+    const std::size_t local_size_2 = local_end_2 - local_start_2;
+
+    const auto r_item_1 = in_acc[end_1 - 1];
+    const auto l_item_2 = in_acc[start_2];
+
+    // Copy if the sequences are sorted with respect to each other or merge
+    // otherwise
+    if (!comp(l_item_2, r_item_1)) {
+        const std::size_t out_shift_1 = start_out + local_start_1 - start_1;
+        const std::size_t out_shift_2 =
+            start_out + end_1 - start_1 + local_start_2 - start_2;
+
+        for (std::size_t i = 0; i < local_size_1; ++i) {
+            out_acc[out_shift_1 + i] = in_acc[local_start_1 + i];
+        }
+        for (std::size_t i = 0; i < local_size_2; ++i) {
+            out_acc[out_shift_2 + i] = in_acc[local_start_2 + i];
+        }
+    }
+    else if (comp(r_item_1, l_item_2)) {
+        const std::size_t out_shift_1 =
+            start_out + end_2 - start_2 + local_start_1 - start_1;
+        const std::size_t out_shift_2 = start_out + local_start_2 - start_2;
+        for (std::size_t i = 0; i < local_size_1; ++i) {
+            out_acc[out_shift_1 + i] = in_acc[local_start_1 + i];
+        }
+        for (std::size_t i = 0; i < local_size_2; ++i) {
+            out_acc[out_shift_2 + i] = in_acc[local_start_2 + i];
+        }
+    }
+    // Perform merging
+    else {
+
+        // Process 1st sequence
+        if (local_start_1 < local_end_1) {
+            // Reduce the range for searching within the 2nd sequence and handle
+            // bound items find left border in 2nd sequence
+            const auto local_l_item_1 = in_acc[local_start_1];
+            std::size_t l_search_bound_2 =
+                lower_bound_impl(in_acc, start_2, end_2, local_l_item_1, comp);
+            const std::size_t l_shift_1 = local_start_1 - start_1;
+            const std::size_t l_shift_2 = l_search_bound_2 - start_2;
+
+            out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_1;
+
+            std::size_t r_search_bound_2{};
+            // find right border in 2nd sequence
+            if (local_size_1 > 1) {
+                const auto local_r_item_1 = in_acc[local_end_1 - 1];
+                r_search_bound_2 = lower_bound_impl(
+                    in_acc, l_search_bound_2, end_2, local_r_item_1, comp);
+                const auto r_shift_1 = local_end_1 - 1 - start_1;
+                const auto r_shift_2 = r_search_bound_2 - start_2;
+
+                out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_1;
+            }
+
+            // Handle intermediate items
+            if (r_search_bound_2 == l_search_bound_2) {
+                const std::size_t shift_2 = l_search_bound_2 - start_2;
+                for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1;
+                     ++idx) {
+                    const auto intermediate_item_1 = in_acc[idx];
+                    const std::size_t shift_1 = idx - start_1;
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_1;
+                }
+            }
+            else {
+                for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1;
+                     ++idx) {
+                    const auto intermediate_item_1 = in_acc[idx];
+                    // we shouldn't seek in whole 2nd sequence. Just for the
+                    // part where the 1st sequence should be
+                    l_search_bound_2 = lower_bound_impl(
+                        in_acc, l_search_bound_2, r_search_bound_2,
+                        intermediate_item_1, comp);
+                    const std::size_t shift_1 = idx - start_1;
+                    const std::size_t shift_2 = l_search_bound_2 - start_2;
+
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_1;
+                }
+            }
+        }
+        // Process 2nd sequence
+        if (local_start_2 < local_end_2) {
+            // Reduce the range for searching within the 1st sequence and handle
+            // bound items find left border in 1st sequence
+            const auto local_l_item_2 = in_acc[local_start_2];
+            std::size_t l_search_bound_1 =
+                upper_bound_impl(in_acc, start_1, end_1, local_l_item_2, comp);
+            const std::size_t l_shift_1 = l_search_bound_1 - start_1;
+            const std::size_t l_shift_2 = local_start_2 - start_2;
+
+            out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_2;
+
+            std::size_t r_search_bound_1{};
+            // find right border in 1st sequence
+            if (local_size_2 > 1) {
+                const auto local_r_item_2 = in_acc[local_end_2 - 1];
+                r_search_bound_1 = upper_bound_impl(
+                    in_acc, l_search_bound_1, end_1, local_r_item_2, comp);
+                const std::size_t r_shift_1 = r_search_bound_1 - start_1;
+                const std::size_t r_shift_2 = local_end_2 - 1 - start_2;
+
+                out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_2;
+            }
+
+            // Handle intermediate items
+            if (l_search_bound_1 == r_search_bound_1) {
+                const std::size_t shift_1 = l_search_bound_1 - start_1;
+                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx)
+                {
+                    const auto intermediate_item_2 = in_acc[idx];
+                    const std::size_t shift_2 = idx - start_2;
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_2;
+                }
+            }
+            else {
+                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx)
+                {
+                    const auto intermediate_item_2 = in_acc[idx];
+                    // we shouldn't seek in whole 1st sequence. Just for the
+                    // part where the 2nd sequence should be
+                    l_search_bound_1 = upper_bound_impl(
+                        in_acc, l_search_bound_1, r_search_bound_1,
+                        intermediate_item_2, comp);
+                    const std::size_t shift_1 = l_search_bound_1 - start_1;
+                    const std::size_t shift_2 = idx - start_2;
+
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_2;
+                }
+            }
+        }
+    }
+}
+
+namespace
+{
+template <typename Iter, typename Compare>
+void insertion_sort_impl(Iter first,
+                         const size_t begin,
+                         const size_t end,
+                         Compare comp)
+{
+    for (size_t i = begin + 1; i < end; ++i) {
+        const auto val_i = first[i];
+        size_t j = i - 1;
+        while ((j + 1 > begin) && (comp(val_i, first[j]))) {
+            first[j + 1] = first[j];
+            --j;
+        }
+        if (j + 1 < i) {
+            first[j + 1] = val_i;
+        }
+    }
+}
+
+template <typename Iter, typename Compare>
+void bubble_sort_impl(Iter first,
+                      const size_t begin,
+                      const size_t end,
+                      Compare comp)
+{
+    if (begin < end) {
+        for (size_t i = begin; i < end; ++i) {
+            // Handle intermediate items
+            for (size_t idx = i + 1; idx < end; ++idx) {
+                if (comp(first[idx], first[i])) {
+                    std::swap(first[i], first[idx]);
+                }
+            }
+        }
+    }
+}
+
+template <typename Iter, typename Compare>
+void leaf_sort_impl(Iter first,
+                    const size_t begin,
+                    const size_t end,
+                    Compare comp)
+{
+    return insertion_sort_impl<Iter, Compare>(first, begin, end, comp);
+}
+} // namespace
+
+template <typename Iter> struct GetValueType
+{
+    using value_type = typename std::iterator_traits<Iter>::value_type;
+};
+
+template <typename ElementType,
+          sycl::access::address_space Space,
+          sycl::access::decorated IsDecorated>
+struct GetValueType<sycl::multi_ptr<ElementType, Space, IsDecorated>>
+{
+    using value_type = ElementType;
+};
+
+template <typename ElementType,
+          int Dim,
+          sycl::access_mode Mode,
+          sycl::target Target,
+          sycl::access::placeholder isPlaceholder>
+struct GetValueType<
+    sycl::accessor<ElementType, Dim, Mode, Target, isPlaceholder>>
+{
+    using value_type = ElementType;
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetValueType<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    using value_type = ElementType;
+};
+
+template <typename Iter> struct GetReadOnlyAccess
+{
+    Iter operator()(Iter it, sycl::handler &)
+    {
+        return it;
+    }
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetReadOnlyAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    auto operator()(sycl::buffer<ElementType, Dim, AllocatorT> buf,
+                    sycl::handler &cgh)
+    {
+        sycl::accessor acc(buf, cgh, sycl::read_only);
+        return acc;
+    }
+};
+
+template <typename Iter> struct GetWriteDiscardAccess
+{
+    Iter operator()(Iter it, sycl::handler &)
+    {
+        return it;
+    }
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetWriteDiscardAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    auto operator()(sycl::buffer<ElementType, Dim, AllocatorT> buf,
+                    sycl::handler &cgh)
+    {
+        sycl::accessor acc(buf, cgh, sycl::write_only, sycl::no_init);
+        return acc;
+    }
+};
+
+template <typename Iter> struct GetReadWriteAccess
+{
+    Iter operator()(Iter it, sycl::handler &)
+    {
+        return it;
+    }
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetReadWriteAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    auto operator()(sycl::buffer<ElementType, Dim, AllocatorT> buf,
+                    sycl::handler &cgh)
+    {
+        sycl::accessor acc(buf, cgh, sycl::read_write);
+        return acc;
+    }
+};
+
+template <typename T1, typename T2, typename Comp>
+class sort_base_step_contig_krn;
+
+template <typename InpAcc, typename OutAcc, typename Comp>
+sycl::event
+sort_base_step_contig_impl(sycl::queue &q,
+                           const size_t iter_nelems,
+                           const size_t sort_nelems,
+                           const InpAcc input,
+                           OutAcc output,
+                           const Comp &comp,
+                           const size_t conseq_nelems_sorted,
+                           const std::vector<sycl::event> &depends = {})
+{
+
+    using inpT = typename GetValueType<InpAcc>::value_type;
+    using outT = typename GetValueType<OutAcc>::value_type;
+    using KernelName = sort_base_step_contig_krn<inpT, outT, Comp>;
+
+    const size_t n_segments =
+        quotient_ceil<size_t>(sort_nelems, conseq_nelems_sorted);
+
+    sycl::event base_sort = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const sycl::range<1> gRange{iter_nelems * n_segments};
+
+        auto input_acc = GetReadOnlyAccess<InpAcc>{}(input, cgh);
+        auto output_acc = GetWriteDiscardAccess<OutAcc>{}(output, cgh);
+
+        cgh.parallel_for<KernelName>(gRange, [=](sycl::id<1> id) {
+            const size_t iter_id = id[0] / n_segments;
+            const size_t segment_id = id[0] - iter_id * n_segments;
+
+            const size_t iter_offset = iter_id * sort_nelems;
+            const size_t beg_id =
+                iter_offset + segment_id * conseq_nelems_sorted;
+            const size_t end_id =
+                iter_offset +
+                std::min<size_t>((segment_id + 1) * conseq_nelems_sorted,
+                                 sort_nelems);
+            for (size_t i = beg_id; i < end_id; ++i) {
+                output_acc[i] = input_acc[i];
+            }
+
+            leaf_sort_impl(output_acc, beg_id, end_id, comp);
+        });
+    });
+
+    return base_sort;
+}
+
+template <typename T1, typename T2, typename Comp>
+class sort_over_work_group_contig_krn;
+
+template <typename InpAcc, typename OutAcc, typename Comp>
+sycl::event
+sort_over_work_group_contig_impl(sycl::queue &q,
+                                 size_t iter_nelems,
+                                 size_t sort_nelems,
+                                 const InpAcc input,
+                                 OutAcc output,
+                                 const Comp &comp,
+                                 size_t &nelems_wg_sorts,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using inpT = typename GetValueType<InpAcc>::value_type;
+    using T = typename GetValueType<OutAcc>::value_type;
+    using KernelName = sort_over_work_group_contig_krn<inpT, T, Comp>;
+
+    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+    auto const &ctx = q.get_context();
+    auto const &dev = q.get_device();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+
+    auto krn = kb.template get_kernel(kernel_id);
+
+    const std::uint32_t max_sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+    const std::uint64_t device_local_memory_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+
+    //  leave 512 bytes of local memory for RT
+    const std::uint64_t safety_margin = 512;
+
+    const std::uint64_t nelems_per_slm =
+        (device_local_memory_size - safety_margin) / (2 * sizeof(T));
+
+    constexpr std::uint32_t sub_groups_per_work_group = 4;
+    const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2;
+
+    const size_t lws = sub_groups_per_work_group * max_sg_size;
+
+    nelems_wg_sorts = elems_per_wi * lws;
+
+    if (nelems_wg_sorts > nelems_per_slm) {
+        nelems_wg_sorts = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4);
+
+        return sort_base_step_contig_impl<InpAcc, OutAcc, Comp>(
+            q, iter_nelems, sort_nelems, input, output, comp, nelems_wg_sorts,
+            depends);
+    }
+
+    // This assumption permits doing away with using a loop
+    assert(nelems_wg_sorts % lws == 0);
+
+    const size_t n_segments =
+        quotient_ceil<size_t>(sort_nelems, nelems_wg_sorts);
+
+    sycl::event base_sort_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.use_kernel_bundle(kb);
+
+        sycl::range<1> global_range{iter_nelems * n_segments * lws};
+        sycl::range<1> local_range{lws};
+
+        sycl::range<1> slm_range{nelems_wg_sorts};
+        sycl::local_accessor<T, 1> work_space(slm_range, cgh);
+        sycl::local_accessor<T, 1> scratch_space(slm_range, cgh);
+
+        auto input_acc = GetReadOnlyAccess<InpAcc>{}(input, cgh);
+        auto output_acc = GetWriteDiscardAccess<OutAcc>{}(output, cgh);
+
+        sycl::nd_range<1> ndRange(global_range, local_range);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const size_t group_id = it.get_group_linear_id();
+            const size_t iter_id = group_id / n_segments;
+            const size_t segment_id = group_id - iter_id * n_segments;
+            const size_t lid = it.get_local_linear_id();
+
+            const size_t segment_start_idx = segment_id * nelems_wg_sorts;
+            const size_t segment_end_idx = std::min<size_t>(
+                segment_start_idx + nelems_wg_sorts, sort_nelems);
+            const size_t wg_chunk_size = segment_end_idx - segment_start_idx;
+
+            // load input into SLM
+            for (size_t array_id = segment_start_idx + lid;
+                 array_id < segment_end_idx; array_id += lws)
+            {
+                T v = (array_id < sort_nelems)
+                          ? input_acc[iter_id * sort_nelems + array_id]
+                          : T{};
+                work_space[array_id - segment_start_idx] = v;
+            }
+            sycl::group_barrier(it.get_group());
+
+            const size_t chunk = quotient_ceil<size_t>(nelems_wg_sorts, lws);
+
+            const size_t chunk_start_idx = lid * chunk;
+            const size_t chunk_end_idx =
+                sycl::min(chunk_start_idx + chunk, wg_chunk_size);
+
+            leaf_sort_impl(work_space, chunk_start_idx, chunk_end_idx, comp);
+
+            sycl::group_barrier(it.get_group());
+
+            bool data_in_temp = false;
+            size_t sorted_size = 1;
+            while (true) {
+                const size_t nelems_sorted_so_far = sorted_size * chunk;
+                if (nelems_sorted_so_far < wg_chunk_size) {
+                    const size_t q = (lid / sorted_size);
+                    const size_t start_1 =
+                        sycl::min(2 * nelems_sorted_so_far * q, wg_chunk_size);
+                    const size_t end_1 = sycl::min(
+                        start_1 + nelems_sorted_so_far, wg_chunk_size);
+                    const size_t end_2 =
+                        sycl::min(end_1 + nelems_sorted_so_far, wg_chunk_size);
+                    const size_t offset = chunk * (lid - q * sorted_size);
+
+                    if (data_in_temp) {
+                        merge_impl(offset, scratch_space, work_space, start_1,
+                                   end_1, end_2, start_1, comp, chunk);
+                    }
+                    else {
+                        merge_impl(offset, work_space, scratch_space, start_1,
+                                   end_1, end_2, start_1, comp, chunk);
+                    }
+                    sycl::group_barrier(it.get_group());
+
+                    data_in_temp = !data_in_temp;
+                    sorted_size *= 2;
+                }
+                else
+                    break;
+            }
+
+            const auto &out_src = (data_in_temp) ? scratch_space : work_space;
+            for (size_t array_id = segment_start_idx + lid;
+                 array_id < segment_end_idx; array_id += lws)
+            {
+                if (array_id < sort_nelems) {
+                    output_acc[iter_id * sort_nelems + array_id] =
+                        out_src[array_id - segment_start_idx];
+                }
+            }
+        });
+    });
+
+    return base_sort_ev;
+}
+
+class vacuous_krn;
+
+inline sycl::event tie_events(sycl::queue &q,
+                              const std::vector<sycl::event> depends)
+{
+    if (depends.empty())
+        return sycl::event();
+    if (depends.size() == 1)
+        return depends[0];
+
+    sycl::event e = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        using KernelName = vacuous_krn;
+        cgh.single_task<KernelName>([]() {});
+    });
+
+    return e;
+}
+
+template <typename T, typename Comp> class merge_adjacent_blocks_to_temp_krn;
+
+template <typename T, typename Comp> class merge_adjacent_blocks_from_temp_krn;
+
+template <typename Acc, typename Comp>
+sycl::event
+merge_sorted_block_contig_impl(sycl::queue &q,
+                               size_t iter_nelems,
+                               size_t sort_nelems,
+                               Acc output,
+                               const Comp comp,
+                               size_t sorted_block_size,
+                               const std::vector<sycl::event> &depends = {})
+{
+
+    if (sorted_block_size >= sort_nelems)
+        return tie_events(q, depends);
+
+    // experimentally determined value
+    // size of segments worked upon by each work-item during merging
+    const sycl::device &dev = q.get_device();
+    const size_t segment_size = (dev.has(sycl::aspect::cpu)) ? 32 : 4;
+
+    const size_t chunk_size =
+        (sorted_block_size < segment_size) ? sorted_block_size : segment_size;
+
+    assert(sorted_block_size % chunk_size == 0);
+
+    using T = typename GetValueType<Acc>::value_type;
+
+    sycl::buffer<T, 1> temp_buf(sycl::range<1>{iter_nelems * sort_nelems});
+    // T *allocated_mem = sycl::malloc_device<T>(iter_nelems * sort_nelems, q);
+
+    bool needs_copy = true;
+    bool used_depends = false;
+
+    sycl::event dep_ev;
+    size_t chunks_merged = sorted_block_size / chunk_size;
+
+    assert(!(chunks_merged & (chunks_merged - 1)));
+
+    using ToTempKernelName = class merge_adjacent_blocks_to_temp_krn<T, Comp>;
+    using FromTempKernelName =
+        class merge_adjacent_blocks_from_temp_krn<T, Comp>;
+
+    while (chunks_merged * chunk_size < sort_nelems) {
+        sycl::event local_dep = dep_ev;
+
+        sycl::event merge_ev = q.submit([&](sycl::handler &cgh) {
+            if (used_depends) {
+                cgh.depends_on(local_dep);
+            }
+            else {
+                cgh.depends_on(depends);
+                used_depends = true;
+            }
+
+            const size_t n_chunks =
+                quotient_ceil<size_t>(sort_nelems, chunk_size);
+
+            if (needs_copy) {
+                sycl::accessor temp_acc{temp_buf, cgh, sycl::write_only,
+                                        sycl::no_init};
+                auto output_acc = GetReadOnlyAccess<Acc>{}(output, cgh);
+                cgh.parallel_for<ToTempKernelName>(
+                    {iter_nelems * n_chunks}, [=](sycl::id<1> wid) {
+                        auto flat_idx = wid[0];
+                        auto iter_idx = flat_idx / n_chunks;
+                        auto idx = flat_idx - n_chunks * iter_idx;
+
+                        const std::size_t idx_mult =
+                            (idx / chunks_merged) * chunks_merged;
+                        const std::size_t idx_rem = (idx - idx_mult);
+                        const std::size_t start_1 =
+                            sycl::min(2 * idx_mult * chunk_size, sort_nelems);
+                        const std::size_t end_1 = sycl::min(
+                            start_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t end_2 = sycl::min(
+                            end_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t offset = chunk_size * idx_rem;
+
+                        const std::size_t iter_offset = iter_idx * sort_nelems;
+
+                        merge_impl(offset, output_acc, temp_acc,
+                                   iter_offset + start_1, iter_offset + end_1,
+                                   iter_offset + end_2, iter_offset + start_1,
+                                   comp, chunk_size);
+                    });
+            }
+            else {
+                sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only};
+                auto output_acc = GetWriteDiscardAccess<Acc>{}(output, cgh);
+                cgh.parallel_for<FromTempKernelName>(
+                    {iter_nelems * n_chunks}, [=](sycl::id<1> wid) {
+                        auto flat_idx = wid[0];
+                        auto iter_idx = flat_idx / n_chunks;
+                        auto idx = flat_idx - n_chunks * iter_idx;
+
+                        const std::size_t idx_mult =
+                            (idx / chunks_merged) * chunks_merged;
+                        const std::size_t idx_rem = (idx - idx_mult);
+                        const std::size_t start_1 =
+                            sycl::min(2 * idx_mult * chunk_size, sort_nelems);
+                        const std::size_t end_1 = sycl::min(
+                            start_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t end_2 = sycl::min(
+                            end_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t offset = chunk_size * idx_rem;
+
+                        const std::size_t iter_offset = iter_idx * sort_nelems;
+
+                        merge_impl(offset, temp_acc, output_acc,
+                                   iter_offset + start_1, iter_offset + end_1,
+                                   iter_offset + end_2, iter_offset + start_1,
+                                   comp, chunk_size);
+                    });
+            }
+        });
+
+        chunks_merged *= 2;
+        dep_ev = merge_ev;
+
+        if (chunks_merged * chunk_size < sort_nelems) {
+            needs_copy = !needs_copy;
+        }
+    }
+
+    if (needs_copy) {
+        sycl::event copy_ev = q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(dep_ev);
+
+            sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only};
+            auto output_acc = GetWriteDiscardAccess<Acc>{}(output, cgh);
+
+            cgh.copy(temp_acc, output_acc);
+        });
+        dep_ev = copy_ev;
+    }
+
+    return dep_ev;
+}
+
+} // end of namespace sort_detail
+
+typedef sycl::event (*sort_contig_fn_ptr_t)(sycl::queue &,
+                                            size_t,
+                                            size_t,
+                                            const char *,
+                                            char *,
+                                            py::ssize_t,
+                                            py::ssize_t,
+                                            py::ssize_t,
+                                            py::ssize_t,
+                                            const std::vector<sycl::event> &);
+
+template <typename argTy, typename Comp = std::less<argTy>>
+sycl::event stable_sort_axis1_contig_impl(
+    sycl::queue &exec_q,
+    size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a
+                        // matrix when sorting over rows)
+    size_t sort_nelems, // size of each array to sort  (length of rows, i.e.
+                        // number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    py::ssize_t iter_arg_offset,
+    py::ssize_t iter_res_offset,
+    py::ssize_t sort_arg_offset,
+    py::ssize_t sort_res_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + sort_arg_offset;
+    argTy *res_tp =
+        reinterpret_cast<argTy *>(res_cp) + iter_res_offset + sort_res_offset;
+
+    auto comp = Comp{};
+
+    constexpr size_t sequential_sorting_threshold = 64;
+
+    if (sort_nelems < sequential_sorting_threshold) {
+        // equal work-item sorts entire row
+        sycl::event sequential_sorting_ev =
+            sort_detail::sort_base_step_contig_impl<const argTy *, argTy *,
+                                                    Comp>(
+                exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
+                sort_nelems, depends);
+
+        return sequential_sorting_ev;
+    }
+    else {
+        size_t sorted_block_size{};
+
+        // Sort segments of the array
+        sycl::event base_sort_ev =
+            sort_detail::sort_over_work_group_contig_impl<const argTy *,
+                                                          argTy *, Comp>(
+                exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
+                sorted_block_size, // modified in place with size of sorted
+                                   // block size
+                depends);
+
+        // Merge segments in parallel until all elements are sorted
+        sycl::event merges_ev =
+            sort_detail::merge_sorted_block_contig_impl<argTy *, Comp>(
+                exec_q, iter_nelems, sort_nelems, res_tp, comp,
+                sorted_block_size, {base_sort_ev});
+
+        return merges_ev;
+    }
+}
+
+template <typename T1, typename T2, typename T3>
+class populate_indexed_data_krn;
+
+template <typename T1, typename T2, typename T3> class index_write_out_krn;
+
+template <typename pairT, typename ValueComp> struct TupleComp
+{
+    bool operator()(const pairT &p1, const pairT &p2) const
+    {
+        const ValueComp value_comp{};
+        return value_comp(std::get<0>(p1), std::get<0>(p2));
+    }
+};
+
+template <typename argTy,
+          typename IndexTy,
+          typename ValueComp = std::less<argTy>>
+sycl::event stable_argsort_axis1_contig_impl(
+    sycl::queue &exec_q,
+    size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a
+                        // matrix when sorting over rows)
+    size_t sort_nelems, // size of each array to sort  (length of rows, i.e.
+                        // number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    py::ssize_t iter_arg_offset,
+    py::ssize_t iter_res_offset,
+    py::ssize_t sort_arg_offset,
+    py::ssize_t sort_res_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + sort_arg_offset;
+    IndexTy *res_tp =
+        reinterpret_cast<IndexTy *>(res_cp) + iter_res_offset + sort_res_offset;
+
+    using ValueIndexT = std::pair<argTy, IndexTy>;
+    const TupleComp<ValueIndexT, ValueComp> tuple_comp{};
+
+    static constexpr size_t determine_automatically = 0;
+    size_t sorted_block_size =
+        (sort_nelems >= 512) ? 512 : determine_automatically;
+
+    sycl::buffer<ValueIndexT, 1> indexed_data(
+        sycl::range<1>(iter_nelems * sort_nelems));
+    sycl::buffer<ValueIndexT, 1> temp_buf(
+        sycl::range<1>(iter_nelems * sort_nelems));
+
+    sycl::event populate_indexed_data_ev =
+        exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+            sycl::accessor acc(indexed_data, cgh, sycl::write_only,
+                               sycl::no_init);
+
+            auto const &range = indexed_data.get_range();
+
+            using KernelName =
+                populate_indexed_data_krn<argTy, IndexTy, ValueComp>;
+
+            cgh.parallel_for<KernelName>(range, [=](sycl::id<1> id) {
+                size_t i = id[0];
+                size_t sort_id = i % sort_nelems;
+                acc[i] =
+                    std::make_pair(arg_tp[i], static_cast<IndexTy>(sort_id));
+            });
+        });
+
+    // Sort segments of the array
+    sycl::event base_sort_ev = sort_detail::sort_over_work_group_contig_impl(
+        exec_q, iter_nelems, sort_nelems, indexed_data, temp_buf, tuple_comp,
+        sorted_block_size, // modified in place with size of sorted block size
+        {populate_indexed_data_ev});
+
+    // Merge segments in parallel until all elements are sorted
+    sycl::event merges_ev = sort_detail::merge_sorted_block_contig_impl(
+        exec_q, iter_nelems, sort_nelems, temp_buf, tuple_comp,
+        sorted_block_size, {base_sort_ev});
+
+    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(merges_ev);
+
+        auto temp_acc =
+            sort_detail::GetReadOnlyAccess<decltype(temp_buf)>{}(temp_buf, cgh);
+
+        using KernelName = index_write_out_krn<argTy, IndexTy, ValueComp>;
+
+        cgh.parallel_for<KernelName>(temp_buf.get_range(), [=](sycl::id<1> id) {
+            res_tp[id] = std::get<1>(temp_acc[id]);
+        });
+    });
+
+    return write_out_ev;
+}
+
+} // end of namespace kernels
+} // end of namespace tensor
+} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators.cpp b/dpctl/tensor/libtensor/source/accumulators.cpp
index 0a2ce69f69..094aa4c284 100644
--- a/dpctl/tensor/libtensor/source/accumulators.cpp
+++ b/dpctl/tensor/libtensor/source/accumulators.cpp
@@ -150,12 +150,20 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
 
     const bool use_i32 = (cumsum_typeid == int32_typeid);
 
+    std::vector<sycl::event> host_task_events;
+
     if (mask.is_c_contiguous()) {
         auto fn = (use_i32)
                       ? mask_positions_contig_i32_dispatch_vector[mask_typeid]
                       : mask_positions_contig_i64_dispatch_vector[mask_typeid];
 
-        return fn(exec_q, mask_size, mask_data, cumsum_data, depends);
+        size_t total_set = fn(exec_q, mask_size, mask_data, cumsum_data,
+                              host_task_events, depends);
+        {
+            py::gil_scoped_release release;
+            sycl::event::wait(host_task_events);
+        }
+        return total_set;
     }
 
     const py::ssize_t *shape = mask.get_shape_raw();
@@ -175,7 +183,6 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
     auto strided_fn =
         (use_i32) ? mask_positions_strided_i32_dispatch_vector[mask_typeid]
                   : mask_positions_strided_i64_dispatch_vector[mask_typeid];
-    std::vector<sycl::event> host_task_events;
 
     using dpctl::tensor::offset_utils::device_allocate_and_pack;
     const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
@@ -189,7 +196,10 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
 
     if (2 * static_cast<size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
         copy_shape_ev.wait();
-        sycl::event::wait(host_task_events);
+        {
+            py::gil_scoped_release release;
+            sycl::event::wait(host_task_events);
+        }
         sycl::free(shape_strides, exec_q);
         throw std::runtime_error("Unexpected error");
     }
@@ -200,10 +210,14 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
     dependent_events.insert(dependent_events.end(), depends.begin(),
                             depends.end());
 
-    size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd,
-                                  shape_strides, cumsum_data, dependent_events);
+    size_t total_set =
+        strided_fn(exec_q, mask_size, mask_data, nd, shape_strides, cumsum_data,
+                   host_task_events, dependent_events);
 
-    sycl::event::wait(host_task_events);
+    {
+        py::gil_scoped_release release;
+        sycl::event::wait(host_task_events);
+    }
     sycl::free(shape_strides, exec_q);
 
     return total_set;
@@ -283,6 +297,8 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
             "Cumulative sum array must have int64 data-type.");
     }
 
+    std::vector<sycl::event> host_task_events;
+
     if (src.is_c_contiguous()) {
         auto fn = cumsum_1d_contig_dispatch_vector[src_typeid];
         if (fn == nullptr) {
@@ -290,7 +306,13 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
                 "this cumsum requires integer type, got src_typeid=" +
                 std::to_string(src_typeid));
         }
-        return fn(exec_q, src_size, src_data, cumsum_data, depends);
+        size_t total = fn(exec_q, src_size, src_data, cumsum_data,
+                          host_task_events, depends);
+        {
+            py::gil_scoped_release release;
+            sycl::event::wait(host_task_events);
+        }
+        return total;
     }
 
     const py::ssize_t *shape = src.get_shape_raw();
@@ -313,7 +335,6 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
             "this cumsum requires integer type, got src_typeid=" +
             std::to_string(src_typeid));
     }
-    std::vector<sycl::event> host_task_events;
 
     using dpctl::tensor::offset_utils::device_allocate_and_pack;
     const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
@@ -339,9 +360,12 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
                             depends.end());
 
     size_t total = strided_fn(exec_q, src_size, src_data, nd, shape_strides,
-                              cumsum_data, dependent_events);
+                              cumsum_data, host_task_events, dependent_events);
 
-    sycl::event::wait(host_task_events);
+    {
+        py::gil_scoped_release release;
+        sycl::event::wait(host_task_events);
+    }
     sycl::free(shape_strides, exec_q);
 
     return total;
diff --git a/dpctl/tensor/libtensor/source/sorting/argsort.cpp b/dpctl/tensor/libtensor/source/sorting/argsort.cpp
new file mode 100644
index 0000000000..f23084fe21
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/sorting/argsort.cpp
@@ -0,0 +1,261 @@
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===--------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+
+#include "utils/math_utils.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "argsort.hpp"
+#include "kernels/sorting.hpp"
+#include "sorting_common.hpp"
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+template <typename sorting_contig_impl_fnT>
+std::pair<sycl::event, sycl::event>
+py_argsort(const dpctl::tensor::usm_ndarray &src,
+           const int trailing_dims_to_sort,
+           const dpctl::tensor::usm_ndarray &dst,
+           sycl::queue &exec_q,
+           const std::vector<sycl::event> &depends,
+           const sorting_contig_impl_fnT &stable_sort_contig_fns)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+    int iteration_nd = src_nd - trailing_dims_to_sort;
+    if (trailing_dims_to_sort <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_sort must be positive, but no "
+                              "greater than rank of the array being sorted");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    size_t iter_nelems(1);
+
+    for (int i = 0; same_shapes && (i < iteration_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        iter_nelems *= static_cast<size_t>(src_shape_i);
+    }
+
+    size_t sort_nelems(1);
+    for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        sort_nelems *= static_cast<size_t>(src_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if ((iter_nelems == 0) || (sort_nelems == 0)) {
+        // Nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    // destination must be ample enough to accommodate all elements
+    {
+        auto dst_offsets = dst.get_minmax_offsets();
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < sort_nelems * iter_nelems) {
+            throw py::value_error(
+                "Destination array can not accommodate all the "
+                "elements of source array.");
+        }
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if ((dst_typeid != static_cast<int>(td_ns::typenum_t::INT64)) &&
+        (dst_typeid != static_cast<int>(td_ns::typenum_t::INT32)))
+    {
+        throw py::value_error(
+            "Output index array must have data type int32 or int64");
+    }
+
+    // handle special case when both reduction and iteration are 1D contiguous
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    if (is_src_c_contig && is_dst_c_contig) {
+        using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl;
+
+        static constexpr py::ssize_t zero_offset = py::ssize_t(0);
+
+        auto fn = stable_sort_contig_fns[src_typeid][dst_typeid];
+
+        if (fn == nullptr) {
+            throw py::value_error("Not implemented for given index type");
+        }
+
+        sycl::event comp_ev =
+            fn(exec_q, iter_nelems, sort_nelems, src.get_data(), dst.get_data(),
+               zero_offset, zero_offset, zero_offset, zero_offset, depends);
+
+        sycl::event keep_args_alive_ev =
+            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev});
+
+        return std::make_pair(keep_args_alive_ev, comp_ev);
+    }
+
+    return std::make_pair(sycl::event(), sycl::event());
+}
+
+using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+static sort_contig_fn_ptr_t
+    ascending_argsort_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static sort_contig_fn_ptr_t
+    descending_argsort_contig_dispatch_table[td_ns::num_types]
+                                            [td_ns::num_types];
+
+template <typename fnT, typename argTy, typename IndexTy>
+struct AscendingArgSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<IndexTy, std::int64_t> ||
+                      std::is_same_v<IndexTy, std::int32_t>)
+        {
+            using Comp = typename AscendingSorter<argTy>::type;
+
+            using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl;
+            return stable_argsort_axis1_contig_impl<argTy, IndexTy, Comp>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename argTy, typename IndexTy>
+struct DescendingArgSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<IndexTy, std::int64_t> ||
+                      std::is_same_v<IndexTy, std::int32_t>)
+        {
+            using Comp = typename DescendingSorter<argTy>::type;
+
+            using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl;
+            return stable_argsort_axis1_contig_impl<argTy, IndexTy, Comp>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_argsort_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+    td_ns::DispatchTableBuilder<sort_contig_fn_ptr_t,
+                                AscendingArgSortContigFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(ascending_argsort_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<
+        sort_contig_fn_ptr_t, DescendingArgSortContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(descending_argsort_contig_dispatch_table);
+}
+
+void init_argsort_functions(py::module_ m)
+{
+    dpctl::tensor::py_internal::init_argsort_dispatch_tables();
+
+    auto py_argsort_ascending = [](const dpctl::tensor::usm_ndarray &src,
+                                   const int trailing_dims_to_sort,
+                                   const dpctl::tensor::usm_ndarray &dst,
+                                   sycl::queue &exec_q,
+                                   const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return dpctl::tensor::py_internal::py_argsort(
+            src, trailing_dims_to_sort, dst, exec_q, depends,
+            dpctl::tensor::py_internal::
+                ascending_argsort_contig_dispatch_table);
+    };
+    m.def("_argsort_ascending", py_argsort_ascending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto py_argsort_descending = [](const dpctl::tensor::usm_ndarray &src,
+                                    const int trailing_dims_to_sort,
+                                    const dpctl::tensor::usm_ndarray &dst,
+                                    sycl::queue &exec_q,
+                                    const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return dpctl::tensor::py_internal::py_argsort(
+            src, trailing_dims_to_sort, dst, exec_q, depends,
+            dpctl::tensor::py_internal::
+                descending_argsort_contig_dispatch_table);
+    };
+    m.def("_argsort_descending", py_argsort_descending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    return;
+}
+
+} // end of namespace py_internal
+} // end of namespace tensor
+} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/argsort.hpp b/dpctl/tensor/libtensor/source/sorting/argsort.hpp
new file mode 100644
index 0000000000..7637f2df4c
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/sorting/argsort.hpp
@@ -0,0 +1,42 @@
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===--------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_argsort_functions(py::module_);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/sort.cpp b/dpctl/tensor/libtensor/source/sorting/sort.cpp
new file mode 100644
index 0000000000..8a4a710354
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/sorting/sort.cpp
@@ -0,0 +1,235 @@
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===--------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+
+#include "utils/math_utils.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/sorting.hpp"
+#include "sort.hpp"
+#include "sorting_common.hpp"
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+template <typename sorting_contig_impl_fnT>
+std::pair<sycl::event, sycl::event>
+py_sort(const dpctl::tensor::usm_ndarray &src,
+        const int trailing_dims_to_sort,
+        const dpctl::tensor::usm_ndarray &dst,
+        sycl::queue &exec_q,
+        const std::vector<sycl::event> &depends,
+        const sorting_contig_impl_fnT &stable_sort_contig_fns)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+    int iteration_nd = src_nd - trailing_dims_to_sort;
+    if (trailing_dims_to_sort <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_sort must be positive, but no "
+                              "greater than rank of the array being sorted");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    size_t iter_nelems(1);
+
+    for (int i = 0; same_shapes && (i < iteration_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        iter_nelems *= static_cast<size_t>(src_shape_i);
+    }
+
+    size_t sort_nelems(1);
+    for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        sort_nelems *= static_cast<size_t>(src_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if ((iter_nelems == 0) || (sort_nelems == 0)) {
+        // Nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    // destination must be ample enough to accommodate all elements
+    {
+        auto dst_offsets = dst.get_minmax_offsets();
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < sort_nelems * iter_nelems) {
+            throw py::value_error(
+                "Destination array can not accommodate all the "
+                "elements of source array.");
+        }
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error("Both input arrays must have "
+                              "the same value data type");
+    }
+
+    // handle special case when both reduction and iteration are 1D contiguous
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    if (is_src_c_contig && is_dst_c_contig) {
+        using dpctl::tensor::kernels::stable_sort_axis1_contig_impl;
+
+        static constexpr py::ssize_t zero_offset = py::ssize_t(0);
+
+        auto fn = stable_sort_contig_fns[src_typeid];
+
+        sycl::event comp_ev =
+            fn(exec_q, iter_nelems, sort_nelems, src.get_data(), dst.get_data(),
+               zero_offset, zero_offset, zero_offset, zero_offset, depends);
+
+        sycl::event keep_args_alive_ev =
+            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev});
+
+        return std::make_pair(keep_args_alive_ev, comp_ev);
+    }
+
+    return std::make_pair(sycl::event(), sycl::event());
+}
+
+using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+static sort_contig_fn_ptr_t
+    ascending_sort_contig_dispatch_vector[td_ns::num_types];
+static sort_contig_fn_ptr_t
+    descending_sort_contig_dispatch_vector[td_ns::num_types];
+
+template <typename fnT, typename argTy> struct AscendingSortContigFactory
+{
+    fnT get()
+    {
+        using Comp = typename AscendingSorter<argTy>::type;
+
+        using dpctl::tensor::kernels::stable_sort_axis1_contig_impl;
+        return stable_sort_axis1_contig_impl<argTy, Comp>;
+    }
+};
+
+template <typename fnT, typename argTy> struct DescendingSortContigFactory
+{
+    fnT get()
+    {
+        using Comp = typename DescendingSorter<argTy>::type;
+        using dpctl::tensor::kernels::stable_sort_axis1_contig_impl;
+        return stable_sort_axis1_contig_impl<argTy, Comp>;
+    }
+};
+
+void init_sort_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+    td_ns::DispatchVectorBuilder<sort_contig_fn_ptr_t,
+                                 AscendingSortContigFactory, td_ns::num_types>
+        dtv1;
+    dtv1.populate_dispatch_vector(ascending_sort_contig_dispatch_vector);
+
+    td_ns::DispatchVectorBuilder<sort_contig_fn_ptr_t,
+                                 DescendingSortContigFactory, td_ns::num_types>
+        dtv2;
+    dtv2.populate_dispatch_vector(descending_sort_contig_dispatch_vector);
+}
+
+void init_sort_functions(py::module_ m)
+{
+    dpctl::tensor::py_internal::init_sort_dispatch_vectors();
+
+    auto py_sort_ascending = [](const dpctl::tensor::usm_ndarray &src,
+                                const int trailing_dims_to_sort,
+                                const dpctl::tensor::usm_ndarray &dst,
+                                sycl::queue &exec_q,
+                                const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return dpctl::tensor::py_internal::py_sort(
+            src, trailing_dims_to_sort, dst, exec_q, depends,
+            dpctl::tensor::py_internal::ascending_sort_contig_dispatch_vector);
+    };
+    m.def("_sort_ascending", py_sort_ascending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto py_sort_descending = [](const dpctl::tensor::usm_ndarray &src,
+                                 const int trailing_dims_to_sort,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return dpctl::tensor::py_internal::py_sort(
+            src, trailing_dims_to_sort, dst, exec_q, depends,
+            dpctl::tensor::py_internal::descending_sort_contig_dispatch_vector);
+    };
+    m.def("_sort_descending", py_sort_descending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    return;
+}
+
+} // end of namespace py_internal
+} // end of namespace tensor
+} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/sort.hpp b/dpctl/tensor/libtensor/source/sorting/sort.hpp
new file mode 100644
index 0000000000..2ad5bdb623
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/sorting/sort.hpp
@@ -0,0 +1,42 @@
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===--------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_sort_functions(py::module_);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp b/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp
new file mode 100644
index 0000000000..62b30ccb8f
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp
@@ -0,0 +1,134 @@
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===--------------------------------------------------------------------===//
+
+#pragma once
+
+#include "sycl/sycl.hpp"
+#include <type_traits>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace
+{
+template <typename fpT> struct ExtendedRealFPLess
+{
+    /* [R, nan] */
+    bool operator()(const fpT v1, const fpT v2) const
+    {
+        return (!sycl::isnan(v1) && (sycl::isnan(v2) || (v1 < v2)));
+    }
+};
+
+template <typename fpT> struct ExtendedRealFPGreater
+{
+    bool operator()(const fpT v1, const fpT v2) const
+    {
+        return (!sycl::isnan(v2) && (sycl::isnan(v1) || (v2 < v1)));
+    }
+};
+
+template <typename cT> struct ExtendedComplexFPLess
+{
+    /* [(R, R), (R, nan), (nan, R), (nan, nan)] */
+
+    bool operator()(const cT &v1, const cT &v2) const
+    {
+        using realT = typename cT::value_type;
+
+        const realT real1 = std::real(v1);
+        const realT real2 = std::real(v2);
+
+        const bool r1_nan = sycl::isnan(real1);
+        const bool r2_nan = sycl::isnan(real2);
+
+        const realT imag1 = std::imag(v1);
+        const realT imag2 = std::imag(v2);
+
+        const bool i1_nan = sycl::isnan(imag1);
+        const bool i2_nan = sycl::isnan(imag2);
+
+        const int idx1 = ((r1_nan) ? 2 : 0) + ((i1_nan) ? 1 : 0);
+        const int idx2 = ((r2_nan) ? 2 : 0) + ((i2_nan) ? 1 : 0);
+
+        const bool res =
+            !(r1_nan && i1_nan) &&
+            ((idx1 < idx2) ||
+             ((idx1 == idx2) &&
+              ((r1_nan && !i1_nan && (imag1 < imag2)) ||
+               (!r1_nan && i1_nan && (real1 < real2)) ||
+               (!r1_nan && !i1_nan &&
+                ((real1 < real2) || (!(real2 < real1) && (imag1 < imag2)))))));
+
+        return res;
+    }
+};
+
+template <typename cT> struct ExtendedComplexFPGreater
+{
+    bool operator()(const cT &v1, const cT &v2) const
+    {
+        auto less_ = ExtendedComplexFPLess<cT>{};
+        return less_(v2, v1);
+    }
+};
+
+template <typename T>
+inline constexpr bool is_fp_v = (std::is_same_v<T, sycl::half> ||
+                                 std::is_same_v<T, float> ||
+                                 std::is_same_v<T, double>);
+
+} // end of anonymous namespace
+
+template <typename argTy> struct AscendingSorter
+{
+    using type = std::conditional_t<is_fp_v<argTy>,
+                                    ExtendedRealFPLess<argTy>,
+                                    std::less<argTy>>;
+};
+
+template <typename T> struct AscendingSorter<std::complex<T>>
+{
+    using type = ExtendedComplexFPLess<std::complex<T>>;
+};
+
+template <typename argTy> struct DescendingSorter
+{
+    using type = std::conditional_t<is_fp_v<argTy>,
+                                    ExtendedRealFPGreater<argTy>,
+                                    std::greater<argTy>>;
+};
+
+template <typename T> struct DescendingSorter<std::complex<T>>
+{
+    using type = ExtendedComplexFPGreater<std::complex<T>>;
+};
+
+} // end of namespace py_internal
+} // end of namespace tensor
+} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/tensor_sorting.cpp b/dpctl/tensor/libtensor/source/tensor_sorting.cpp
new file mode 100644
index 0000000000..5d534d85cd
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/tensor_sorting.cpp
@@ -0,0 +1,37 @@
+//===-- tensor_sorting.cpp -                                 -----*-C++-*-/===//
+//   Implementation of _tensor_reductions_impl module
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "sorting/argsort.hpp"
+#include "sorting/sort.hpp"
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(_tensor_sorting_impl, m)
+{
+    dpctl::tensor::py_internal::init_sort_functions(m);
+    dpctl::tensor::py_internal::init_argsort_functions(m);
+}
diff --git a/dpctl/tests/test_usm_ndarray_sorting.py b/dpctl/tests/test_usm_ndarray_sorting.py
new file mode 100644
index 0000000000..e76ac667e1
--- /dev/null
+++ b/dpctl/tests/test_usm_ndarray_sorting.py
@@ -0,0 +1,282 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+import numpy as np
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_sort_1d(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    inp = dpt.roll(
+        dpt.concat(
+            (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype))
+        ),
+        734,
+    )
+
+    s = dpt.sort(inp, descending=False)
+    assert dpt.all(s[:-1] <= s[1:])
+
+    s1 = dpt.sort(inp, descending=True)
+    assert dpt.all(s1[:-1] >= s1[1:])
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_sort_2d(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    fl = dpt.roll(
+        dpt.concat(
+            (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype))
+        ),
+        734,
+    )
+    inp = dpt.reshape(fl, (20, -1))
+
+    s = dpt.sort(inp, axis=1, descending=False)
+    assert dpt.all(s[:, :-1] <= s[:, 1:])
+
+    s1 = dpt.sort(inp, axis=1, descending=True)
+    assert dpt.all(s1[:, :-1] >= s1[:, 1:])
+
+
+def test_sort_strides():
+
+    fl = dpt.roll(
+        dpt.concat((dpt.ones(10000, dtype="i4"), dpt.zeros(10000, dtype="i4"))),
+        734,
+    )
+    inp = dpt.reshape(fl, (-1, 20))
+
+    s = dpt.sort(inp, axis=0, descending=False)
+    assert dpt.all(s[:-1, :] <= s[1:, :])
+
+    s1 = dpt.sort(inp, axis=0, descending=True)
+    assert dpt.all(s1[:-1, :] >= s1[1:, :])
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_argsort_1d(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    inp = dpt.roll(
+        dpt.concat(
+            (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype))
+        ),
+        734,
+    )
+
+    s_idx = dpt.argsort(inp, descending=False)
+    assert dpt.all(inp[s_idx[:-1]] <= inp[s_idx[1:]])
+
+    s1_idx = dpt.argsort(inp, descending=True)
+    assert dpt.all(inp[s1_idx[:-1]] >= inp[s1_idx[1:]])
+
+
+def test_sort_validation():
+    with pytest.raises(TypeError):
+        dpt.sort(dict())
+
+
+def test_argsort_validation():
+    with pytest.raises(TypeError):
+        dpt.argsort(dict())
+
+
+def test_sort_axis0():
+    get_queue_or_skip()
+
+    n, m = 200, 30
+    xf = dpt.arange(n * m, 0, step=-1, dtype="i4")
+    x = dpt.reshape(xf, (n, m))
+    s = dpt.sort(x, axis=0)
+
+    assert dpt.all(s[:-1, :] <= s[1:, :])
+
+
+def test_argsort_axis0():
+    get_queue_or_skip()
+
+    n, m = 200, 30
+    xf = dpt.arange(n * m, 0, step=-1, dtype="i4")
+    x = dpt.reshape(xf, (n, m))
+    idx = dpt.argsort(x, axis=0)
+
+    conseq_idx = dpt.arange(m, dtype=idx.dtype)
+    s = x[idx, conseq_idx[dpt.newaxis, :]]
+
+    assert dpt.all(s[:-1, :] <= s[1:, :])
+
+
+def test_sort_strided():
+    get_queue_or_skip()
+
+    x_orig = dpt.arange(100, dtype="i4")
+    x_flipped = dpt.flip(x_orig, axis=0)
+    s = dpt.sort(x_flipped)
+
+    assert dpt.all(s == x_orig)
+
+
+def test_argsort_strided():
+    get_queue_or_skip()
+
+    x_orig = dpt.arange(100, dtype="i4")
+    x_flipped = dpt.flip(x_orig, axis=0)
+    idx = dpt.argsort(x_flipped)
+
+    assert dpt.all(x_flipped[idx] == x_orig)
+
+
+def test_sort_0d_array():
+    get_queue_or_skip()
+
+    x = dpt.asarray(1, dtype="i4")
+    assert dpt.sort(x) == 1
+
+
+def test_argsort_0d_array():
+    get_queue_or_skip()
+
+    x = dpt.asarray(1, dtype="i4")
+    assert dpt.argsort(x) == 0
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "f2",
+        "f4",
+        "f8",
+    ],
+)
+def test_sort_real_fp_nan(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.asarray(
+        [-0.0, 0.1, dpt.nan, 0.0, -0.1, dpt.nan, 0.2, -0.3], dtype=dtype
+    )
+    s = dpt.sort(x)
+
+    expected = dpt.asarray(
+        [-0.3, -0.1, -0.0, 0.0, 0.1, 0.2, dpt.nan, dpt.nan], dtype=dtype
+    )
+
+    assert dpt.allclose(s, expected, equal_nan=True)
+
+    s = dpt.sort(x, descending=True)
+
+    expected = dpt.asarray(
+        [dpt.nan, dpt.nan, 0.2, 0.1, -0.0, 0.0, -0.1, -0.3], dtype=dtype
+    )
+
+    assert dpt.allclose(s, expected, equal_nan=True)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "c8",
+        "c16",
+    ],
+)
+def test_sort_complex_fp_nan(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    rvs = [-0.0, 0.1, 0.0, 0.2, -0.3, dpt.nan]
+    ivs = [-0.0, 0.1, 0.0, 0.2, -0.3, dpt.nan]
+
+    cv = []
+    for rv in rvs:
+        for iv in ivs:
+            cv.append(complex(rv, iv))
+
+    inp = dpt.asarray(cv, dtype=dtype)
+    s = dpt.sort(inp)
+
+    expected = np.sort(dpt.asnumpy(inp))
+
+    assert np.allclose(dpt.asnumpy(s), expected, equal_nan=True)
+
+    for i, j in itertools.permutations(range(inp.shape[0]), 2):
+        r1 = dpt.asnumpy(dpt.sort(inp[dpt.asarray([i, j])]))
+        r2 = np.sort(dpt.asnumpy(inp[dpt.asarray([i, j])]))
+        assert np.array_equal(
+            r1.view(np.int64), r2.view(np.int64)
+        ), f"Failed for {i} and {j}"
diff --git a/dpctl/tests/test_usm_ndarray_unique.py b/dpctl/tests/test_usm_ndarray_unique.py
new file mode 100644
index 0000000000..0d4247def5
--- /dev/null
+++ b/dpctl/tests/test_usm_ndarray_unique.py
@@ -0,0 +1,304 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_unique_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, roll = 10000, 734
+    inp = dpt.roll(
+        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
+        roll,
+    )
+
+    uv = dpt.unique_values(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
+
+
+def test_unique_values_strided():
+    get_queue_or_skip()
+
+    n, m = 1000, 20
+    inp = dpt.ones((n, m), dtype="i4", order="F")
+    inp[:, ::2] = 0
+
+    uv = dpt.unique_values(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+
+    inp = dpt.reshape(inp, -1)
+    inp = dpt.flip(dpt.reshape(inp, -1))
+
+    uv = dpt.unique_values(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_unique_counts(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, roll = 10000, 734
+    inp = dpt.roll(
+        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
+        roll,
+    )
+
+    uv, uv_counts = dpt.unique_counts(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
+    assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype))
+
+
+def test_unique_counts_strided():
+    get_queue_or_skip()
+
+    n, m = 1000, 20
+    inp = dpt.ones((n, m), dtype="i4", order="F")
+    inp[:, ::2] = 0
+
+    uv, uv_counts = dpt.unique_counts(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
+
+    inp = dpt.flip(dpt.reshape(inp, -1))
+
+    uv, uv_counts = dpt.unique_counts(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_unique_inverse(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, roll = 10000, 734
+    inp = dpt.roll(
+        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
+        roll,
+    )
+
+    uv, inv = dpt.unique_inverse(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+
+
+def test_unique_inverse_strided():
+    get_queue_or_skip()
+
+    n, m = 1000, 20
+    inp = dpt.ones((n, m), dtype="i4", order="F")
+    inp[:, ::2] = 0
+
+    uv, inv = dpt.unique_inverse(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+
+    inp = dpt.flip(dpt.reshape(inp, -1))
+
+    uv, inv = dpt.unique_inverse(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_unique_all(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, roll = 10000, 734
+    inp = dpt.roll(
+        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
+        roll,
+    )
+
+    uv, ind, inv, uv_counts = dpt.unique_all(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
+    assert dpt.all(uv == inp[ind])
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+    assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype))
+
+
+def test_unique_all_strided():
+    get_queue_or_skip()
+
+    n, m = 1000, 20
+    inp = dpt.ones((n, m), dtype="i4", order="F")
+    inp[:, ::2] = 0
+
+    uv, ind, inv, uv_counts = dpt.unique_all(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(uv == dpt.reshape(inp, -1)[ind])
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
+
+    inp = dpt.flip(dpt.reshape(inp, -1))
+
+    uv, ind, inv, uv_counts = dpt.unique_all(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(uv == inp[ind])
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
+
+
+def test_set_functions_empty_input():
+    get_queue_or_skip()
+    x = dpt.ones((10, 0, 1), dtype="i4")
+
+    res = dpt.unique_values(x)
+    assert isinstance(res, dpctl.tensor.usm_ndarray)
+    assert res.size == 0
+    assert res.dtype == x.dtype
+
+    res = dpt.unique_inverse(x)
+    assert type(res).__name__ == "UniqueInverseResult"
+    uv, inv = res
+    assert isinstance(uv, dpctl.tensor.usm_ndarray)
+    assert uv.size == 0
+    assert isinstance(inv, dpctl.tensor.usm_ndarray)
+    assert inv.size == 0
+
+    res = dpt.unique_counts(x)
+    assert type(res).__name__ == "UniqueCountsResult"
+    uv, uv_counts = res
+    assert isinstance(uv, dpctl.tensor.usm_ndarray)
+    assert uv.size == 0
+    assert isinstance(uv_counts, dpctl.tensor.usm_ndarray)
+    assert uv_counts.size == 0
+
+    res = dpt.unique_all(x)
+    assert type(res).__name__ == "UniqueAllResult"
+    uv, ind, inv, uv_counts = res
+    assert isinstance(uv, dpctl.tensor.usm_ndarray)
+    assert uv.size == 0
+    assert isinstance(ind, dpctl.tensor.usm_ndarray)
+    assert ind.size == 0
+    assert isinstance(inv, dpctl.tensor.usm_ndarray)
+    assert inv.size == 0
+    assert isinstance(uv_counts, dpctl.tensor.usm_ndarray)
+    assert uv_counts.size == 0
+
+
+def test_set_function_outputs():
+    get_queue_or_skip()
+    # check standard and early exit paths
+    x1 = dpt.arange(10, dtype="i4")
+    x2 = dpt.ones((10, 10), dtype="i4")
+
+    assert isinstance(dpt.unique_values(x1), dpctl.tensor.usm_ndarray)
+    assert isinstance(dpt.unique_values(x2), dpctl.tensor.usm_ndarray)
+
+    assert type(dpt.unique_inverse(x1)).__name__ == "UniqueInverseResult"
+    assert type(dpt.unique_inverse(x2)).__name__ == "UniqueInverseResult"
+
+    assert type(dpt.unique_counts(x1)).__name__ == "UniqueCountsResult"
+    assert type(dpt.unique_counts(x2)).__name__ == "UniqueCountsResult"
+
+    assert type(dpt.unique_all(x1)).__name__ == "UniqueAllResult"
+    assert type(dpt.unique_all(x2)).__name__ == "UniqueAllResult"
+
+
+def test_set_functions_compute_follows_data():
+    # tests that all intermediate calls and allocations
+    # are compatible with an input with an arbitrary queue
+    q = dpctl.SyclQueue()
+    x = dpt.arange(10, dtype="i4", sycl_queue=q)
+
+    assert isinstance(dpt.unique_values(x), dpctl.tensor.usm_ndarray)
+    assert dpt.unique_counts(x)
+    assert dpt.unique_inverse(x)
+    assert dpt.unique_all(x)