diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index de8be6fae0..d2947aa772 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -118,6 +118,10 @@ set(_reduction_sources set(_boolean_reduction_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp ) +set(_sorting_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/sort.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/argsort.cpp +) set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp @@ -148,6 +152,10 @@ set(_tensor_reductions_impl_sources ${_boolean_reduction_sources} ${_reduction_sources} ) +set(_tensor_sorting_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_sorting.cpp + ${_sorting_sources} +) set(_py_trgts) @@ -166,6 +174,11 @@ pybind11_add_module(${python_module_name} MODULE ${_tensor_reductions_impl_sourc add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_reductions_impl_sources}) list(APPEND _py_trgts ${python_module_name}) +set(python_module_name _tensor_sorting_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_sorting_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_sorting_impl_sources}) +list(APPEND _py_trgts ${python_module_name}) + set(_clang_prefix "") if (WIN32) set(_clang_prefix "/clang:") @@ -179,6 +192,7 @@ set(_no_fast_math_sources list(APPEND _no_fast_math_sources ${_elementwise_sources} ${_reduction_sources} + ${_sorting_sources} ) foreach(_src_fn ${_no_fast_math_sources}) diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index cdb701e1cb..81fc152e7a 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -175,6 +175,13 @@ reduce_hypot, sum, ) +from ._set_functions import ( + unique_all, + unique_counts, + unique_inverse, + unique_values, +) +from ._sorting import argsort, sort from ._testing import allclose from ._type_utils import can_cast, finfo, iinfo, result_type @@ -343,4 +350,10 @@ "__array_namespace_info__", "reciprocal", "angle", + "sort", + "argsort", + "unique_all", + "unique_counts", + "unique_inverse", + "unique_values", ] diff --git a/dpctl/tensor/_set_functions.py b/dpctl/tensor/_set_functions.py new file mode 100644 index 0000000000..f18d64d715 --- /dev/null +++ b/dpctl/tensor/_set_functions.py @@ -0,0 +1,606 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import NamedTuple + +import dpctl +import dpctl.tensor as dpt + +from ._tensor_elementwise_impl import _not_equal, _subtract +from ._tensor_impl import ( + _copy_usm_ndarray_into_usm_ndarray, + _extract, + _full_usm_ndarray, + _linspace_step, + _take, + default_device_index_type, + mask_positions, +) +from ._tensor_sorting_impl import _argsort_ascending, _sort_ascending + +__all__ = [ + "unique_values", + "unique_counts", + "unique_inverse", + "unique_all", + "UniqueAllResult", + "UniqueCountsResult", + "UniqueInverseResult", +] + + +class UniqueAllResult(NamedTuple): + values: dpt.usm_ndarray + indices: dpt.usm_ndarray + inverse_indices: dpt.usm_ndarray + counts: dpt.usm_ndarray + + +class UniqueCountsResult(NamedTuple): + values: dpt.usm_ndarray + counts: dpt.usm_ndarray + + +class UniqueInverseResult(NamedTuple): + values: dpt.usm_ndarray + inverse_indices: dpt.usm_ndarray + + +def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: + """unique_values(x) + + Returns the unique elements of an input array x. + + Args: + x (usm_ndarray): + input array. The input with more than one dimension is flattened. + Returns: + usm_ndarray + an array containing the set of unique elements in `x`. The + returned array has the same data type as `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + if x.ndim == 1: + fx = x + else: + fx = dpt.reshape(x, (x.size,), order="C") + if fx.size == 0: + return fx + s = dpt.empty_like(fx, order="C") + host_tasks = [] + if fx.flags.c_contiguous: + ht_ev, sort_ev = _sort_ascending( + src=fx, trailing_dims_to_sort=1, dst=s, sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + else: + tmp = dpt.empty_like(fx, order="C") + ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( + src=fx, dst=tmp, sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + ht_ev, sort_ev = _sort_ascending( + src=tmp, + trailing_dims_to_sort=1, + dst=s, + sycl_queue=exec_q, + depends=[copy_ev], + ) + host_tasks.append(ht_ev) + unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) + ht_ev, uneq_ev = _not_equal( + src1=s[:-1], + src2=s[1:], + dst=unique_mask[1:], + sycl_queue=exec_q, + depends=[sort_ev], + ) + host_tasks.append(ht_ev) + ht_ev, one_ev = _full_usm_ndarray( + fill_value=True, dst=unique_mask[0], sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + cumsum = dpt.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q) + # synchronizing call + n_uniques = mask_positions( + unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev] + ) + if n_uniques == fx.size: + dpctl.SyclEvent.wait_for(host_tasks) + return s + unique_vals = dpt.empty( + n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + ) + ht_ev, _ = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + host_tasks.append(ht_ev) + dpctl.SyclEvent.wait_for(host_tasks) + return unique_vals + + +def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: + """unique_counts(x) + + Returns the unique elements of an input array `x` and the corresponding + counts for each unique element in `x`. + + Args: + x (usm_ndarray): + input array. The input with more than one dimension is flattened. + Returns: + tuple[usm_ndarray, usm_ndarray] + a namedtuple `(values, counts)` whose + + * first element is the field name `values` and is an array + containing the unique elements of `x`. This array has the + same data type as `x`. + * second element has the field name `counts` and is an array + containing the number of times each unique element occurs in `x`. + This array has the same shape as `values` and has the default + array index data type. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + x_usm_type = x.usm_type + if x.ndim == 1: + fx = x + else: + fx = dpt.reshape(x, (x.size,), order="C") + ind_dt = default_device_index_type(exec_q) + if fx.size == 0: + return UniqueCountsResult(fx, dpt.empty_like(fx, dtype=ind_dt)) + s = dpt.empty_like(fx, order="C") + host_tasks = [] + if fx.flags.c_contiguous: + ht_ev, sort_ev = _sort_ascending( + src=fx, trailing_dims_to_sort=1, dst=s, sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + else: + tmp = dpt.empty_like(fx, order="C") + ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( + src=fx, dst=tmp, sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + ht_ev, sort_ev = _sort_ascending( + src=tmp, + dst=s, + trailing_dims_to_sort=1, + sycl_queue=exec_q, + depends=[copy_ev], + ) + host_tasks.append(ht_ev) + unique_mask = dpt.empty(s.shape, dtype="?", sycl_queue=exec_q) + ht_ev, uneq_ev = _not_equal( + src1=s[:-1], + src2=s[1:], + dst=unique_mask[1:], + sycl_queue=exec_q, + depends=[sort_ev], + ) + host_tasks.append(ht_ev) + ht_ev, one_ev = _full_usm_ndarray( + fill_value=True, dst=unique_mask[0], sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) + # synchronizing call + n_uniques = mask_positions( + unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev] + ) + if n_uniques == fx.size: + dpctl.SyclEvent.wait_for(host_tasks) + return UniqueCountsResult( + s, + dpt.ones( + n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q + ), + ) + unique_vals = dpt.empty( + n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q + ) + # populate unique values + ht_ev, _ = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + host_tasks.append(ht_ev) + unique_counts = dpt.empty( + n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q + ) + idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) + ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) + host_tasks.append(ht_ev) + ht_ev, extr_ev = _extract( + src=idx, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_counts[:-1], + sycl_queue=exec_q, + depends=[id_ev], + ) + host_tasks.append(ht_ev) + ht_ev, set_ev = _full_usm_ndarray( + x.size, dst=unique_counts[-1], sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + _counts = dpt.empty_like(unique_counts[1:]) + ht_ev, _ = _subtract( + src1=unique_counts[1:], + src2=unique_counts[:-1], + dst=_counts, + sycl_queue=exec_q, + depends=[set_ev, extr_ev], + ) + host_tasks.append(ht_ev) + dpctl.SyclEvent.wait_for(host_tasks) + return UniqueCountsResult(unique_vals, _counts) + + +def unique_inverse(x): + """unique_inverse + + Returns the unique elements of an input array x and the indices from the + set of unique elements that reconstruct `x`. + + Args: + x (usm_ndarray): + input array. The input with more than one dimension is flattened. + Returns: + tuple[usm_ndarray, usm_ndarray] + a namedtuple `(values, inverse_indices)` whose + + * first element has the field name `values` and is an array + containing the unique elements of `x`. The array has the same + data type as `x`. + * second element has the field name `inverse_indices` and is an + array containing the indices of values that reconstruct `x`. + The array has the same shape as `x` and has the default array + index data type. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + x_usm_type = x.usm_type + ind_dt = default_device_index_type(exec_q) + if x.ndim == 1: + fx = x + else: + fx = dpt.reshape(x, (x.size,), order="C") + sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C") + unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C") + if fx.size == 0: + return UniqueInverseResult(fx, dpt.reshape(unsorting_ids, x.shape)) + host_tasks = [] + if fx.flags.c_contiguous: + ht_ev, sort_ev = _argsort_ascending( + src=fx, trailing_dims_to_sort=1, dst=sorting_ids, sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + else: + tmp = dpt.empty_like(fx, order="C") + ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( + src=fx, dst=tmp, sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + ht_ev, sort_ev = _argsort_ascending( + src=tmp, + trailing_dims_to_sort=1, + dst=sorting_ids, + sycl_queue=exec_q, + depends=[copy_ev], + ) + host_tasks.append(ht_ev) + ht_ev, _ = _argsort_ascending( + src=sorting_ids, + trailing_dims_to_sort=1, + dst=unsorting_ids, + sycl_queue=exec_q, + depends=[sort_ev], + ) + host_tasks.append(ht_ev) + s = dpt.empty_like(fx) + # s = fx[sorting_ids] + ht_ev, take_ev = _take( + src=fx, + ind=(sorting_ids,), + dst=s, + axis_start=0, + mode=0, + sycl_queue=exec_q, + depends=[sort_ev], + ) + host_tasks.append(ht_ev) + unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) + ht_ev, uneq_ev = _not_equal( + src1=s[:-1], + src2=s[1:], + dst=unique_mask[1:], + sycl_queue=exec_q, + depends=[take_ev], + ) + host_tasks.append(ht_ev) + ht_ev, one_ev = _full_usm_ndarray( + fill_value=True, dst=unique_mask[0], sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) + # synchronizing call + n_uniques = mask_positions( + unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev] + ) + if n_uniques == fx.size: + dpctl.SyclEvent.wait_for(host_tasks) + return UniqueInverseResult(s, dpt.reshape(unsorting_ids, x.shape)) + unique_vals = dpt.empty( + n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q + ) + ht_ev, _ = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + host_tasks.append(ht_ev) + cum_unique_counts = dpt.empty( + n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q + ) + idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) + ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) + host_tasks.append(ht_ev) + ht_ev, extr_ev = _extract( + src=idx, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=cum_unique_counts[:-1], + sycl_queue=exec_q, + depends=[id_ev], + ) + host_tasks.append(ht_ev) + ht_ev, set_ev = _full_usm_ndarray( + x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + _counts = dpt.empty_like(cum_unique_counts[1:]) + ht_ev, _ = _subtract( + src1=cum_unique_counts[1:], + src2=cum_unique_counts[:-1], + dst=_counts, + sycl_queue=exec_q, + depends=[set_ev, extr_ev], + ) + host_tasks.append(ht_ev) + # TODO: when searchsorted is available, + # inv = searchsorted(unique_vals, fx) + dpctl.SyclEvent.wait_for(host_tasks) + counts = dpt.asnumpy(_counts).tolist() + inv = dpt.empty_like(fx, dtype=ind_dt) + pos = 0 + host_tasks = [] + for i in range(len(counts)): + pos_next = pos + counts[i] + _dst = inv[pos:pos_next] + ht_ev, _ = _full_usm_ndarray(fill_value=i, dst=_dst, sycl_queue=exec_q) + pos = pos_next + host_tasks.append(ht_ev) + dpctl.SyclEvent.wait_for(host_tasks) + return UniqueInverseResult( + unique_vals, dpt.reshape(inv[unsorting_ids], x.shape) + ) + + +def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: + """unique_all(x) + + Returns the unique elements of an input array `x`, the first occurring + indices for each unique element in `x`, the indices from the set of unique + elements that reconstruct `x`, and the corresponding counts for each + unique element in `x`. + + Args: + x (usm_ndarray): + input array. The input with more than one dimension is flattened. + Returns: + tuple[usm_ndarray, usm_ndarray, usm_ndarray, usm_ndarray] + a namedtuple `(values, indices, inverse_indices, counts)` whose + + * first element has the field name `values` and is an array + containing the unique elements of `x`. The array has the same + data type as `x`. + * second element has the field name `indices` and is an array + the indices (of first occurrences) of `x` that result in + `values`. The array has the same shape as `values` and has the + default array index data type. + * third element has the field name `inverse_indices` and is an + array containing the indices of values that reconstruct `x`. + The array has the same shape as `x` and has the default array + index data type. + * fourth element has the field name `counts` and is an array + containing the number of times each unique element occurs in `x`. + This array has the same shape as `values` and has the default + array index data type. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + x_usm_type = x.usm_type + ind_dt = default_device_index_type(exec_q) + if x.ndim == 1: + fx = x + else: + fx = dpt.reshape(x, (x.size,), order="C") + sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C") + unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C") + if fx.size == 0: + # original array contains no data + # so it can be safely returned as values + return UniqueAllResult( + fx, + sorting_ids, + dpt.reshape(unsorting_ids, x.shape), + dpt.empty_like(fx, dtype=ind_dt), + ) + host_tasks = [] + if fx.flags.c_contiguous: + ht_ev, sort_ev = _argsort_ascending( + src=fx, trailing_dims_to_sort=1, dst=sorting_ids, sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + else: + tmp = dpt.empty_like(fx, order="C") + ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( + src=fx, dst=tmp, sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + ht_ev, sort_ev = _argsort_ascending( + src=tmp, + trailing_dims_to_sort=1, + dst=sorting_ids, + sycl_queue=exec_q, + depends=[copy_ev], + ) + host_tasks.append(ht_ev) + ht_ev, _ = _argsort_ascending( + src=sorting_ids, + trailing_dims_to_sort=1, + dst=unsorting_ids, + sycl_queue=exec_q, + depends=[sort_ev], + ) + host_tasks.append(ht_ev) + s = dpt.empty_like(fx) + # s = fx[sorting_ids] + ht_ev, take_ev = _take( + src=fx, + ind=(sorting_ids,), + dst=s, + axis_start=0, + mode=0, + sycl_queue=exec_q, + depends=[sort_ev], + ) + host_tasks.append(ht_ev) + unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) + ht_ev, uneq_ev = _not_equal( + src1=s[:-1], + src2=s[1:], + dst=unique_mask[1:], + sycl_queue=exec_q, + depends=[take_ev], + ) + host_tasks.append(ht_ev) + ht_ev, one_ev = _full_usm_ndarray( + fill_value=True, dst=unique_mask[0], sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) + # synchronizing call + n_uniques = mask_positions( + unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev] + ) + if n_uniques == fx.size: + dpctl.SyclEvent.wait_for(host_tasks) + _counts = dpt.ones( + n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q + ) + return UniqueAllResult( + s, + sorting_ids, + dpt.reshape(unsorting_ids, x.shape), + _counts, + ) + unique_vals = dpt.empty( + n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q + ) + ht_ev, _ = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + host_tasks.append(ht_ev) + cum_unique_counts = dpt.empty( + n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q + ) + idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) + ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) + host_tasks.append(ht_ev) + ht_ev, extr_ev = _extract( + src=idx, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=cum_unique_counts[:-1], + sycl_queue=exec_q, + depends=[id_ev], + ) + host_tasks.append(ht_ev) + ht_ev, set_ev = _full_usm_ndarray( + x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q + ) + host_tasks.append(ht_ev) + _counts = dpt.empty_like(cum_unique_counts[1:]) + ht_ev, _ = _subtract( + src1=cum_unique_counts[1:], + src2=cum_unique_counts[:-1], + dst=_counts, + sycl_queue=exec_q, + depends=[set_ev, extr_ev], + ) + host_tasks.append(ht_ev) + # TODO: when searchsorted is available, + # inv = searchsorted(unique_vals, fx) + dpctl.SyclEvent.wait_for(host_tasks) + counts = dpt.asnumpy(_counts).tolist() + inv = dpt.empty_like(fx, dtype=ind_dt) + pos = 0 + host_tasks = [] + for i in range(len(counts)): + pos_next = pos + counts[i] + _dst = inv[pos:pos_next] + ht_ev, _ = _full_usm_ndarray(fill_value=i, dst=_dst, sycl_queue=exec_q) + pos = pos_next + host_tasks.append(ht_ev) + dpctl.SyclEvent.wait_for(host_tasks) + return UniqueAllResult( + unique_vals, + sorting_ids[cum_unique_counts[:-1]], + dpt.reshape(inv[unsorting_ids], x.shape), + _counts, + ) diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py new file mode 100644 index 0000000000..661c3bc1c4 --- /dev/null +++ b/dpctl/tensor/_sorting.py @@ -0,0 +1,186 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from numpy.core.numeric import normalize_axis_index + +import dpctl +import dpctl.tensor as dpt +import dpctl.tensor._tensor_impl as ti + +from ._tensor_sorting_impl import ( + _argsort_ascending, + _argsort_descending, + _sort_ascending, + _sort_descending, +) + +__all__ = ["sort", "argsort"] + + +def sort(x, /, *, axis=-1, descending=False, stable=True): + """sort(x, axis=-1, descending=False, stable=True) + + Returns a sorted copy of an input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which to sort. If set to `-1`, the function + must sort along the last axis. Default: `-1`. + descending (Optional[bool]): + sort order. If `True`, the array must be sorted in descending + order (by value). If `False`, the array must be sorted in + ascending order (by value). Default: `False`. + stable (Optional[bool]): + sort stability. If `True`, the returned array must maintain the + relative order of `x` values which compare as equal. If `False`, + the returned array may or may not maintain the relative order of + `x` values which compare as equal. Default: `True`. + + Returns: + usm_ndarray: + a sorted array. The returned array has the same data type and + the same shape as the input array `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}" + ) + nd = x.ndim + if nd == 0: + axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis") + return dpt.copy(x, order="C") + else: + axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis") + a1 = axis + 1 + if a1 == nd: + perm = list(range(nd)) + arr = x + else: + perm = [i for i in range(nd) if i != axis] + [ + axis, + ] + arr = dpt.permute_dims(x, perm) + exec_q = x.sycl_queue + host_tasks_list = [] + impl_fn = _sort_descending if descending else _sort_ascending + if arr.flags.c_contiguous: + res = dpt.empty_like(arr, order="C") + ht_ev, _ = impl_fn( + src=arr, trailing_dims_to_sort=1, dst=res, sycl_queue=exec_q + ) + host_tasks_list.append(ht_ev) + else: + tmp = dpt.empty_like(arr, order="C") + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=exec_q + ) + host_tasks_list.append(ht_ev) + res = dpt.empty_like(arr, order="C") + ht_ev, _ = impl_fn( + src=tmp, + trailing_dims_to_sort=1, + dst=res, + sycl_queue=exec_q, + depends=[copy_ev], + ) + host_tasks_list.append(ht_ev) + if a1 != nd: + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(res, inv_perm) + dpctl.SyclEvent.wait_for(host_tasks_list) + return res + + +def argsort(x, axis=-1, descending=False, stable=True): + """argsort(x, axis=-1, descending=False, stable=True) + + Returns the indices that sort an array `x` along a specified axis. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which to sort. If set to `-1`, the function + must sort along the last axis. Default: `-1`. + descending (Optional[bool]): + sort order. If `True`, the array must be sorted in descending + order (by value). If `False`, the array must be sorted in + ascending order (by value). Default: `False`. + stable (Optional[bool]): + sort stability. If `True`, the returned array must maintain the + relative order of `x` values which compare as equal. If `False`, + the returned array may or may not maintain the relative order of + `x` values which compare as equal. Default: `True`. + + Returns: + usm_ndarray: + an array of indices. The returned array has the same shape as + the input array `x`. The return array has default array index + data type. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}" + ) + nd = x.ndim + if nd == 0: + axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis") + return dpt.zeros_like( + x, dtype=ti.default_device_index_type(x.sycl_queue), order="C" + ) + else: + axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis") + a1 = axis + 1 + if a1 == nd: + perm = list(range(nd)) + arr = x + else: + perm = [i for i in range(nd) if i != axis] + [ + axis, + ] + arr = dpt.permute_dims(x, perm) + exec_q = x.sycl_queue + host_tasks_list = [] + impl_fn = _argsort_descending if descending else _argsort_ascending + index_dt = ti.default_device_index_type(exec_q) + if arr.flags.c_contiguous: + res = dpt.empty_like(arr, dtype=index_dt, order="C") + ht_ev, _ = impl_fn( + src=arr, trailing_dims_to_sort=1, dst=res, sycl_queue=exec_q + ) + host_tasks_list.append(ht_ev) + else: + tmp = dpt.empty_like(arr, order="C") + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=exec_q + ) + host_tasks_list.append(ht_ev) + res = dpt.empty_like(arr, dtype=index_dt, order="C") + ht_ev, _ = impl_fn( + src=tmp, + trailing_dims_to_sort=1, + dst=res, + sycl_queue=exec_q, + depends=[copy_ev], + ) + host_tasks_list.append(ht_ev) + if a1 != nd: + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(res, inv_perm) + dpctl.SyclEvent.wait_for(host_tasks_list) + return res diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index bd6ad20b6d..491fb12126 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -51,23 +51,6 @@ template T ceiling_quotient(T n, T m) { return (n + m - 1) / m; } -template T1 ceiling_quotient(T1 n, T2 m) -{ - return ceiling_quotient(n, static_cast(m)); -} - -template -class inclusive_scan_rec_local_scan_krn; - -template -class inclusive_scan_rec_chunk_update_krn; template struct NonZeroIndicator { @@ -93,127 +76,261 @@ template struct NoOpTransformer } }; -/* - * for integer type maskT, - * output[j] = sum( input[s0 + i * s1], 0 <= i <= j) - * for 0 <= j < n_elems - */ +// Iterative cumulative summation + +using nwiT = std::uint16_t; + template -sycl::event inclusive_scan_rec(sycl::queue &exec_q, - size_t n_elems, - size_t wg_size, - const inputT *input, - outputT *output, - size_t s0, - size_t s1, - IndexerT indexer, - TransformerT transformer, - std::vector const &depends = {}) +class inclusive_scan_iter_local_scan_krn; + +template +class inclusive_scan_iter_chunk_update_krn; + +template +sycl::event +inclusive_scan_base_step(sycl::queue &exec_q, + const size_t wg_size, + const size_t n_elems, + const inputT *input, + outputT *output, + const size_t s0, + const size_t s1, + IndexerT indexer, + TransformerT transformer, + size_t &n_groups, + const std::vector &depends = {}) { - size_t n_groups = ceiling_quotient(n_elems, n_wi * wg_size); + n_groups = ceiling_quotient(n_elems, n_wi * wg_size); - const sycl::event &inc_scan_phase1_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); + sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); - using slmT = sycl::local_accessor; + using slmT = sycl::local_accessor; - auto lws = sycl::range<1>(wg_size); - auto gws = sycl::range<1>(n_groups * wg_size); + auto lws = sycl::range<1>(wg_size); + auto gws = sycl::range<1>(n_groups * wg_size); - auto ndRange = sycl::nd_range<1>(gws, lws); + auto ndRange = sycl::nd_range<1>(gws, lws); - slmT slm_iscan_tmp(lws, cgh); + slmT slm_iscan_tmp(lws, cgh); - using KernelName = inclusive_scan_rec_local_scan_krn< - inputT, outputT, n_wi, IndexerT, decltype(transformer)>; + using KernelName = + inclusive_scan_iter_local_scan_krn; - cgh.parallel_for(ndRange, [=, slm_iscan_tmp = std::move( - slm_iscan_tmp)]( - sycl::nd_item<1> it) { - auto chunk_gid = it.get_global_id(0); - auto lid = it.get_local_id(0); + cgh.parallel_for(ndRange, [=, slm_iscan_tmp = + std::move(slm_iscan_tmp)]( + sycl::nd_item<1> it) { + auto chunk_gid = it.get_global_id(0); + auto lid = it.get_local_id(0); - std::array local_isum; + std::array local_isum; - size_t i = chunk_gid * n_wi; - for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { - constexpr outputT out_zero(0); + size_t i = chunk_gid * n_wi; + +#pragma unroll + for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { + constexpr outputT out_zero(0); - local_isum[m_wi] = - (i + m_wi < n_elems) - ? transformer(input[indexer(s0 + s1 * (i + m_wi))]) - : out_zero; - } + local_isum[m_wi] = + (i + m_wi < n_elems) + ? transformer(input[indexer(s0 + s1 * (i + m_wi))]) + : out_zero; + } #pragma unroll - for (size_t m_wi = 1; m_wi < n_wi; ++m_wi) { - local_isum[m_wi] += local_isum[m_wi - 1]; - } - // local_isum is now result of - // inclusive scan of locally stored inputs + for (nwiT m_wi = 1; m_wi < n_wi; ++m_wi) { + local_isum[m_wi] += local_isum[m_wi - 1]; + } + // local_isum is now result of + // inclusive scan of locally stored inputs - size_t wg_iscan_val = sycl::inclusive_scan_over_group( - it.get_group(), local_isum.back(), sycl::plus(), - size_t(0)); + outputT wg_iscan_val = sycl::inclusive_scan_over_group( + it.get_group(), local_isum.back(), sycl::plus(), + outputT(0)); - slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; - it.barrier(sycl::access::fence_space::local_space); - size_t addand = (lid == 0) ? 0 : slm_iscan_tmp[lid]; + slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; + it.barrier(sycl::access::fence_space::local_space); + outputT addand = (lid == 0) ? outputT(0) : slm_iscan_tmp[lid]; #pragma unroll - for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { - local_isum[m_wi] += addand; - } - - for (size_t m_wi = 0; m_wi < n_wi && i + m_wi < n_elems; ++m_wi) - { - output[i + m_wi] = local_isum[m_wi]; - } - }); + for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { + local_isum[m_wi] += addand; + } + + for (nwiT m_wi = 0; (m_wi < n_wi) && (i + m_wi < n_elems); ++m_wi) { + output[i + m_wi] = local_isum[m_wi]; + } }); + }); - sycl::event out_event = inc_scan_phase1_ev; - if (n_groups > 1) { - outputT *temp = sycl::malloc_device(n_groups - 1, exec_q); + return inc_scan_phase1_ev; +} + +namespace +{ +template class stack_t +{ + T *src_; + size_t size_; + T *local_scans_; + +public: + stack_t() : src_{}, size_{}, local_scans_{} {} + stack_t(T *src, size_t sz, T *local_scans) + : src_(src), size_(sz), local_scans_(local_scans) + { + } + ~stack_t(){}; + + T *get_src_ptr() const + { + return src_; + } + + const T *get_src_const_ptr() const + { + return src_; + } + size_t get_size() const + { + return size_; + } + + T *get_local_scans_ptr() const + { + return local_scans_; + } +}; +} // end of anonymous namespace + +/* + * for integer type maskT, + * output[j] = sum( input[s0 + i * s1], 0 <= i <= j) + * for 0 <= j < n_elems + */ +template +sycl::event inclusive_scan_iter(sycl::queue &exec_q, + const size_t wg_size, + const size_t n_elems, + const inputT *input, + outputT *output, + const size_t s0, + const size_t s1, + IndexerT indexer, + TransformerT transformer, + std::vector &host_tasks, + const std::vector &depends = {}) +{ + size_t n_groups; + sycl::event inc_scan_phase1_ev = + inclusive_scan_base_step( + exec_q, wg_size, n_elems, input, output, s0, s1, indexer, + transformer, n_groups, depends); + + sycl::event dependent_event = inc_scan_phase1_ev; + if (n_groups > 1) { auto chunk_size = wg_size * n_wi; + // how much of temporary allocation do we need + size_t n_groups_ = n_groups; + size_t temp_size = 0; + while (n_groups_ > 1) { + const auto this_size = (n_groups_ - 1); + temp_size += this_size; + n_groups_ = ceiling_quotient(this_size, chunk_size); + } + + // allocate + outputT *temp = sycl::malloc_device(temp_size, exec_q); + + if (!temp) { + throw std::bad_alloc(); + } + + std::vector> stack{}; + + // inclusive scans over blocks + n_groups_ = n_groups; + outputT *src = output; + outputT *local_scans = temp; + NoOpIndexer _no_op_indexer{}; - NoOpTransformer _no_op_transformer{}; - auto e2 = inclusive_scan_rec( - exec_q, n_groups - 1, wg_size, output, temp, chunk_size - 1, - chunk_size, _no_op_indexer, _no_op_transformer, - {inc_scan_phase1_ev}); - - // output[ chunk_size * (i + 1) + j] += temp[i] - auto e3 = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(e2); - cgh.parallel_for>( - {n_elems}, [=](auto wiid) - { - auto gid = wiid[0]; - auto i = (gid / chunk_size); - output[gid] += (i > 0) ? temp[i - 1] : 0; + using NoOpTransformerT = NoOpTransformer; + NoOpTransformerT _no_op_transformer{}; + size_t size_to_update = n_elems; + while (n_groups_ > 1) { + + size_t src_size = n_groups_ - 1; + dependent_event = + inclusive_scan_base_step( + exec_q, wg_size, src_size, src, local_scans, chunk_size - 1, + chunk_size, _no_op_indexer, _no_op_transformer, + n_groups_, // n_groups_ is modified in place + {dependent_event}); + stack.push_back({src, size_to_update, local_scans}); + src = local_scans; + local_scans += src_size; + size_to_update = src_size; + } + + for (size_t reverse_stack_id = 0; reverse_stack_id < stack.size(); + ++reverse_stack_id) + { + auto stack_id = stack.size() - 1 - reverse_stack_id; + + auto stack_elem = stack[stack_id]; + outputT *src = stack_elem.get_src_ptr(); + size_t src_size = stack_elem.get_size(); + outputT *local_scans = stack_elem.get_local_scans_ptr(); + + // output[ chunk_size * (i + 1) + j] += temp[i] + dependent_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_event); + + using UpdateKernelName = + class inclusive_scan_iter_chunk_update_krn< + inputT, outputT, n_wi, IndexerT, TransformerT, + NoOpIndexer, NoOpTransformerT>; + + cgh.parallel_for( + {src_size}, [chunk_size, src, local_scans](auto wiid) { + auto gid = wiid[0]; + auto i = (gid / chunk_size); + src[gid] += (i > 0) ? local_scans[i - 1] : outputT(0); + }); }); - }); + } sycl::event e4 = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(e3); + cgh.depends_on(dependent_event); const auto &ctx = exec_q.get_context(); cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); }); }); - - out_event = std::move(e4); + host_tasks.push_back(e4); } - return out_event; + return dependent_event; } typedef size_t (*accumulate_contig_impl_fn_ptr_t)( @@ -221,29 +338,44 @@ typedef size_t (*accumulate_contig_impl_fn_ptr_t)( size_t, const char *, char *, - std::vector const &); + std::vector &, + const std::vector &); template size_t accumulate_contig_impl(sycl::queue &q, size_t n_elems, const char *mask, char *cumsum, - std::vector const &depends = {}) + std::vector &host_tasks, + const std::vector &depends = {}) { - constexpr int n_wi = 8; const maskT *mask_data_ptr = reinterpret_cast(mask); cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); - size_t wg_size = 128; NoOpIndexer flat_indexer{}; transformerT non_zero_indicator{}; - const sycl::event &comp_ev = - inclusive_scan_rec( - q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1, - flat_indexer, non_zero_indicator, depends); - + sycl::event comp_ev; + + const sycl::device &dev = q.get_device(); + if (dev.has(sycl::aspect::cpu)) { + constexpr nwiT n_wi_for_cpu = 8; + size_t wg_size = 256; + comp_ev = inclusive_scan_iter( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1, + flat_indexer, non_zero_indicator, host_tasks, depends); + } + else { + constexpr nwiT n_wi_for_gpu = 4; + size_t wg_size = 256; + comp_ev = inclusive_scan_iter( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1, + flat_indexer, non_zero_indicator, host_tasks, depends); + } cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); cumsumT *last_elem_host_usm = sycl::malloc_host(1, q); @@ -251,8 +383,10 @@ size_t accumulate_contig_impl(sycl::queue &q, if (last_elem_host_usm == nullptr) { throw std::bad_alloc(); } - sycl::event copy_e = - q.copy(last_elem, last_elem_host_usm, 1, {comp_ev}); + sycl::event copy_e = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(comp_ev); + cgh.copy(last_elem, last_elem_host_usm, 1); + }); copy_e.wait(); size_t return_val = static_cast(*last_elem_host_usm); sycl::free(last_elem_host_usm, q); @@ -305,7 +439,8 @@ typedef size_t (*accumulate_strided_impl_fn_ptr_t)( int, const py::ssize_t *, char *, - std::vector const &); + std::vector &, + const std::vector &); template size_t accumulate_strided_impl(sycl::queue &q, @@ -314,21 +449,35 @@ size_t accumulate_strided_impl(sycl::queue &q, int nd, const py::ssize_t *shape_strides, char *cumsum, - std::vector const &depends = {}) + std::vector &host_tasks, + const std::vector &depends = {}) { - constexpr int n_wi = 8; const maskT *mask_data_ptr = reinterpret_cast(mask); cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); - size_t wg_size = 128; StridedIndexer strided_indexer{nd, 0, shape_strides}; transformerT non_zero_indicator{}; - const sycl::event &comp_ev = - inclusive_scan_rec( - q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1, - strided_indexer, non_zero_indicator, depends); + const sycl::device &dev = q.get_device(); + sycl::event comp_ev; + if (dev.has(sycl::aspect::cpu)) { + constexpr nwiT n_wi_for_cpu = 8; + size_t wg_size = 256; + comp_ev = inclusive_scan_iter( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1, + strided_indexer, non_zero_indicator, host_tasks, depends); + } + else { + constexpr nwiT n_wi_for_gpu = 4; + size_t wg_size = 256; + comp_ev = inclusive_scan_iter( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, 0, 1, + strided_indexer, non_zero_indicator, host_tasks, depends); + } cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); @@ -337,8 +486,10 @@ size_t accumulate_strided_impl(sycl::queue &q, if (last_elem_host_usm == nullptr) { throw std::bad_alloc(); } - sycl::event copy_e = - q.copy(last_elem, last_elem_host_usm, 1, {comp_ev}); + sycl::event copy_e = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(comp_ev); + cgh.copy(last_elem, last_elem_host_usm, 1); + }); copy_e.wait(); size_t return_val = static_cast(*last_elem_host_usm); sycl::free(last_elem_host_usm, q); diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp index 61fb0f6ba0..877680c8bf 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp @@ -315,8 +315,10 @@ boolean_reduction_axis1_contig_impl(sycl::queue &exec_q, }); } else { - sycl::event init_ev = exec_q.fill(res_tp, resTy(identity_val), - iter_nelems, depends); + sycl::event init_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.fill(res_tp, resTy(identity_val), iter_nelems); + }); red_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(init_ev); @@ -484,8 +486,10 @@ boolean_reduction_axis0_contig_impl(sycl::queue &exec_q, size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); { - sycl::event init_ev = exec_q.fill(res_tp, resTy(identity_val), - iter_nelems, depends); + sycl::event init_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.fill(res_tp, resTy(identity_val), iter_nelems); + }); sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(init_ev); diff --git a/dpctl/tensor/libtensor/include/kernels/sorting.hpp b/dpctl/tensor/libtensor/include/kernels/sorting.hpp new file mode 100644 index 0000000000..e7a024259e --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/sorting.hpp @@ -0,0 +1,913 @@ +//=== sorting.hpp - Implementation of sorting kernels ---*-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor sort/argsort operations. +//===----------------------------------------------------------------------===// + +#pragma once + +#include "pybind11/pybind11.h" + +#include +#include +#include +#include +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ + +namespace sort_detail +{ + +template T quotient_ceil(T n, T m) +{ + return (n + m - 1) / m; +} + +template +std::size_t lower_bound_impl(const Acc acc, + std::size_t first, + std::size_t last, + const Value &value, + Compare comp) +{ + std::size_t n = last - first; + std::size_t cur = n; + std::size_t it; + while (n > 0) { + it = first; + cur = n / 2; + it += cur; + if (comp(acc[it], value)) { + n -= cur + 1, first = ++it; + } + else + n = cur; + } + return first; +} + +template +std::size_t upper_bound_impl(const Acc acc, + const std::size_t first, + const std::size_t last, + const Value &value, + Compare comp) +{ + const auto &op_comp = [comp](auto x, auto y) { return !comp(y, x); }; + return lower_bound_impl(acc, first, last, value, op_comp); +} + +/*! @brief Merge two contiguous sorted segments */ +template +void merge_impl(const std::size_t offset, + const InAcc in_acc, + OutAcc out_acc, + const std::size_t start_1, + const std::size_t end_1, + const std::size_t end_2, + const std::size_t start_out, + Compare comp, + const std::size_t chunk) +{ + const std::size_t start_2 = end_1; + // Borders of the sequences to merge within this call + const std::size_t local_start_1 = sycl::min(offset + start_1, end_1); + const std::size_t local_end_1 = sycl::min(local_start_1 + chunk, end_1); + const std::size_t local_start_2 = sycl::min(offset + start_2, end_2); + const std::size_t local_end_2 = sycl::min(local_start_2 + chunk, end_2); + + const std::size_t local_size_1 = local_end_1 - local_start_1; + const std::size_t local_size_2 = local_end_2 - local_start_2; + + const auto r_item_1 = in_acc[end_1 - 1]; + const auto l_item_2 = in_acc[start_2]; + + // Copy if the sequences are sorted with respect to each other or merge + // otherwise + if (!comp(l_item_2, r_item_1)) { + const std::size_t out_shift_1 = start_out + local_start_1 - start_1; + const std::size_t out_shift_2 = + start_out + end_1 - start_1 + local_start_2 - start_2; + + for (std::size_t i = 0; i < local_size_1; ++i) { + out_acc[out_shift_1 + i] = in_acc[local_start_1 + i]; + } + for (std::size_t i = 0; i < local_size_2; ++i) { + out_acc[out_shift_2 + i] = in_acc[local_start_2 + i]; + } + } + else if (comp(r_item_1, l_item_2)) { + const std::size_t out_shift_1 = + start_out + end_2 - start_2 + local_start_1 - start_1; + const std::size_t out_shift_2 = start_out + local_start_2 - start_2; + for (std::size_t i = 0; i < local_size_1; ++i) { + out_acc[out_shift_1 + i] = in_acc[local_start_1 + i]; + } + for (std::size_t i = 0; i < local_size_2; ++i) { + out_acc[out_shift_2 + i] = in_acc[local_start_2 + i]; + } + } + // Perform merging + else { + + // Process 1st sequence + if (local_start_1 < local_end_1) { + // Reduce the range for searching within the 2nd sequence and handle + // bound items find left border in 2nd sequence + const auto local_l_item_1 = in_acc[local_start_1]; + std::size_t l_search_bound_2 = + lower_bound_impl(in_acc, start_2, end_2, local_l_item_1, comp); + const std::size_t l_shift_1 = local_start_1 - start_1; + const std::size_t l_shift_2 = l_search_bound_2 - start_2; + + out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_1; + + std::size_t r_search_bound_2{}; + // find right border in 2nd sequence + if (local_size_1 > 1) { + const auto local_r_item_1 = in_acc[local_end_1 - 1]; + r_search_bound_2 = lower_bound_impl( + in_acc, l_search_bound_2, end_2, local_r_item_1, comp); + const auto r_shift_1 = local_end_1 - 1 - start_1; + const auto r_shift_2 = r_search_bound_2 - start_2; + + out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_1; + } + + // Handle intermediate items + if (r_search_bound_2 == l_search_bound_2) { + const std::size_t shift_2 = l_search_bound_2 - start_2; + for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1; + ++idx) { + const auto intermediate_item_1 = in_acc[idx]; + const std::size_t shift_1 = idx - start_1; + out_acc[start_out + shift_1 + shift_2] = + intermediate_item_1; + } + } + else { + for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1; + ++idx) { + const auto intermediate_item_1 = in_acc[idx]; + // we shouldn't seek in whole 2nd sequence. Just for the + // part where the 1st sequence should be + l_search_bound_2 = lower_bound_impl( + in_acc, l_search_bound_2, r_search_bound_2, + intermediate_item_1, comp); + const std::size_t shift_1 = idx - start_1; + const std::size_t shift_2 = l_search_bound_2 - start_2; + + out_acc[start_out + shift_1 + shift_2] = + intermediate_item_1; + } + } + } + // Process 2nd sequence + if (local_start_2 < local_end_2) { + // Reduce the range for searching within the 1st sequence and handle + // bound items find left border in 1st sequence + const auto local_l_item_2 = in_acc[local_start_2]; + std::size_t l_search_bound_1 = + upper_bound_impl(in_acc, start_1, end_1, local_l_item_2, comp); + const std::size_t l_shift_1 = l_search_bound_1 - start_1; + const std::size_t l_shift_2 = local_start_2 - start_2; + + out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_2; + + std::size_t r_search_bound_1{}; + // find right border in 1st sequence + if (local_size_2 > 1) { + const auto local_r_item_2 = in_acc[local_end_2 - 1]; + r_search_bound_1 = upper_bound_impl( + in_acc, l_search_bound_1, end_1, local_r_item_2, comp); + const std::size_t r_shift_1 = r_search_bound_1 - start_1; + const std::size_t r_shift_2 = local_end_2 - 1 - start_2; + + out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_2; + } + + // Handle intermediate items + if (l_search_bound_1 == r_search_bound_1) { + const std::size_t shift_1 = l_search_bound_1 - start_1; + for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx) + { + const auto intermediate_item_2 = in_acc[idx]; + const std::size_t shift_2 = idx - start_2; + out_acc[start_out + shift_1 + shift_2] = + intermediate_item_2; + } + } + else { + for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx) + { + const auto intermediate_item_2 = in_acc[idx]; + // we shouldn't seek in whole 1st sequence. Just for the + // part where the 2nd sequence should be + l_search_bound_1 = upper_bound_impl( + in_acc, l_search_bound_1, r_search_bound_1, + intermediate_item_2, comp); + const std::size_t shift_1 = l_search_bound_1 - start_1; + const std::size_t shift_2 = idx - start_2; + + out_acc[start_out + shift_1 + shift_2] = + intermediate_item_2; + } + } + } + } +} + +namespace +{ +template +void insertion_sort_impl(Iter first, + const size_t begin, + const size_t end, + Compare comp) +{ + for (size_t i = begin + 1; i < end; ++i) { + const auto val_i = first[i]; + size_t j = i - 1; + while ((j + 1 > begin) && (comp(val_i, first[j]))) { + first[j + 1] = first[j]; + --j; + } + if (j + 1 < i) { + first[j + 1] = val_i; + } + } +} + +template +void bubble_sort_impl(Iter first, + const size_t begin, + const size_t end, + Compare comp) +{ + if (begin < end) { + for (size_t i = begin; i < end; ++i) { + // Handle intermediate items + for (size_t idx = i + 1; idx < end; ++idx) { + if (comp(first[idx], first[i])) { + std::swap(first[i], first[idx]); + } + } + } + } +} + +template +void leaf_sort_impl(Iter first, + const size_t begin, + const size_t end, + Compare comp) +{ + return insertion_sort_impl(first, begin, end, comp); +} +} // namespace + +template struct GetValueType +{ + using value_type = typename std::iterator_traits::value_type; +}; + +template +struct GetValueType> +{ + using value_type = ElementType; +}; + +template +struct GetValueType< + sycl::accessor> +{ + using value_type = ElementType; +}; + +template +struct GetValueType> +{ + using value_type = ElementType; +}; + +template struct GetReadOnlyAccess +{ + Iter operator()(Iter it, sycl::handler &) + { + return it; + } +}; + +template +struct GetReadOnlyAccess> +{ + auto operator()(sycl::buffer buf, + sycl::handler &cgh) + { + sycl::accessor acc(buf, cgh, sycl::read_only); + return acc; + } +}; + +template struct GetWriteDiscardAccess +{ + Iter operator()(Iter it, sycl::handler &) + { + return it; + } +}; + +template +struct GetWriteDiscardAccess> +{ + auto operator()(sycl::buffer buf, + sycl::handler &cgh) + { + sycl::accessor acc(buf, cgh, sycl::write_only, sycl::no_init); + return acc; + } +}; + +template struct GetReadWriteAccess +{ + Iter operator()(Iter it, sycl::handler &) + { + return it; + } +}; + +template +struct GetReadWriteAccess> +{ + auto operator()(sycl::buffer buf, + sycl::handler &cgh) + { + sycl::accessor acc(buf, cgh, sycl::read_write); + return acc; + } +}; + +template +class sort_base_step_contig_krn; + +template +sycl::event +sort_base_step_contig_impl(sycl::queue &q, + const size_t iter_nelems, + const size_t sort_nelems, + const InpAcc input, + OutAcc output, + const Comp &comp, + const size_t conseq_nelems_sorted, + const std::vector &depends = {}) +{ + + using inpT = typename GetValueType::value_type; + using outT = typename GetValueType::value_type; + using KernelName = sort_base_step_contig_krn; + + const size_t n_segments = + quotient_ceil(sort_nelems, conseq_nelems_sorted); + + sycl::event base_sort = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const sycl::range<1> gRange{iter_nelems * n_segments}; + + auto input_acc = GetReadOnlyAccess{}(input, cgh); + auto output_acc = GetWriteDiscardAccess{}(output, cgh); + + cgh.parallel_for(gRange, [=](sycl::id<1> id) { + const size_t iter_id = id[0] / n_segments; + const size_t segment_id = id[0] - iter_id * n_segments; + + const size_t iter_offset = iter_id * sort_nelems; + const size_t beg_id = + iter_offset + segment_id * conseq_nelems_sorted; + const size_t end_id = + iter_offset + + std::min((segment_id + 1) * conseq_nelems_sorted, + sort_nelems); + for (size_t i = beg_id; i < end_id; ++i) { + output_acc[i] = input_acc[i]; + } + + leaf_sort_impl(output_acc, beg_id, end_id, comp); + }); + }); + + return base_sort; +} + +template +class sort_over_work_group_contig_krn; + +template +sycl::event +sort_over_work_group_contig_impl(sycl::queue &q, + size_t iter_nelems, + size_t sort_nelems, + const InpAcc input, + OutAcc output, + const Comp &comp, + size_t &nelems_wg_sorts, + const std::vector &depends = {}) +{ + using inpT = typename GetValueType::value_type; + using T = typename GetValueType::value_type; + using KernelName = sort_over_work_group_contig_krn; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = q.get_context(); + auto const &dev = q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.template get_kernel(kernel_id); + + const std::uint32_t max_sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + const std::uint64_t device_local_memory_size = + dev.get_info(); + + // leave 512 bytes of local memory for RT + const std::uint64_t safety_margin = 512; + + const std::uint64_t nelems_per_slm = + (device_local_memory_size - safety_margin) / (2 * sizeof(T)); + + constexpr std::uint32_t sub_groups_per_work_group = 4; + const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2; + + const size_t lws = sub_groups_per_work_group * max_sg_size; + + nelems_wg_sorts = elems_per_wi * lws; + + if (nelems_wg_sorts > nelems_per_slm) { + nelems_wg_sorts = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4); + + return sort_base_step_contig_impl( + q, iter_nelems, sort_nelems, input, output, comp, nelems_wg_sorts, + depends); + } + + // This assumption permits doing away with using a loop + assert(nelems_wg_sorts % lws == 0); + + const size_t n_segments = + quotient_ceil(sort_nelems, nelems_wg_sorts); + + sycl::event base_sort_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.use_kernel_bundle(kb); + + sycl::range<1> global_range{iter_nelems * n_segments * lws}; + sycl::range<1> local_range{lws}; + + sycl::range<1> slm_range{nelems_wg_sorts}; + sycl::local_accessor work_space(slm_range, cgh); + sycl::local_accessor scratch_space(slm_range, cgh); + + auto input_acc = GetReadOnlyAccess{}(input, cgh); + auto output_acc = GetWriteDiscardAccess{}(output, cgh); + + sycl::nd_range<1> ndRange(global_range, local_range); + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> it) { + const size_t group_id = it.get_group_linear_id(); + const size_t iter_id = group_id / n_segments; + const size_t segment_id = group_id - iter_id * n_segments; + const size_t lid = it.get_local_linear_id(); + + const size_t segment_start_idx = segment_id * nelems_wg_sorts; + const size_t segment_end_idx = std::min( + segment_start_idx + nelems_wg_sorts, sort_nelems); + const size_t wg_chunk_size = segment_end_idx - segment_start_idx; + + // load input into SLM + for (size_t array_id = segment_start_idx + lid; + array_id < segment_end_idx; array_id += lws) + { + T v = (array_id < sort_nelems) + ? input_acc[iter_id * sort_nelems + array_id] + : T{}; + work_space[array_id - segment_start_idx] = v; + } + sycl::group_barrier(it.get_group()); + + const size_t chunk = quotient_ceil(nelems_wg_sorts, lws); + + const size_t chunk_start_idx = lid * chunk; + const size_t chunk_end_idx = + sycl::min(chunk_start_idx + chunk, wg_chunk_size); + + leaf_sort_impl(work_space, chunk_start_idx, chunk_end_idx, comp); + + sycl::group_barrier(it.get_group()); + + bool data_in_temp = false; + size_t sorted_size = 1; + while (true) { + const size_t nelems_sorted_so_far = sorted_size * chunk; + if (nelems_sorted_so_far < wg_chunk_size) { + const size_t q = (lid / sorted_size); + const size_t start_1 = + sycl::min(2 * nelems_sorted_so_far * q, wg_chunk_size); + const size_t end_1 = sycl::min( + start_1 + nelems_sorted_so_far, wg_chunk_size); + const size_t end_2 = + sycl::min(end_1 + nelems_sorted_so_far, wg_chunk_size); + const size_t offset = chunk * (lid - q * sorted_size); + + if (data_in_temp) { + merge_impl(offset, scratch_space, work_space, start_1, + end_1, end_2, start_1, comp, chunk); + } + else { + merge_impl(offset, work_space, scratch_space, start_1, + end_1, end_2, start_1, comp, chunk); + } + sycl::group_barrier(it.get_group()); + + data_in_temp = !data_in_temp; + sorted_size *= 2; + } + else + break; + } + + const auto &out_src = (data_in_temp) ? scratch_space : work_space; + for (size_t array_id = segment_start_idx + lid; + array_id < segment_end_idx; array_id += lws) + { + if (array_id < sort_nelems) { + output_acc[iter_id * sort_nelems + array_id] = + out_src[array_id - segment_start_idx]; + } + } + }); + }); + + return base_sort_ev; +} + +class vacuous_krn; + +inline sycl::event tie_events(sycl::queue &q, + const std::vector depends) +{ + if (depends.empty()) + return sycl::event(); + if (depends.size() == 1) + return depends[0]; + + sycl::event e = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + using KernelName = vacuous_krn; + cgh.single_task([]() {}); + }); + + return e; +} + +template class merge_adjacent_blocks_to_temp_krn; + +template class merge_adjacent_blocks_from_temp_krn; + +template +sycl::event +merge_sorted_block_contig_impl(sycl::queue &q, + size_t iter_nelems, + size_t sort_nelems, + Acc output, + const Comp comp, + size_t sorted_block_size, + const std::vector &depends = {}) +{ + + if (sorted_block_size >= sort_nelems) + return tie_events(q, depends); + + // experimentally determined value + // size of segments worked upon by each work-item during merging + const sycl::device &dev = q.get_device(); + const size_t segment_size = (dev.has(sycl::aspect::cpu)) ? 32 : 4; + + const size_t chunk_size = + (sorted_block_size < segment_size) ? sorted_block_size : segment_size; + + assert(sorted_block_size % chunk_size == 0); + + using T = typename GetValueType::value_type; + + sycl::buffer temp_buf(sycl::range<1>{iter_nelems * sort_nelems}); + // T *allocated_mem = sycl::malloc_device(iter_nelems * sort_nelems, q); + + bool needs_copy = true; + bool used_depends = false; + + sycl::event dep_ev; + size_t chunks_merged = sorted_block_size / chunk_size; + + assert(!(chunks_merged & (chunks_merged - 1))); + + using ToTempKernelName = class merge_adjacent_blocks_to_temp_krn; + using FromTempKernelName = + class merge_adjacent_blocks_from_temp_krn; + + while (chunks_merged * chunk_size < sort_nelems) { + sycl::event local_dep = dep_ev; + + sycl::event merge_ev = q.submit([&](sycl::handler &cgh) { + if (used_depends) { + cgh.depends_on(local_dep); + } + else { + cgh.depends_on(depends); + used_depends = true; + } + + const size_t n_chunks = + quotient_ceil(sort_nelems, chunk_size); + + if (needs_copy) { + sycl::accessor temp_acc{temp_buf, cgh, sycl::write_only, + sycl::no_init}; + auto output_acc = GetReadOnlyAccess{}(output, cgh); + cgh.parallel_for( + {iter_nelems * n_chunks}, [=](sycl::id<1> wid) { + auto flat_idx = wid[0]; + auto iter_idx = flat_idx / n_chunks; + auto idx = flat_idx - n_chunks * iter_idx; + + const std::size_t idx_mult = + (idx / chunks_merged) * chunks_merged; + const std::size_t idx_rem = (idx - idx_mult); + const std::size_t start_1 = + sycl::min(2 * idx_mult * chunk_size, sort_nelems); + const std::size_t end_1 = sycl::min( + start_1 + chunks_merged * chunk_size, sort_nelems); + const std::size_t end_2 = sycl::min( + end_1 + chunks_merged * chunk_size, sort_nelems); + const std::size_t offset = chunk_size * idx_rem; + + const std::size_t iter_offset = iter_idx * sort_nelems; + + merge_impl(offset, output_acc, temp_acc, + iter_offset + start_1, iter_offset + end_1, + iter_offset + end_2, iter_offset + start_1, + comp, chunk_size); + }); + } + else { + sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only}; + auto output_acc = GetWriteDiscardAccess{}(output, cgh); + cgh.parallel_for( + {iter_nelems * n_chunks}, [=](sycl::id<1> wid) { + auto flat_idx = wid[0]; + auto iter_idx = flat_idx / n_chunks; + auto idx = flat_idx - n_chunks * iter_idx; + + const std::size_t idx_mult = + (idx / chunks_merged) * chunks_merged; + const std::size_t idx_rem = (idx - idx_mult); + const std::size_t start_1 = + sycl::min(2 * idx_mult * chunk_size, sort_nelems); + const std::size_t end_1 = sycl::min( + start_1 + chunks_merged * chunk_size, sort_nelems); + const std::size_t end_2 = sycl::min( + end_1 + chunks_merged * chunk_size, sort_nelems); + const std::size_t offset = chunk_size * idx_rem; + + const std::size_t iter_offset = iter_idx * sort_nelems; + + merge_impl(offset, temp_acc, output_acc, + iter_offset + start_1, iter_offset + end_1, + iter_offset + end_2, iter_offset + start_1, + comp, chunk_size); + }); + } + }); + + chunks_merged *= 2; + dep_ev = merge_ev; + + if (chunks_merged * chunk_size < sort_nelems) { + needs_copy = !needs_copy; + } + } + + if (needs_copy) { + sycl::event copy_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dep_ev); + + sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only}; + auto output_acc = GetWriteDiscardAccess{}(output, cgh); + + cgh.copy(temp_acc, output_acc); + }); + dep_ev = copy_ev; + } + + return dep_ev; +} + +} // end of namespace sort_detail + +typedef sycl::event (*sort_contig_fn_ptr_t)(sycl::queue &, + size_t, + size_t, + const char *, + char *, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + const std::vector &); + +template > +sycl::event stable_sort_axis1_contig_impl( + sycl::queue &exec_q, + size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a + // matrix when sorting over rows) + size_t sort_nelems, // size of each array to sort (length of rows, i.e. + // number of columns) + const char *arg_cp, + char *res_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + py::ssize_t sort_arg_offset, + py::ssize_t sort_res_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + sort_arg_offset; + argTy *res_tp = + reinterpret_cast(res_cp) + iter_res_offset + sort_res_offset; + + auto comp = Comp{}; + + constexpr size_t sequential_sorting_threshold = 64; + + if (sort_nelems < sequential_sorting_threshold) { + // equal work-item sorts entire row + sycl::event sequential_sorting_ev = + sort_detail::sort_base_step_contig_impl( + exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp, + sort_nelems, depends); + + return sequential_sorting_ev; + } + else { + size_t sorted_block_size{}; + + // Sort segments of the array + sycl::event base_sort_ev = + sort_detail::sort_over_work_group_contig_impl( + exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp, + sorted_block_size, // modified in place with size of sorted + // block size + depends); + + // Merge segments in parallel until all elements are sorted + sycl::event merges_ev = + sort_detail::merge_sorted_block_contig_impl( + exec_q, iter_nelems, sort_nelems, res_tp, comp, + sorted_block_size, {base_sort_ev}); + + return merges_ev; + } +} + +template +class populate_indexed_data_krn; + +template class index_write_out_krn; + +template struct TupleComp +{ + bool operator()(const pairT &p1, const pairT &p2) const + { + const ValueComp value_comp{}; + return value_comp(std::get<0>(p1), std::get<0>(p2)); + } +}; + +template > +sycl::event stable_argsort_axis1_contig_impl( + sycl::queue &exec_q, + size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a + // matrix when sorting over rows) + size_t sort_nelems, // size of each array to sort (length of rows, i.e. + // number of columns) + const char *arg_cp, + char *res_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + py::ssize_t sort_arg_offset, + py::ssize_t sort_res_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + sort_arg_offset; + IndexTy *res_tp = + reinterpret_cast(res_cp) + iter_res_offset + sort_res_offset; + + using ValueIndexT = std::pair; + const TupleComp tuple_comp{}; + + static constexpr size_t determine_automatically = 0; + size_t sorted_block_size = + (sort_nelems >= 512) ? 512 : determine_automatically; + + sycl::buffer indexed_data( + sycl::range<1>(iter_nelems * sort_nelems)); + sycl::buffer temp_buf( + sycl::range<1>(iter_nelems * sort_nelems)); + + sycl::event populate_indexed_data_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + sycl::accessor acc(indexed_data, cgh, sycl::write_only, + sycl::no_init); + + auto const &range = indexed_data.get_range(); + + using KernelName = + populate_indexed_data_krn; + + cgh.parallel_for(range, [=](sycl::id<1> id) { + size_t i = id[0]; + size_t sort_id = i % sort_nelems; + acc[i] = + std::make_pair(arg_tp[i], static_cast(sort_id)); + }); + }); + + // Sort segments of the array + sycl::event base_sort_ev = sort_detail::sort_over_work_group_contig_impl( + exec_q, iter_nelems, sort_nelems, indexed_data, temp_buf, tuple_comp, + sorted_block_size, // modified in place with size of sorted block size + {populate_indexed_data_ev}); + + // Merge segments in parallel until all elements are sorted + sycl::event merges_ev = sort_detail::merge_sorted_block_contig_impl( + exec_q, iter_nelems, sort_nelems, temp_buf, tuple_comp, + sorted_block_size, {base_sort_ev}); + + sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(merges_ev); + + auto temp_acc = + sort_detail::GetReadOnlyAccess{}(temp_buf, cgh); + + using KernelName = index_write_out_krn; + + cgh.parallel_for(temp_buf.get_range(), [=](sycl::id<1> id) { + res_tp[id] = std::get<1>(temp_acc[id]); + }); + }); + + return write_out_ev; +} + +} // end of namespace kernels +} // end of namespace tensor +} // end of namespace dpctl diff --git a/dpctl/tensor/libtensor/source/accumulators.cpp b/dpctl/tensor/libtensor/source/accumulators.cpp index 0a2ce69f69..094aa4c284 100644 --- a/dpctl/tensor/libtensor/source/accumulators.cpp +++ b/dpctl/tensor/libtensor/source/accumulators.cpp @@ -150,12 +150,20 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, const bool use_i32 = (cumsum_typeid == int32_typeid); + std::vector host_task_events; + if (mask.is_c_contiguous()) { auto fn = (use_i32) ? mask_positions_contig_i32_dispatch_vector[mask_typeid] : mask_positions_contig_i64_dispatch_vector[mask_typeid]; - return fn(exec_q, mask_size, mask_data, cumsum_data, depends); + size_t total_set = fn(exec_q, mask_size, mask_data, cumsum_data, + host_task_events, depends); + { + py::gil_scoped_release release; + sycl::event::wait(host_task_events); + } + return total_set; } const py::ssize_t *shape = mask.get_shape_raw(); @@ -175,7 +183,6 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, auto strided_fn = (use_i32) ? mask_positions_strided_i32_dispatch_vector[mask_typeid] : mask_positions_strided_i64_dispatch_vector[mask_typeid]; - std::vector host_task_events; using dpctl::tensor::offset_utils::device_allocate_and_pack; const auto &ptr_size_event_tuple = device_allocate_and_pack( @@ -189,7 +196,10 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, if (2 * static_cast(nd) != std::get<1>(ptr_size_event_tuple)) { copy_shape_ev.wait(); - sycl::event::wait(host_task_events); + { + py::gil_scoped_release release; + sycl::event::wait(host_task_events); + } sycl::free(shape_strides, exec_q); throw std::runtime_error("Unexpected error"); } @@ -200,10 +210,14 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, dependent_events.insert(dependent_events.end(), depends.begin(), depends.end()); - size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd, - shape_strides, cumsum_data, dependent_events); + size_t total_set = + strided_fn(exec_q, mask_size, mask_data, nd, shape_strides, cumsum_data, + host_task_events, dependent_events); - sycl::event::wait(host_task_events); + { + py::gil_scoped_release release; + sycl::event::wait(host_task_events); + } sycl::free(shape_strides, exec_q); return total_set; @@ -283,6 +297,8 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, "Cumulative sum array must have int64 data-type."); } + std::vector host_task_events; + if (src.is_c_contiguous()) { auto fn = cumsum_1d_contig_dispatch_vector[src_typeid]; if (fn == nullptr) { @@ -290,7 +306,13 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, "this cumsum requires integer type, got src_typeid=" + std::to_string(src_typeid)); } - return fn(exec_q, src_size, src_data, cumsum_data, depends); + size_t total = fn(exec_q, src_size, src_data, cumsum_data, + host_task_events, depends); + { + py::gil_scoped_release release; + sycl::event::wait(host_task_events); + } + return total; } const py::ssize_t *shape = src.get_shape_raw(); @@ -313,7 +335,6 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, "this cumsum requires integer type, got src_typeid=" + std::to_string(src_typeid)); } - std::vector host_task_events; using dpctl::tensor::offset_utils::device_allocate_and_pack; const auto &ptr_size_event_tuple = device_allocate_and_pack( @@ -339,9 +360,12 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, depends.end()); size_t total = strided_fn(exec_q, src_size, src_data, nd, shape_strides, - cumsum_data, dependent_events); + cumsum_data, host_task_events, dependent_events); - sycl::event::wait(host_task_events); + { + py::gil_scoped_release release; + sycl::event::wait(host_task_events); + } sycl::free(shape_strides, exec_q); return total; diff --git a/dpctl/tensor/libtensor/source/sorting/argsort.cpp b/dpctl/tensor/libtensor/source/sorting/argsort.cpp new file mode 100644 index 0000000000..f23084fe21 --- /dev/null +++ b/dpctl/tensor/libtensor/source/sorting/argsort.cpp @@ -0,0 +1,261 @@ +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include + +#include "utils/math_utils.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/type_dispatch.hpp" + +#include "argsort.hpp" +#include "kernels/sorting.hpp" +#include "sorting_common.hpp" + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +template +std::pair +py_argsort(const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const sorting_contig_impl_fnT &stable_sort_contig_fns) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd) { + throw py::value_error("The input and output arrays must have " + "the same array ranks"); + } + int iteration_nd = src_nd - trailing_dims_to_sort; + if (trailing_dims_to_sort <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_sort must be positive, but no " + "greater than rank of the array being sorted"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + size_t iter_nelems(1); + + for (int i = 0; same_shapes && (i < iteration_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + iter_nelems *= static_cast(src_shape_i); + } + + size_t sort_nelems(1); + for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + sort_nelems *= static_cast(src_shape_i); + } + + if (!same_shapes) { + throw py::value_error( + "Destination shape does not match the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if ((iter_nelems == 0) || (sort_nelems == 0)) { + // Nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + // destination must be ample enough to accommodate all elements + { + auto dst_offsets = dst.get_minmax_offsets(); + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < sort_nelems * iter_nelems) { + throw py::value_error( + "Destination array can not accommodate all the " + "elements of source array."); + } + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if ((dst_typeid != static_cast(td_ns::typenum_t::INT64)) && + (dst_typeid != static_cast(td_ns::typenum_t::INT32))) + { + throw py::value_error( + "Output index array must have data type int32 or int64"); + } + + // handle special case when both reduction and iteration are 1D contiguous + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + + if (is_src_c_contig && is_dst_c_contig) { + using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl; + + static constexpr py::ssize_t zero_offset = py::ssize_t(0); + + auto fn = stable_sort_contig_fns[src_typeid][dst_typeid]; + + if (fn == nullptr) { + throw py::value_error("Not implemented for given index type"); + } + + sycl::event comp_ev = + fn(exec_q, iter_nelems, sort_nelems, src.get_data(), dst.get_data(), + zero_offset, zero_offset, zero_offset, zero_offset, depends); + + sycl::event keep_args_alive_ev = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev}); + + return std::make_pair(keep_args_alive_ev, comp_ev); + } + + return std::make_pair(sycl::event(), sycl::event()); +} + +using dpctl::tensor::kernels::sort_contig_fn_ptr_t; +static sort_contig_fn_ptr_t + ascending_argsort_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static sort_contig_fn_ptr_t + descending_argsort_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct AscendingArgSortContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v || + std::is_same_v) + { + using Comp = typename AscendingSorter::type; + + using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl; + return stable_argsort_axis1_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct DescendingArgSortContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v || + std::is_same_v) + { + using Comp = typename DescendingSorter::type; + + using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl; + return stable_argsort_axis1_contig_impl; + } + else { + return nullptr; + } + } +}; + +void init_argsort_dispatch_tables(void) +{ + using dpctl::tensor::kernels::sort_contig_fn_ptr_t; + + td_ns::DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(ascending_argsort_contig_dispatch_table); + + td_ns::DispatchTableBuilder< + sort_contig_fn_ptr_t, DescendingArgSortContigFactory, td_ns::num_types> + dtb2; + dtb2.populate_dispatch_table(descending_argsort_contig_dispatch_table); +} + +void init_argsort_functions(py::module_ m) +{ + dpctl::tensor::py_internal::init_argsort_dispatch_tables(); + + auto py_argsort_ascending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return dpctl::tensor::py_internal::py_argsort( + src, trailing_dims_to_sort, dst, exec_q, depends, + dpctl::tensor::py_internal:: + ascending_argsort_contig_dispatch_table); + }; + m.def("_argsort_ascending", py_argsort_ascending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto py_argsort_descending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return dpctl::tensor::py_internal::py_argsort( + src, trailing_dims_to_sort, dst, exec_q, depends, + dpctl::tensor::py_internal:: + descending_argsort_contig_dispatch_table); + }; + m.def("_argsort_descending", py_argsort_descending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + return; +} + +} // end of namespace py_internal +} // end of namespace tensor +} // end of namespace dpctl diff --git a/dpctl/tensor/libtensor/source/sorting/argsort.hpp b/dpctl/tensor/libtensor/source/sorting/argsort.hpp new file mode 100644 index 0000000000..7637f2df4c --- /dev/null +++ b/dpctl/tensor/libtensor/source/sorting/argsort.hpp @@ -0,0 +1,42 @@ +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===--------------------------------------------------------------------===// + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_argsort_functions(py::module_); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/sorting/sort.cpp b/dpctl/tensor/libtensor/source/sorting/sort.cpp new file mode 100644 index 0000000000..8a4a710354 --- /dev/null +++ b/dpctl/tensor/libtensor/source/sorting/sort.cpp @@ -0,0 +1,235 @@ +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include + +#include "utils/math_utils.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/sorting.hpp" +#include "sort.hpp" +#include "sorting_common.hpp" + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +template +std::pair +py_sort(const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const sorting_contig_impl_fnT &stable_sort_contig_fns) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd) { + throw py::value_error("The input and output arrays must have " + "the same array ranks"); + } + int iteration_nd = src_nd - trailing_dims_to_sort; + if (trailing_dims_to_sort <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_sort must be positive, but no " + "greater than rank of the array being sorted"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + size_t iter_nelems(1); + + for (int i = 0; same_shapes && (i < iteration_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + iter_nelems *= static_cast(src_shape_i); + } + + size_t sort_nelems(1); + for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + sort_nelems *= static_cast(src_shape_i); + } + + if (!same_shapes) { + throw py::value_error( + "Destination shape does not match the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if ((iter_nelems == 0) || (sort_nelems == 0)) { + // Nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + // destination must be ample enough to accommodate all elements + { + auto dst_offsets = dst.get_minmax_offsets(); + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < sort_nelems * iter_nelems) { + throw py::value_error( + "Destination array can not accommodate all the " + "elements of source array."); + } + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_typeid != dst_typeid) { + throw py::value_error("Both input arrays must have " + "the same value data type"); + } + + // handle special case when both reduction and iteration are 1D contiguous + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + + if (is_src_c_contig && is_dst_c_contig) { + using dpctl::tensor::kernels::stable_sort_axis1_contig_impl; + + static constexpr py::ssize_t zero_offset = py::ssize_t(0); + + auto fn = stable_sort_contig_fns[src_typeid]; + + sycl::event comp_ev = + fn(exec_q, iter_nelems, sort_nelems, src.get_data(), dst.get_data(), + zero_offset, zero_offset, zero_offset, zero_offset, depends); + + sycl::event keep_args_alive_ev = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev}); + + return std::make_pair(keep_args_alive_ev, comp_ev); + } + + return std::make_pair(sycl::event(), sycl::event()); +} + +using dpctl::tensor::kernels::sort_contig_fn_ptr_t; +static sort_contig_fn_ptr_t + ascending_sort_contig_dispatch_vector[td_ns::num_types]; +static sort_contig_fn_ptr_t + descending_sort_contig_dispatch_vector[td_ns::num_types]; + +template struct AscendingSortContigFactory +{ + fnT get() + { + using Comp = typename AscendingSorter::type; + + using dpctl::tensor::kernels::stable_sort_axis1_contig_impl; + return stable_sort_axis1_contig_impl; + } +}; + +template struct DescendingSortContigFactory +{ + fnT get() + { + using Comp = typename DescendingSorter::type; + using dpctl::tensor::kernels::stable_sort_axis1_contig_impl; + return stable_sort_axis1_contig_impl; + } +}; + +void init_sort_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::sort_contig_fn_ptr_t; + + td_ns::DispatchVectorBuilder + dtv1; + dtv1.populate_dispatch_vector(ascending_sort_contig_dispatch_vector); + + td_ns::DispatchVectorBuilder + dtv2; + dtv2.populate_dispatch_vector(descending_sort_contig_dispatch_vector); +} + +void init_sort_functions(py::module_ m) +{ + dpctl::tensor::py_internal::init_sort_dispatch_vectors(); + + auto py_sort_ascending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return dpctl::tensor::py_internal::py_sort( + src, trailing_dims_to_sort, dst, exec_q, depends, + dpctl::tensor::py_internal::ascending_sort_contig_dispatch_vector); + }; + m.def("_sort_ascending", py_sort_ascending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto py_sort_descending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return dpctl::tensor::py_internal::py_sort( + src, trailing_dims_to_sort, dst, exec_q, depends, + dpctl::tensor::py_internal::descending_sort_contig_dispatch_vector); + }; + m.def("_sort_descending", py_sort_descending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + return; +} + +} // end of namespace py_internal +} // end of namespace tensor +} // end of namespace dpctl diff --git a/dpctl/tensor/libtensor/source/sorting/sort.hpp b/dpctl/tensor/libtensor/source/sorting/sort.hpp new file mode 100644 index 0000000000..2ad5bdb623 --- /dev/null +++ b/dpctl/tensor/libtensor/source/sorting/sort.hpp @@ -0,0 +1,42 @@ +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===--------------------------------------------------------------------===// + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_sort_functions(py::module_); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp b/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp new file mode 100644 index 0000000000..62b30ccb8f --- /dev/null +++ b/dpctl/tensor/libtensor/source/sorting/sorting_common.hpp @@ -0,0 +1,134 @@ +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===--------------------------------------------------------------------===// + +#pragma once + +#include "sycl/sycl.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace +{ +template struct ExtendedRealFPLess +{ + /* [R, nan] */ + bool operator()(const fpT v1, const fpT v2) const + { + return (!sycl::isnan(v1) && (sycl::isnan(v2) || (v1 < v2))); + } +}; + +template struct ExtendedRealFPGreater +{ + bool operator()(const fpT v1, const fpT v2) const + { + return (!sycl::isnan(v2) && (sycl::isnan(v1) || (v2 < v1))); + } +}; + +template struct ExtendedComplexFPLess +{ + /* [(R, R), (R, nan), (nan, R), (nan, nan)] */ + + bool operator()(const cT &v1, const cT &v2) const + { + using realT = typename cT::value_type; + + const realT real1 = std::real(v1); + const realT real2 = std::real(v2); + + const bool r1_nan = sycl::isnan(real1); + const bool r2_nan = sycl::isnan(real2); + + const realT imag1 = std::imag(v1); + const realT imag2 = std::imag(v2); + + const bool i1_nan = sycl::isnan(imag1); + const bool i2_nan = sycl::isnan(imag2); + + const int idx1 = ((r1_nan) ? 2 : 0) + ((i1_nan) ? 1 : 0); + const int idx2 = ((r2_nan) ? 2 : 0) + ((i2_nan) ? 1 : 0); + + const bool res = + !(r1_nan && i1_nan) && + ((idx1 < idx2) || + ((idx1 == idx2) && + ((r1_nan && !i1_nan && (imag1 < imag2)) || + (!r1_nan && i1_nan && (real1 < real2)) || + (!r1_nan && !i1_nan && + ((real1 < real2) || (!(real2 < real1) && (imag1 < imag2))))))); + + return res; + } +}; + +template struct ExtendedComplexFPGreater +{ + bool operator()(const cT &v1, const cT &v2) const + { + auto less_ = ExtendedComplexFPLess{}; + return less_(v2, v1); + } +}; + +template +inline constexpr bool is_fp_v = (std::is_same_v || + std::is_same_v || + std::is_same_v); + +} // end of anonymous namespace + +template struct AscendingSorter +{ + using type = std::conditional_t, + ExtendedRealFPLess, + std::less>; +}; + +template struct AscendingSorter> +{ + using type = ExtendedComplexFPLess>; +}; + +template struct DescendingSorter +{ + using type = std::conditional_t, + ExtendedRealFPGreater, + std::greater>; +}; + +template struct DescendingSorter> +{ + using type = ExtendedComplexFPGreater>; +}; + +} // end of namespace py_internal +} // end of namespace tensor +} // end of namespace dpctl diff --git a/dpctl/tensor/libtensor/source/tensor_sorting.cpp b/dpctl/tensor/libtensor/source/tensor_sorting.cpp new file mode 100644 index 0000000000..5d534d85cd --- /dev/null +++ b/dpctl/tensor/libtensor/source/tensor_sorting.cpp @@ -0,0 +1,37 @@ +//===-- tensor_sorting.cpp - -----*-C++-*-/===// +// Implementation of _tensor_reductions_impl module +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include + +#include "sorting/argsort.hpp" +#include "sorting/sort.hpp" + +namespace py = pybind11; + +PYBIND11_MODULE(_tensor_sorting_impl, m) +{ + dpctl::tensor::py_internal::init_sort_functions(m); + dpctl::tensor::py_internal::init_argsort_functions(m); +} diff --git a/dpctl/tests/test_usm_ndarray_sorting.py b/dpctl/tests/test_usm_ndarray_sorting.py new file mode 100644 index 0000000000..e76ac667e1 --- /dev/null +++ b/dpctl/tests/test_usm_ndarray_sorting.py @@ -0,0 +1,282 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np +import pytest + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_sort_1d(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + inp = dpt.roll( + dpt.concat( + (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype)) + ), + 734, + ) + + s = dpt.sort(inp, descending=False) + assert dpt.all(s[:-1] <= s[1:]) + + s1 = dpt.sort(inp, descending=True) + assert dpt.all(s1[:-1] >= s1[1:]) + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_sort_2d(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + fl = dpt.roll( + dpt.concat( + (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype)) + ), + 734, + ) + inp = dpt.reshape(fl, (20, -1)) + + s = dpt.sort(inp, axis=1, descending=False) + assert dpt.all(s[:, :-1] <= s[:, 1:]) + + s1 = dpt.sort(inp, axis=1, descending=True) + assert dpt.all(s1[:, :-1] >= s1[:, 1:]) + + +def test_sort_strides(): + + fl = dpt.roll( + dpt.concat((dpt.ones(10000, dtype="i4"), dpt.zeros(10000, dtype="i4"))), + 734, + ) + inp = dpt.reshape(fl, (-1, 20)) + + s = dpt.sort(inp, axis=0, descending=False) + assert dpt.all(s[:-1, :] <= s[1:, :]) + + s1 = dpt.sort(inp, axis=0, descending=True) + assert dpt.all(s1[:-1, :] >= s1[1:, :]) + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_argsort_1d(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + inp = dpt.roll( + dpt.concat( + (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype)) + ), + 734, + ) + + s_idx = dpt.argsort(inp, descending=False) + assert dpt.all(inp[s_idx[:-1]] <= inp[s_idx[1:]]) + + s1_idx = dpt.argsort(inp, descending=True) + assert dpt.all(inp[s1_idx[:-1]] >= inp[s1_idx[1:]]) + + +def test_sort_validation(): + with pytest.raises(TypeError): + dpt.sort(dict()) + + +def test_argsort_validation(): + with pytest.raises(TypeError): + dpt.argsort(dict()) + + +def test_sort_axis0(): + get_queue_or_skip() + + n, m = 200, 30 + xf = dpt.arange(n * m, 0, step=-1, dtype="i4") + x = dpt.reshape(xf, (n, m)) + s = dpt.sort(x, axis=0) + + assert dpt.all(s[:-1, :] <= s[1:, :]) + + +def test_argsort_axis0(): + get_queue_or_skip() + + n, m = 200, 30 + xf = dpt.arange(n * m, 0, step=-1, dtype="i4") + x = dpt.reshape(xf, (n, m)) + idx = dpt.argsort(x, axis=0) + + conseq_idx = dpt.arange(m, dtype=idx.dtype) + s = x[idx, conseq_idx[dpt.newaxis, :]] + + assert dpt.all(s[:-1, :] <= s[1:, :]) + + +def test_sort_strided(): + get_queue_or_skip() + + x_orig = dpt.arange(100, dtype="i4") + x_flipped = dpt.flip(x_orig, axis=0) + s = dpt.sort(x_flipped) + + assert dpt.all(s == x_orig) + + +def test_argsort_strided(): + get_queue_or_skip() + + x_orig = dpt.arange(100, dtype="i4") + x_flipped = dpt.flip(x_orig, axis=0) + idx = dpt.argsort(x_flipped) + + assert dpt.all(x_flipped[idx] == x_orig) + + +def test_sort_0d_array(): + get_queue_or_skip() + + x = dpt.asarray(1, dtype="i4") + assert dpt.sort(x) == 1 + + +def test_argsort_0d_array(): + get_queue_or_skip() + + x = dpt.asarray(1, dtype="i4") + assert dpt.argsort(x) == 0 + + +@pytest.mark.parametrize( + "dtype", + [ + "f2", + "f4", + "f8", + ], +) +def test_sort_real_fp_nan(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.asarray( + [-0.0, 0.1, dpt.nan, 0.0, -0.1, dpt.nan, 0.2, -0.3], dtype=dtype + ) + s = dpt.sort(x) + + expected = dpt.asarray( + [-0.3, -0.1, -0.0, 0.0, 0.1, 0.2, dpt.nan, dpt.nan], dtype=dtype + ) + + assert dpt.allclose(s, expected, equal_nan=True) + + s = dpt.sort(x, descending=True) + + expected = dpt.asarray( + [dpt.nan, dpt.nan, 0.2, 0.1, -0.0, 0.0, -0.1, -0.3], dtype=dtype + ) + + assert dpt.allclose(s, expected, equal_nan=True) + + +@pytest.mark.parametrize( + "dtype", + [ + "c8", + "c16", + ], +) +def test_sort_complex_fp_nan(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + rvs = [-0.0, 0.1, 0.0, 0.2, -0.3, dpt.nan] + ivs = [-0.0, 0.1, 0.0, 0.2, -0.3, dpt.nan] + + cv = [] + for rv in rvs: + for iv in ivs: + cv.append(complex(rv, iv)) + + inp = dpt.asarray(cv, dtype=dtype) + s = dpt.sort(inp) + + expected = np.sort(dpt.asnumpy(inp)) + + assert np.allclose(dpt.asnumpy(s), expected, equal_nan=True) + + for i, j in itertools.permutations(range(inp.shape[0]), 2): + r1 = dpt.asnumpy(dpt.sort(inp[dpt.asarray([i, j])])) + r2 = np.sort(dpt.asnumpy(inp[dpt.asarray([i, j])])) + assert np.array_equal( + r1.view(np.int64), r2.view(np.int64) + ), f"Failed for {i} and {j}" diff --git a/dpctl/tests/test_usm_ndarray_unique.py b/dpctl/tests/test_usm_ndarray_unique.py new file mode 100644 index 0000000000..0d4247def5 --- /dev/null +++ b/dpctl/tests/test_usm_ndarray_unique.py @@ -0,0 +1,304 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import dpctl +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_unique_values(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n, roll = 10000, 734 + inp = dpt.roll( + dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))), + roll, + ) + + uv = dpt.unique_values(inp) + assert dpt.all(uv == dpt.arange(2, dtype=dtype)) + + +def test_unique_values_strided(): + get_queue_or_skip() + + n, m = 1000, 20 + inp = dpt.ones((n, m), dtype="i4", order="F") + inp[:, ::2] = 0 + + uv = dpt.unique_values(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + + inp = dpt.reshape(inp, -1) + inp = dpt.flip(dpt.reshape(inp, -1)) + + uv = dpt.unique_values(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_unique_counts(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n, roll = 10000, 734 + inp = dpt.roll( + dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))), + roll, + ) + + uv, uv_counts = dpt.unique_counts(inp) + assert dpt.all(uv == dpt.arange(2, dtype=dtype)) + assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype)) + + +def test_unique_counts_strided(): + get_queue_or_skip() + + n, m = 1000, 20 + inp = dpt.ones((n, m), dtype="i4", order="F") + inp[:, ::2] = 0 + + uv, uv_counts = dpt.unique_counts(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype)) + + inp = dpt.flip(dpt.reshape(inp, -1)) + + uv, uv_counts = dpt.unique_counts(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype)) + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_unique_inverse(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n, roll = 10000, 734 + inp = dpt.roll( + dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))), + roll, + ) + + uv, inv = dpt.unique_inverse(inp) + assert dpt.all(uv == dpt.arange(2, dtype=dtype)) + assert dpt.all(inp == uv[inv]) + assert inp.shape == inv.shape + + +def test_unique_inverse_strided(): + get_queue_or_skip() + + n, m = 1000, 20 + inp = dpt.ones((n, m), dtype="i4", order="F") + inp[:, ::2] = 0 + + uv, inv = dpt.unique_inverse(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + assert dpt.all(inp == uv[inv]) + assert inp.shape == inv.shape + + inp = dpt.flip(dpt.reshape(inp, -1)) + + uv, inv = dpt.unique_inverse(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + assert dpt.all(inp == uv[inv]) + assert inp.shape == inv.shape + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_unique_all(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n, roll = 10000, 734 + inp = dpt.roll( + dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))), + roll, + ) + + uv, ind, inv, uv_counts = dpt.unique_all(inp) + assert dpt.all(uv == dpt.arange(2, dtype=dtype)) + assert dpt.all(uv == inp[ind]) + assert dpt.all(inp == uv[inv]) + assert inp.shape == inv.shape + assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype)) + + +def test_unique_all_strided(): + get_queue_or_skip() + + n, m = 1000, 20 + inp = dpt.ones((n, m), dtype="i4", order="F") + inp[:, ::2] = 0 + + uv, ind, inv, uv_counts = dpt.unique_all(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + assert dpt.all(uv == dpt.reshape(inp, -1)[ind]) + assert dpt.all(inp == uv[inv]) + assert inp.shape == inv.shape + assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype)) + + inp = dpt.flip(dpt.reshape(inp, -1)) + + uv, ind, inv, uv_counts = dpt.unique_all(inp) + assert dpt.all(uv == dpt.arange(2, dtype="i4")) + assert dpt.all(uv == inp[ind]) + assert dpt.all(inp == uv[inv]) + assert inp.shape == inv.shape + assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype)) + + +def test_set_functions_empty_input(): + get_queue_or_skip() + x = dpt.ones((10, 0, 1), dtype="i4") + + res = dpt.unique_values(x) + assert isinstance(res, dpctl.tensor.usm_ndarray) + assert res.size == 0 + assert res.dtype == x.dtype + + res = dpt.unique_inverse(x) + assert type(res).__name__ == "UniqueInverseResult" + uv, inv = res + assert isinstance(uv, dpctl.tensor.usm_ndarray) + assert uv.size == 0 + assert isinstance(inv, dpctl.tensor.usm_ndarray) + assert inv.size == 0 + + res = dpt.unique_counts(x) + assert type(res).__name__ == "UniqueCountsResult" + uv, uv_counts = res + assert isinstance(uv, dpctl.tensor.usm_ndarray) + assert uv.size == 0 + assert isinstance(uv_counts, dpctl.tensor.usm_ndarray) + assert uv_counts.size == 0 + + res = dpt.unique_all(x) + assert type(res).__name__ == "UniqueAllResult" + uv, ind, inv, uv_counts = res + assert isinstance(uv, dpctl.tensor.usm_ndarray) + assert uv.size == 0 + assert isinstance(ind, dpctl.tensor.usm_ndarray) + assert ind.size == 0 + assert isinstance(inv, dpctl.tensor.usm_ndarray) + assert inv.size == 0 + assert isinstance(uv_counts, dpctl.tensor.usm_ndarray) + assert uv_counts.size == 0 + + +def test_set_function_outputs(): + get_queue_or_skip() + # check standard and early exit paths + x1 = dpt.arange(10, dtype="i4") + x2 = dpt.ones((10, 10), dtype="i4") + + assert isinstance(dpt.unique_values(x1), dpctl.tensor.usm_ndarray) + assert isinstance(dpt.unique_values(x2), dpctl.tensor.usm_ndarray) + + assert type(dpt.unique_inverse(x1)).__name__ == "UniqueInverseResult" + assert type(dpt.unique_inverse(x2)).__name__ == "UniqueInverseResult" + + assert type(dpt.unique_counts(x1)).__name__ == "UniqueCountsResult" + assert type(dpt.unique_counts(x2)).__name__ == "UniqueCountsResult" + + assert type(dpt.unique_all(x1)).__name__ == "UniqueAllResult" + assert type(dpt.unique_all(x2)).__name__ == "UniqueAllResult" + + +def test_set_functions_compute_follows_data(): + # tests that all intermediate calls and allocations + # are compatible with an input with an arbitrary queue + q = dpctl.SyclQueue() + x = dpt.arange(10, dtype="i4", sycl_queue=q) + + assert isinstance(dpt.unique_values(x), dpctl.tensor.usm_ndarray) + assert dpt.unique_counts(x) + assert dpt.unique_inverse(x) + assert dpt.unique_all(x)