Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace SYCL backend reduce_by_segment implementation with reduce-then-scan call #1915

Open
wants to merge 31 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
6c11271
Initial commit of reduce_by_segment with the reduce-then-scan path
mmichel11 Sep 27, 2024
b7328e8
Revert change to ranges and use zip_view over segments / values instead
mmichel11 Oct 14, 2024
f017ffd
Implement correct return for reduce_by_segment
mmichel11 Oct 15, 2024
5cfa661
Add support for flag predicates
mmichel11 Oct 15, 2024
4f8059a
Revert "Add support for flag predicates"
mmichel11 Oct 15, 2024
d56bc5a
Re-implement support for flag predicates in a more performant manner
mmichel11 Oct 15, 2024
543e82f
Add fallback and remove old SYCL implementation
mmichel11 Oct 15, 2024
7c92238
Switch from using lambdas to functors
mmichel11 Oct 15, 2024
6c4267b
Add device copyable specializations for red-by-seg functors and updat…
mmichel11 Oct 15, 2024
3eee242
Fix typo in error message in device_copyable.pass.cpp
mmichel11 Oct 15, 2024
b334fb0
Introduce separate input generation for scan phase and update tests
mmichel11 Oct 18, 2024
d564b33
Improve code readability
mmichel11 Oct 18, 2024
358ec3b
Add optional first key field to scan input and remove input range in …
mmichel11 Oct 18, 2024
db0bc25
Update __write_op in reduce-then-scan
mmichel11 Oct 18, 2024
7abbff8
Remove now unneeded ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION macro
mmichel11 Oct 18, 2024
92cf29a
Alternate testing between usm shared and device to prevent excessive …
mmichel11 Oct 21, 2024
d292e07
Performance tuning within scan input functor
mmichel11 Oct 21, 2024
e41298d
Handle n=0, n=1 first in reduce_by_segment
mmichel11 Oct 21, 2024
fb9a306
Code cleanup
mmichel11 Oct 22, 2024
d32ea4d
Improve comments and mark relevant variables as const
mmichel11 Oct 22, 2024
aa4baaf
Add condition to ensure value type is trivially copyable to call redu…
mmichel11 Oct 22, 2024
8cc59df
clang-format
mmichel11 Oct 22, 2024
80ceacd
Introduce iterator based __pattern_reduce_by_segment
mmichel11 Nov 5, 2024
74143b2
Revert "Remove now unneeded ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTIO…
mmichel11 Nov 5, 2024
fdf6a39
Fix test bug where device allocation is always used for testing
mmichel11 Nov 5, 2024
5987df9
Separate each reduce_by_segment fallback path into their own functions
mmichel11 Nov 13, 2024
d80377e
clang-format
mmichel11 Nov 13, 2024
3c8154e
Address comments in reduce-then-scan based implementation
mmichel11 Nov 21, 2024
db63d45
Improve explanations of reduce-by-segment approach
mmichel11 Nov 22, 2024
3deed76
Use binary_op[_non]_device_copyable where appropriate
mmichel11 Nov 22, 2024
c641bd3
Address comments in fallback implementation
mmichel11 Nov 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
436 changes: 8 additions & 428 deletions include/oneapi/dpl/internal/reduce_by_segment_impl.h

Large diffs are not rendered by default.

40 changes: 40 additions & 0 deletions include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
Original file line number Diff line number Diff line change
Expand Up @@ -2003,6 +2003,46 @@ __pattern_shift_right(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec
return __last - __res;
}

template <typename _Name>
struct __copy_keys_values_wrapper;

template <typename _BackendTag, typename _ExecutionPolicy, typename _Iterator1, typename _Iterator2,
typename _Iterator3, typename _Iterator4, typename _BinaryPredicate, typename _BinaryOperator>
typename std::iterator_traits<_Iterator3>::difference_type
__pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Iterator1 __keys_first,
_Iterator1 __keys_last, _Iterator2 __values_first, _Iterator3 __out_keys_first,
_Iterator4 __out_values_first, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
{
std::size_t __n = std::distance(__keys_first, __keys_last);

if (__n == 0)
return 0;

if (__n == 1)
{
__brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy> __copy_op{};

oneapi::dpl::__internal::__pattern_walk2_n(
__tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_keys_values_wrapper>(__exec),
oneapi::dpl::make_zip_iterator(__keys_first, __values_first), 1,
oneapi::dpl::make_zip_iterator(__out_keys_first, __out_values_first), __copy_op);

return 1;
}

auto __keep_keys = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>();
auto __keys = __keep_keys(__keys_first, __keys_last);
auto __keep_values = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator2>();
auto __values = __keep_values(__values_first, __values_first + __n);
auto __keep_key_outputs = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator3>();
auto __out_keys = __keep_key_outputs(__out_keys_first, __out_keys_first + __n);
auto __keep_value_outputs = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator4>();
auto __out_values = __keep_value_outputs(__out_values_first, __out_values_first + __n);
return oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __keys.all_view(), __values.all_view(),
__out_keys.all_view(), __out_values.all_view(), __binary_pred, __binary_op);
}

} // namespace __internal
} // namespace dpl
} // namespace oneapi
Expand Down
132 changes: 9 additions & 123 deletions include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
Original file line number Diff line number Diff line change
Expand Up @@ -889,22 +889,7 @@ __pattern_minmax_element(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _
//------------------------------------------------------------------------

template <typename _Name>
struct __copy_keys_wrapper;

template <typename _Name>
struct __copy_values_wrapper;

template <typename _Name>
struct __reduce1_wrapper;

template <typename _Name>
struct __reduce2_wrapper;

template <typename _Name>
struct __assign_key1_wrapper;

template <typename _Name>
struct __assign_key2_wrapper;
struct __copy_keys_values_range_wrapper;

template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3,
typename _Range4, typename _BinaryPredicate, typename _BinaryOperator>
Expand Down Expand Up @@ -932,117 +917,18 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&&
__brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy> __copy_range{};

oneapi::dpl::__internal::__ranges::__pattern_walk_n(
__tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_keys_wrapper>(__exec), __copy_range,
std::forward<_Range1>(__keys), std::forward<_Range3>(__out_keys));

oneapi::dpl::__internal::__ranges::__pattern_walk_n(
__tag,
oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_values_wrapper>(
std::forward<_ExecutionPolicy>(__exec)),
__copy_range, std::forward<_Range2>(__values), std::forward<_Range4>(__out_values));
__tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_keys_values_range_wrapper>(__exec),
__copy_range,
oneapi::dpl::__ranges::zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)),
oneapi::dpl::__ranges::zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)));

return 1;
}

using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>;
using __key_type = oneapi::dpl::__internal::__value_t<_Range1>;
using __val_type = oneapi::dpl::__internal::__value_t<_Range2>;

// Round 1: reduce with extra indices added to avoid long segments
// TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence.
// Skip a round of copy_if and reduces if there are none.
auto __idx = oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer();
auto __tmp_out_keys =
oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __key_type>(__exec, __n).get_buffer();
auto __tmp_out_values =
oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n).get_buffer();

// Replicating first element of keys view to be able to compare (i-1)-th and (i)-th key with aligned sequences,
// dropping the last key for the i-1 sequence.
auto __k1 =
oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::replicate_start_view_simple(__keys, 1), __n);

// view1 elements are a tuple of the element index and pairs of adjacent keys
// view2 elements are a tuple of the elements where key-index pairs will be written by copy_if
auto __view1 = experimental::ranges::zip_view(experimental::ranges::views::iota(0, __n), __k1, __keys);
auto __view2 = experimental::ranges::zip_view(experimental::ranges::views::all_write(__tmp_out_keys),
experimental::ranges::views::all_write(__idx));

// use work group size adjusted to shared local memory as the maximum segment size.
std::size_t __wgroup_size =
oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(__key_type) + sizeof(__val_type));

// element is copied if it is the 0th element (marks beginning of first segment), is in an index
// evenly divisible by wg size (ensures segments are not long), or has a key not equal to the
// adjacent element (marks end of real segments)
// TODO: replace wgroup size with segment size based on platform specifics.
auto __intermediate_result_end = __ranges::__pattern_copy_if(
__tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key1_wrapper>(__exec), __view1, __view2,
[__binary_pred, __wgroup_size](const auto& __a) {
// The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys
// for (i-1), but we still need to get its key value as it is the start of a segment
const auto index = std::get<0>(__a);
if (index == 0)
return true;
return index % __wgroup_size == 0 // segment size
|| !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // key comparison
},
unseq_backend::__brick_assign_key_position{});

//reduce by segment
oneapi::dpl::__par_backend_hetero::__parallel_for(
_BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec),
unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), __intermediate_result_end,
oneapi::dpl::__ranges::take_view_simple(experimental::ranges::views::all_read(__idx),
__intermediate_result_end),
std::forward<_Range2>(__values), experimental::ranges::views::all_write(__tmp_out_values))
.wait();

// Round 2: final reduction to get result for each segment of equal adjacent keys
// create views over adjacent keys
oneapi::dpl::__ranges::all_view<__key_type, __par_backend_hetero::access_mode::read_write> __new_keys(
__tmp_out_keys);

// Replicating first element of key views to be able to compare (i-1)-th and (i)-th key,
// dropping the last key for the i-1 sequence. Only taking the appropriate number of keys to start with here.
auto __clipped_new_keys = oneapi::dpl::__ranges::take_view_simple(__new_keys, __intermediate_result_end);

auto __k3 = oneapi::dpl::__ranges::take_view_simple(
oneapi::dpl::__ranges::replicate_start_view_simple(__clipped_new_keys, 1), __intermediate_result_end);

// view3 elements are a tuple of the element index and pairs of adjacent keys
// view4 elements are a tuple of the elements where key-index pairs will be written by copy_if
auto __view3 = experimental::ranges::zip_view(experimental::ranges::views::iota(0, __intermediate_result_end), __k3,
__clipped_new_keys);
auto __view4 = experimental::ranges::zip_view(experimental::ranges::views::all_write(__out_keys),
experimental::ranges::views::all_write(__idx));

// element is copied if it is the 0th element (marks beginning of first segment), or has a key not equal to
// the adjacent element (end of a segment). Artificial segments based on wg size are not created.
auto __result_end = __ranges::__pattern_copy_if(
__tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key2_wrapper>(__exec), __view3, __view4,
[__binary_pred](const auto& __a) {
// The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys
// for (i-1), but we still need to get its key value as it is the start of a segment
if (std::get<0>(__a) == 0)
return true;
return !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // keys comparison
},
unseq_backend::__brick_assign_key_position{});

//reduce by segment
oneapi::dpl::__par_backend_hetero::__parallel_for(
_BackendTag{},
oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>(
std::forward<_ExecutionPolicy>(__exec)),
unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>(
__binary_op, __intermediate_result_end),
__result_end,
oneapi::dpl::__ranges::take_view_simple(experimental::ranges::views::all_read(__idx), __result_end),
experimental::ranges::views::all_read(__tmp_out_values), std::forward<_Range4>(__out_values))
.__deferrable_wait();

return __result_end;
return oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys),
std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values),
__binary_pred, __binary_op);
}

} // namespace __ranges
Expand Down
Loading
Loading