-
Notifications
You must be signed in to change notification settings - Fork 114
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Re-implement SYCL backend parallel_for
to improve bandwidth utilization
#1976
base: main
Are you sure you want to change the base?
Changes from 66 commits
9764a57
c836b1d
55f33a4
adadd56
71d7bcc
ebb3d56
2c4ecd0
dc6bd0c
d5126b2
6433a50
d376124
a7c7606
b8aa15c
b45a7c2
4f9a360
7bb1d2b
a2ad920
47fe214
79a18e9
3ab8c75
4a70fe2
5530209
90f19d4
1ac65b9
6a5a562
357032f
ca9e594
e4060f5
283b053
75e4beb
7990bc1
4aaa81f
65e4a68
b4657a6
3086dd3
df17673
fd4e2c3
58fd466
094124f
82135f6
e979118
4bfaada
8ae18db
6cb11c7
505bdf3
114924d
845de21
71678d0
8b0b18b
f7d9753
83c5ca4
fedd5de
08aa260
a5eca96
32612a1
1336735
c5e7d61
0c6ca75
be8aeda
86b9c89
ffd95cc
537a6f0
50a60ea
1081ab8
9513edb
b274c8d
92f3374
eb2cdf8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,13 +37,19 @@ enum class search_algorithm | |
binary_search | ||
}; | ||
|
||
template <typename Comp, typename T, search_algorithm func> | ||
struct custom_brick | ||
#if _ONEDPL_BACKEND_SYCL | ||
template <typename Comp, typename T, typename _Range, search_algorithm func> | ||
struct custom_brick : oneapi::dpl::unseq_backend::walk_scalar_base<_Range> | ||
{ | ||
Comp comp; | ||
T size; | ||
bool use_32bit_indexing; | ||
|
||
custom_brick(Comp comp, T size, bool use_32bit_indexing) | ||
: comp(comp), size(size), use_32bit_indexing(use_32bit_indexing) | ||
{ | ||
} | ||
|
||
template <typename _Size, typename _ItemId, typename _Acc> | ||
void | ||
search_impl(_ItemId idx, _Acc acc) const | ||
|
@@ -68,17 +74,23 @@ struct custom_brick | |
get<2>(acc[idx]) = (value != end_orig) && (get<1>(acc[idx]) == get<0>(acc[value])); | ||
} | ||
} | ||
|
||
template <typename _ItemId, typename _Acc> | ||
template <typename _IsFull, typename _ItemId, typename _Acc> | ||
void | ||
operator()(_ItemId idx, _Acc acc) const | ||
__scalar_path_impl(_IsFull, _ItemId idx, _Acc acc) const | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe that we may try to improve this code by replacing run-time const bool use_32bit_indexing = size <= std::numeric_limits<std::uint32_t>::max(); so it's not big deal to add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As discussed offline, I will reevaluate performance here and provide an update. The advantage of the current approach is that we only compile a single kernel whereas your suggestion may improve kernel performance with the cost of increased JIT overhead. |
||
{ | ||
if (use_32bit_indexing) | ||
search_impl<std::uint32_t>(idx, acc); | ||
else | ||
search_impl<std::uint64_t>(idx, acc); | ||
} | ||
template <typename _IsFull, typename _ItemId, typename _Acc> | ||
void | ||
operator()(_IsFull __is_full, _ItemId idx, _Acc acc) const | ||
{ | ||
__scalar_path_impl(__is_full, idx, acc); | ||
} | ||
}; | ||
#endif | ||
|
||
template <class _Tag, typename Policy, typename InputIterator1, typename InputIterator2, typename OutputIterator, | ||
typename StrictWeakOrdering> | ||
|
@@ -155,7 +167,8 @@ lower_bound_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, InputIt | |
const bool use_32bit_indexing = size <= std::numeric_limits<std::uint32_t>::max(); | ||
__bknd::__parallel_for( | ||
_BackendTag{}, ::std::forward<decltype(policy)>(policy), | ||
custom_brick<StrictWeakOrdering, decltype(size), search_algorithm::lower_bound>{comp, size, use_32bit_indexing}, | ||
custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::lower_bound>{ | ||
danhoeflinger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
comp, size, use_32bit_indexing}, | ||
value_size, zip_vw) | ||
.__deferrable_wait(); | ||
return result + value_size; | ||
|
@@ -187,7 +200,8 @@ upper_bound_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, InputIt | |
const bool use_32bit_indexing = size <= std::numeric_limits<std::uint32_t>::max(); | ||
__bknd::__parallel_for( | ||
_BackendTag{}, std::forward<decltype(policy)>(policy), | ||
custom_brick<StrictWeakOrdering, decltype(size), search_algorithm::upper_bound>{comp, size, use_32bit_indexing}, | ||
custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::upper_bound>{ | ||
comp, size, use_32bit_indexing}, | ||
value_size, zip_vw) | ||
.__deferrable_wait(); | ||
return result + value_size; | ||
|
@@ -217,10 +231,11 @@ binary_search_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, Input | |
auto result_buf = keep_result(result, result + value_size); | ||
auto zip_vw = make_zip_view(input_buf.all_view(), value_buf.all_view(), result_buf.all_view()); | ||
const bool use_32bit_indexing = size <= std::numeric_limits<std::uint32_t>::max(); | ||
__bknd::__parallel_for(_BackendTag{}, std::forward<decltype(policy)>(policy), | ||
custom_brick<StrictWeakOrdering, decltype(size), search_algorithm::binary_search>{ | ||
comp, size, use_32bit_indexing}, | ||
value_size, zip_vw) | ||
__bknd::__parallel_for( | ||
_BackendTag{}, std::forward<decltype(policy)>(policy), | ||
custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::binary_search>{ | ||
comp, size, use_32bit_indexing}, | ||
value_size, zip_vw) | ||
.__deferrable_wait(); | ||
return result + value_size; | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -56,8 +56,12 @@ __pattern_walk1(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt | |
oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator>(); | ||
auto __buf = __keep(__first, __last); | ||
|
||
auto __view = __buf.all_view(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we really extract this into local variable? Will we have some profit? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Initially, I did this to remove the duplicate |
||
oneapi::dpl::__par_backend_hetero::__parallel_for( | ||
_BackendTag{}, __exec, unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf.all_view()) | ||
_BackendTag{}, __exec, | ||
unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{ | ||
__f, static_cast<std::size_t>(__n)}, | ||
__n, __view) | ||
.__deferrable_wait(); | ||
} | ||
|
||
|
@@ -101,9 +105,14 @@ __pattern_walk2(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt | |
auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__acc_mode2, _ForwardIterator2>(); | ||
auto __buf2 = __keep2(__first2, __first2 + __n); | ||
|
||
auto __view1 = __buf1.all_view(); | ||
auto __view2 = __buf2.all_view(); | ||
|
||
auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for( | ||
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), | ||
unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf1.all_view(), __buf2.all_view()); | ||
unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{ | ||
__f, static_cast<std::size_t>(__n)}, | ||
__n, __view1, __view2); | ||
|
||
// Call no wait, wait or deferrable wait depending on _WaitMode | ||
__future.wait(_WaitMode{}); | ||
|
@@ -130,10 +139,28 @@ _ForwardIterator2 | |
__pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _ForwardIterator1 __first1, | ||
_ForwardIterator1 __last1, _ForwardIterator2 __first2, _Function __f) | ||
{ | ||
return __pattern_walk2</*_WaitMode*/ __par_backend_hetero::__deferrable_mode, | ||
__par_backend_hetero::access_mode::read_write, | ||
__par_backend_hetero::access_mode::read_write>( | ||
__tag, ::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __f); | ||
auto __n = __last1 - __first1; | ||
if (__n == 0) | ||
return __first2; | ||
|
||
auto __keep1 = | ||
oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator1>(); | ||
auto __buf1 = __keep1(__first1, __last1); | ||
|
||
auto __keep2 = | ||
oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator2>(); | ||
auto __buf2 = __keep2(__first2, __first2 + __n); | ||
|
||
auto __view1 = __buf1.all_view(); | ||
auto __view2 = __buf2.all_view(); | ||
|
||
auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for( | ||
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), | ||
unseq_backend::__brick_swap<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{ | ||
__f, static_cast<std::size_t>(__n)}, | ||
__n, __view1, __view2); | ||
__future.wait(__par_backend_hetero::__deferrable_mode{}); | ||
return __first2 + __n; | ||
} | ||
|
||
//------------------------------------------------------------------------ | ||
|
@@ -160,9 +187,15 @@ __pattern_walk3(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt | |
auto __keep3 = oneapi::dpl::__ranges::__get_sycl_range<__acc_mode3, _ForwardIterator3>(); | ||
auto __buf3 = __keep3(__first3, __first3 + __n); | ||
|
||
oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), | ||
unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, | ||
__buf1.all_view(), __buf2.all_view(), __buf3.all_view()) | ||
auto __view1 = __buf1.all_view(); | ||
auto __view2 = __buf2.all_view(); | ||
auto __view3 = __buf3.all_view(); | ||
|
||
oneapi::dpl::__par_backend_hetero::__parallel_for( | ||
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), | ||
unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2), | ||
decltype(__view3)>{__f, static_cast<std::size_t>(__n)}, | ||
__n, __view1, __view2, __view3) | ||
.__deferrable_wait(); | ||
|
||
return __first3 + __n; | ||
|
@@ -1564,9 +1597,10 @@ __pattern_reverse(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato | |
auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator>(); | ||
auto __buf = __keep(__first, __last); | ||
oneapi::dpl::__par_backend_hetero::__parallel_for( | ||
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), | ||
unseq_backend::__reverse_functor<typename ::std::iterator_traits<_Iterator>::difference_type>{__n}, __n / 2, | ||
__buf.all_view()) | ||
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), | ||
unseq_backend::__reverse_functor<typename std::iterator_traits<_Iterator>::difference_type, | ||
decltype(__buf.all_view())>{__n}, | ||
__n / 2, __buf.all_view()) | ||
.__deferrable_wait(); | ||
} | ||
|
||
|
@@ -1589,10 +1623,13 @@ __pattern_reverse_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bi | |
auto __keep2 = | ||
oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator>(); | ||
auto __buf2 = __keep2(__result, __result + __n); | ||
auto __view1 = __buf1.all_view(); | ||
auto __view2 = __buf2.all_view(); | ||
oneapi::dpl::__par_backend_hetero::__parallel_for( | ||
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), | ||
unseq_backend::__reverse_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type>{__n}, | ||
__n, __buf1.all_view(), __buf2.all_view()) | ||
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), | ||
unseq_backend::__reverse_copy<typename std::iterator_traits<_BidirectionalIterator>::difference_type, | ||
decltype(__view1), decltype(__view2)>{__n}, | ||
__n, __view1, __view2) | ||
.__deferrable_wait(); | ||
|
||
return __result + __n; | ||
|
@@ -1626,24 +1663,27 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator | |
auto __buf = __keep(__first, __last); | ||
auto __temp_buf = oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, _Tp>(__exec, __n); | ||
|
||
auto __view = __buf.all_view(); | ||
auto __temp_rng_w = | ||
oneapi::dpl::__ranges::all_view<_Tp, __par_backend_hetero::access_mode::write>(__temp_buf.get_buffer()); | ||
|
||
const auto __shift = __new_first - __first; | ||
oneapi::dpl::__par_backend_hetero::__parallel_for( | ||
_BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__rotate_wrapper>(__exec), | ||
unseq_backend::__rotate_copy<typename ::std::iterator_traits<_Iterator>::difference_type>{__n, __shift}, __n, | ||
__buf.all_view(), __temp_rng_w); | ||
unseq_backend::__rotate_copy<typename std::iterator_traits<_Iterator>::difference_type, decltype(__view), | ||
decltype(__temp_rng_w)>{__n, __shift}, | ||
__n, __view, __temp_rng_w); | ||
|
||
//An explicit wait isn't required here because we are working with a temporary sycl::buffer and sycl accessors and | ||
//SYCL runtime makes a dependency graph to prevent the races between two __parallel_for patterns. | ||
|
||
using _Function = __brick_move<__hetero_tag<_BackendTag>, _ExecutionPolicy>; | ||
auto __brick = unseq_backend::walk_n<_ExecutionPolicy, _Function>{_Function{}}; | ||
|
||
auto __temp_rng_rw = | ||
oneapi::dpl::__ranges::all_view<_Tp, __par_backend_hetero::access_mode::read_write>(__temp_buf.get_buffer()); | ||
oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __brick, | ||
auto __brick = | ||
unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__temp_rng_rw), | ||
decltype(__buf.all_view())>{_Function{}, static_cast<std::size_t>(__n)}; | ||
oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __brick, | ||
__n, __temp_rng_rw, __buf.all_view()) | ||
.__deferrable_wait(); | ||
|
||
|
@@ -1673,13 +1713,16 @@ __pattern_rotate_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bid | |
oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator>(); | ||
auto __buf2 = __keep2(__result, __result + __n); | ||
|
||
auto __view1 = __buf1.all_view(); | ||
auto __view2 = __buf2.all_view(); | ||
|
||
const auto __shift = __new_first - __first; | ||
|
||
oneapi::dpl::__par_backend_hetero::__parallel_for( | ||
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), | ||
unseq_backend::__rotate_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type>{__n, | ||
__shift}, | ||
__n, __buf1.all_view(), __buf2.all_view()) | ||
unseq_backend::__rotate_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type, | ||
decltype(__view1), decltype(__view2)>{__n, __shift}, | ||
__n, __view1, __view2) | ||
.__deferrable_wait(); | ||
|
||
return __result + __n; | ||
|
@@ -1936,19 +1979,22 @@ __pattern_shift_left(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Rang | |
if (__n >= __mid) | ||
{ | ||
using _Function = __brick_move<__hetero_tag<_BackendTag>, _ExecutionPolicy>; | ||
auto __brick = oneapi::dpl::unseq_backend::walk_n<_ExecutionPolicy, _Function>{_Function{}}; | ||
|
||
//TODO: to consider use just "read" access mode for a source range and just "write" - for a destination range. | ||
auto __src = oneapi::dpl::__ranges::drop_view_simple<_Range, _DiffType>(__rng, __n); | ||
auto __dst = oneapi::dpl::__ranges::take_view_simple<_Range, _DiffType>(__rng, __size_res); | ||
|
||
auto __brick = | ||
unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__src), decltype(__dst)>{ | ||
_Function{}, static_cast<std::size_t>(__size_res)}; | ||
|
||
oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), | ||
__brick, __size_res, __src, __dst) | ||
.__deferrable_wait(); | ||
} | ||
else //2. n < size/2; 'n' parallel copying | ||
{ | ||
auto __brick = unseq_backend::__brick_shift_left<_ExecutionPolicy, _DiffType>{__size, __n}; | ||
auto __brick = unseq_backend::__brick_shift_left<_ExecutionPolicy, _DiffType, decltype(__rng)>{__size, __n}; | ||
oneapi::dpl::__par_backend_hetero::__parallel_for( | ||
_BackendTag{}, | ||
oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__shift_left_right>( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lets fix the naming of this while were touching all its instances
__custom_brick
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems that the historical convention within the
internal/
directory is to not use any leading underscores although it has changed a bit over time.I do not have a strong preference if we make this change or leave it as is, but maybe it fits in a broader discussion regarding the remaining implementations in this directory.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure if there is compelling reason other than resistance to making purely cosmetic changes in the changelog to have a different convention here. This is why I suggest adjusting it while we are already touching all (or most) instances of it. Perhaps someone with a longer historical knowledge of this code could chime in here if there is a reason to keep this with different conventions.
Not super important to me, so optional nitpick.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will wait a bit longer to see if anyone has objections. If not, then I will add this suggestion.