Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve SYCL backend __parallel_for performance for large input sizes #1870

Closed
wants to merge 26 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
e0e03bd
Optimize memory transactions in SYCL backend parallel for
mmichel11 Sep 5, 2024
9f3384a
clang-format
mmichel11 Sep 5, 2024
a244bbb
Correct comment and error handling.
mmichel11 Sep 6, 2024
ad05086
__num_groups bugfix
mmichel11 Sep 10, 2024
bb83642
Introduce stride recommender for different targets and better distrib…
mmichel11 Sep 16, 2024
3dbdcec
Cleanup
mmichel11 Sep 16, 2024
f53ae6c
Unroll loop if possible
mmichel11 Sep 18, 2024
0bf77a7
Revert "Unroll loop if possible"
mmichel11 Sep 18, 2024
7768aff
Use a small and large kernel in parallel for
mmichel11 Sep 20, 2024
4316e07
Improve __iters_per_work_item heuristic.
mmichel11 Sep 20, 2024
d2cf632
Code cleanup
mmichel11 Sep 20, 2024
4eeaf97
Clang format
mmichel11 Sep 23, 2024
b717ac7
Update comments
mmichel11 Sep 23, 2024
839f1ad
Bugfix in comment
mmichel11 Sep 23, 2024
b347606
More cleanup and better handle non-full case
mmichel11 Sep 23, 2024
72a0941
Rename __ndi to __item for consistency with codebase
mmichel11 Sep 24, 2024
9f82b11
Update all comments on kernel naming trick
mmichel11 Sep 24, 2024
b5f0021
Handle non-full case in a cleaner way
mmichel11 Sep 24, 2024
5519dac
Switch min tuple type utility to return size of type
mmichel11 Sep 24, 2024
2d985a5
Remove unnecessary template parameter
mmichel11 Sep 24, 2024
6ab46ad
Make non-template function inline for ODR compliance
mmichel11 Sep 24, 2024
e3e05a7
If the iters per work item is 1, then only compile the basic pfor kernel
mmichel11 Sep 24, 2024
8f9c5bb
Address several PR comments
mmichel11 Sep 25, 2024
55623b2
Remove free function __stride_recommender
mmichel11 Sep 25, 2024
39b572f
Accept ranges as forwarding references in __parallel_for_large_submitter
mmichel11 Sep 25, 2024
33337f8
Address reviewer comments
mmichel11 Nov 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ namespace oneapi::dpl::experimental::kt::gpu::esimd::__impl
{

//------------------------------------------------------------------------
// Please see the comment for __parallel_for_submitter for optional kernel name explanation
// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
//------------------------------------------------------------------------

template <bool __is_ascending, ::std::uint8_t __radix_bits, ::std::uint16_t __data_per_work_item,
Expand Down
154 changes: 146 additions & 8 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@
#include <cmath>
#include <limits>
#include <cstdint>
#include <tuple>

#include "../../iterator_impl.h"
#include "../../execution_impl.h"
#include "../../utils_ranges.h"
#include "../../utils.h"

#include "sycl_defs.h"
#include "parallel_backend_sycl_utils.h"
Expand Down Expand Up @@ -214,6 +216,12 @@ class __scan_single_wg_dynamic_kernel;
template <typename... Name>
class __scan_copy_single_wg_kernel;

template <typename... Name>
class __parallel_for_small_kernel;

template <typename... Name>
class __parallel_for_large_kernel;

//------------------------------------------------------------------------
// parallel_for - async pattern
//------------------------------------------------------------------------
Expand All @@ -222,10 +230,10 @@ class __scan_copy_single_wg_kernel;
// as the parameter pack that can be empty (for unnamed kernels) or contain exactly one
// type (for explicitly specified name by the user)
template <typename _KernelName>
struct __parallel_for_submitter;
struct __parallel_for_small_submitter;

template <typename... _Name>
struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name...>>
{
template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
auto
Expand All @@ -246,6 +254,120 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
}
};

template <typename _KernelName, typename... _RangeTypes>
struct __parallel_for_large_submitter;

template <typename... _Name, typename... _RangeTypes>
struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _RangeTypes...>
{
static constexpr std::size_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size<
std::tuple<oneapi::dpl::__internal::__value_t<_RangeTypes>...>>::value;
// __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the
// flattened ranges. This allows us to launch enough work per item to saturate the device's memory
// bandwidth. This heuristic errs on the side of launching more work per item than what is needed to
// achieve full bandwidth utilization. 16 bytes per range per work item has been found as a good
// value across the different for-based algorithms.
static constexpr std::uint8_t __bytes_per_work_item = 16;
dmitriy-sobolev marked this conversation as resolved.
Show resolved Hide resolved
static constexpr std::uint8_t __iters_per_work_item =
oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, __min_type_size);
// Limit the work-group size to 512 which has empirically yielded the best results across different architectures.
static constexpr std::uint16_t __max_work_group_size = 512;

// SPIR-V compilation targets show best performance with a stride of the sub-group size.
// Other compilation targets perform best with a work-group size stride. This utility can only be called from the
// device.
static inline std::tuple<std::size_t, std::size_t, bool>
__stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item,
std::size_t __work_group_size)
{
if constexpr (oneapi::dpl::__internal::__is_spirv_target_v)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you include utils.h where __is_spirv_target_v is defined?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

{
const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group();
const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All these functions returns std::size_t. Could you please explain why you are using std::uint32_t instead?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The signature for the sub-group member function is uint32_t get_local_linear_range() const and related functions also return uint32_t. Were you thinking about the group class maybe?

const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
const std::size_t __work_group_id = __item.get_group().get_group_linear_id();

const std::size_t __sub_group_start_idx =
__iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id);
const bool __is_full_sub_group =
__sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count;
const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id;
return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group);
Copy link
Contributor

@dmitriy-sobolev dmitriy-sobolev Oct 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you include <tuple>?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From my point of view, the usage of std::make_tuple for primitive types doesn't make sense at all.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just std::tuple is now used in the new PR.

}
else
{
const std::size_t __work_group_start_idx =
__item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item;
const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id();
const bool __is_full_work_group =
__work_group_start_idx + __iters_per_work_item * __work_group_size <= __count;
return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just std::tuple is now used in the new PR.

}
}

// Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item,
// then we should start using this code path.
template <typename _ExecutionPolicy>
static std::size_t
__estimate_best_start_size(const _ExecutionPolicy& __exec)
{
const std::size_t __work_group_size =
oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
return __work_group_size * __iters_per_work_item * __max_cu;
}

template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
auto
operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
adamfidel marked this conversation as resolved.
Show resolved Hide resolved
{
assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
_PRINT_INFO_IN_DEBUG_MODE(__exec);
auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we haven't any other cases where we capture policy into submit call.
May be better to eval

            const std::size_t __work_group_size =
                oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);

outside of submit and capture __work_group_size by value?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, applied to new PR (this one will be closed soon).

//get an access to data under SYCL buffer:
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
const std::size_t __work_group_size =
oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
const std::size_t __num_groups =
oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item));
const std::size_t __num_items = __num_groups * __work_group_size;
__cgh.parallel_for<_Name...>(
sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
Comment on lines +334 to +336
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
const std::size_t __num_items = __num_groups * __work_group_size;
__cgh.parallel_for<_Name...>(
sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
__cgh.parallel_for<_Name...>(
sycl::nd_range(sycl::range<1>(__num_groups * __work_group_size), sycl::range<1>(__work_group_size)),

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's more readable from my point of view.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, it is in the new PR.

[=](sycl::nd_item</*dim=*/1> __item) {
// TODO: Investigate adding a vectorized path similar to reduce.
// Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but
// performance regressions for out-of-place (e.g. std::copy) where the compiler was unable to
// vectorize our code. Vectorization may also improve performance of for-algorithms over small data
// types.
auto [__idx, __stride, __is_full] =
__stride_recommender(__item, __count, __iters_per_work_item, __work_group_size);
if (__is_full)
{
_ONEDPL_PRAGMA_UNROLL
for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i)
{
__brick(__idx, __rngs...);
__idx += __stride;
}
}
// If we are not full, then take this branch only if there is work to process.
else if (__idx < __count)
{
const std::uint8_t __adjusted_iters_per_work_item =
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved
oneapi::dpl::__internal::__dpl_ceiling_div(__count - __idx, __stride);
for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we use _ONEDPL_PRAGMA_UNROLL for this for-loop too?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not in this case (there is a similar case in the strided loop utility in the new PR). Because the loop end variable is computed at run-time, the loop cannot be unrolled.

This path is called for the last sub-group / work-group, so the performance impact is negligible.

{
__brick(__idx, __rngs...);
__idx += __stride;
}
}
});
});
return __future(__event);
}
};

//General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for,
//for some algorithms happens that size of processing range is n, but amount of iterations is n/2.
template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
Expand All @@ -254,17 +376,33 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
_Ranges&&... __rngs)
{
using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
using _ForKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<_CustomName>;

return __parallel_for_submitter<_ForKernel>()(::std::forward<_ExecutionPolicy>(__exec), __brick, __count,
::std::forward<_Ranges>(__rngs)...);
using _ForKernelSmall =
oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_small_kernel<_CustomName>>;
using _ForKernelLarge =
oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_large_kernel<_CustomName>>;

using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>;
using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge, _Ranges...>;
// Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a
// single kernel that worsen performance for small cases. If the number of iterations of the large submitter is 1,
// then only compile the basic kernel as the two versions are effectively the same.
if constexpr (__large_submitter::__iters_per_work_item > 1)
danhoeflinger marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we combine this if constexpr together with the next if?
Will we have some real profit from these two conditions?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Combining the two would make it a runtime if conditional. Even if __iters_per_work_item is known at compile-time and the compiler can optimize it out, there may still be a chance for the kernel to be unnecessarily compiled. I think it is best to keep the if constexpr, so we can be sure to avoid compiling the large submitter if possible.

{
if (__count >= __large_submitter::__estimate_best_start_size(__exec))
{
return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
std::forward<_Ranges>(__rngs)...);
}
}
return __small_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
std::forward<_Ranges>(__rngs)...);
}

//------------------------------------------------------------------------
// parallel_transform_scan - async pattern
//------------------------------------------------------------------------

// Please see the comment for __parallel_for_submitter for optional kernel name explanation
danhoeflinger marked this conversation as resolved.
Show resolved Hide resolved
// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
template <typename _CustomName, typename _PropagateScanName>
struct __parallel_scan_submitter;

Expand Down Expand Up @@ -1972,7 +2110,7 @@ struct __partial_merge_kernel
}
};

// Please see the comment for __parallel_for_submitter for optional kernel name explanation
// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
template <typename _GlobalSortName, typename _CopyBackName>
struct __parallel_partial_sort_submitter;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ namespace __par_backend_hetero
//General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for,
//for some algorithms happens that size of processing range is n, but amount of iterations is n/2.

// Please see the comment for __parallel_for_submitter for optional kernel name explanation
// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
template <typename _Name>
struct __parallel_for_fpga_submitter;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _
}
}

// Please see the comment for __parallel_for_submitter for optional kernel name explanation
// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
template <typename _IdType, typename _Name>
struct __parallel_merge_submitter;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ __device_reduce_kernel(const _NDItemId __item_id, const _Size __n, const _Size _

//------------------------------------------------------------------------
// parallel_transform_reduce - async patterns
// Please see the comment for __parallel_for_submitter for optional kernel name explanation
// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
//------------------------------------------------------------------------

// Parallel_transform_reduce for a small arrays using a single work group.
Expand Down
16 changes: 16 additions & 0 deletions include/oneapi/dpl/pstl/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <iterator>
#include <functional>
#include <type_traits>
#include <algorithm>

#if _ONEDPL_BACKEND_SYCL
# include "hetero/dpcpp/sycl_defs.h"
Expand Down Expand Up @@ -784,6 +785,21 @@ union __lazy_ctor_storage
}
};

// Returns the smallest type within a set of potentially nested template types.
// E.g. If we consider the type: T = tuple<float, tuple<short, long>, int, double>,
// then __min_nested_type_size<T>::value returns sizeof(short).
template <typename _T>
struct __min_nested_type_size
{
constexpr static std::size_t value = sizeof(_T);
};

template <template <typename...> typename _WrapperType, typename... _Ts>
struct __min_nested_type_size<_WrapperType<_Ts...>>
{
constexpr static std::size_t value = std::min({__min_nested_type_size<_Ts>::value...});
};

} // namespace __internal
} // namespace dpl
} // namespace oneapi
Expand Down
Loading