diff --git a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h index 4d7b81e6a2e..4fc274f2445 100644 --- a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h +++ b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h @@ -27,7 +27,7 @@ namespace oneapi::dpl::experimental::kt::gpu::esimd::__impl { //------------------------------------------------------------------------ -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation //------------------------------------------------------------------------ template #include #include +#include #include "../../iterator_impl.h" #include "../../execution_impl.h" #include "../../utils_ranges.h" +#include "../../utils.h" #include "sycl_defs.h" #include "parallel_backend_sycl_utils.h" @@ -214,6 +216,12 @@ class __scan_single_wg_dynamic_kernel; template class __scan_copy_single_wg_kernel; +template +class __parallel_for_small_kernel; + +template +class __parallel_for_large_kernel; + //------------------------------------------------------------------------ // parallel_for - async pattern //------------------------------------------------------------------------ @@ -222,10 +230,10 @@ class __scan_copy_single_wg_kernel; // as the parameter pack that can be empty (for unnamed kernels) or contain exactly one // type (for explicitly specified name by the user) template -struct __parallel_for_submitter; +struct __parallel_for_small_submitter; template -struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> +struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name...>> { template auto @@ -246,6 +254,120 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> } }; +template +struct __parallel_for_large_submitter; + +template +struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _RangeTypes...> +{ + static constexpr std::size_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size< + std::tuple...>>::value; + // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the + // flattened ranges. This allows us to launch enough work per item to saturate the device's memory + // bandwidth. This heuristic errs on the side of launching more work per item than what is needed to + // achieve full bandwidth utilization. 16 bytes per range per work item has been found as a good + // value across the different for-based algorithms. + static constexpr std::uint8_t __bytes_per_work_item = 16; + static constexpr std::uint8_t __iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, __min_type_size); + // Limit the work-group size to 512 which has empirically yielded the best results across different architectures. + static constexpr std::uint16_t __max_work_group_size = 512; + + // SPIR-V compilation targets show best performance with a stride of the sub-group size. + // Other compilation targets perform best with a work-group size stride. This utility can only be called from the + // device. + static inline std::tuple + __stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item, + std::size_t __work_group_size) + { + if constexpr (oneapi::dpl::__internal::__is_spirv_target_v) + { + const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group(); + const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); + const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); + const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); + const std::size_t __work_group_id = __item.get_group().get_group_linear_id(); + + const std::size_t __sub_group_start_idx = + __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id); + const bool __is_full_sub_group = + __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; + const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id; + return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group); + } + else + { + const std::size_t __work_group_start_idx = + __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; + const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id(); + const bool __is_full_work_group = + __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; + return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group); + } + } + + // Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item, + // then we should start using this code path. + template + static std::size_t + __estimate_best_start_size(const _ExecutionPolicy& __exec) + { + const std::size_t __work_group_size = + oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); + const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); + return __work_group_size * __iters_per_work_item * __max_cu; + } + + template + auto + operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const + { + assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0); + _PRINT_INFO_IN_DEBUG_MODE(__exec); + auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) { + //get an access to data under SYCL buffer: + oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); + const std::size_t __work_group_size = + oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); + const std::size_t __num_groups = + oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); + const std::size_t __num_items = __num_groups * __work_group_size; + __cgh.parallel_for<_Name...>( + sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), + [=](sycl::nd_item __item) { + // TODO: Investigate adding a vectorized path similar to reduce. + // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but + // performance regressions for out-of-place (e.g. std::copy) where the compiler was unable to + // vectorize our code. Vectorization may also improve performance of for-algorithms over small data + // types. + auto [__idx, __stride, __is_full] = + __stride_recommender(__item, __count, __iters_per_work_item, __work_group_size); + if (__is_full) + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i) + { + __brick(__idx, __rngs...); + __idx += __stride; + } + } + // If we are not full, then take this branch only if there is work to process. + else if (__idx < __count) + { + const std::uint8_t __adjusted_iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__count - __idx, __stride); + for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i) + { + __brick(__idx, __rngs...); + __idx += __stride; + } + } + }); + }); + return __future(__event); + } +}; + //General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for, //for some algorithms happens that size of processing range is n, but amount of iterations is n/2. template @@ -254,17 +376,33 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& _Ranges&&... __rngs) { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - using _ForKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<_CustomName>; - - return __parallel_for_submitter<_ForKernel>()(::std::forward<_ExecutionPolicy>(__exec), __brick, __count, - ::std::forward<_Ranges>(__rngs)...); + using _ForKernelSmall = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_small_kernel<_CustomName>>; + using _ForKernelLarge = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_large_kernel<_CustomName>>; + + using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>; + using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge, _Ranges...>; + // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a + // single kernel that worsen performance for small cases. If the number of iterations of the large submitter is 1, + // then only compile the basic kernel as the two versions are effectively the same. + if constexpr (__large_submitter::__iters_per_work_item > 1) + { + if (__count >= __large_submitter::__estimate_best_start_size(__exec)) + { + return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); + } + } + return __small_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); } //------------------------------------------------------------------------ // parallel_transform_scan - async pattern //------------------------------------------------------------------------ -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_scan_submitter; @@ -1972,7 +2110,7 @@ struct __partial_merge_kernel } }; -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_partial_sort_submitter; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h index 7baee78b1b1..3be82fdc623 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h @@ -48,7 +48,7 @@ namespace __par_backend_hetero //General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for, //for some algorithms happens that size of processing range is n, but amount of iterations is n/2. -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_for_fpga_submitter; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 753e32816a0..280bb5181bd 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -129,7 +129,7 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _ } } -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_merge_submitter; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h index edad63d2a79..23e38268bf9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h @@ -111,7 +111,7 @@ __device_reduce_kernel(const _NDItemId __item_id, const _Size __n, const _Size _ //------------------------------------------------------------------------ // parallel_transform_reduce - async patterns -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation //------------------------------------------------------------------------ // Parallel_transform_reduce for a small arrays using a single work group. diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h index 8a8dfdae1bc..1848d33eaea 100644 --- a/include/oneapi/dpl/pstl/utils.h +++ b/include/oneapi/dpl/pstl/utils.h @@ -25,6 +25,7 @@ #include #include #include +#include #if _ONEDPL_BACKEND_SYCL # include "hetero/dpcpp/sycl_defs.h" @@ -784,6 +785,21 @@ union __lazy_ctor_storage } }; +// Returns the smallest type within a set of potentially nested template types. +// E.g. If we consider the type: T = tuple, int, double>, +// then __min_nested_type_size::value returns sizeof(short). +template +struct __min_nested_type_size +{ + constexpr static std::size_t value = sizeof(_T); +}; + +template