uxlfoundation · mmichel11 · Sep 5, 2024 · Sep 5, 2024 · Sep 6, 2024 · Sep 10, 2024
diff --git a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h
@@ -27,7 +27,7 @@ namespace oneapi::dpl::experimental::kt::gpu::esimd::__impl
 {
 
 //------------------------------------------------------------------------
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 //------------------------------------------------------------------------
 
 template <bool __is_ascending, ::std::uint8_t __radix_bits, ::std::uint16_t __data_per_work_item,

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -28,10 +28,12 @@
 #include <cmath>
 #include <limits>
 #include <cstdint>
+#include <tuple>
 
 #include "../../iterator_impl.h"
 #include "../../execution_impl.h"
 #include "../../utils_ranges.h"
+#include "../../utils.h"
 
 #include "sycl_defs.h"
 #include "parallel_backend_sycl_utils.h"
@@ -214,6 +216,12 @@ class __scan_single_wg_dynamic_kernel;
 template <typename... Name>
 class __scan_copy_single_wg_kernel;
 
+template <typename... Name>
+class __parallel_for_small_kernel;
+
+template <typename... Name>
+class __parallel_for_large_kernel;
+
 //------------------------------------------------------------------------
 // parallel_for - async pattern
 //------------------------------------------------------------------------
@@ -222,10 +230,10 @@ class __scan_copy_single_wg_kernel;
 // as the parameter pack that can be empty (for unnamed kernels) or contain exactly one
 // type (for explicitly specified name by the user)
 template <typename _KernelName>
-struct __parallel_for_submitter;
+struct __parallel_for_small_submitter;
 
 template <typename... _Name>
-struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
+struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name...>>
 {
     template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
     auto
@@ -246,6 +254,120 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
     }
 };
 
+template <typename _KernelName, typename... _RangeTypes>
+struct __parallel_for_large_submitter;
+
+template <typename... _Name, typename... _RangeTypes>
+struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _RangeTypes...>
+{
+    static constexpr std::size_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size<
+        std::tuple<oneapi::dpl::__internal::__value_t<_RangeTypes>...>>::value;
+    // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the
+    // flattened ranges. This allows us to launch enough work per item to saturate the device's memory
+    // bandwidth. This heuristic errs on the side of launching more work per item than what is needed to
+    // achieve full bandwidth utilization. 16 bytes per range per work item has been found as a good
+    // value across the different for-based algorithms.
+    static constexpr std::uint8_t __bytes_per_work_item = 16;
+    static constexpr std::uint8_t __iters_per_work_item =
+        oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, __min_type_size);
+    // Limit the work-group size to 512 which has empirically yielded the best results across different architectures.
+    static constexpr std::uint16_t __max_work_group_size = 512;
+
+    // SPIR-V compilation targets show best performance with a stride of the sub-group size.
+    // Other compilation targets perform best with a work-group size stride. This utility can only be called from the
+    // device.
+    static inline std::tuple<std::size_t, std::size_t, bool>
+    __stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item,
+                         std::size_t __work_group_size)
+    {
+        if constexpr (oneapi::dpl::__internal::__is_spirv_target_v)
+        {
+            const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group();
+            const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
+            const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
+            const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
+            const std::size_t __work_group_id = __item.get_group().get_group_linear_id();
+
+            const std::size_t __sub_group_start_idx =
+                __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id);
+            const bool __is_full_sub_group =
+                __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count;
+            const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id;
+            return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group);
+        }
+        else
+        {
+            const std::size_t __work_group_start_idx =
+                __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item;
+            const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id();
+            const bool __is_full_work_group =
+                __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count;
+            return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group);
+        }
+    }
+
+    // Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item,
+    // then we should start using this code path.
+    template <typename _ExecutionPolicy>
+    static std::size_t
+    __estimate_best_start_size(const _ExecutionPolicy& __exec)
+    {
+        const std::size_t __work_group_size =
+            oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
+        const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
+        return __work_group_size * __iters_per_work_item * __max_cu;
+    }
+
+    template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
+    auto
+    operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
+    {
+        assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
+        _PRINT_INFO_IN_DEBUG_MODE(__exec);
+        auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) {
+            //get an access to data under SYCL buffer:
+            oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
+            const std::size_t __work_group_size =
+                oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
+            const std::size_t __num_groups =
+                oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item));
+            const std::size_t __num_items = __num_groups * __work_group_size;
+            __cgh.parallel_for<_Name...>(
+                sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
-            const std::size_t __num_items = __num_groups * __work_group_size;
-            __cgh.parallel_for<_Name...>(
-                sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
+            __cgh.parallel_for<_Name...>(
+                sycl::nd_range(sycl::range<1>(__num_groups * __work_group_size), sycl::range<1>(__work_group_size)),
-            const std::size_t __num_items = __num_groups * __work_group_size;
-            __cgh.parallel_for<_Name...>(
-                sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
+            __cgh.parallel_for<_Name...>(
+                sycl::nd_range(sycl::range<1>(__num_groups * __work_group_size), sycl::range<1>(__work_group_size)),
+                [=](sycl::nd_item</*dim=*/1> __item) {
+                    // TODO: Investigate adding a vectorized path similar to reduce.
+                    // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but
+                    // performance regressions for out-of-place (e.g. std::copy) where the compiler was unable to
+                    // vectorize our code. Vectorization may also improve performance of for-algorithms over small data
+                    // types.
+                    auto [__idx, __stride, __is_full] =
+                        __stride_recommender(__item, __count, __iters_per_work_item, __work_group_size);
+                    if (__is_full)
+                    {
+                        _ONEDPL_PRAGMA_UNROLL
+                        for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i)
+                        {
+                            __brick(__idx, __rngs...);
+                            __idx += __stride;
+                        }
+                    }
+                    // If we are not full, then take this branch only if there is work to process.
+                    else if (__idx < __count)
+                    {
+                        const std::uint8_t __adjusted_iters_per_work_item =
+                            oneapi::dpl::__internal::__dpl_ceiling_div(__count - __idx, __stride);
+                        for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)
+                        {
+                            __brick(__idx, __rngs...);
+                            __idx += __stride;
+                        }
+                    }
+                });
+        });
+        return __future(__event);
+    }
+};
+
 //General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for,
 //for some algorithms happens that size of processing range is n, but amount of iterations is n/2.
 template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
@@ -254,17 +376,33 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
                _Ranges&&... __rngs)
 {
     using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
-    using _ForKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<_CustomName>;
-
-    return __parallel_for_submitter<_ForKernel>()(::std::forward<_ExecutionPolicy>(__exec), __brick, __count,
-                                                  ::std::forward<_Ranges>(__rngs)...);
+    using _ForKernelSmall =
+        oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_small_kernel<_CustomName>>;
+    using _ForKernelLarge =
+        oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_large_kernel<_CustomName>>;
+
+    using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>;
+    using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge, _Ranges...>;
+    // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a
+    // single kernel that worsen performance for small cases. If the number of iterations of the large submitter is 1,
+    // then only compile the basic kernel as the two versions are effectively the same.
+    if constexpr (__large_submitter::__iters_per_work_item > 1)
+    {
+        if (__count >= __large_submitter::__estimate_best_start_size(__exec))
+        {
+            return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+                                       std::forward<_Ranges>(__rngs)...);
+        }
+    }
+    return __small_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+                               std::forward<_Ranges>(__rngs)...);
 }
 
 //------------------------------------------------------------------------
 // parallel_transform_scan - async pattern
 //------------------------------------------------------------------------
 
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 template <typename _CustomName, typename _PropagateScanName>
 struct __parallel_scan_submitter;
 
@@ -1972,7 +2110,7 @@ struct __partial_merge_kernel
     }
 };
 
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 template <typename _GlobalSortName, typename _CopyBackName>
 struct __parallel_partial_sort_submitter;
 

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h
@@ -48,7 +48,7 @@ namespace __par_backend_hetero
 //General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for,
 //for some algorithms happens that size of processing range is n, but amount of iterations is n/2.
 
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 template <typename _Name>
 struct __parallel_for_fpga_submitter;
 

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -129,7 +129,7 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _
     }
 }
 
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 template <typename _IdType, typename _Name>
 struct __parallel_merge_submitter;
 

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
@@ -111,7 +111,7 @@ __device_reduce_kernel(const _NDItemId __item_id, const _Size __n, const _Size _
 
 //------------------------------------------------------------------------
 // parallel_transform_reduce - async patterns
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 //------------------------------------------------------------------------
 
 // Parallel_transform_reduce for a small arrays using a single work group.

diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h
@@ -25,6 +25,7 @@
 #include <iterator>
 #include <functional>
 #include <type_traits>
+#include <algorithm>
 
 #if _ONEDPL_BACKEND_SYCL
 #    include "hetero/dpcpp/sycl_defs.h"
@@ -784,6 +785,21 @@ union __lazy_ctor_storage
     }
 };
 
+// Returns the smallest type within a set of potentially nested template types.
+// E.g. If we consider the type: T = tuple<float, tuple<short, long>, int, double>,
+// then __min_nested_type_size<T>::value returns sizeof(short).
+template <typename _T>
+struct __min_nested_type_size
+{
+    constexpr static std::size_t value = sizeof(_T);
+};
+
+template <template <typename...> typename _WrapperType, typename... _Ts>
+struct __min_nested_type_size<_WrapperType<_Ts...>>
+{
+    constexpr static std::size_t value = std::min({__min_nested_type_size<_Ts>::value...});
+};
+
 } // namespace __internal
 } // namespace dpl
 } // namespace oneapi