From 170b6b2d8052b9957c5e2924ade35fc0a37a7b42 Mon Sep 17 00:00:00 2001
From: Julian Miller <julian.miller@intel.com>
Date: Wed, 10 Jul 2024 16:44:10 +0200
Subject: [PATCH] Add comment about future GPU re-evaluation

---
 include/oneapi/dpl/internal/reduce_by_segment_impl.h            | 1 +
 include/oneapi/dpl/internal/scan_by_segment_impl.h              | 1 +
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h    | 2 ++
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h    | 2 ++
 4 files changed, 6 insertions(+)

diff --git a/include/oneapi/dpl/internal/reduce_by_segment_impl.h b/include/oneapi/dpl/internal/reduce_by_segment_impl.h
index 1de86633ebe..759b61e2cdd 100644
--- a/include/oneapi/dpl/internal/reduce_by_segment_impl.h
+++ b/include/oneapi/dpl/internal/reduce_by_segment_impl.h
@@ -237,6 +237,7 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy
         16; // Each work item serially processes 16 items. Best observed performance on gpu
 
     // Limit the work-group size to prevent large sizes on CPUs. Empirically found value.
+    // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future.
     std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)2048);
 
     // adjust __wgroup_size according to local memory limit. Double the requirement on __val_type due to sycl group algorithm's use
diff --git a/include/oneapi/dpl/internal/scan_by_segment_impl.h b/include/oneapi/dpl/internal/scan_by_segment_impl.h
index 43097a30633..b895561baeb 100644
--- a/include/oneapi/dpl/internal/scan_by_segment_impl.h
+++ b/include/oneapi/dpl/internal/scan_by_segment_impl.h
@@ -128,6 +128,7 @@ struct __sycl_scan_by_segment_impl
             4; // Assigning 4 elements per work item resulted in best performance on gpu.
 
         // Limit the work-group size to prevent large sizes on CPUs. Empirically found value.
+        // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future.
         std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)2048);
 
         // We require 2 * sizeof(__val_type) * __wgroup_size of SLM for the work group segmented scan. We add
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 9543b0a5958..da5c0e72d72 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -318,6 +318,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
         // TODO: find a way to generalize getting of reliable work-group sizes
         ::std::size_t __wgroup_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(_Type));
         // Limit the work-group size to prevent large sizes on CPUs. Empirically found value.
+        // This value matches the current practical limit for GPUs, but may need to be re-evaluated in the future.
         __wgroup_size = std::min(__wgroup_size, (std::size_t)1024);
 
 #if _ONEDPL_COMPILE_KERNEL
@@ -1171,6 +1172,7 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli
 
     // TODO: find a way to generalize getting of reliable work-group size
     // Limit the work-group size to prevent large sizes on CPUs. Empirically found value.
+    // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future.
     std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)4096);
 #if _ONEDPL_COMPILE_KERNEL
     auto __kernel = __internal::__kernel_compiler<_FindOrKernel>::__compile(__exec);
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
index 35c16467fa6..c8e0f6bf23d 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
@@ -663,6 +663,7 @@ struct __parallel_radix_sort_iteration
         ::std::size_t __max_sg_size = oneapi::dpl::__internal::__max_sub_group_size(__exec);
         ::std::size_t __reorder_sg_size = __max_sg_size;
         // Limit the work-group size to prevent large sizes on CPUs. Empirically found value.
+        // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future.
         std::size_t __scan_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)4096);
 #if _ONEDPL_RADIX_WORKLOAD_TUNING
         ::std::size_t __count_wg_size = (__in_rng.size() > (1 << 21) /*2M*/ ? 128 : __max_sg_size);
@@ -777,6 +778,7 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP
     sycl::event __event{};
 
     // Limit the work-group size to prevent large sizes on CPUs. Empirically found value.
+    // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future.
     const std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)4096);
 
     //TODO: 1.to reduce number of the kernels; 2.to define work group size in runtime, depending on number of elements