Add comment about future GPU re-evaluation

uxlfoundation · Jul 10, 2024 · 170b6b2 · 170b6b2
1 parent 2276dfb
commit 170b6b2
Show file tree

Hide file tree

Showing 4 changed files with 6 additions and 0 deletions.
diff --git a/include/oneapi/dpl/internal/reduce_by_segment_impl.h b/include/oneapi/dpl/internal/reduce_by_segment_impl.h
@@ -237,6 +237,7 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy
         16; // Each work item serially processes 16 items. Best observed performance on gpu
 
     // Limit the work-group size to prevent large sizes on CPUs. Empirically found value.
+    // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future.
     std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)2048);
 
     // adjust __wgroup_size according to local memory limit. Double the requirement on __val_type due to sycl group algorithm's use

diff --git a/include/oneapi/dpl/internal/scan_by_segment_impl.h b/include/oneapi/dpl/internal/scan_by_segment_impl.h
@@ -128,6 +128,7 @@ struct __sycl_scan_by_segment_impl
             4; // Assigning 4 elements per work item resulted in best performance on gpu.
 
         // Limit the work-group size to prevent large sizes on CPUs. Empirically found value.
+        // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future.
         std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)2048);
 
         // We require 2 * sizeof(__val_type) * __wgroup_size of SLM for the work group segmented scan. We add

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -318,6 +318,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
         // TODO: find a way to generalize getting of reliable work-group sizes
         ::std::size_t __wgroup_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(_Type));
         // Limit the work-group size to prevent large sizes on CPUs. Empirically found value.
+        // This value matches the current practical limit for GPUs, but may need to be re-evaluated in the future.
         __wgroup_size = std::min(__wgroup_size, (std::size_t)1024);
 
 #if _ONEDPL_COMPILE_KERNEL
@@ -1171,6 +1172,7 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli
 
     // TODO: find a way to generalize getting of reliable work-group size
     // Limit the work-group size to prevent large sizes on CPUs. Empirically found value.
+    // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future.
     std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)4096);
 #if _ONEDPL_COMPILE_KERNEL
     auto __kernel = __internal::__kernel_compiler<_FindOrKernel>::__compile(__exec);

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
@@ -663,6 +663,7 @@ struct __parallel_radix_sort_iteration
         ::std::size_t __max_sg_size = oneapi::dpl::__internal::__max_sub_group_size(__exec);
         ::std::size_t __reorder_sg_size = __max_sg_size;
         // Limit the work-group size to prevent large sizes on CPUs. Empirically found value.
+        // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future.
         std::size_t __scan_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)4096);
 #if _ONEDPL_RADIX_WORKLOAD_TUNING
         ::std::size_t __count_wg_size = (__in_rng.size() > (1 << 21) /*2M*/ ? 128 : __max_sg_size);
@@ -777,6 +778,7 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP
     sycl::event __event{};
 
     // Limit the work-group size to prevent large sizes on CPUs. Empirically found value.
+    // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future.
     const std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)4096);
 
     //TODO: 1.to reduce number of the kernels; 2.to define work group size in runtime, depending on number of elements