From 170b6b2d8052b9957c5e2924ade35fc0a37a7b42 Mon Sep 17 00:00:00 2001 From: Julian Miller Date: Wed, 10 Jul 2024 16:44:10 +0200 Subject: [PATCH] Add comment about future GPU re-evaluation --- include/oneapi/dpl/internal/reduce_by_segment_impl.h | 1 + include/oneapi/dpl/internal/scan_by_segment_impl.h | 1 + include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 ++ .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h | 2 ++ 4 files changed, 6 insertions(+) diff --git a/include/oneapi/dpl/internal/reduce_by_segment_impl.h b/include/oneapi/dpl/internal/reduce_by_segment_impl.h index 1de86633ebe..759b61e2cdd 100644 --- a/include/oneapi/dpl/internal/reduce_by_segment_impl.h +++ b/include/oneapi/dpl/internal/reduce_by_segment_impl.h @@ -237,6 +237,7 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy 16; // Each work item serially processes 16 items. Best observed performance on gpu // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. + // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)2048); // adjust __wgroup_size according to local memory limit. Double the requirement on __val_type due to sycl group algorithm's use diff --git a/include/oneapi/dpl/internal/scan_by_segment_impl.h b/include/oneapi/dpl/internal/scan_by_segment_impl.h index 43097a30633..b895561baeb 100644 --- a/include/oneapi/dpl/internal/scan_by_segment_impl.h +++ b/include/oneapi/dpl/internal/scan_by_segment_impl.h @@ -128,6 +128,7 @@ struct __sycl_scan_by_segment_impl 4; // Assigning 4 elements per work item resulted in best performance on gpu. // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. + // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)2048); // We require 2 * sizeof(__val_type) * __wgroup_size of SLM for the work group segmented scan. We add diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 9543b0a5958..da5c0e72d72 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -318,6 +318,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name // TODO: find a way to generalize getting of reliable work-group sizes ::std::size_t __wgroup_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(_Type)); // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. + // This value matches the current practical limit for GPUs, but may need to be re-evaluated in the future. __wgroup_size = std::min(__wgroup_size, (std::size_t)1024); #if _ONEDPL_COMPILE_KERNEL @@ -1171,6 +1172,7 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli // TODO: find a way to generalize getting of reliable work-group size // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. + // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)4096); #if _ONEDPL_COMPILE_KERNEL auto __kernel = __internal::__kernel_compiler<_FindOrKernel>::__compile(__exec); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h index 35c16467fa6..c8e0f6bf23d 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h @@ -663,6 +663,7 @@ struct __parallel_radix_sort_iteration ::std::size_t __max_sg_size = oneapi::dpl::__internal::__max_sub_group_size(__exec); ::std::size_t __reorder_sg_size = __max_sg_size; // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. + // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. std::size_t __scan_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)4096); #if _ONEDPL_RADIX_WORKLOAD_TUNING ::std::size_t __count_wg_size = (__in_rng.size() > (1 << 21) /*2M*/ ? 128 : __max_sg_size); @@ -777,6 +778,7 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP sycl::event __event{}; // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. + // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. const std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)4096); //TODO: 1.to reduce number of the kernels; 2.to define work group size in runtime, depending on number of elements