From 9c70d8877eff1c8cfc6f7d0e6873d6e23564f5a8 Mon Sep 17 00:00:00 2001 From: Dmitriy Sobolev Date: Mon, 6 Jan 2025 22:00:55 +0000 Subject: [PATCH] Make kernel compilation logic more portable --- .../dpl/internal/scan_by_segment_impl.h | 4 ++-- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 4 ++-- .../dpcpp/parallel_backend_sycl_radix_sort.h | 6 +++--- .../dpcpp/parallel_backend_sycl_reduce.h | 2 +- .../parallel_backend_sycl_reduce_by_segment.h | 8 +++---- .../dpcpp/parallel_backend_sycl_utils.h | 4 +++- .../oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h | 21 ++++++++++++++++++- include/oneapi/dpl/pstl/onedpl_config.h | 7 ------- 8 files changed, 35 insertions(+), 21 deletions(-) diff --git a/include/oneapi/dpl/internal/scan_by_segment_impl.h b/include/oneapi/dpl/internal/scan_by_segment_impl.h index a8199e227dd..379bfc9a56c 100644 --- a/include/oneapi/dpl/internal/scan_by_segment_impl.h +++ b/include/oneapi/dpl/internal/scan_by_segment_impl.h @@ -168,7 +168,7 @@ struct __sycl_scan_by_segment_impl __cgh.use_kernel_bundle(__seg_scan_wg_kernel.get_kernel_bundle()); #endif __cgh.parallel_for<_SegScanWgKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT __seg_scan_wg_kernel, #endif sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { @@ -272,7 +272,7 @@ struct __sycl_scan_by_segment_impl __cgh.use_kernel_bundle(__seg_scan_prefix_kernel.get_kernel_bundle()); #endif __cgh.parallel_for<_SegScanPrefixKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT __seg_scan_prefix_kernel, #endif sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 3c7eb216074..33ffe938026 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -330,7 +330,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name __cgh.use_kernel_bundle(__kernel_1.get_kernel_bundle()); #endif __cgh.parallel_for<_LocalScanKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT __kernel_1, #endif sycl::nd_range<1>(__n_groups * __wgroup_size, __wgroup_size), [=](sycl::nd_item<1> __item) { @@ -351,7 +351,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name __cgh.use_kernel_bundle(__kernel_2.get_kernel_bundle()); #endif __cgh.parallel_for<_GroupScanKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT __kernel_2, #endif // TODO: try to balance work between several workgroups instead of one diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h index b00023ab02d..47a29c0f967 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h @@ -202,7 +202,7 @@ __radix_sort_count_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments, : __hdl.use_kernel_bundle(__kernel.get_kernel_bundle()); #endif __hdl.parallel_for<_KernelName>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT __kernel, #endif sycl::nd_range<1>(__segments * __wg_size, __wg_size), [=](sycl::nd_item<1> __self_item) { @@ -303,7 +303,7 @@ __radix_sort_scan_submit(_ExecutionPolicy&& __exec, ::std::size_t __scan_wg_size __hdl.use_kernel_bundle(__kernel.get_kernel_bundle()); #endif __hdl.parallel_for<_KernelName>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT __kernel, #endif sycl::nd_range<1>(__radix_states * __scan_wg_size, __scan_wg_size), [=](sycl::nd_item<1> __self_item) { @@ -548,7 +548,7 @@ __radix_sort_reorder_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments, __hdl.use_kernel_bundle(__kernel.get_kernel_bundle()); #endif __hdl.parallel_for<_KernelName>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT __kernel, #endif //Each SYCL work group processes one data segment. diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h index a29060a9cca..a6581b36d11 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h @@ -374,7 +374,7 @@ struct __parallel_transform_reduce_impl __cgh.use_kernel_bundle(__kernel.get_kernel_bundle()); #endif __cgh.parallel_for<_ReduceKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT __kernel, #endif sycl::nd_range<1>(sycl::range<1>(__n_groups * __work_group_size), diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h index 13d36c20419..331ee2481da 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h @@ -169,7 +169,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ #endif __cgh.parallel_for<_SegReduceCountKernel>( sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=]( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT __seg_reduce_count_kernel, #endif sycl::nd_item<1> __item) { @@ -206,7 +206,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ __cgh.use_kernel_bundle(__seg_reduce_offset_kernel.get_kernel_bundle()); #endif __cgh.parallel_for<_SegReduceOffsetKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT __seg_reduce_offset_kernel, #endif sycl::nd_range<1>{__wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { @@ -229,7 +229,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ __cgh.use_kernel_bundle(__seg_reduce_wg_kernel.get_kernel_bundle()); #endif __cgh.parallel_for<_SegReduceWgKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT __seg_reduce_wg_kernel, #endif sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { @@ -352,7 +352,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ __cgh.use_kernel_bundle(__seg_reduce_prefix_kernel.get_kernel_bundle()); #endif __cgh.parallel_for<_SegReducePrefixKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT __seg_reduce_prefix_kernel, #endif sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 3d9d923d80d..348484f1477 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -255,6 +255,7 @@ using __kernel_name_generator = _BaseName<_CustomName>; #endif +#if _ONEDPL_COMPILE_KERNEL template class __kernel_compiler { @@ -287,7 +288,7 @@ class __kernel_compiler { return __kernel_array_type{__kernel_bundle.get_kernel(__kernel_ids[_Ip])...}; } -#else +#elif _ONEDPL_LIBSYCL_PROGRAM_PRESENT template static auto __compile(_Exec&& __exec) @@ -300,6 +301,7 @@ class __kernel_compiler } #endif }; +#endif // _ONEDPL_COMPILE_KERNEL #if _ONEDPL_DEBUG_SYCL template diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h index 72540c492b3..abce0902be1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h @@ -42,6 +42,10 @@ #endif #define _ONEDPL_LIBSYCL_VERSION_LESS_THAN(_VERSION) (_ONEDPL_LIBSYCL_VERSION && _ONEDPL_LIBSYCL_VERSION < _VERSION) +#if defined(ACPP_VERSION_MAJOR) && defined(ACPP_VERSION_MINOR) && defined(ACPP_VERSION_PATCH) +# define _ONEDPL_ACPP_VERSION (ACPP_VERSION_MAJOR * 10000 + ACPP_VERSION_MINOR * 100 + ACPP_VERSION_PATCH) +#endif + #if _ONEDPL_FPGA_DEVICE # if _ONEDPL_LIBSYCL_VERSION >= 50400 # include @@ -59,7 +63,9 @@ #define _ONEDPL_SYCL2020_COLLECTIVES_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300)) #define _ONEDPL_SYCL2020_BUFFER_SIZE_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300)) #define _ONEDPL_SYCL2020_ACCESSOR_SIZE_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300)) -#define _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300)) +// Kernel bundle support is not expected in ACPP, see https://github.com/AdaptiveCpp/AdaptiveCpp/issues/1296. +#define _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT \ + (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300) && !_ONEDPL_ACPP_VERSION) #define _ONEDPL_SYCL2020_KNOWN_IDENTITY_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300)) #define _ONEDPL_SYCL2020_FUNCTIONAL_OBJECTS_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300)) #define _ONEDPL_SYCL2020_REQD_SUB_GROUP_SIZE_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300)) @@ -83,10 +89,23 @@ // Feature macros for DPC++ SYCL runtime library alternatives to non-supported SYCL 2020 features #define _ONEDPL_LIBSYCL_COLLECTIVES_PRESENT (_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300)) +#define _ONEDPL_LIBSYCL_PROGRAM_PRESENT (_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300)) #define _ONEDPL_LIBSYCL_KNOWN_IDENTITY_PRESENT (_ONEDPL_LIBSYCL_VERSION == 50200) #define _ONEDPL_LIBSYCL_SUB_GROUP_MASK_PRESENT \ (SYCL_EXT_ONEAPI_SUB_GROUP_MASK >= 1 && _ONEDPL_LIBSYCL_VERSION >= 50700) +// Compilation of a kernel is requiried to obtain valid work_group_size +// when target devices are CPU or FPGA emulator. Since CPU and GPU devices +// cannot be distinguished during compilation, the macro is enabled by default. +#define _ONEDPL_CAN_COMPILE_KERNEL (_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT || _ONEDPL_LIBSYCL_PROGRAM_PRESENT) +#if !defined(_ONEDPL_COMPILE_KERNEL) +# define _ONEDPL_COMPILE_KERNEL _ONEDPL_CAN_COMPILE_KERNEL +#else +# if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_CAN_COMPILE_KERNEL +# error "No SYCL kernel compilation method available (neither SYCL 2020 kernel bundle nor other alternatives)." +# endif +#endif + #define _ONEDPL_SYCL_DEVICE_COPYABLE_SPECIALIZATION_BROKEN (_ONEDPL_LIBSYCL_VERSION_LESS_THAN(70100)) // Macro to check if we are compiling for SPIR-V devices. This macro must only be used within diff --git a/include/oneapi/dpl/pstl/onedpl_config.h b/include/oneapi/dpl/pstl/onedpl_config.h index 05b91087078..71ee884f39e 100644 --- a/include/oneapi/dpl/pstl/onedpl_config.h +++ b/include/oneapi/dpl/pstl/onedpl_config.h @@ -334,13 +334,6 @@ # define _ONEDPL_USE_GROUP_ALGOS 1 # endif -// Compilation of a kernel is requiried to obtain valid work_group_size -// when target devices are CPU or FPGA emulator. Since CPU and GPU devices -// cannot be distinguished during compilation, the macro is enabled by default. -# if !defined(_ONEDPL_COMPILE_KERNEL) -# define _ONEDPL_COMPILE_KERNEL 1 -# endif - # define _ONEDPL_BUILT_IN_STABLE_NAME_PRESENT __has_builtin(__builtin_sycl_unique_stable_name) #endif // _ONEDPL_BACKEND_SYCL