diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index c7b7262531..127c232891 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -2212,6 +2212,9 @@ struct __parallel_partial_sort_submitter<__internal::__optional_kernel_name<_Glo } }; +template +class __sort_global_kernel; + template auto __parallel_partial_sort_impl(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range&& __rng, diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 7029963222..aefdb0368e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -178,7 +178,7 @@ struct __leaf_sorter // 3. Sort on work-group level bool __data_in_temp = - __group_sorter.sort(__item, __storage_acc, __comp, static_cast(0), __adjusted_process_size, + __group_sorter.sort(__item, __storage_acc, __comp, std::uint32_t{0}, __adjusted_process_size, /*sorted per sub-group*/ __data_per_workitem, __data_per_workitem, __workgroup_size); // barrier is not needed here because of the barrier inside the sort method @@ -215,7 +215,7 @@ struct __merge_sort_leaf_submitter<__internal::__optional_kernel_name<_LeafSortN sycl::event operator()(sycl::queue& __q, _Range& __rng, _Compare __comp, _LeafSorter& __leaf_sorter) const { - return __q.submit([&](sycl::handler& __cgh) { + return __q.submit([&__rng, __comp, &__leaf_sorter](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng); auto __storage_acc = __leaf_sorter.create_storage_accessor(__cgh); const std::uint32_t __wg_count = @@ -228,71 +228,423 @@ struct __merge_sort_leaf_submitter<__internal::__optional_kernel_name<_LeafSortN } }; -template +template struct __merge_sort_global_submitter; -template -struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name<_GlobalSortName...>> +template +struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name<_DiagonalsKernelName...>, + __internal::__optional_kernel_name<_GlobalSortName1...>, + __internal::__optional_kernel_name<_GlobalSortName2...>> { - template - std::pair - operator()(sycl::queue& __q, _Range& __rng, _Compare __comp, _LeafSizeT __leaf_size, _TempBuf& __temp_buf, + private: + using _merge_split_point_t = _split_point_t<_IndexT>; + + struct nd_range_params + { + std::size_t base_diag_count = 0; + std::size_t steps_between_two_base_diags = 0; + _IndexT chunk = 0; + _IndexT steps = 0; + }; + + struct WorkDataArea + { + // How WorkDataArea is implemented : + // + // i_elem_local + // | + // offset | i_elem + // | | | + // V V V + // +------+-------+------+-----+ + // | | | / | + // | | | / | + // | | | / | + // | | | / | + // | | | / | + // | | | / | + // offset -> +------+---n1--+ <----+---- whole data area : size == __n + // | | /| | + // | | <-/-+------------+---- working data area : sizeof(rng1) <= __n_sorted, sizeof(rng2) <= __n_sorted + // | | / | | + // | n2 / | | + // | | / | | + // | | / | | + // | |/ | | + // i_elem_local -> +------+-------+ | + // | / | + // | / | + // | / | + // | / | + // | / | + // i_elem -> +/ | + // | | + // | | + // | | + // | | + // | | + // +---------------------------+ + + _IndexT i_elem = 0; // Global diagonal index + _IndexT i_elem_local = 0; // Local diagonal index + // Offset to the first element in the subrange (i.e. the first element of the first subrange for merge) + _IndexT offset = 0; + _IndexT n1 = 0; // Size of the first subrange + _IndexT n2 = 0; // Size of the second subrange + + WorkDataArea(const std::size_t __n, const std::size_t __n_sorted, const std::size_t __linear_id, + const std::size_t __chunk) + { + // Calculate global diagonal index + i_elem = __linear_id * __chunk; + + // Calculate local diagonal index + i_elem_local = i_elem % (__n_sorted * 2); + + // Calculate offset to the first element in the subrange (i.e. the first element of the first subrange for merge) + offset = std::min<_IndexT>(i_elem - i_elem_local, __n); + + // Calculate size of the first and the second subranges + n1 = std::min<_IndexT>(offset + __n_sorted, __n) - offset; + n2 = std::min<_IndexT>(offset + __n_sorted + n1, __n) - (offset + n1); + } + + inline bool + is_i_elem_local_inside_merge_matrix() const + { + return i_elem_local < n1 + n2; + } + }; + + template + struct DropViews + { + using __drop_view_simple_t = oneapi::dpl::__ranges::drop_view_simple; + + __drop_view_simple_t rng1; + __drop_view_simple_t rng2; + + DropViews(Rng& __rng, const WorkDataArea& __data_area) + : rng1(__rng, __data_area.offset), rng2(__rng, __data_area.offset + __data_area.n1) + { + } + }; + + std::size_t + tune_amount_of_base_diagonals(std::size_t __n_sorted) const + { + // TODO required to evaluate this value based on available SLM size for each work-group. + const std::size_t __base_diag_count = 32 * 1'024; + + // Multiply work per item by a power of 2 to reach the desired number of iterations. + // __dpl_bit_ceil rounds the ratio up to the next power of 2. + const std::size_t __k = oneapi::dpl::__internal::__dpl_bit_ceil( + oneapi::dpl::__internal::__dpl_ceiling_div(256 * 1024 * 1024, __n_sorted)); + + return oneapi::dpl::__internal::__dpl_ceiling_div(__base_diag_count, __k); + } + + // Calculate nd-range params + template + nd_range_params + eval_nd_range_params(_ExecutionPolicy&& __exec, const std::size_t __rng_size, const _IndexT __portions) const + { + const bool __is_cpu = __exec.queue().get_device().is_cpu(); + const _IndexT __chunk = __is_cpu ? 32 : 4; + const _IndexT __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_size, __chunk); + + _IndexT __base_diag_count = tune_amount_of_base_diagonals(__rng_size); + _IndexT __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); + + return {__base_diag_count * __portions, __steps_between_two_base_diags, __chunk, __steps * __portions}; + } + + template + std::size_t + get_max_base_diags_count(_ExecutionPolicy&& __exec, const std::int64_t __n_iter, const _IndexT __n, + _IndexT __n_sorted) const + { + std::size_t __max_base_diags_count = 0; + + if (__n_iter > 0) + { + __n_sorted = __n_sorted << (__n_iter - 1); + + const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); + __max_base_diags_count = + eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __portions).base_diag_count; + } + + return __max_base_diags_count; + } + + template + inline static _merge_split_point_t + __find_start_point_w(const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp) + { + return __find_start_point(__views.rng1, decltype(__data_area.n1){0}, __data_area.n1, __views.rng2, + decltype(__data_area.n2){0}, __data_area.n2, __data_area.i_elem_local, __comp); + } + + template + inline static void + __serial_merge_w(const nd_range_params& __nd_range_params, const WorkDataArea& __data_area, + const DropViews& __views, _Rng& __rng, const _merge_split_point_t& __sp, _Compare __comp) + { + __serial_merge(__views.rng1, __views.rng2, __rng /* rng3 */, __sp.first /* start1 */, __sp.second /* start2 */, + __data_area.i_elem /* start3 */, __nd_range_params.chunk, __data_area.n1, __data_area.n2, + __comp); + } + + // Calculation of split points on each base diagonal + template + sycl::event + eval_split_points_for_groups(const sycl::event& __event_chain, const _IndexT __n_sorted, const bool __data_in_temp, + _ExecutionPolicy&& __exec, _Range&& __rng, _TempBuf& __temp_buf, _Compare __comp, + const nd_range_params& __nd_range_params, + _Storage& __base_diagonals_sp_global_storage) const + { + const _IndexT __n = __rng.size(); + + return __exec.queue().submit([&__event_chain, __n_sorted, __data_in_temp, &__rng, &__temp_buf, __comp, + __nd_range_params, &__base_diagonals_sp_global_storage, + __n](sycl::handler& __cgh) { + __cgh.depends_on(__event_chain); + + oneapi::dpl::__ranges::__require_access(__cgh, __rng); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc( + __cgh, __dpl_sycl::__no_init{}); + + sycl::accessor __dst(__temp_buf, __cgh, sycl::read_write, sycl::no_init); + + __cgh.parallel_for<_DiagonalsKernelName...>( + // +1 doesn't required here, because we need to calculate split points for each base diagonal + // and for the right base diagonal in the last work-group but we can keep it one position to the left + // because we know that for 0-diagonal the split point is { 0, 0 }. + sycl::range(__nd_range_params.base_diag_count /*+ 1*/), + [=](sycl::item __item_id) { + const std::size_t __linear_id = __item_id.get_linear_id(); + + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + + // We should add `1` to __linear_id here to avoid calculation of split-point for 0-diagonal + // Please see additional explanations in the __lookup_sp function below. + const WorkDataArea __data_area(__n, __n_sorted, __linear_id + 1, + __nd_range_params.chunk * + __nd_range_params.steps_between_two_base_diags); + + __base_diagonals_sp_global_ptr[__linear_id] = + __data_area.is_i_elem_local_inside_merge_matrix() + ? (__data_in_temp + ? __find_start_point_w(__data_area, DropViews(__dst, __data_area), __comp) + : __find_start_point_w(__data_area, DropViews(__rng, __data_area), __comp)) + : _merge_split_point_t{__data_area.n1, __data_area.n2}; + }); + }); + } + + template + inline static _merge_split_point_t + __lookup_sp(const std::size_t __linear_id_in_steps_range, const nd_range_params& __nd_range_params, + const WorkDataArea& __data_area, const DropViews& __views, _Compare __comp, + _BaseDiagonalsSPStorage __base_diagonals_sp_global_ptr) + { + // | subrange 0 | subrange 1 | subrange 2 | subrange 3 | subrange 4 + // | contains (2 * __n_sorted values) | contains (2 * __n_sorted values) | contains (2 * __n_sorted values) | contains (2 * __n_sorted values) | contains the rest of data... < Data parts + // |----/----/----/----/----/----/----/----/----|----/----/----/----/----/----/----/----/----|----/----/----/----/----/----/----/----/----|----/----/----/----/----/----/----/----/----|----/--- < Steps + // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ + // | | | | | | | | | | | | | | + // bd00 bd01 bd02 bd10 bd11 bd12 bd20 bd21 | bd22 bd30 bd31 bd32 bd40 < Base diagonals + // ^ ^ ^ ^ ^ ^ ^ | ^ ^ ^ ^ ^ + // --- 0 1 2 3 4 5 6 | 7 8 9 10 11 < Indexes in the base diagonal's SP storage + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 20 21 | 23 24 25 26 27 28 29 30 31 32 33 34 35 36 < Linear IDs: __linear_id_in_steps_range + // ^ | | | + // | __sp_left | __sp_right + // | | + // | __linear_id_in_steps_range + // We don't save the first diagonal into base diagonal's SP storage !!! + + std::size_t __diagonal_idx = __linear_id_in_steps_range / __nd_range_params.steps_between_two_base_diags; + + const _merge_split_point_t __sp_left = + __diagonal_idx > 0 ? __base_diagonals_sp_global_ptr[__diagonal_idx - 1] : _merge_split_point_t{0, 0}; + const _merge_split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx]; + + const bool __is_base_diagonal = + __linear_id_in_steps_range % __nd_range_params.steps_between_two_base_diags == 0; + + if (__sp_right.first + __sp_right.second > 0) + { + if (!__is_base_diagonal) + return __find_start_point(__views.rng1, __sp_left.first, __sp_right.first, __views.rng2, + __sp_left.second, __sp_right.second, __data_area.i_elem_local, __comp); + return __sp_left; + } + + return __find_start_point_w(__data_area, __views, __comp); + } + + // Process parallel merge + template + sycl::event + run_parallel_merge(const sycl::event& __event_chain, const _IndexT __n_sorted, const bool __data_in_temp, + _ExecutionPolicy&& __exec, _Range&& __rng, _TempBuf& __temp_buf, _Compare __comp, + const nd_range_params& __nd_range_params) const + { + const _IndexT __n = __rng.size(); + + return __exec.queue().submit([&__event_chain, __n_sorted, __data_in_temp, &__rng, &__temp_buf, __comp, + __nd_range_params, __n](sycl::handler& __cgh) { + __cgh.depends_on(__event_chain); + + oneapi::dpl::__ranges::__require_access(__cgh, __rng); + sycl::accessor __dst(__temp_buf, __cgh, sycl::read_write, sycl::no_init); + + __cgh.parallel_for<_GlobalSortName1...>( + sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { + const std::size_t __linear_id = __item_id.get_linear_id(); + + const WorkDataArea __data_area(__n, __n_sorted, __linear_id, __nd_range_params.chunk); + + if (__data_area.is_i_elem_local_inside_merge_matrix()) + { + if (__data_in_temp) + { + __serial_merge_w(__nd_range_params, __data_area, DropViews(__dst, __data_area), __rng, + __find_start_point_w(__data_area, DropViews(__dst, __data_area), __comp), + __comp); + } + else + { + __serial_merge_w(__nd_range_params, __data_area, DropViews(__rng, __data_area), __dst, + __find_start_point_w(__data_area, DropViews(__rng, __data_area), __comp), + __comp); + } + } + }); + }); + } + + // Process parallel merge with usage of split-points on base diagonals + template + sycl::event + run_parallel_merge_from_diagonals(const sycl::event& __event_chain, const _IndexT __n_sorted, + const bool __data_in_temp, _ExecutionPolicy&& __exec, _Range&& __rng, + _TempBuf& __temp_buf, _Compare __comp, const nd_range_params& __nd_range_params, + _Storage& __base_diagonals_sp_global_storage) const + { + const _IndexT __n = __rng.size(); + + return __exec.queue().submit([&__event_chain, __n_sorted, __data_in_temp, &__rng, &__temp_buf, __comp, + __nd_range_params, &__base_diagonals_sp_global_storage, + __n](sycl::handler& __cgh) { + __cgh.depends_on(__event_chain); + + oneapi::dpl::__ranges::__require_access(__cgh, __rng); + sycl::accessor __dst(__temp_buf, __cgh, sycl::read_write, sycl::no_init); + + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); + + __cgh.parallel_for<_GlobalSortName2...>( + sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { + const std::size_t __linear_id = __item_id.get_linear_id(); + + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + + const WorkDataArea __data_area(__n, __n_sorted, __linear_id, __nd_range_params.chunk); + + if (__data_area.is_i_elem_local_inside_merge_matrix()) + { + if (__data_in_temp) + { + __serial_merge_w(__nd_range_params, __data_area, DropViews(__dst, __data_area), __rng, + __lookup_sp(__linear_id, __nd_range_params, __data_area, + DropViews(__dst, __data_area), __comp, + __base_diagonals_sp_global_ptr), + __comp); + } + else + { + __serial_merge_w(__nd_range_params, __data_area, DropViews(__rng, __data_area), __dst, + __lookup_sp(__linear_id, __nd_range_params, __data_area, + DropViews(__rng, __data_area), __comp, + __base_diagonals_sp_global_ptr), + __comp); + } + } + }); + }); + } + + public: + template + std::tuple> + operator()(_ExecutionPolicy&& __exec, _Range& __rng, _Compare __comp, _LeafSizeT __leaf_size, _TempBuf& __temp_buf, sycl::event __event_chain) const { const _IndexT __n = __rng.size(); _IndexT __n_sorted = __leaf_size; - const bool __is_cpu = __q.get_device().is_cpu(); - const _IndexT __chunk = __is_cpu ? 32 : 4; - const std::size_t __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + bool __data_in_temp = false; + using __value_type = oneapi::dpl::__internal::__value_t<_Range>; + + // Calculate nd-range params + const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __n, 1); + + using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _merge_split_point_t>; + const std::size_t __n_power2 = oneapi::dpl::__internal::__dpl_bit_ceil(__n); // ctz precisely calculates log2 of an integral value which is a power of 2, while // std::log2 may be prone to rounding errors on some architectures const std::int64_t __n_iter = sycl::ctz(__n_power2) - sycl::ctz(__leaf_size); - for (std::int64_t __i = 0; __i < __n_iter; ++__i) - { - __event_chain = __q.submit([&, __event_chain, __n_sorted, __data_in_temp](sycl::handler& __cgh) { - __cgh.depends_on(__event_chain); - oneapi::dpl::__ranges::__require_access(__cgh, __rng); - sycl::accessor __dst(__temp_buf, __cgh, sycl::read_write, sycl::no_init); + // Create storage to save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + const std::size_t __max_base_diags_count = get_max_base_diags_count(__exec, __n_iter, __n, __n_sorted); + auto __p_base_diagonals_sp_global_storage = + new __base_diagonals_sp_storage_t(__exec, 0, __max_base_diags_count); - __cgh.parallel_for<_GlobalSortName...>( - sycl::range(__steps), [=](sycl::item __item_id) { - const _IndexT __i_elem = __item_id.get_linear_id() * __chunk; - const _IndexT __i_elem_local = __i_elem % (__n_sorted * 2); + // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. + std::shared_ptr<__result_and_scratch_storage_base> __p_result_and_scratch_storage_base( + static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); - const _IndexT __offset = std::min<_IndexT>(__i_elem - __i_elem_local, __n); - const _IndexT __n1 = std::min<_IndexT>(__offset + __n_sorted, __n) - __offset; - const _IndexT __n2 = std::min<_IndexT>(__offset + __n1 + __n_sorted, __n) - (__offset + __n1); + for (std::int64_t __i = 0; __i < __n_iter; ++__i) + { + if (2 * __n_sorted < __get_starting_size_limit_for_large_submitter<__value_type>()) + { + // Process parallel merge + __event_chain = run_parallel_merge(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf, + __comp, __nd_range_params); + } + else + { + const auto __portions = oneapi::dpl::__internal::__dpl_ceiling_div(__n, 2 * __n_sorted); + const nd_range_params __nd_range_params_this = + eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __portions); - if (__data_in_temp) - { - const oneapi::dpl::__ranges::drop_view_simple __rng1(__dst, __offset); - const oneapi::dpl::__ranges::drop_view_simple __rng2(__dst, __offset + __n1); + assert(__nd_range_params_this.base_diag_count <= __max_base_diags_count); - const auto start = __find_start_point(__rng1, _IndexT{0}, __n1, __rng2, _IndexT{0}, __n2, - __i_elem_local, __comp); - __serial_merge(__rng1, __rng2, __rng /*__rng3*/, start.first, start.second, __i_elem, - __chunk, __n1, __n2, __comp); - } - else - { - const oneapi::dpl::__ranges::drop_view_simple __rng1(__rng, __offset); - const oneapi::dpl::__ranges::drop_view_simple __rng2(__rng, __offset + __n1); + // Calculation of split-points on each base diagonal + __event_chain = + eval_split_points_for_groups(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf, + __comp, __nd_range_params_this, *__p_base_diagonals_sp_global_storage); + + // Process parallel merge with usage of split-points on base diagonals + __event_chain = run_parallel_merge_from_diagonals(__event_chain, __n_sorted, __data_in_temp, __exec, + __rng, __temp_buf, __comp, __nd_range_params_this, + *__p_base_diagonals_sp_global_storage); + } - const auto start = __find_start_point(__rng1, _IndexT{0}, __n1, __rng2, _IndexT{0}, __n2, - __i_elem_local, __comp); - __serial_merge(__rng1, __rng2, __dst /*__rng3*/, start.first, start.second, __i_elem, - __chunk, __n1, __n2, __comp); - } - }); - }); __n_sorted *= 2; __data_in_temp = !__data_in_temp; } - return {__event_chain, __data_in_temp}; + + return {std::move(__event_chain), __data_in_temp, std::move(__p_result_and_scratch_storage_base)}; } }; @@ -306,7 +658,7 @@ struct __merge_sort_copy_back_submitter<__internal::__optional_kernel_name<_Copy sycl::event operator()(sycl::queue& __q, _Range& __rng, _TempBuf& __temp_buf, sycl::event __event_chain) const { - __event_chain = __q.submit([&, __event_chain](sycl::handler& __cgh) { + return __q.submit([&__rng, &__temp_buf, &__event_chain](sycl::handler& __cgh) { __cgh.depends_on(__event_chain); oneapi::dpl::__ranges::__require_access(__cgh, __rng); auto __temp_acc = __temp_buf.template get_access(__cgh); @@ -317,7 +669,6 @@ struct __merge_sort_copy_back_submitter<__internal::__optional_kernel_name<_Copy __rng[__idx] = __temp_acc[__idx]; }); }); - return __event_chain; } }; @@ -325,7 +676,13 @@ template class __sort_leaf_kernel; template -class __sort_global_kernel; +class __diagonals_kernel_name_for_merge_sort; + +template +class __sort_global_kernel1; + +template +class __sort_global_kernel2; template class __sort_copy_back_kernel; @@ -339,8 +696,12 @@ __merge_sort(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp, _LeafSo using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _LeafSortKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__sort_leaf_kernel<_CustomName>>; - using _GlobalSortKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __sort_global_kernel<_CustomName, _IndexT>>; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name_for_merge_sort<_CustomName, _IndexT>>; + using _GlobalSortKernel1 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __sort_global_kernel1<_CustomName, _IndexT>>; + using _GlobalSortKernel2 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __sort_global_kernel2<_CustomName, _IndexT>>; using _CopyBackKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__sort_copy_back_kernel<_CustomName>>; @@ -356,15 +717,16 @@ __merge_sort(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp, _LeafSo // 2. Merge sorting oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, _Tp> __temp(__exec, __rng.size()); auto __temp_buf = __temp.get_buffer(); - auto [__event_sort, __data_in_temp] = __merge_sort_global_submitter<_IndexT, _GlobalSortKernel>()( - __q, __rng, __comp, __leaf_sorter.__process_size, __temp_buf, __event_leaf_sort); + auto [__event_sort, __data_in_temp, __temp_sp_storages] = + __merge_sort_global_submitter<_IndexT, _DiagonalsKernelName, _GlobalSortKernel1, _GlobalSortKernel2>()( + __exec, __rng, __comp, __leaf_sorter.__process_size, __temp_buf, __event_leaf_sort); // 3. If the data remained in the temporary buffer then copy it back if (__data_in_temp) { __event_sort = __merge_sort_copy_back_submitter<_CopyBackKernel>()(__q, __rng, __temp_buf, __event_sort); } - return __future(__event_sort); + return __future(__event_sort, std::move(__temp_sp_storages)); } template diff --git a/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp b/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp index e39dee0f8b..500e8f4303 100644 --- a/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp +++ b/test/parallel_api/algorithm/alg.sorting/sort.pass.cpp @@ -360,11 +360,11 @@ test_default_name_gen(Convert convert, size_t n) #endif //TEST_DPCPP_BACKEND_PRESENT -template <::std::size_t CallNumber, typename T, typename Compare, typename Convert> +template <::std::size_t CallNumber, typename T, typename Compare, typename Convert, typename FStep> void -test_sort(Compare compare, Convert convert) +test_sort(Compare compare, Convert convert, size_t start_size, size_t max_size, FStep fstep) { - for (size_t n = 0; n < 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) + for (size_t n = start_size; n <= max_size; n = fstep(n)) { LastIndex = n + 2; // The rand()%(2*n+1) encourages generation of some duplicates. @@ -408,48 +408,42 @@ struct NonConstCmp } }; -int -main() +template +void +test_sort(size_t start_size, size_t max_size, FStep fstep) { - ::std::srand(42); - std::int32_t start = 0; - std::int32_t end = 2; -#ifndef _PSTL_TEST_SORT - start = 1; -#endif // #ifndef _PSTL_TEST_SORT -#ifndef _PSTL_TEST_STABLE_SORT - end = 1; -#endif // _PSTL_TEST_STABLE_SORT - for (std::int32_t kind = start; kind < end; ++kind) - { - Stable = kind != 0; - #if !TEST_DPCPP_BACKEND_PRESENT // ParanoidKey has atomic increment in ctors. It's not allowed in kernel test_sort<0, ParanoidKey>(KeyCompare(TestUtils::OddTag()), - [](size_t k, size_t val) { return ParanoidKey(k, val, TestUtils::OddTag()); }); + [](size_t k, size_t val) { return ParanoidKey(k, val, TestUtils::OddTag()); }, + start_size, max_size, fstep); #endif // !TEST_DPCPP_BACKEND_PRESENT #if !ONEDPL_FPGA_DEVICE - test_sort<10, TestUtils::float32_t>([](TestUtils::float32_t x, TestUtils::float32_t y) { return x < y; }, - [](size_t k, size_t val) - { return TestUtils::float32_t(val) * (k % 2 ? 1 : -1); }); + test_sort([](TestUtils::float32_t x, TestUtils::float32_t y) { return x < y; }, + [](size_t k, size_t val) + { return TestUtils::float32_t(val) * (k % 2 ? 1 : -1); }, + start_size, max_size, fstep); - test_sort<20, unsigned char>([](unsigned char x, unsigned char y) - { return x > y; }, // Reversed so accidental use of < will be detected. - [](size_t k, size_t val) { return (unsigned char)val; }); + test_sort([](unsigned char x, unsigned char y) + { return x > y; }, // Reversed so accidental use of < will be detected. + [](size_t k, size_t val) { return (unsigned char)val; }, + start_size, max_size, fstep); - test_sort<30, unsigned char>(NonConstCmp{}, [](size_t k, size_t val) { return (unsigned char)val; }); + test_sort(NonConstCmp{}, [](size_t k, size_t val) { return (unsigned char)val; }, + start_size, max_size, fstep); #endif // !ONEDPL_FPGA_DEVICE - test_sort<40, std::int32_t>([](std::int32_t x, std::int32_t y) - { return x > y; }, // Reversed so accidental use of < will be detected. - [](size_t k, size_t val) { return std::int32_t(val) * (k % 2 ? 1 : -1); }); + test_sort([](std::int32_t x, std::int32_t y) + { return x > y; }, // Reversed so accidental use of < will be detected. + [](size_t k, size_t val) { return std::int32_t(val) * (k % 2 ? 1 : -1); }, + start_size, max_size, fstep); - test_sort<50, std::int16_t>( + test_sort( std::greater(), [](size_t k, size_t val) { - return std::int16_t(val) * (k % 2 ? 1 : -1); }); + return std::int16_t(val) * (k % 2 ? 1 : -1); }, + start_size, max_size, fstep); #if TEST_DPCPP_BACKEND_PRESENT auto convert = [](size_t k, size_t val) { @@ -466,7 +460,44 @@ main() } return sycl::bit_cast(raw); }; - test_sort<60, sycl::half>(std::greater(), convert); + test_sort(std::greater(), convert, + start_size, max_size, fstep); +#endif +} + +int +main() +{ + ::std::srand(42); + std::int32_t start = 0; + std::int32_t end = 2; +#ifndef _PSTL_TEST_SORT + start = 1; +#endif // #ifndef _PSTL_TEST_SORT +#ifndef _PSTL_TEST_STABLE_SORT + end = 1; +#endif // _PSTL_TEST_STABLE_SORT + + const size_t start_size_small = 0; + const size_t max_size_small = 100'000; + auto fstep_small = [](std::size_t size){ return size <= 16 ? size + 1 : size_t(3.1415 * size);}; + + for (std::int32_t kind = start; kind < end; ++kind) + { + Stable = kind != 0; + + test_sort<100>(start_size_small, max_size_small, fstep_small); + + // Large data sizes (on GPU only) +#if TEST_DPCPP_BACKEND_PRESENT + if (!TestUtils::get_test_queue().get_device().is_cpu()) + { + const size_t start_size_large = 4'000'000; + const size_t max_size_large = 8'000'000; + auto fstep_large = [](std::size_t size){ return size + 2'000'000; }; + + test_sort<200>(start_size_large, max_size_large, fstep_large); + } #endif }