From 7bcd21e25023b813d7f92cba2847919c5852160c Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 8 Nov 2024 12:08:19 -0500 Subject: [PATCH 01/50] histogram CPU initial implementation Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 50 ++++++++ include/oneapi/dpl/pstl/histogram_impl.h | 18 +-- .../oneapi/dpl/pstl/omp/parallel_histogram.h | 115 ++++++++++++++++++ .../oneapi/dpl/pstl/parallel_backend_omp.h | 5 + .../oneapi/dpl/pstl/parallel_backend_serial.h | 11 ++ .../oneapi/dpl/pstl/parallel_backend_tbb.h | 39 ++++++ .../numeric/numeric.ops/histogram.pass.cpp | 37 ++++-- 7 files changed, 246 insertions(+), 29 deletions(-) create mode 100644 include/oneapi/dpl/pstl/omp/parallel_histogram.h diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index ae9094f721a..e0dce1006f8 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4289,6 +4289,56 @@ __pattern_shift_right(_Tag __tag, _ExecutionPolicy&& __exec, _BidirectionalItera return __res.base(); } +template +void +__brick_histogram(_ForwardIterator __first, _ForwardIterator __last, _IdxHashFunc __func, + _RandomAccessIterator __histogram_first, _IsVector) noexcept +{ + using _Size = typename ::std::iterator_traits<_ForwardIterator>::difference_type; + for (; __first != __last; ++__first) + { + _Size __bin = __func.get_bin(*__first); + if (__bin >= 0) + { + ++__histogram_first[__bin]; + } + } +} + +template +void +__pattern_histogram(_Tag, _ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, + _Size __num_bins, _IdxHashFunc __func, _RandomAccessIterator __histogram_first) +{ + using _HistogramValueT = typename std::iterator_traits<_RandomAccessIterator>::value_type; + static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>); + __pattern_fill(_Tag{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins, _HistogramValueT{0}); + __brick_histogram(__first, __last, __func, __histogram_first, typename _Tag::__is_vector{}); +} + +template +void +__pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, + _Size __num_bins, _IdxHashFunc __func, _RandomAccessIterator2 __histogram_first) +{ + using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag; + using _HistogramValueT = typename std::iterator_traits<_RandomAccessIterator2>::value_type; + + if (__last - __first > 0) + { + __par_backend::__parallel_histogram(__backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, + __num_bins, __histogram_first, [&](auto __subrange_first, auto __subrange_last, auto __histogram_first) { + __brick_histogram(__subrange_first, __subrange_last, __func, __histogram_first, _IsVector{}); + }); + } + else + { + __pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec),__histogram_first, __histogram_first + __num_bins, _HistogramValueT{0}); + } +} + } // namespace __internal } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/histogram_impl.h b/include/oneapi/dpl/pstl/histogram_impl.h index 1273414807f..9f4ed9f56e1 100644 --- a/include/oneapi/dpl/pstl/histogram_impl.h +++ b/include/oneapi/dpl/pstl/histogram_impl.h @@ -19,6 +19,7 @@ #include "histogram_extension_defs.h" #include "histogram_binhash_utils.h" #include "iterator_impl.h" +#include "algorithm_impl.h" #if _ONEDPL_HETERO_BACKEND # include "hetero/histogram_impl_hetero.h" @@ -29,23 +30,6 @@ namespace oneapi namespace dpl { -namespace __internal -{ - -template -void -__pattern_histogram(_Tag, _ExecutionPolicy&& exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, - _Size __num_bins, _IdxHashFunc __func, _RandomAccessIterator2 __histogram_first) -{ - static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>); - - static_assert(sizeof(_Size) == 0 /*false*/, - "Histogram API is currently unsupported for policies other than device execution policies"); -} - -} // namespace __internal - template oneapi::dpl::__internal::__enable_if_execution_policy<_ExecutionPolicy, _RandomAccessIterator2> diff --git a/include/oneapi/dpl/pstl/omp/parallel_histogram.h b/include/oneapi/dpl/pstl/omp/parallel_histogram.h new file mode 100644 index 00000000000..c5c8ef67d20 --- /dev/null +++ b/include/oneapi/dpl/pstl/omp/parallel_histogram.h @@ -0,0 +1,115 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + + +#ifndef _ONEDPL_INTERNAL_OMP_PARALLEL_HISTOGRAM_H +#define _ONEDPL_INTERNAL_OMP_PARALLEL_HISTOGRAM_H + +#include "util.h" + +namespace oneapi +{ +namespace dpl +{ +namespace __omp_backend +{ + +template +void +__histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _Size __num_bins, + _RandomAccessIterator2 __histogram_first, _Fp __f) +{ + using _HistogramValueT = typename ::std::iterator_traits<_RandomAccessIterator2>::value_type; + + const std::size_t __num_threads = omp_get_num_threads(); + const std::size_t __size = __last - __first; + + // Initial partition of the iteration space into chunks. If the range is too small, + // this will result in a nonsense policy, so we check on the size as well below. + auto __policy1 = oneapi::dpl::__omp_backend::__chunk_partitioner(__first, __last); + auto __policy2 = oneapi::dpl::__omp_backend::__chunk_partitioner(__histogram_first, __histogram_first + __num_bins); + + std::vector> __local_histograms(__num_threads); + + //TODO: use histogram output for zeroth thread? + for (std::size_t __i = 0; __i < __num_threads; ++__i) + { + __local_histograms[__i].resize(__num_bins, _HistogramValueT{0}); + } + + // main loop + _PSTL_PRAGMA(omp taskloop shared(__local_histograms)) + for (std::size_t __chunk = 0; __chunk < __policy1.__n_chunks; ++__chunk) + { + oneapi::dpl::__omp_backend::__process_chunk( + __policy1, __first, __chunk, [&](auto __chunk_first, auto __chunk_last) { + auto __thread_num = omp_get_thread_num(); + __f(__chunk_first, __chunk_last, __local_histograms[__thread_num].begin()); + }); + } + + _PSTL_PRAGMA(omp taskloop shared(__local_histograms)) + for (std::size_t __chunk = 0; __chunk < __policy2.__n_chunks; ++__chunk) + { + oneapi::dpl::__omp_backend::__process_chunk( + __policy2, __histogram_first, __chunk, [&](auto __chunk_first, auto __chunk_last) { + for (auto __iter = __chunk_first; __iter != __chunk_last; ++__iter) + { + *__iter = __local_histograms[0][__iter - __histogram_first]; + } + for (auto __iter = __chunk_first; __iter != __chunk_last; ++__iter) + { + for (std::size_t __i = 1; __i < __num_threads; ++__i) + { + *__iter += __local_histograms[__i][__iter - __histogram_first]; + } + } + }); + } +} + +template +void +__parallel_histogram(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolicy&&, _RandomAccessIterator1 __first, + _RandomAccessIterator1 __last, _Size __num_bins, _RandomAccessIterator2 __histogram_first, _Fp __f) +{ + if (omp_in_parallel()) + { + // We don't create a nested parallel region in an existing parallel + // region: just create tasks + oneapi::dpl::__omp_backend::__histogram_body(__first, __last, __num_bins, __histogram_first, __f); + } + else + { + // Create a parallel region, and a single thread will create tasks + // for the region. + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single nowait) + { + oneapi::dpl::__omp_backend::__histogram_body(__first, __last, __num_bins, __histogram_first, __f); + } + } + +} + +} // namespace __omp_backend +} // namespace dpl +} // namespace oneapi +#endif // _ONEDPL_INTERNAL_OMP_PARALLEL_HISTOGRAM_H + + + + diff --git a/include/oneapi/dpl/pstl/parallel_backend_omp.h b/include/oneapi/dpl/pstl/parallel_backend_omp.h index f608581b245..0fd87008c49 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_omp.h +++ b/include/oneapi/dpl/pstl/parallel_backend_omp.h @@ -62,4 +62,9 @@ //------------------------------------------------------------------------ #include "./omp/parallel_merge.h" +//------------------------------------------------------------------------ +// parallel_histogram +//------------------------------------------------------------------------ +#include "./omp/parallel_histogram.h" + #endif //_ONEDPL_PARALLEL_BACKEND_OMP_H diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h index 6acd4b617f9..ea4f474972c 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_serial.h +++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h @@ -129,6 +129,17 @@ __parallel_for_each(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPol __f(*__iter); } +template +void +__parallel_histogram(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&& exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, + _Size __num_bins, _RandomAccessIterator2 __histogram_first, _Fp __f) +{ + using _HistogramValueT = typename ::std::iterator_traits<_RandomAccessIterator2>::value_type; + ::std::fill(__histogram_first, __histogram_first + __num_bins, _HistogramValueT{0}); + __f(__first, __last, __histogram_first); +} + + } // namespace __serial_backend } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index 32efecd95a2..5062dd676de 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -34,6 +34,7 @@ #include #include #include +#include #if TBB_INTERFACE_VERSION > 12000 # include #endif @@ -1306,6 +1307,44 @@ __parallel_for_each(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy tbb::this_task_arena::isolate([&]() { tbb::parallel_for_each(__begin, __end, __f); }); } +//------------------------------------------------------------------------ +// parallel_histogram +//------------------------------------------------------------------------ +template +void +__parallel_histogram(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy&&, _RandomAccessIterator1 __first, + _RandomAccessIterator1 __last, _Size __num_bins, _RandomAccessIterator2 __histogram_first, _Fp __f) +{ + using _HistogramValueT = typename ::std::iterator_traits<_RandomAccessIterator2>::value_type; + + tbb::this_task_arena::isolate([&]() { + tbb::enumerable_thread_specific> __thread_local_histogram(__num_bins, _HistogramValueT{0}); + size_t __n = __last - __first; + tbb::parallel_for(tbb::blocked_range<_Size>(0, __n), + [&](const tbb::blocked_range<_Size>& __range) { + std::vector<_HistogramValueT>& __local_histogram = __thread_local_histogram.local(); + __f(__first + __range.begin(), __first + __range.end(), __local_histogram.begin()); + }); + tbb::parallel_for(tbb::blocked_range<_Size>(0, __num_bins), + [&](const tbb::blocked_range<_Size>& __range) { + auto __local_histogram = __thread_local_histogram.begin(); + for (auto __i = __range.begin(); __i != __range.end(); ++__i) + { + __histogram_first[__i] = (*__local_histogram)[__i]; + } + ++__local_histogram; + for (; __local_histogram != __thread_local_histogram.end(); ++__local_histogram) + { + for (auto __i = __range.begin(); __i != __range.end(); ++__i) + { + __histogram_first[__i] += (*__local_histogram)[__i]; + } + } + }); + }); +} + } // namespace __tbb_backend } // namespace dpl } // namespace oneapi diff --git a/test/parallel_api/numeric/numeric.ops/histogram.pass.cpp b/test/parallel_api/numeric/numeric.ops/histogram.pass.cpp index b16885cf8aa..6fbfc53bfa7 100644 --- a/test/parallel_api/numeric/numeric.ops/histogram.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/histogram.pass.cpp @@ -24,12 +24,11 @@ using namespace TestUtils; -#if TEST_DPCPP_BACKEND_PRESENT struct test_histogram_even_bins { template - void + std::enable_if_t> operator()(Policy&& exec, Iterator1 in_first, Iterator1 in_last, Iterator2 expected_bin_first, Iterator2 expected_bin_last, Iterator3 bin_first, Iterator3 bin_last, Size n, T bin_min, T bin_max, Size trash) @@ -41,13 +40,21 @@ struct test_histogram_even_bins EXPECT_EQ_N(expected_bin_first, bin_first, bin_size, "wrong result from histogram"); ::std::fill_n(bin_first, bin_size, trash); } + + template + std::enable_if_t> + operator()(Policy&& exec, Iterator1 in_first, Iterator1 in_last, Iterator2 expected_bin_first, + Iterator2 expected_bin_last, Iterator3 bin_first, Iterator3 bin_last, Size n, T bin_min, T bin_max, + Size trash) + { + } }; struct test_histogram_range_bins { template - void + std::enable_if_t> operator()(Policy&& exec, Iterator1 in_first, Iterator1 in_last, Iterator2 boundary_first, Iterator2 boundary_last, Iterator3 expected_bin_first, Iterator3 /* expected_bin_last */, Iterator4 bin_first, Iterator4 bin_last, Size trash) @@ -59,6 +66,15 @@ struct test_histogram_range_bins EXPECT_EQ_N(expected_bin_first, bin_first, bin_size, "wrong result from histogram"); ::std::fill_n(bin_first, bin_size, trash); } + + template + std::enable_if_t> + operator()(Policy&& exec, Iterator1 in_first, Iterator1 in_last, Iterator2 boundary_first, Iterator2 boundary_last, + Iterator3 expected_bin_first, Iterator3 /* expected_bin_last */, Iterator4 bin_first, Iterator4 bin_last, + Size trash) + { + } }; template <::std::size_t CallNumber, typename Size, typename T> @@ -74,11 +90,11 @@ test_range_and_even_histogram(Size n, T min_boundary, T max_boundary, T overflow Sequence expected(num_bins, [](size_t k) { return 0; }); Sequence out(num_bins, [&](size_t k) { return trash; }); - invoke_on_all_hetero_policies()(test_histogram_even_bins(), in.begin(), in.end(), expected.begin(), + invoke_on_all_policies()(test_histogram_even_bins(), in.begin(), in.end(), expected.begin(), expected.end(), out.begin(), out.end(), Size(in.size()), min_boundary, max_boundary, trash); # if !ONEDPL_FPGA_DEVICE - invoke_on_all_hetero_policies()(test_histogram_even_bins(), in.cbegin(), in.cend(), + invoke_on_all_policies()(test_histogram_even_bins(), in.cbegin(), in.cend(), expected.begin(), expected.end(), out.begin(), out.end(), Size(in.size()), min_boundary, max_boundary, trash); # endif // !ONEDPL_FPGA_DEVICE @@ -86,14 +102,15 @@ test_range_and_even_histogram(Size n, T min_boundary, T max_boundary, T overflow T offset = (max_boundary - min_boundary) / T(num_bins); Sequence boundaries(num_bins + 1, [&](size_t k) { return k * offset + (std::rand() % jitter) + min_boundary; }); - invoke_on_all_hetero_policies()(test_histogram_range_bins(), in.begin(), in.end(), + invoke_on_all_policies()(test_histogram_range_bins(), in.begin(), in.end(), boundaries.begin(), boundaries.end(), expected.begin(), expected.end(), out.begin(), out.end(), trash); # if !ONEDPL_FPGA_DEVICE - invoke_on_all_hetero_policies()(test_histogram_range_bins(), in.cbegin(), in.cend(), + invoke_on_all_policies()(test_histogram_range_bins(), in.cbegin(), in.cend(), boundaries.cbegin(), boundaries.cend(), expected.begin(), expected.end(), out.begin(), out.end(), trash); # endif // !ONEDPL_FPGA_DEVICE + } template <::std::size_t CallNumber, typename T, typename Size> @@ -108,19 +125,15 @@ test_histogram(T min_boundary, T max_boundary, T overflow, Size jitter, Size tra } } } -#endif // TEST_DPCPP_BACKEND_PRESENT int main() { -#if TEST_DPCPP_BACKEND_PRESENT test_histogram<0, float, uint32_t>(10000.0f, 110000.0f, 300.0f, uint32_t(50), uint32_t(99999)); #if !ONEDPL_FPGA_DEVICE test_histogram<1, std::int32_t, uint64_t>(-50000, 50000, 10000, uint64_t(5), uint64_t(99999)); #endif //!ONEDPL_FPGA_DEVICE -#endif // TEST_DPCPP_BACKEND_PRESENT - - return done(TEST_DPCPP_BACKEND_PRESENT); + return done(); } From 955f0e21c2de390ba61216abe9f98139d79d6e8e Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 8 Nov 2024 15:36:35 -0500 Subject: [PATCH 02/50] atomics histogram implementation Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 48 +++++++++++++++++++++--- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index e0dce1006f8..a62369491d5 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -20,6 +20,7 @@ #include #include #include +#include #include "algorithm_fwd.h" @@ -4305,6 +4306,26 @@ __brick_histogram(_ForwardIterator __first, _ForwardIterator __last, _IdxHashFun } } +template +void +__brick_histogram_atomics(_ForwardIterator __first, _ForwardIterator __last, _IdxHashFunc __func, + _RandomAccessIterator __histogram_first, _IsVector) noexcept +{ + using _Size = typename ::std::iterator_traits<_ForwardIterator>::difference_type; + using _HistogramValueT = typename ::std::iterator_traits<_RandomAccessIterator>::value_type; + + for (; __first != __last; ++__first) + { + _Size __bin = __func.get_bin(*__first); + if (__bin >= 0) + { + std::atomic<_HistogramValueT>* __atomic_histogram_bin = reinterpret_cast*>(std::addressof(*(__histogram_first + __bin))); + __atomic_histogram_bin->fetch_add(_HistogramValueT{1}, std::memory_order_relaxed); + } + } +} + + template void @@ -4325,13 +4346,30 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando { using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag; using _HistogramValueT = typename std::iterator_traits<_RandomAccessIterator2>::value_type; + using _DiffType = typename ::std::iterator_traits<_RandomAccessIterator2>::difference_type; + + const _DiffType __histogram_threshold = 1048576; - if (__last - __first > 0) + _DiffType __n = __last - __first; + if (__n > 0) { - __par_backend::__parallel_histogram(__backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, - __num_bins, __histogram_first, [&](auto __subrange_first, auto __subrange_last, auto __histogram_first) { - __brick_histogram(__subrange_first, __subrange_last, __func, __histogram_first, _IsVector{}); - }); + if (__n > __histogram_threshold) + { + //Atomic histogram brick to protect against race conditions + __pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec),__histogram_first, __histogram_first + __num_bins, _HistogramValueT{0}); + __par_backend::__parallel_for(__backend_tag{}, std::forward<_ExecutionPolicy>(__exec), _DiffType{0}, __n, + [&](_DiffType __i, _DiffType __j) { + __brick_histogram_atomics(__first + __i, __first + __j, __func, __histogram_first, _IsVector{}); + }); + } + else + { + //Embarassingly parallel with temporary histogram outputs + __par_backend::__parallel_histogram(__backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, + __num_bins, __histogram_first, [&](auto __subrange_first, auto __subrange_last, auto __histogram_first) { + __brick_histogram(__subrange_first, __subrange_last, __func, __histogram_first, _IsVector{}); + }); + } } else { From ebcada674bdad76fbb8d6511758b5a5935e65bef Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 8 Nov 2024 15:38:50 -0500 Subject: [PATCH 03/50] clang format Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 42 +++++++++++-------- .../oneapi/dpl/pstl/omp/parallel_histogram.h | 6 --- .../oneapi/dpl/pstl/parallel_backend_tbb.h | 37 ++++++++-------- 3 files changed, 42 insertions(+), 43 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index a62369491d5..c290f3f9c13 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4293,7 +4293,7 @@ __pattern_shift_right(_Tag __tag, _ExecutionPolicy&& __exec, _BidirectionalItera template void __brick_histogram(_ForwardIterator __first, _ForwardIterator __last, _IdxHashFunc __func, - _RandomAccessIterator __histogram_first, _IsVector) noexcept + _RandomAccessIterator __histogram_first, _IsVector) noexcept { using _Size = typename ::std::iterator_traits<_ForwardIterator>::difference_type; for (; __first != __last; ++__first) @@ -4309,7 +4309,7 @@ __brick_histogram(_ForwardIterator __first, _ForwardIterator __last, _IdxHashFun template void __brick_histogram_atomics(_ForwardIterator __first, _ForwardIterator __last, _IdxHashFunc __func, - _RandomAccessIterator __histogram_first, _IsVector) noexcept + _RandomAccessIterator __histogram_first, _IsVector) noexcept { using _Size = typename ::std::iterator_traits<_ForwardIterator>::difference_type; using _HistogramValueT = typename ::std::iterator_traits<_RandomAccessIterator>::value_type; @@ -4319,13 +4319,13 @@ __brick_histogram_atomics(_ForwardIterator __first, _ForwardIterator __last, _Id _Size __bin = __func.get_bin(*__first); if (__bin >= 0) { - std::atomic<_HistogramValueT>* __atomic_histogram_bin = reinterpret_cast*>(std::addressof(*(__histogram_first + __bin))); + std::atomic<_HistogramValueT>* __atomic_histogram_bin = + reinterpret_cast*>(std::addressof(*(__histogram_first + __bin))); __atomic_histogram_bin->fetch_add(_HistogramValueT{1}, std::memory_order_relaxed); } } } - template void @@ -4334,15 +4334,17 @@ __pattern_histogram(_Tag, _ExecutionPolicy&& __exec, _ForwardIterator __first, _ { using _HistogramValueT = typename std::iterator_traits<_RandomAccessIterator>::value_type; static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>); - __pattern_fill(_Tag{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins, _HistogramValueT{0}); + __pattern_fill(_Tag{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins, + _HistogramValueT{0}); __brick_histogram(__first, __last, __func, __histogram_first, typename _Tag::__is_vector{}); } -template +template void -__pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, - _Size __num_bins, _IdxHashFunc __func, _RandomAccessIterator2 __histogram_first) +__pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, + _RandomAccessIterator1 __last, _Size __num_bins, _IdxHashFunc __func, + _RandomAccessIterator2 __histogram_first) { using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag; using _HistogramValueT = typename std::iterator_traits<_RandomAccessIterator2>::value_type; @@ -4356,24 +4358,28 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando if (__n > __histogram_threshold) { //Atomic histogram brick to protect against race conditions - __pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec),__histogram_first, __histogram_first + __num_bins, _HistogramValueT{0}); + __pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, + __histogram_first + __num_bins, _HistogramValueT{0}); __par_backend::__parallel_for(__backend_tag{}, std::forward<_ExecutionPolicy>(__exec), _DiffType{0}, __n, - [&](_DiffType __i, _DiffType __j) { - __brick_histogram_atomics(__first + __i, __first + __j, __func, __histogram_first, _IsVector{}); - }); + [&](_DiffType __i, _DiffType __j) { + __brick_histogram_atomics(__first + __i, __first + __j, __func, + __histogram_first, _IsVector{}); + }); } else { //Embarassingly parallel with temporary histogram outputs - __par_backend::__parallel_histogram(__backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, - __num_bins, __histogram_first, [&](auto __subrange_first, auto __subrange_last, auto __histogram_first) { - __brick_histogram(__subrange_first, __subrange_last, __func, __histogram_first, _IsVector{}); - }); + __par_backend::__parallel_histogram( + __backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __num_bins, + __histogram_first, [&](auto __subrange_first, auto __subrange_last, auto __histogram_first) { + __brick_histogram(__subrange_first, __subrange_last, __func, __histogram_first, _IsVector{}); + }); } } else { - __pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec),__histogram_first, __histogram_first + __num_bins, _HistogramValueT{0}); + __pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, + __histogram_first + __num_bins, _HistogramValueT{0}); } } diff --git a/include/oneapi/dpl/pstl/omp/parallel_histogram.h b/include/oneapi/dpl/pstl/omp/parallel_histogram.h index c5c8ef67d20..5a3ca7a0a73 100644 --- a/include/oneapi/dpl/pstl/omp/parallel_histogram.h +++ b/include/oneapi/dpl/pstl/omp/parallel_histogram.h @@ -13,7 +13,6 @@ // //===----------------------------------------------------------------------===// - #ifndef _ONEDPL_INTERNAL_OMP_PARALLEL_HISTOGRAM_H #define _ONEDPL_INTERNAL_OMP_PARALLEL_HISTOGRAM_H @@ -102,14 +101,9 @@ __parallel_histogram(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolic oneapi::dpl::__omp_backend::__histogram_body(__first, __last, __num_bins, __histogram_first, __f); } } - } } // namespace __omp_backend } // namespace dpl } // namespace oneapi #endif // _ONEDPL_INTERNAL_OMP_PARALLEL_HISTOGRAM_H - - - - diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index 5062dd676de..e705af1fb41 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -1319,29 +1319,28 @@ __parallel_histogram(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolic using _HistogramValueT = typename ::std::iterator_traits<_RandomAccessIterator2>::value_type; tbb::this_task_arena::isolate([&]() { - tbb::enumerable_thread_specific> __thread_local_histogram(__num_bins, _HistogramValueT{0}); + tbb::enumerable_thread_specific> __thread_local_histogram(__num_bins, + _HistogramValueT{0}); size_t __n = __last - __first; - tbb::parallel_for(tbb::blocked_range<_Size>(0, __n), - [&](const tbb::blocked_range<_Size>& __range) { - std::vector<_HistogramValueT>& __local_histogram = __thread_local_histogram.local(); - __f(__first + __range.begin(), __first + __range.end(), __local_histogram.begin()); - }); - tbb::parallel_for(tbb::blocked_range<_Size>(0, __num_bins), - [&](const tbb::blocked_range<_Size>& __range) { - auto __local_histogram = __thread_local_histogram.begin(); + tbb::parallel_for(tbb::blocked_range<_Size>(0, __n), [&](const tbb::blocked_range<_Size>& __range) { + std::vector<_HistogramValueT>& __local_histogram = __thread_local_histogram.local(); + __f(__first + __range.begin(), __first + __range.end(), __local_histogram.begin()); + }); + tbb::parallel_for(tbb::blocked_range<_Size>(0, __num_bins), [&](const tbb::blocked_range<_Size>& __range) { + auto __local_histogram = __thread_local_histogram.begin(); + for (auto __i = __range.begin(); __i != __range.end(); ++__i) + { + __histogram_first[__i] = (*__local_histogram)[__i]; + } + ++__local_histogram; + for (; __local_histogram != __thread_local_histogram.end(); ++__local_histogram) + { for (auto __i = __range.begin(); __i != __range.end(); ++__i) { - __histogram_first[__i] = (*__local_histogram)[__i]; + __histogram_first[__i] += (*__local_histogram)[__i]; } - ++__local_histogram; - for (; __local_histogram != __thread_local_histogram.end(); ++__local_histogram) - { - for (auto __i = __range.begin(); __i != __range.end(); ++__i) - { - __histogram_first[__i] += (*__local_histogram)[__i]; - } - } - }); + } + }); }); } From 93dd493e13cd38dbbfdef945d9fd36234aab9e10 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 8 Nov 2024 15:39:56 -0500 Subject: [PATCH 04/50] comment about atomics Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index c290f3f9c13..721ebdc9a3e 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4319,6 +4319,7 @@ __brick_histogram_atomics(_ForwardIterator __first, _ForwardIterator __last, _Id _Size __bin = __func.get_bin(*__first); if (__bin >= 0) { + //TODO: this isnt guaranteed to work by the compiler / standard library, need to specialize std::atomic<_HistogramValueT>* __atomic_histogram_bin = reinterpret_cast*>(std::addressof(*(__histogram_first + __bin))); __atomic_histogram_bin->fetch_add(_HistogramValueT{1}, std::memory_order_relaxed); From 4d950442ebeb431721d09bdfd4a9b36d38ad7620 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Tue, 12 Nov 2024 09:07:54 -0500 Subject: [PATCH 05/50] first draft of atomic increment (unchecked) Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/internal/version_impl.h | 1 + include/oneapi/dpl/pstl/algorithm_impl.h | 5 +---- include/oneapi/dpl/pstl/utils.h | 9 +++++++++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/internal/version_impl.h b/include/oneapi/dpl/internal/version_impl.h index 38a15e5eecf..857cb327a65 100644 --- a/include/oneapi/dpl/internal/version_impl.h +++ b/include/oneapi/dpl/internal/version_impl.h @@ -20,6 +20,7 @@ # define _ONEDPL_STD_FEATURE_MACROS_PRESENT 1 // Clang 15 and older do not support range adaptors, see https://bugs.llvm.org/show_bug.cgi?id=44833 # define _ONEDPL_CPP20_RANGES_PRESENT ((__cpp_lib_ranges >= 201911L) && !(__clang__ && __clang_major__ < 16)) +# define _ONEDPL_CPP20_ATOMIC_REF_PRESENT (__cpp_lib_atomic_ref >= 201806L) #else # define _ONEDPL_STD_FEATURE_MACROS_PRESENT 0 # define _ONEDPL_CPP20_RANGES_PRESENT 0 diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 721ebdc9a3e..2841d19cf2e 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4319,10 +4319,7 @@ __brick_histogram_atomics(_ForwardIterator __first, _ForwardIterator __last, _Id _Size __bin = __func.get_bin(*__first); if (__bin >= 0) { - //TODO: this isnt guaranteed to work by the compiler / standard library, need to specialize - std::atomic<_HistogramValueT>* __atomic_histogram_bin = - reinterpret_cast*>(std::addressof(*(__histogram_first + __bin))); - __atomic_histogram_bin->fetch_add(_HistogramValueT{1}, std::memory_order_relaxed); + _ONEDPL_ATOMIC_INCREMENT(__histogram_first[__bin]); } } } diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h index 9c32178a78c..b5934f701ac 100644 --- a/include/oneapi/dpl/pstl/utils.h +++ b/include/oneapi/dpl/pstl/utils.h @@ -42,6 +42,15 @@ # include // memcpy #endif +#if _ONEDPL_CPP20_ATOMIC_REF_PRESENT +# define _ONEDPL_ATOMIC_INCREMENT(element_ref) \ + std::atomic_ref>{element_ref}.fetch_add(1, std::memory_order_relaxed); +#elif defined(_MSC_VER) +# define _ONEDPL_ATOMIC_INCREMENT(element_ref) InterlockedAdd(&element_ref, 1); +#else +# define _ONEDPL_ATOMIC_INCREMENT(element_ref) __atomic_fetch_add_n(&element_ref, 1, __ATOMIC_RELAXED); +#endif + namespace oneapi { namespace dpl From a7fe8a1d30f98b9cef424e3fe0f878ebdd5f8880 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Tue, 12 Nov 2024 14:11:07 -0500 Subject: [PATCH 06/50] atomics include and fix builtin Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/utils.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h index b5934f701ac..27bd725d339 100644 --- a/include/oneapi/dpl/pstl/utils.h +++ b/include/oneapi/dpl/pstl/utils.h @@ -25,6 +25,7 @@ #include #include #include +#include #if _ONEDPL_BACKEND_SYCL # include "hetero/dpcpp/sycl_defs.h" @@ -48,7 +49,7 @@ #elif defined(_MSC_VER) # define _ONEDPL_ATOMIC_INCREMENT(element_ref) InterlockedAdd(&element_ref, 1); #else -# define _ONEDPL_ATOMIC_INCREMENT(element_ref) __atomic_fetch_add_n(&element_ref, 1, __ATOMIC_RELAXED); +# define _ONEDPL_ATOMIC_INCREMENT(element_ref) __atomic_fetch_add(&element_ref, 1, __ATOMIC_RELAXED); #endif namespace oneapi From 6be0b7b89be18cb5499de18485d33a500bd24f9e Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Tue, 12 Nov 2024 14:12:41 -0500 Subject: [PATCH 07/50] large case Signed-off-by: Dan Hoeflinger --- .../numeric/numeric.ops/histogram.pass.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/parallel_api/numeric/numeric.ops/histogram.pass.cpp b/test/parallel_api/numeric/numeric.ops/histogram.pass.cpp index 6fbfc53bfa7..e532b56a091 100644 --- a/test/parallel_api/numeric/numeric.ops/histogram.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/histogram.pass.cpp @@ -34,8 +34,14 @@ struct test_histogram_even_bins Size trash) { const Size bin_size = bin_last - bin_first; + std::cout<<"n: "<(n, min_boundary, max_boundary, overflow, jitter, bin_size, trash); } } + //single large case to hit the atomic case + Size n = 1100000; + Size bin_size = 1100000; + test_range_and_even_histogram(n, min_boundary, max_boundary, overflow, jitter, bin_size, trash); } int From 9e3f16c88c7e227452113967fc83822cae45ca33 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 15 Nov 2024 16:51:13 -0500 Subject: [PATCH 08/50] fix threshold check Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 2841d19cf2e..bc2c957861e 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4353,7 +4353,7 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando _DiffType __n = __last - __first; if (__n > 0) { - if (__n > __histogram_threshold) + if (__num_bins > __histogram_threshold) { //Atomic histogram brick to protect against race conditions __pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, From 0592c152432fa5ab5d8497bca71b993519df49f2 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Tue, 19 Nov 2024 12:55:20 -0500 Subject: [PATCH 09/50] minor improvements Signed-off-by: Dan Hoeflinger --- CMakeLists.txt | 2 +- include/oneapi/dpl/pstl/algorithm_impl.h | 4 ++-- include/oneapi/dpl/pstl/omp/parallel_histogram.h | 6 +----- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cbe9a214a1f..cc58ced3502 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -173,7 +173,7 @@ if (ONEDPL_BACKEND MATCHES "^(tbb|dpcpp|dpcpp_only)$") set(SET_BACKEND_${ONEDPL_BACKEND_NAME} TRUE) if (ONEDPL_BACKEND MATCHES "^(tbb|dpcpp)$") - find_package(TBB 2021 REQUIRED tbb OPTIONAL_COMPONENTS tbbmalloc) + find_package(TBB 2021 REQUIRED tbb tbbmalloc) message(STATUS "oneDPL uses oneTBB ${TBB_VERSION}") target_link_libraries(oneDPL INTERFACE TBB::tbb) endif() diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index bc2c957861e..3b389a0c9aa 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4353,11 +4353,11 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando _DiffType __n = __last - __first; if (__n > 0) { - if (__num_bins > __histogram_threshold) + if (__num_bins >= __histogram_threshold) { - //Atomic histogram brick to protect against race conditions __pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins, _HistogramValueT{0}); + //Atomic histogram brick to protect against race conditions __par_backend::__parallel_for(__backend_tag{}, std::forward<_ExecutionPolicy>(__exec), _DiffType{0}, __n, [&](_DiffType __i, _DiffType __j) { __brick_histogram_atomics(__first + __i, __first + __j, __func, diff --git a/include/oneapi/dpl/pstl/omp/parallel_histogram.h b/include/oneapi/dpl/pstl/omp/parallel_histogram.h index 5a3ca7a0a73..faa7d7b3946 100644 --- a/include/oneapi/dpl/pstl/omp/parallel_histogram.h +++ b/include/oneapi/dpl/pstl/omp/parallel_histogram.h @@ -40,13 +40,9 @@ __histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, auto __policy1 = oneapi::dpl::__omp_backend::__chunk_partitioner(__first, __last); auto __policy2 = oneapi::dpl::__omp_backend::__chunk_partitioner(__histogram_first, __histogram_first + __num_bins); - std::vector> __local_histograms(__num_threads); + std::vector> __local_histograms(__num_threads, std::vector<_HistogramValueT>(__num_bins, _HistogramValueT{0})); //TODO: use histogram output for zeroth thread? - for (std::size_t __i = 0; __i < __num_threads; ++__i) - { - __local_histograms[__i].resize(__num_bins, _HistogramValueT{0}); - } // main loop _PSTL_PRAGMA(omp taskloop shared(__local_histograms)) From 6ac87a0c32a8d6db02b5e9a44a451b2f59da40d0 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Tue, 19 Nov 2024 20:15:34 -0500 Subject: [PATCH 10/50] MSVC fixes --- CMakeLists.txt | 2 +- include/oneapi/dpl/pstl/utils.h | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cc58ced3502..5c18269e80d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -336,7 +336,7 @@ elseif(ONEDPL_BACKEND MATCHES "^(omp)$") if (OpenMP_CXX_FLAGS MATCHES ".*-fiopenmp.*") set(_openmp_flag -fopenmp) elseif (OpenMP_CXX_FLAGS MATCHES ".*[-/]Qiopenmp.*") - set(_openmp_flag /Qopenmp) + set(_openmp_flag -Qopenmp) endif() if (_openmp_flag) message(STATUS "Using ${_openmp_flag} for openMP") diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h index 27bd725d339..23b9d6c729b 100644 --- a/include/oneapi/dpl/pstl/utils.h +++ b/include/oneapi/dpl/pstl/utils.h @@ -45,11 +45,12 @@ #if _ONEDPL_CPP20_ATOMIC_REF_PRESENT # define _ONEDPL_ATOMIC_INCREMENT(element_ref) \ - std::atomic_ref>{element_ref}.fetch_add(1, std::memory_order_relaxed); + std::atomic_ref>{element_ref}.fetch_add(1, std::memory_order_relaxed) #elif defined(_MSC_VER) -# define _ONEDPL_ATOMIC_INCREMENT(element_ref) InterlockedAdd(&element_ref, 1); +//TODO: find a better way to do this with intrinsics on MSVC without including windows.h? +# define _ONEDPL_ATOMIC_INCREMENT(element_ref) reinterpret_cast>*>((&element_ref))->fetch_add(1, std::memory_order_relaxed) #else -# define _ONEDPL_ATOMIC_INCREMENT(element_ref) __atomic_fetch_add(&element_ref, 1, __ATOMIC_RELAXED); +# define _ONEDPL_ATOMIC_INCREMENT(element_ref) __atomic_fetch_add(&element_ref, 1, __ATOMIC_RELAXED) #endif namespace oneapi From af5defb8e6e44574031a4fc106fdbcfd32ba6bb1 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 20 Nov 2024 15:50:57 -0500 Subject: [PATCH 11/50] parallelize initialization of openMP temp histograms Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/omp/parallel_histogram.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/omp/parallel_histogram.h b/include/oneapi/dpl/pstl/omp/parallel_histogram.h index faa7d7b3946..036e66d9c4a 100644 --- a/include/oneapi/dpl/pstl/omp/parallel_histogram.h +++ b/include/oneapi/dpl/pstl/omp/parallel_histogram.h @@ -40,9 +40,14 @@ __histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, auto __policy1 = oneapi::dpl::__omp_backend::__chunk_partitioner(__first, __last); auto __policy2 = oneapi::dpl::__omp_backend::__chunk_partitioner(__histogram_first, __histogram_first + __num_bins); - std::vector> __local_histograms(__num_threads, std::vector<_HistogramValueT>(__num_bins, _HistogramValueT{0})); + std::vector> __local_histograms(__num_threads); //TODO: use histogram output for zeroth thread? + _PSTL_PRAGMA(omp taskloop shared(__local_histograms)) + for (std::size_t __tid = 0; __tid < __num_threads; ++__tid) + { + __local_histograms[__tid].resize(__num_bins, _HistogramValueT{0}); + } // main loop _PSTL_PRAGMA(omp taskloop shared(__local_histograms)) From 65c30af985308dbff7aed0c4628d56f3a7a5d6f6 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 13 Dec 2024 09:11:08 -0500 Subject: [PATCH 12/50] removing unnecessary type casting Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 3b389a0c9aa..ff8ccba86f3 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4295,10 +4295,9 @@ void __brick_histogram(_ForwardIterator __first, _ForwardIterator __last, _IdxHashFunc __func, _RandomAccessIterator __histogram_first, _IsVector) noexcept { - using _Size = typename ::std::iterator_traits<_ForwardIterator>::difference_type; for (; __first != __last; ++__first) { - _Size __bin = __func.get_bin(*__first); + std::int32_t __bin = __func.get_bin(*__first); if (__bin >= 0) { ++__histogram_first[__bin]; @@ -4311,12 +4310,9 @@ void __brick_histogram_atomics(_ForwardIterator __first, _ForwardIterator __last, _IdxHashFunc __func, _RandomAccessIterator __histogram_first, _IsVector) noexcept { - using _Size = typename ::std::iterator_traits<_ForwardIterator>::difference_type; - using _HistogramValueT = typename ::std::iterator_traits<_RandomAccessIterator>::value_type; - for (; __first != __last; ++__first) { - _Size __bin = __func.get_bin(*__first); + std::int32_t __bin = __func.get_bin(*__first); if (__bin >= 0) { _ONEDPL_ATOMIC_INCREMENT(__histogram_first[__bin]); From eec36d5b3fbb94b5e17a1c28c0aac4fad01b5923 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 13 Dec 2024 16:22:36 -0500 Subject: [PATCH 13/50] improving accumulation of local histograms (simd) Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 14 +++++++-- .../oneapi/dpl/pstl/omp/parallel_histogram.h | 31 ++++++++++--------- .../oneapi/dpl/pstl/parallel_backend_serial.h | 9 +++--- .../oneapi/dpl/pstl/parallel_backend_tbb.h | 17 +++++----- 4 files changed, 40 insertions(+), 31 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index ff8ccba86f3..daf9315eee5 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4355,7 +4355,7 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando __histogram_first + __num_bins, _HistogramValueT{0}); //Atomic histogram brick to protect against race conditions __par_backend::__parallel_for(__backend_tag{}, std::forward<_ExecutionPolicy>(__exec), _DiffType{0}, __n, - [&](_DiffType __i, _DiffType __j) { + [__first, __func, __histogram_first](_DiffType __i, _DiffType __j) { __brick_histogram_atomics(__first + __i, __first + __j, __func, __histogram_first, _IsVector{}); }); @@ -4365,8 +4365,18 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando //Embarassingly parallel with temporary histogram outputs __par_backend::__parallel_histogram( __backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __num_bins, - __histogram_first, [&](auto __subrange_first, auto __subrange_last, auto __histogram_first) { + __histogram_first, + [__func](auto __subrange_first, auto __subrange_last, auto __histogram_first) { __brick_histogram(__subrange_first, __subrange_last, __func, __histogram_first, _IsVector{}); + }, + [](auto __local_histogram_first, std::uint32_t __n, auto __histogram_accum_first) { + oneapi::dpl::__unseq_backend::__simd_walk_2(__local_histogram_first, __n, __histogram_accum_first, + oneapi::dpl::__internal::__pstl_assign()); + }, + [](auto __local_histogram_first, std::uint32_t __n, auto __histogram_accum_first) { + oneapi::dpl::__unseq_backend::__simd_walk_2( + __local_histogram_first, __n, __histogram_accum_first, + [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }); }); } } diff --git a/include/oneapi/dpl/pstl/omp/parallel_histogram.h b/include/oneapi/dpl/pstl/omp/parallel_histogram.h index 036e66d9c4a..8977624f54e 100644 --- a/include/oneapi/dpl/pstl/omp/parallel_histogram.h +++ b/include/oneapi/dpl/pstl/omp/parallel_histogram.h @@ -25,10 +25,11 @@ namespace dpl namespace __omp_backend { -template +template void __histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _Size __num_bins, - _RandomAccessIterator2 __histogram_first, _Fp __f) + _RandomAccessIterator2 __histogram_first, _FpHist __f, _FpInitialize __init, _FpAccum __accum) { using _HistogramValueT = typename ::std::iterator_traits<_RandomAccessIterator2>::value_type; @@ -65,32 +66,31 @@ __histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, { oneapi::dpl::__omp_backend::__process_chunk( __policy2, __histogram_first, __chunk, [&](auto __chunk_first, auto __chunk_last) { - for (auto __iter = __chunk_first; __iter != __chunk_last; ++__iter) - { - *__iter = __local_histograms[0][__iter - __histogram_first]; - } - for (auto __iter = __chunk_first; __iter != __chunk_last; ++__iter) + __init(__local_histograms[0].begin() + (__chunk_first - __histogram_first), + (__chunk_last - __chunk_first), __chunk_first); + + for (std::size_t __i = 1; __i < __num_threads; ++__i) { - for (std::size_t __i = 1; __i < __num_threads; ++__i) - { - *__iter += __local_histograms[__i][__iter - __histogram_first]; - } + __accum(__local_histograms[__i].begin() + (__chunk_first - __histogram_first), + (__chunk_last - __chunk_first), __chunk_first); } }); } } template + typename _FpHist, typename _FpInitialize, typename _FpAccum> void __parallel_histogram(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolicy&&, _RandomAccessIterator1 __first, - _RandomAccessIterator1 __last, _Size __num_bins, _RandomAccessIterator2 __histogram_first, _Fp __f) + _RandomAccessIterator1 __last, _Size __num_bins, _RandomAccessIterator2 __histogram_first, + _FpHist __f, _FpInitialize __init, _FpAccum __accum) { if (omp_in_parallel()) { // We don't create a nested parallel region in an existing parallel // region: just create tasks - oneapi::dpl::__omp_backend::__histogram_body(__first, __last, __num_bins, __histogram_first, __f); + oneapi::dpl::__omp_backend::__histogram_body(__first, __last, __num_bins, __histogram_first, __f, __init, + __accum); } else { @@ -99,7 +99,8 @@ __parallel_histogram(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolic _PSTL_PRAGMA(omp parallel) _PSTL_PRAGMA(omp single nowait) { - oneapi::dpl::__omp_backend::__histogram_body(__first, __last, __num_bins, __histogram_first, __f); + oneapi::dpl::__omp_backend::__histogram_body(__first, __last, __num_bins, __histogram_first, __f, __init, + __accum); } } } diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h index ea4f474972c..e67bb4b00ca 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_serial.h +++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h @@ -129,17 +129,18 @@ __parallel_for_each(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPol __f(*__iter); } -template +template void -__parallel_histogram(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&& exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, - _Size __num_bins, _RandomAccessIterator2 __histogram_first, _Fp __f) +__parallel_histogram(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&& exec, + _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _Size __num_bins, + _RandomAccessIterator2 __histogram_first, _FpHist __f, _FpInitialize, _FpAccum) { using _HistogramValueT = typename ::std::iterator_traits<_RandomAccessIterator2>::value_type; ::std::fill(__histogram_first, __histogram_first + __num_bins, _HistogramValueT{0}); __f(__first, __last, __histogram_first); } - } // namespace __serial_backend } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index e705af1fb41..0c307460ca4 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -1311,10 +1311,11 @@ __parallel_for_each(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy // parallel_histogram //------------------------------------------------------------------------ template + typename _FpHist, typename _FpInitialize, typename _FpAccum> void __parallel_histogram(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy&&, _RandomAccessIterator1 __first, - _RandomAccessIterator1 __last, _Size __num_bins, _RandomAccessIterator2 __histogram_first, _Fp __f) + _RandomAccessIterator1 __last, _Size __num_bins, _RandomAccessIterator2 __histogram_first, + _FpHist __f, _FpInitialize __init, _FpAccum __accum) { using _HistogramValueT = typename ::std::iterator_traits<_RandomAccessIterator2>::value_type; @@ -1326,19 +1327,15 @@ __parallel_histogram(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolic std::vector<_HistogramValueT>& __local_histogram = __thread_local_histogram.local(); __f(__first + __range.begin(), __first + __range.end(), __local_histogram.begin()); }); + tbb::parallel_for(tbb::blocked_range<_Size>(0, __num_bins), [&](const tbb::blocked_range<_Size>& __range) { auto __local_histogram = __thread_local_histogram.begin(); - for (auto __i = __range.begin(); __i != __range.end(); ++__i) - { - __histogram_first[__i] = (*__local_histogram)[__i]; - } + __init(__local_histogram->begin() + __range.begin(), __range.size(), __histogram_first + __range.begin()); ++__local_histogram; for (; __local_histogram != __thread_local_histogram.end(); ++__local_histogram) { - for (auto __i = __range.begin(); __i != __range.end(); ++__i) - { - __histogram_first[__i] += (*__local_histogram)[__i]; - } + __accum(__local_histogram->begin() + __range.begin(), __range.size(), + __histogram_first + __range.begin()); } }); }); From 1faece59b76327d442e293c3175f2178546fc94d Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 13 Dec 2024 21:26:07 -0500 Subject: [PATCH 14/50] Properly using IsVector Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index daf9315eee5..f5dd5824a49 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4370,13 +4370,12 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando __brick_histogram(__subrange_first, __subrange_last, __func, __histogram_first, _IsVector{}); }, [](auto __local_histogram_first, std::uint32_t __n, auto __histogram_accum_first) { - oneapi::dpl::__unseq_backend::__simd_walk_2(__local_histogram_first, __n, __histogram_accum_first, - oneapi::dpl::__internal::__pstl_assign()); + __internal::__brick_walk2_n(__local_histogram_first, __n, __histogram_accum_first, + oneapi::dpl::__internal::__pstl_assign(), IsVector{}); }, [](auto __local_histogram_first, std::uint32_t __n, auto __histogram_accum_first) { - oneapi::dpl::__unseq_backend::__simd_walk_2( - __local_histogram_first, __n, __histogram_accum_first, - [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }); + __internal::__brick_walk2_n(__local_histogram_first, __n, __histogram_accum_first, + [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, IsVector{}); }); } } From a39accd629e5b83ec677bd1a4e5872c5279a4bff Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 16 Dec 2024 11:06:42 -0500 Subject: [PATCH 15/50] typo fix Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index f5dd5824a49..39703e64379 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4371,11 +4371,11 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando }, [](auto __local_histogram_first, std::uint32_t __n, auto __histogram_accum_first) { __internal::__brick_walk2_n(__local_histogram_first, __n, __histogram_accum_first, - oneapi::dpl::__internal::__pstl_assign(), IsVector{}); + oneapi::dpl::__internal::__pstl_assign(), _IsVector{}); }, [](auto __local_histogram_first, std::uint32_t __n, auto __histogram_accum_first) { __internal::__brick_walk2_n(__local_histogram_first, __n, __histogram_accum_first, - [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, IsVector{}); + [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, _IsVector{}); }); } } From 2cd184d9ae73282e96a6804a9ffb084e37e0d441 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 16 Dec 2024 11:30:25 -0500 Subject: [PATCH 16/50] special handling thread zero to use global mem Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 9 ++------- .../oneapi/dpl/pstl/omp/parallel_histogram.h | 19 ++++++++++++------- .../oneapi/dpl/pstl/parallel_backend_tbb.h | 17 +++++++++++------ 3 files changed, 25 insertions(+), 20 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 39703e64379..e1fb9e80425 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4347,12 +4347,12 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando const _DiffType __histogram_threshold = 1048576; _DiffType __n = __last - __first; + __pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, + __histogram_first + __num_bins, _HistogramValueT{0}); if (__n > 0) { if (__num_bins >= __histogram_threshold) { - __pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, - __histogram_first + __num_bins, _HistogramValueT{0}); //Atomic histogram brick to protect against race conditions __par_backend::__parallel_for(__backend_tag{}, std::forward<_ExecutionPolicy>(__exec), _DiffType{0}, __n, [__first, __func, __histogram_first](_DiffType __i, _DiffType __j) { @@ -4379,11 +4379,6 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando }); } } - else - { - __pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, - __histogram_first + __num_bins, _HistogramValueT{0}); - } } } // namespace __internal diff --git a/include/oneapi/dpl/pstl/omp/parallel_histogram.h b/include/oneapi/dpl/pstl/omp/parallel_histogram.h index 8977624f54e..0395b16b908 100644 --- a/include/oneapi/dpl/pstl/omp/parallel_histogram.h +++ b/include/oneapi/dpl/pstl/omp/parallel_histogram.h @@ -41,11 +41,11 @@ __histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, auto __policy1 = oneapi::dpl::__omp_backend::__chunk_partitioner(__first, __last); auto __policy2 = oneapi::dpl::__omp_backend::__chunk_partitioner(__histogram_first, __histogram_first + __num_bins); - std::vector> __local_histograms(__num_threads); + std::vector> __local_histograms(__num_threads - 1); //TODO: use histogram output for zeroth thread? _PSTL_PRAGMA(omp taskloop shared(__local_histograms)) - for (std::size_t __tid = 0; __tid < __num_threads; ++__tid) + for (std::size_t __tid = 0; __tid < __num_threads - 1; ++__tid) { __local_histograms[__tid].resize(__num_bins, _HistogramValueT{0}); } @@ -57,7 +57,15 @@ __histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, oneapi::dpl::__omp_backend::__process_chunk( __policy1, __first, __chunk, [&](auto __chunk_first, auto __chunk_last) { auto __thread_num = omp_get_thread_num(); - __f(__chunk_first, __chunk_last, __local_histograms[__thread_num].begin()); + if (__thread_num == 0) + { + //use the first thread to initialize the global histogram + __f(__chunk_first, __chunk_last, __histogram_first); + } + else + { + __f(__chunk_first, __chunk_last, __local_histograms[__thread_num - 1].begin()); + } }); } @@ -66,10 +74,7 @@ __histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, { oneapi::dpl::__omp_backend::__process_chunk( __policy2, __histogram_first, __chunk, [&](auto __chunk_first, auto __chunk_last) { - __init(__local_histograms[0].begin() + (__chunk_first - __histogram_first), - (__chunk_last - __chunk_first), __chunk_first); - - for (std::size_t __i = 1; __i < __num_threads; ++__i) + for (std::size_t __i = 0; __i < __num_threads - 1; ++__i) { __accum(__local_histograms[__i].begin() + (__chunk_first - __histogram_first), (__chunk_last - __chunk_first), __chunk_first); diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index 0c307460ca4..e662cfa6230 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -1324,15 +1324,20 @@ __parallel_histogram(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolic _HistogramValueT{0}); size_t __n = __last - __first; tbb::parallel_for(tbb::blocked_range<_Size>(0, __n), [&](const tbb::blocked_range<_Size>& __range) { - std::vector<_HistogramValueT>& __local_histogram = __thread_local_histogram.local(); - __f(__first + __range.begin(), __first + __range.end(), __local_histogram.begin()); + if (tbb::this_task_arena::current_thread_index() == 0) + { + //use the first thread to initialize the global histogram + __f(__first + __range.begin(), __first + __range.end(), __histogram_first); + } + else + { + __f(__first + __range.begin(), __first + __range.end(), __thread_local_histogram.local().begin()); + } + }); tbb::parallel_for(tbb::blocked_range<_Size>(0, __num_bins), [&](const tbb::blocked_range<_Size>& __range) { - auto __local_histogram = __thread_local_histogram.begin(); - __init(__local_histogram->begin() + __range.begin(), __range.size(), __histogram_first + __range.begin()); - ++__local_histogram; - for (; __local_histogram != __thread_local_histogram.end(); ++__local_histogram) + for (auto __local_histogram = __thread_local_histogram.begin(); __local_histogram != __thread_local_histogram.end(); ++__local_histogram) { __accum(__local_histogram->begin() + __range.begin(), __range.size(), __histogram_first + __range.begin()); From b3dd2a0629aacab12ef7638d895a4d83fb8aea68 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 16 Dec 2024 11:32:14 -0500 Subject: [PATCH 17/50] cleanup Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 4 ---- include/oneapi/dpl/pstl/omp/parallel_histogram.h | 14 ++++++-------- include/oneapi/dpl/pstl/parallel_backend_serial.h | 4 ++-- include/oneapi/dpl/pstl/parallel_backend_tbb.h | 4 ++-- 4 files changed, 10 insertions(+), 16 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index e1fb9e80425..0926ce239a8 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4369,10 +4369,6 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando [__func](auto __subrange_first, auto __subrange_last, auto __histogram_first) { __brick_histogram(__subrange_first, __subrange_last, __func, __histogram_first, _IsVector{}); }, - [](auto __local_histogram_first, std::uint32_t __n, auto __histogram_accum_first) { - __internal::__brick_walk2_n(__local_histogram_first, __n, __histogram_accum_first, - oneapi::dpl::__internal::__pstl_assign(), _IsVector{}); - }, [](auto __local_histogram_first, std::uint32_t __n, auto __histogram_accum_first) { __internal::__brick_walk2_n(__local_histogram_first, __n, __histogram_accum_first, [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, _IsVector{}); diff --git a/include/oneapi/dpl/pstl/omp/parallel_histogram.h b/include/oneapi/dpl/pstl/omp/parallel_histogram.h index 0395b16b908..5b07a758f55 100644 --- a/include/oneapi/dpl/pstl/omp/parallel_histogram.h +++ b/include/oneapi/dpl/pstl/omp/parallel_histogram.h @@ -26,10 +26,10 @@ namespace __omp_backend { template + typename _FpAccum> void __histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _Size __num_bins, - _RandomAccessIterator2 __histogram_first, _FpHist __f, _FpInitialize __init, _FpAccum __accum) + _RandomAccessIterator2 __histogram_first, _FpHist __f, _FpAccum __accum) { using _HistogramValueT = typename ::std::iterator_traits<_RandomAccessIterator2>::value_type; @@ -84,18 +84,17 @@ __histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, } template + typename _FpHist, typename _FpAccum> void __parallel_histogram(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolicy&&, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _Size __num_bins, _RandomAccessIterator2 __histogram_first, - _FpHist __f, _FpInitialize __init, _FpAccum __accum) + _FpHist __f, _FpAccum __accum) { if (omp_in_parallel()) { // We don't create a nested parallel region in an existing parallel // region: just create tasks - oneapi::dpl::__omp_backend::__histogram_body(__first, __last, __num_bins, __histogram_first, __f, __init, - __accum); + oneapi::dpl::__omp_backend::__histogram_body(__first, __last, __num_bins, __histogram_first, __f, __accum); } else { @@ -104,8 +103,7 @@ __parallel_histogram(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolic _PSTL_PRAGMA(omp parallel) _PSTL_PRAGMA(omp single nowait) { - oneapi::dpl::__omp_backend::__histogram_body(__first, __last, __num_bins, __histogram_first, __f, __init, - __accum); + oneapi::dpl::__omp_backend::__histogram_body(__first, __last, __num_bins, __histogram_first, __f, __accum); } } } diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h index e67bb4b00ca..cbe28c8aa2d 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_serial.h +++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h @@ -130,11 +130,11 @@ __parallel_for_each(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPol } template + typename _FpHist, typename _FpAccum> void __parallel_histogram(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&& exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _Size __num_bins, - _RandomAccessIterator2 __histogram_first, _FpHist __f, _FpInitialize, _FpAccum) + _RandomAccessIterator2 __histogram_first, _FpHist __f, _FpAccum /*Unused*/) { using _HistogramValueT = typename ::std::iterator_traits<_RandomAccessIterator2>::value_type; ::std::fill(__histogram_first, __histogram_first + __num_bins, _HistogramValueT{0}); diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index e662cfa6230..8f7b2d6e1a4 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -1311,11 +1311,11 @@ __parallel_for_each(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy // parallel_histogram //------------------------------------------------------------------------ template + typename _FpHist, typename _FpAccum> void __parallel_histogram(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy&&, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _Size __num_bins, _RandomAccessIterator2 __histogram_first, - _FpHist __f, _FpInitialize __init, _FpAccum __accum) + _FpHist __f, _FpAccum __accum) { using _HistogramValueT = typename ::std::iterator_traits<_RandomAccessIterator2>::value_type; From bff45d96a40dc63d83ca95423ba8e783482501c8 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 16 Dec 2024 11:52:24 -0500 Subject: [PATCH 18/50] atomic version removal Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 49 ++++++------------------ include/oneapi/dpl/pstl/utils.h | 11 ------ 2 files changed, 11 insertions(+), 49 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 0926ce239a8..13108555844 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4305,21 +4305,6 @@ __brick_histogram(_ForwardIterator __first, _ForwardIterator __last, _IdxHashFun } } -template -void -__brick_histogram_atomics(_ForwardIterator __first, _ForwardIterator __last, _IdxHashFunc __func, - _RandomAccessIterator __histogram_first, _IsVector) noexcept -{ - for (; __first != __last; ++__first) - { - std::int32_t __bin = __func.get_bin(*__first); - if (__bin >= 0) - { - _ONEDPL_ATOMIC_INCREMENT(__histogram_first[__bin]); - } - } -} - template void @@ -4351,29 +4336,17 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando __histogram_first + __num_bins, _HistogramValueT{0}); if (__n > 0) { - if (__num_bins >= __histogram_threshold) - { - //Atomic histogram brick to protect against race conditions - __par_backend::__parallel_for(__backend_tag{}, std::forward<_ExecutionPolicy>(__exec), _DiffType{0}, __n, - [__first, __func, __histogram_first](_DiffType __i, _DiffType __j) { - __brick_histogram_atomics(__first + __i, __first + __j, __func, - __histogram_first, _IsVector{}); - }); - } - else - { - //Embarassingly parallel with temporary histogram outputs - __par_backend::__parallel_histogram( - __backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __num_bins, - __histogram_first, - [__func](auto __subrange_first, auto __subrange_last, auto __histogram_first) { - __brick_histogram(__subrange_first, __subrange_last, __func, __histogram_first, _IsVector{}); - }, - [](auto __local_histogram_first, std::uint32_t __n, auto __histogram_accum_first) { - __internal::__brick_walk2_n(__local_histogram_first, __n, __histogram_accum_first, - [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, _IsVector{}); - }); - } + //Embarassingly parallel with temporary histogram outputs + __par_backend::__parallel_histogram( + __backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __num_bins, + __histogram_first, + [__func](auto __subrange_first, auto __subrange_last, auto __histogram_first) { + __brick_histogram(__subrange_first, __subrange_last, __func, __histogram_first, _IsVector{}); + }, + [](auto __local_histogram_first, std::uint32_t __n, auto __histogram_accum_first) { + __internal::__brick_walk2_n(__local_histogram_first, __n, __histogram_accum_first, + [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, _IsVector{}); + }); } } diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h index 23b9d6c729b..9c32178a78c 100644 --- a/include/oneapi/dpl/pstl/utils.h +++ b/include/oneapi/dpl/pstl/utils.h @@ -25,7 +25,6 @@ #include #include #include -#include #if _ONEDPL_BACKEND_SYCL # include "hetero/dpcpp/sycl_defs.h" @@ -43,16 +42,6 @@ # include // memcpy #endif -#if _ONEDPL_CPP20_ATOMIC_REF_PRESENT -# define _ONEDPL_ATOMIC_INCREMENT(element_ref) \ - std::atomic_ref>{element_ref}.fetch_add(1, std::memory_order_relaxed) -#elif defined(_MSC_VER) -//TODO: find a better way to do this with intrinsics on MSVC without including windows.h? -# define _ONEDPL_ATOMIC_INCREMENT(element_ref) reinterpret_cast>*>((&element_ref))->fetch_add(1, std::memory_order_relaxed) -#else -# define _ONEDPL_ATOMIC_INCREMENT(element_ref) __atomic_fetch_add(&element_ref, 1, __ATOMIC_RELAXED) -#endif - namespace oneapi { namespace dpl From 2c8cc044cf259d909aa4548ef92f664eeb0a11e6 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 16 Dec 2024 12:48:35 -0500 Subject: [PATCH 19/50] Revert "cleanup" This reverts commit 8d6160955b02c0ae7c33fba9301fd2561f311669. --- include/oneapi/dpl/pstl/algorithm_impl.h | 4 ++++ include/oneapi/dpl/pstl/omp/parallel_histogram.h | 14 ++++++++------ include/oneapi/dpl/pstl/parallel_backend_serial.h | 4 ++-- include/oneapi/dpl/pstl/parallel_backend_tbb.h | 4 ++-- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 13108555844..cb118f54de3 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4343,6 +4343,10 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando [__func](auto __subrange_first, auto __subrange_last, auto __histogram_first) { __brick_histogram(__subrange_first, __subrange_last, __func, __histogram_first, _IsVector{}); }, + [](auto __local_histogram_first, std::uint32_t __n, auto __histogram_accum_first) { + __internal::__brick_walk2_n(__local_histogram_first, __n, __histogram_accum_first, + oneapi::dpl::__internal::__pstl_assign(), _IsVector{}); + }, [](auto __local_histogram_first, std::uint32_t __n, auto __histogram_accum_first) { __internal::__brick_walk2_n(__local_histogram_first, __n, __histogram_accum_first, [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, _IsVector{}); diff --git a/include/oneapi/dpl/pstl/omp/parallel_histogram.h b/include/oneapi/dpl/pstl/omp/parallel_histogram.h index 5b07a758f55..0395b16b908 100644 --- a/include/oneapi/dpl/pstl/omp/parallel_histogram.h +++ b/include/oneapi/dpl/pstl/omp/parallel_histogram.h @@ -26,10 +26,10 @@ namespace __omp_backend { template + typename _FpInitialize, typename _FpAccum> void __histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _Size __num_bins, - _RandomAccessIterator2 __histogram_first, _FpHist __f, _FpAccum __accum) + _RandomAccessIterator2 __histogram_first, _FpHist __f, _FpInitialize __init, _FpAccum __accum) { using _HistogramValueT = typename ::std::iterator_traits<_RandomAccessIterator2>::value_type; @@ -84,17 +84,18 @@ __histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, } template + typename _FpHist, typename _FpInitialize, typename _FpAccum> void __parallel_histogram(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolicy&&, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _Size __num_bins, _RandomAccessIterator2 __histogram_first, - _FpHist __f, _FpAccum __accum) + _FpHist __f, _FpInitialize __init, _FpAccum __accum) { if (omp_in_parallel()) { // We don't create a nested parallel region in an existing parallel // region: just create tasks - oneapi::dpl::__omp_backend::__histogram_body(__first, __last, __num_bins, __histogram_first, __f, __accum); + oneapi::dpl::__omp_backend::__histogram_body(__first, __last, __num_bins, __histogram_first, __f, __init, + __accum); } else { @@ -103,7 +104,8 @@ __parallel_histogram(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolic _PSTL_PRAGMA(omp parallel) _PSTL_PRAGMA(omp single nowait) { - oneapi::dpl::__omp_backend::__histogram_body(__first, __last, __num_bins, __histogram_first, __f, __accum); + oneapi::dpl::__omp_backend::__histogram_body(__first, __last, __num_bins, __histogram_first, __f, __init, + __accum); } } } diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h index cbe28c8aa2d..e67bb4b00ca 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_serial.h +++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h @@ -130,11 +130,11 @@ __parallel_for_each(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPol } template + typename _FpHist, typename _FpInitialize, typename _FpAccum> void __parallel_histogram(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&& exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _Size __num_bins, - _RandomAccessIterator2 __histogram_first, _FpHist __f, _FpAccum /*Unused*/) + _RandomAccessIterator2 __histogram_first, _FpHist __f, _FpInitialize, _FpAccum) { using _HistogramValueT = typename ::std::iterator_traits<_RandomAccessIterator2>::value_type; ::std::fill(__histogram_first, __histogram_first + __num_bins, _HistogramValueT{0}); diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index 8f7b2d6e1a4..e662cfa6230 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -1311,11 +1311,11 @@ __parallel_for_each(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy // parallel_histogram //------------------------------------------------------------------------ template + typename _FpHist, typename _FpInitialize, typename _FpAccum> void __parallel_histogram(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy&&, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _Size __num_bins, _RandomAccessIterator2 __histogram_first, - _FpHist __f, _FpAccum __accum) + _FpHist __f, _FpInitialize __init, _FpAccum __accum) { using _HistogramValueT = typename ::std::iterator_traits<_RandomAccessIterator2>::value_type; From 790121b918c5372e93d086a9e0dd90d8a037d9da Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 16 Dec 2024 12:49:12 -0500 Subject: [PATCH 20/50] Revert "special handling thread zero to use global mem" This reverts commit 6f0a9cbec4f4f294798d8fb24dd61832b6607f30. --- include/oneapi/dpl/pstl/algorithm_impl.h | 7 +++++-- .../oneapi/dpl/pstl/omp/parallel_histogram.h | 19 +++++++------------ .../oneapi/dpl/pstl/parallel_backend_tbb.h | 17 ++++++----------- 3 files changed, 18 insertions(+), 25 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index cb118f54de3..b8e5636db69 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4332,8 +4332,6 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando const _DiffType __histogram_threshold = 1048576; _DiffType __n = __last - __first; - __pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, - __histogram_first + __num_bins, _HistogramValueT{0}); if (__n > 0) { //Embarassingly parallel with temporary histogram outputs @@ -4352,6 +4350,11 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, _IsVector{}); }); } + else + { + __pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, + __histogram_first + __num_bins, _HistogramValueT{0}); + } } } // namespace __internal diff --git a/include/oneapi/dpl/pstl/omp/parallel_histogram.h b/include/oneapi/dpl/pstl/omp/parallel_histogram.h index 0395b16b908..8977624f54e 100644 --- a/include/oneapi/dpl/pstl/omp/parallel_histogram.h +++ b/include/oneapi/dpl/pstl/omp/parallel_histogram.h @@ -41,11 +41,11 @@ __histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, auto __policy1 = oneapi::dpl::__omp_backend::__chunk_partitioner(__first, __last); auto __policy2 = oneapi::dpl::__omp_backend::__chunk_partitioner(__histogram_first, __histogram_first + __num_bins); - std::vector> __local_histograms(__num_threads - 1); + std::vector> __local_histograms(__num_threads); //TODO: use histogram output for zeroth thread? _PSTL_PRAGMA(omp taskloop shared(__local_histograms)) - for (std::size_t __tid = 0; __tid < __num_threads - 1; ++__tid) + for (std::size_t __tid = 0; __tid < __num_threads; ++__tid) { __local_histograms[__tid].resize(__num_bins, _HistogramValueT{0}); } @@ -57,15 +57,7 @@ __histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, oneapi::dpl::__omp_backend::__process_chunk( __policy1, __first, __chunk, [&](auto __chunk_first, auto __chunk_last) { auto __thread_num = omp_get_thread_num(); - if (__thread_num == 0) - { - //use the first thread to initialize the global histogram - __f(__chunk_first, __chunk_last, __histogram_first); - } - else - { - __f(__chunk_first, __chunk_last, __local_histograms[__thread_num - 1].begin()); - } + __f(__chunk_first, __chunk_last, __local_histograms[__thread_num].begin()); }); } @@ -74,7 +66,10 @@ __histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, { oneapi::dpl::__omp_backend::__process_chunk( __policy2, __histogram_first, __chunk, [&](auto __chunk_first, auto __chunk_last) { - for (std::size_t __i = 0; __i < __num_threads - 1; ++__i) + __init(__local_histograms[0].begin() + (__chunk_first - __histogram_first), + (__chunk_last - __chunk_first), __chunk_first); + + for (std::size_t __i = 1; __i < __num_threads; ++__i) { __accum(__local_histograms[__i].begin() + (__chunk_first - __histogram_first), (__chunk_last - __chunk_first), __chunk_first); diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index e662cfa6230..0c307460ca4 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -1324,20 +1324,15 @@ __parallel_histogram(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolic _HistogramValueT{0}); size_t __n = __last - __first; tbb::parallel_for(tbb::blocked_range<_Size>(0, __n), [&](const tbb::blocked_range<_Size>& __range) { - if (tbb::this_task_arena::current_thread_index() == 0) - { - //use the first thread to initialize the global histogram - __f(__first + __range.begin(), __first + __range.end(), __histogram_first); - } - else - { - __f(__first + __range.begin(), __first + __range.end(), __thread_local_histogram.local().begin()); - } - + std::vector<_HistogramValueT>& __local_histogram = __thread_local_histogram.local(); + __f(__first + __range.begin(), __first + __range.end(), __local_histogram.begin()); }); tbb::parallel_for(tbb::blocked_range<_Size>(0, __num_bins), [&](const tbb::blocked_range<_Size>& __range) { - for (auto __local_histogram = __thread_local_histogram.begin(); __local_histogram != __thread_local_histogram.end(); ++__local_histogram) + auto __local_histogram = __thread_local_histogram.begin(); + __init(__local_histogram->begin() + __range.begin(), __range.size(), __histogram_first + __range.begin()); + ++__local_histogram; + for (; __local_histogram != __thread_local_histogram.end(); ++__local_histogram) { __accum(__local_histogram->begin() + __range.begin(), __range.size(), __histogram_first + __range.begin()); From 0392f23a83b7fcba8d1f128ef2bbf88e0aa88bc1 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 16 Dec 2024 14:23:09 -0500 Subject: [PATCH 21/50] comments and cleanup Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index b8e5636db69..8ee429417d9 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4329,8 +4329,6 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando using _HistogramValueT = typename std::iterator_traits<_RandomAccessIterator2>::value_type; using _DiffType = typename ::std::iterator_traits<_RandomAccessIterator2>::difference_type; - const _DiffType __histogram_threshold = 1048576; - _DiffType __n = __last - __first; if (__n > 0) { @@ -4341,10 +4339,12 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando [__func](auto __subrange_first, auto __subrange_last, auto __histogram_first) { __brick_histogram(__subrange_first, __subrange_last, __func, __histogram_first, _IsVector{}); }, + //initialize output global histogram with first local histogram via assign [](auto __local_histogram_first, std::uint32_t __n, auto __histogram_accum_first) { __internal::__brick_walk2_n(__local_histogram_first, __n, __histogram_accum_first, oneapi::dpl::__internal::__pstl_assign(), _IsVector{}); }, + //accumulate into output global histogram with other local histogram via += operator [](auto __local_histogram_first, std::uint32_t __n, auto __histogram_accum_first) { __internal::__brick_walk2_n(__local_histogram_first, __n, __histogram_accum_first, [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, _IsVector{}); From 45563996a350e54e12417d90057b1d981d9ae3f0 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 16 Dec 2024 15:28:20 -0500 Subject: [PATCH 22/50] handling coarser grained parallelism Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/omp/parallel_histogram.h | 10 +++++++--- include/oneapi/dpl/pstl/parallel_backend_tbb.h | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/omp/parallel_histogram.h b/include/oneapi/dpl/pstl/omp/parallel_histogram.h index 8977624f54e..589d236c2c7 100644 --- a/include/oneapi/dpl/pstl/omp/parallel_histogram.h +++ b/include/oneapi/dpl/pstl/omp/parallel_histogram.h @@ -41,11 +41,15 @@ __histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, auto __policy1 = oneapi::dpl::__omp_backend::__chunk_partitioner(__first, __last); auto __policy2 = oneapi::dpl::__omp_backend::__chunk_partitioner(__histogram_first, __histogram_first + __num_bins); - std::vector> __local_histograms(__num_threads); + const std::size_t __threads_to_use = std::min(__num_threads, + oneapi::dpl::__internal::__dpl_ceiling_div(__size, + __policy1.__chunk_size)); + + std::vector> __local_histograms(__threads_to_use); //TODO: use histogram output for zeroth thread? _PSTL_PRAGMA(omp taskloop shared(__local_histograms)) - for (std::size_t __tid = 0; __tid < __num_threads; ++__tid) + for (std::size_t __tid = 0; __tid < __threads_to_use; ++__tid) { __local_histograms[__tid].resize(__num_bins, _HistogramValueT{0}); } @@ -69,7 +73,7 @@ __histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, __init(__local_histograms[0].begin() + (__chunk_first - __histogram_first), (__chunk_last - __chunk_first), __chunk_first); - for (std::size_t __i = 1; __i < __num_threads; ++__i) + for (std::size_t __i = 1; __i < __threads_to_use; ++__i) { __accum(__local_histograms[__i].begin() + (__chunk_first - __histogram_first), (__chunk_last - __chunk_first), __chunk_first); diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index 0c307460ca4..9a1903b946e 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -1323,7 +1323,7 @@ __parallel_histogram(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolic tbb::enumerable_thread_specific> __thread_local_histogram(__num_bins, _HistogramValueT{0}); size_t __n = __last - __first; - tbb::parallel_for(tbb::blocked_range<_Size>(0, __n), [&](const tbb::blocked_range<_Size>& __range) { + tbb::parallel_for(tbb::blocked_range<_Size>(0, __n, 1024), [&](const tbb::blocked_range<_Size>& __range) { std::vector<_HistogramValueT>& __local_histogram = __thread_local_histogram.local(); __f(__first + __range.begin(), __first + __range.end(), __local_histogram.begin()); }); From 2fdea3407ebfbcaaa3da674acdadd6301c3d1224 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 16 Dec 2024 15:45:45 -0500 Subject: [PATCH 23/50] undo-ing broken thread restriction in openMP Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/omp/parallel_histogram.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/omp/parallel_histogram.h b/include/oneapi/dpl/pstl/omp/parallel_histogram.h index 589d236c2c7..de1dc8187c8 100644 --- a/include/oneapi/dpl/pstl/omp/parallel_histogram.h +++ b/include/oneapi/dpl/pstl/omp/parallel_histogram.h @@ -41,15 +41,9 @@ __histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, auto __policy1 = oneapi::dpl::__omp_backend::__chunk_partitioner(__first, __last); auto __policy2 = oneapi::dpl::__omp_backend::__chunk_partitioner(__histogram_first, __histogram_first + __num_bins); - const std::size_t __threads_to_use = std::min(__num_threads, - oneapi::dpl::__internal::__dpl_ceiling_div(__size, - __policy1.__chunk_size)); - - std::vector> __local_histograms(__threads_to_use); - - //TODO: use histogram output for zeroth thread? + std::vector> __local_histograms(__num_threads); _PSTL_PRAGMA(omp taskloop shared(__local_histograms)) - for (std::size_t __tid = 0; __tid < __threads_to_use; ++__tid) + for (std::size_t __tid = 0; __tid < __num_threads; ++__tid) { __local_histograms[__tid].resize(__num_bins, _HistogramValueT{0}); } @@ -73,7 +67,7 @@ __histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, __init(__local_histograms[0].begin() + (__chunk_first - __histogram_first), (__chunk_last - __chunk_first), __chunk_first); - for (std::size_t __i = 1; __i < __threads_to_use; ++__i) + for (std::size_t __i = 1; __i < __num_threads; ++__i) { __accum(__local_histograms[__i].begin() + (__chunk_first - __histogram_first), (__chunk_last - __chunk_first), __chunk_first); From a0c016a045a6024c2e2678ff02b1742c88c662b8 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 18 Dec 2024 17:00:01 -0500 Subject: [PATCH 24/50] lift pattern up to algorithm_impl level Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 50 +++++++++++++------ include/oneapi/dpl/pstl/omp/util.h | 38 ++++++++++++++ .../oneapi/dpl/pstl/parallel_backend_serial.h | 24 +++++++++ .../oneapi/dpl/pstl/parallel_backend_tbb.h | 24 +++++++++ 4 files changed, 121 insertions(+), 15 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 8ee429417d9..1fcc5d7f9f1 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -44,6 +44,14 @@ namespace dpl namespace __internal { +template +auto +__create_thread_enumerable_storage(std::size_t __num_elements, + _ValueType __init_value) +{ + return __par_backend::__thread_enumerable_storage{__num_elements, __init_value}; +} + //------------------------------------------------------------------------ // any_of //------------------------------------------------------------------------ @@ -4332,22 +4340,34 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando _DiffType __n = __last - __first; if (__n > 0) { - //Embarassingly parallel with temporary histogram outputs - __par_backend::__parallel_histogram( - __backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __num_bins, - __histogram_first, - [__func](auto __subrange_first, auto __subrange_last, auto __histogram_first) { - __brick_histogram(__subrange_first, __subrange_last, __func, __histogram_first, _IsVector{}); - }, - //initialize output global histogram with first local histogram via assign - [](auto __local_histogram_first, std::uint32_t __n, auto __histogram_accum_first) { - __internal::__brick_walk2_n(__local_histogram_first, __n, __histogram_accum_first, - oneapi::dpl::__internal::__pstl_assign(), _IsVector{}); - }, - //accumulate into output global histogram with other local histogram via += operator - [](auto __local_histogram_first, std::uint32_t __n, auto __histogram_accum_first) { - __internal::__brick_walk2_n(__local_histogram_first, __n, __histogram_accum_first, + auto __thread_enumerable_storage = oneapi::dpl::__internal::__create_thread_enumerable_storage( + __num_bins, _HistogramValueT{0}); + + //main histogram loop + __par_backend::__parallel_for(__backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, + [__func, &__thread_enumerable_storage](_RandomAccessIterator1 __first_local, + _RandomAccessIterator1 __last_local) { + __internal::__brick_histogram(__first_local, __last_local, __func, + __thread_enumerable_storage.get(), _IsVector{}); + }); + // now accumulate temporary storage into output global histogram + __par_backend::__parallel_for( + __backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __histogram_first, + __histogram_first + __num_bins, + [__histogram_first, &__thread_enumerable_storage](auto __global_histogram_first, auto __global_histogram_last) { + _DiffType __local_n = __global_histogram_last - __global_histogram_first; + std::size_t __num_temporary_copies = __thread_enumerable_storage.size(); + _DiffType __range_begin_id = __global_histogram_first - __histogram_first; + //initialize output global histogram with first local histogram via assign + __internal::__brick_walk2_n(__thread_enumerable_storage.get_with_id(0) + __range_begin_id, __local_n, __global_histogram_first, + oneapi::dpl::__internal::__pstl_assign(), _IsVector{}); + for (std::size_t __i = 1; __i < __num_temporary_copies; ++__i) + { + //accumulate into output global histogram with other local histogram via += operator + __internal::__brick_walk2_n( + __thread_enumerable_storage.get_with_id(__i) + __range_begin_id, __local_n, __global_histogram_first, [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, _IsVector{}); + } }); } else diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h index b5fb399e2b5..8249007dbda 100644 --- a/include/oneapi/dpl/pstl/omp/util.h +++ b/include/oneapi/dpl/pstl/omp/util.h @@ -25,6 +25,7 @@ #include #include #include +#include #include "../parallel_backend_utils.h" #include "../unseq_backend_simd.h" @@ -153,6 +154,43 @@ __process_chunk(const __chunk_metrics& __metrics, _Iterator __base, _Index __chu __f(__first, __last); } +template +struct __thread_enumerable_storage +{ + __thread_enumerable_storage(std::size_t __num_bins, _ValueType __init_value) + { + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single nowait) + { + __num_threads = omp_get_num_threads(); + __thread_specific_storage.resize(__num_threads); + _PSTL_PRAGMA(omp taskloop shared(__thread_specific_storage, __num_bins, __init_value)) + for (std::size_t __tid = 0; __tid < __num_threads; ++__tid) + { + __thread_specific_storage[__tid].resize(__num_bins, __init_value); + } + } + } + + std::size_t size() const + { + return __num_threads; + } + + auto get_with_id(std::size_t __i) + { + return __thread_specific_storage[__i].begin(); + } + + auto get() + { + return get_with_id(omp_get_thread_num()); + } + + std::vector> __thread_specific_storage; + std::size_t __num_threads; +}; + } // namespace __omp_backend } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h index e67bb4b00ca..2df972295ad 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_serial.h +++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h @@ -42,6 +42,30 @@ __cancel_execution(oneapi::dpl::__internal::__serial_backend_tag) { } +template +struct __thread_enumerable_storage +{ + __thread_enumerable_storage(std::size_t __num_bins, _ValueType __init_value) : __storage(__num_bins, __init_value) + {} + + std::size_t size() const + { + return std::size_t{1}; + } + + auto get() + { + return __storage.begin(); + } + + auto get_with_id(std::size_t __i) + { + return get(); + } + + std::vector<_ValueType> __storage; +}; + template void __parallel_for(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&&, _Index __first, _Index __last, diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index 9a1903b946e..b254e359f09 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -1307,6 +1307,30 @@ __parallel_for_each(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy tbb::this_task_arena::isolate([&]() { tbb::parallel_for_each(__begin, __end, __f); }); } +template +struct __thread_enumerable_storage +{ + __thread_enumerable_storage(std::size_t __num_bins, _ValueType __init_value) : __thread_specific_storage(__num_bins, __init_value) + {} + + std::size_t size() const + { + return __thread_specific_storage.size(); + } + + auto get() + { + return __thread_specific_storage.local().begin(); + } + + auto get_with_id(std::size_t __i) + { + return __thread_specific_storage.begin()[__i].begin(); + } + + tbb::enumerable_thread_specific> __thread_specific_storage; +}; + //------------------------------------------------------------------------ // parallel_histogram //------------------------------------------------------------------------ From e87ba0278dbe53d9b0f18749f60c30bcbf599c96 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 18 Dec 2024 17:03:15 -0500 Subject: [PATCH 25/50] cleanup unnecessary code Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/internal/version_impl.h | 1 - include/oneapi/dpl/pstl/algorithm_impl.h | 1 - .../oneapi/dpl/pstl/omp/parallel_histogram.h | 109 ------------------ .../oneapi/dpl/pstl/parallel_backend_omp.h | 5 - .../oneapi/dpl/pstl/parallel_backend_serial.h | 12 -- .../oneapi/dpl/pstl/parallel_backend_tbb.h | 33 ------ 6 files changed, 161 deletions(-) delete mode 100644 include/oneapi/dpl/pstl/omp/parallel_histogram.h diff --git a/include/oneapi/dpl/internal/version_impl.h b/include/oneapi/dpl/internal/version_impl.h index 857cb327a65..38a15e5eecf 100644 --- a/include/oneapi/dpl/internal/version_impl.h +++ b/include/oneapi/dpl/internal/version_impl.h @@ -20,7 +20,6 @@ # define _ONEDPL_STD_FEATURE_MACROS_PRESENT 1 // Clang 15 and older do not support range adaptors, see https://bugs.llvm.org/show_bug.cgi?id=44833 # define _ONEDPL_CPP20_RANGES_PRESENT ((__cpp_lib_ranges >= 201911L) && !(__clang__ && __clang_major__ < 16)) -# define _ONEDPL_CPP20_ATOMIC_REF_PRESENT (__cpp_lib_atomic_ref >= 201806L) #else # define _ONEDPL_STD_FEATURE_MACROS_PRESENT 0 # define _ONEDPL_CPP20_RANGES_PRESENT 0 diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 1fcc5d7f9f1..0e6a9a09390 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -20,7 +20,6 @@ #include #include #include -#include #include "algorithm_fwd.h" diff --git a/include/oneapi/dpl/pstl/omp/parallel_histogram.h b/include/oneapi/dpl/pstl/omp/parallel_histogram.h deleted file mode 100644 index de1dc8187c8..00000000000 --- a/include/oneapi/dpl/pstl/omp/parallel_histogram.h +++ /dev/null @@ -1,109 +0,0 @@ -// -*- C++ -*- -//===----------------------------------------------------------------------===// -// -// Copyright (C) Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// This file incorporates work covered by the following copyright and permission -// notice: -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// -//===----------------------------------------------------------------------===// - -#ifndef _ONEDPL_INTERNAL_OMP_PARALLEL_HISTOGRAM_H -#define _ONEDPL_INTERNAL_OMP_PARALLEL_HISTOGRAM_H - -#include "util.h" - -namespace oneapi -{ -namespace dpl -{ -namespace __omp_backend -{ - -template -void -__histogram_body(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _Size __num_bins, - _RandomAccessIterator2 __histogram_first, _FpHist __f, _FpInitialize __init, _FpAccum __accum) -{ - using _HistogramValueT = typename ::std::iterator_traits<_RandomAccessIterator2>::value_type; - - const std::size_t __num_threads = omp_get_num_threads(); - const std::size_t __size = __last - __first; - - // Initial partition of the iteration space into chunks. If the range is too small, - // this will result in a nonsense policy, so we check on the size as well below. - auto __policy1 = oneapi::dpl::__omp_backend::__chunk_partitioner(__first, __last); - auto __policy2 = oneapi::dpl::__omp_backend::__chunk_partitioner(__histogram_first, __histogram_first + __num_bins); - - std::vector> __local_histograms(__num_threads); - _PSTL_PRAGMA(omp taskloop shared(__local_histograms)) - for (std::size_t __tid = 0; __tid < __num_threads; ++__tid) - { - __local_histograms[__tid].resize(__num_bins, _HistogramValueT{0}); - } - - // main loop - _PSTL_PRAGMA(omp taskloop shared(__local_histograms)) - for (std::size_t __chunk = 0; __chunk < __policy1.__n_chunks; ++__chunk) - { - oneapi::dpl::__omp_backend::__process_chunk( - __policy1, __first, __chunk, [&](auto __chunk_first, auto __chunk_last) { - auto __thread_num = omp_get_thread_num(); - __f(__chunk_first, __chunk_last, __local_histograms[__thread_num].begin()); - }); - } - - _PSTL_PRAGMA(omp taskloop shared(__local_histograms)) - for (std::size_t __chunk = 0; __chunk < __policy2.__n_chunks; ++__chunk) - { - oneapi::dpl::__omp_backend::__process_chunk( - __policy2, __histogram_first, __chunk, [&](auto __chunk_first, auto __chunk_last) { - __init(__local_histograms[0].begin() + (__chunk_first - __histogram_first), - (__chunk_last - __chunk_first), __chunk_first); - - for (std::size_t __i = 1; __i < __num_threads; ++__i) - { - __accum(__local_histograms[__i].begin() + (__chunk_first - __histogram_first), - (__chunk_last - __chunk_first), __chunk_first); - } - }); - } -} - -template -void -__parallel_histogram(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolicy&&, _RandomAccessIterator1 __first, - _RandomAccessIterator1 __last, _Size __num_bins, _RandomAccessIterator2 __histogram_first, - _FpHist __f, _FpInitialize __init, _FpAccum __accum) -{ - if (omp_in_parallel()) - { - // We don't create a nested parallel region in an existing parallel - // region: just create tasks - oneapi::dpl::__omp_backend::__histogram_body(__first, __last, __num_bins, __histogram_first, __f, __init, - __accum); - } - else - { - // Create a parallel region, and a single thread will create tasks - // for the region. - _PSTL_PRAGMA(omp parallel) - _PSTL_PRAGMA(omp single nowait) - { - oneapi::dpl::__omp_backend::__histogram_body(__first, __last, __num_bins, __histogram_first, __f, __init, - __accum); - } - } -} - -} // namespace __omp_backend -} // namespace dpl -} // namespace oneapi -#endif // _ONEDPL_INTERNAL_OMP_PARALLEL_HISTOGRAM_H diff --git a/include/oneapi/dpl/pstl/parallel_backend_omp.h b/include/oneapi/dpl/pstl/parallel_backend_omp.h index 0fd87008c49..f608581b245 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_omp.h +++ b/include/oneapi/dpl/pstl/parallel_backend_omp.h @@ -62,9 +62,4 @@ //------------------------------------------------------------------------ #include "./omp/parallel_merge.h" -//------------------------------------------------------------------------ -// parallel_histogram -//------------------------------------------------------------------------ -#include "./omp/parallel_histogram.h" - #endif //_ONEDPL_PARALLEL_BACKEND_OMP_H diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h index 2df972295ad..298b7447678 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_serial.h +++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h @@ -153,18 +153,6 @@ __parallel_for_each(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPol __f(*__iter); } -template -void -__parallel_histogram(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&& exec, - _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _Size __num_bins, - _RandomAccessIterator2 __histogram_first, _FpHist __f, _FpInitialize, _FpAccum) -{ - using _HistogramValueT = typename ::std::iterator_traits<_RandomAccessIterator2>::value_type; - ::std::fill(__histogram_first, __histogram_first + __num_bins, _HistogramValueT{0}); - __f(__first, __last, __histogram_first); -} - } // namespace __serial_backend } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index b254e359f09..611d455dfe4 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -1331,39 +1331,6 @@ struct __thread_enumerable_storage tbb::enumerable_thread_specific> __thread_specific_storage; }; -//------------------------------------------------------------------------ -// parallel_histogram -//------------------------------------------------------------------------ -template -void -__parallel_histogram(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy&&, _RandomAccessIterator1 __first, - _RandomAccessIterator1 __last, _Size __num_bins, _RandomAccessIterator2 __histogram_first, - _FpHist __f, _FpInitialize __init, _FpAccum __accum) -{ - using _HistogramValueT = typename ::std::iterator_traits<_RandomAccessIterator2>::value_type; - - tbb::this_task_arena::isolate([&]() { - tbb::enumerable_thread_specific> __thread_local_histogram(__num_bins, - _HistogramValueT{0}); - size_t __n = __last - __first; - tbb::parallel_for(tbb::blocked_range<_Size>(0, __n, 1024), [&](const tbb::blocked_range<_Size>& __range) { - std::vector<_HistogramValueT>& __local_histogram = __thread_local_histogram.local(); - __f(__first + __range.begin(), __first + __range.end(), __local_histogram.begin()); - }); - - tbb::parallel_for(tbb::blocked_range<_Size>(0, __num_bins), [&](const tbb::blocked_range<_Size>& __range) { - auto __local_histogram = __thread_local_histogram.begin(); - __init(__local_histogram->begin() + __range.begin(), __range.size(), __histogram_first + __range.begin()); - ++__local_histogram; - for (; __local_histogram != __thread_local_histogram.end(); ++__local_histogram) - { - __accum(__local_histogram->begin() + __range.begin(), __range.size(), - __histogram_first + __range.begin()); - } - }); - }); -} } // namespace __tbb_backend } // namespace dpl From 9f74810eda565c9db59bb7375868a83b8055fdb5 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 18 Dec 2024 17:08:55 -0500 Subject: [PATCH 26/50] further cleanup / formatting Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 20 ++++++++++--------- include/oneapi/dpl/pstl/omp/util.h | 10 ++++++---- .../oneapi/dpl/pstl/parallel_backend_serial.h | 12 +++++++---- .../oneapi/dpl/pstl/parallel_backend_tbb.h | 16 +++++++++------ 4 files changed, 35 insertions(+), 23 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 0e6a9a09390..b923920f1f7 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -45,8 +45,7 @@ namespace __internal template auto -__create_thread_enumerable_storage(std::size_t __num_elements, - _ValueType __init_value) +__make_thread_enumerable_storage(std::size_t __num_elements, _ValueType __init_value) { return __par_backend::__thread_enumerable_storage{__num_elements, __init_value}; } @@ -4339,8 +4338,8 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando _DiffType __n = __last - __first; if (__n > 0) { - auto __thread_enumerable_storage = oneapi::dpl::__internal::__create_thread_enumerable_storage( - __num_bins, _HistogramValueT{0}); + auto __thread_enumerable_storage = + oneapi::dpl::__internal::__make_thread_enumerable_storage(__num_bins, _HistogramValueT{0}); //main histogram loop __par_backend::__parallel_for(__backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, @@ -4353,19 +4352,22 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando __par_backend::__parallel_for( __backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins, - [__histogram_first, &__thread_enumerable_storage](auto __global_histogram_first, auto __global_histogram_last) { + [__histogram_first, &__thread_enumerable_storage](auto __global_histogram_first, + auto __global_histogram_last) { _DiffType __local_n = __global_histogram_last - __global_histogram_first; std::size_t __num_temporary_copies = __thread_enumerable_storage.size(); _DiffType __range_begin_id = __global_histogram_first - __histogram_first; //initialize output global histogram with first local histogram via assign - __internal::__brick_walk2_n(__thread_enumerable_storage.get_with_id(0) + __range_begin_id, __local_n, __global_histogram_first, - oneapi::dpl::__internal::__pstl_assign(), _IsVector{}); + __internal::__brick_walk2_n(__thread_enumerable_storage.get_with_id(0) + __range_begin_id, __local_n, + __global_histogram_first, oneapi::dpl::__internal::__pstl_assign(), + _IsVector{}); for (std::size_t __i = 1; __i < __num_temporary_copies; ++__i) { //accumulate into output global histogram with other local histogram via += operator __internal::__brick_walk2_n( - __thread_enumerable_storage.get_with_id(__i) + __range_begin_id, __local_n, __global_histogram_first, - [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, _IsVector{}); + __thread_enumerable_storage.get_with_id(__i) + __range_begin_id, __local_n, + __global_histogram_first, [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, + _IsVector{}); } }); } diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h index 8249007dbda..7ca0dc7b143 100644 --- a/include/oneapi/dpl/pstl/omp/util.h +++ b/include/oneapi/dpl/pstl/omp/util.h @@ -25,7 +25,6 @@ #include #include #include -#include #include "../parallel_backend_utils.h" #include "../unseq_backend_simd.h" @@ -172,17 +171,20 @@ struct __thread_enumerable_storage } } - std::size_t size() const + std::size_t + size() const { return __num_threads; } - auto get_with_id(std::size_t __i) + auto + get_with_id(std::size_t __i) { return __thread_specific_storage[__i].begin(); } - auto get() + auto + get() { return get_with_id(omp_get_thread_num()); } diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h index 298b7447678..ec546bf3b91 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_serial.h +++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h @@ -46,19 +46,23 @@ template struct __thread_enumerable_storage { __thread_enumerable_storage(std::size_t __num_bins, _ValueType __init_value) : __storage(__num_bins, __init_value) - {} + { + } - std::size_t size() const + std::size_t + size() const { return std::size_t{1}; } - auto get() + auto + get() { return __storage.begin(); } - auto get_with_id(std::size_t __i) + auto + get_with_id(std::size_t __i) { return get(); } diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index 611d455dfe4..538a9de1701 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -1310,20 +1310,25 @@ __parallel_for_each(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy template struct __thread_enumerable_storage { - __thread_enumerable_storage(std::size_t __num_bins, _ValueType __init_value) : __thread_specific_storage(__num_bins, __init_value) - {} + __thread_enumerable_storage(std::size_t __num_bins, _ValueType __init_value) + : __thread_specific_storage(__num_bins, __init_value) + { + } - std::size_t size() const + std::size_t + size() const { return __thread_specific_storage.size(); } - auto get() + auto + get() { return __thread_specific_storage.local().begin(); } - auto get_with_id(std::size_t __i) + auto + get_with_id(std::size_t __i) { return __thread_specific_storage.begin()[__i].begin(); } @@ -1331,7 +1336,6 @@ struct __thread_enumerable_storage tbb::enumerable_thread_specific> __thread_specific_storage; }; - } // namespace __tbb_backend } // namespace dpl } // namespace oneapi From e0ed14c8ea02d679e663229bf29dbc0a6dfcb7ae Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 20 Dec 2024 12:01:29 -0500 Subject: [PATCH 27/50] add grain size todo Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index b923920f1f7..be24619614d 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4342,6 +4342,7 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando oneapi::dpl::__internal::__make_thread_enumerable_storage(__num_bins, _HistogramValueT{0}); //main histogram loop + //TODO: add defaulted grain-size option for __parallel_for and use larger one here to account for overhead __par_backend::__parallel_for(__backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, [__func, &__thread_enumerable_storage](_RandomAccessIterator1 __first_local, _RandomAccessIterator1 __last_local) { From db2a474988d469dadef578141e257e5b9ec6a2c2 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Fri, 20 Dec 2024 16:44:22 -0500 Subject: [PATCH 28/50] more general thread local storage Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 22 ++++++------------- include/oneapi/dpl/pstl/omp/util.h | 17 +++++++------- .../oneapi/dpl/pstl/parallel_backend_serial.h | 13 ++++++----- .../oneapi/dpl/pstl/parallel_backend_tbb.h | 17 +++++++------- 4 files changed, 32 insertions(+), 37 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index be24619614d..9c692e38d75 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -43,13 +43,6 @@ namespace dpl namespace __internal { -template -auto -__make_thread_enumerable_storage(std::size_t __num_elements, _ValueType __init_value) -{ - return __par_backend::__thread_enumerable_storage{__num_elements, __init_value}; -} - //------------------------------------------------------------------------ // any_of //------------------------------------------------------------------------ @@ -4338,35 +4331,34 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando _DiffType __n = __last - __first; if (__n > 0) { - auto __thread_enumerable_storage = - oneapi::dpl::__internal::__make_thread_enumerable_storage(__num_bins, _HistogramValueT{0}); + __par_backend::__thread_enumerable_storage> __tls{__num_bins, _HistogramValueT{0}}; //main histogram loop //TODO: add defaulted grain-size option for __parallel_for and use larger one here to account for overhead __par_backend::__parallel_for(__backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, - [__func, &__thread_enumerable_storage](_RandomAccessIterator1 __first_local, + [__func, &__tls](_RandomAccessIterator1 __first_local, _RandomAccessIterator1 __last_local) { __internal::__brick_histogram(__first_local, __last_local, __func, - __thread_enumerable_storage.get(), _IsVector{}); + __tls.get().begin(), _IsVector{}); }); // now accumulate temporary storage into output global histogram __par_backend::__parallel_for( __backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins, - [__histogram_first, &__thread_enumerable_storage](auto __global_histogram_first, + [__histogram_first, &__tls](auto __global_histogram_first, auto __global_histogram_last) { _DiffType __local_n = __global_histogram_last - __global_histogram_first; - std::size_t __num_temporary_copies = __thread_enumerable_storage.size(); + std::size_t __num_temporary_copies = __tls.size(); _DiffType __range_begin_id = __global_histogram_first - __histogram_first; //initialize output global histogram with first local histogram via assign - __internal::__brick_walk2_n(__thread_enumerable_storage.get_with_id(0) + __range_begin_id, __local_n, + __internal::__brick_walk2_n(__tls.get_with_id(0).begin() + __range_begin_id, __local_n, __global_histogram_first, oneapi::dpl::__internal::__pstl_assign(), _IsVector{}); for (std::size_t __i = 1; __i < __num_temporary_copies; ++__i) { //accumulate into output global histogram with other local histogram via += operator __internal::__brick_walk2_n( - __thread_enumerable_storage.get_with_id(__i) + __range_begin_id, __local_n, + __tls.get_with_id(__i).begin() + __range_begin_id, __local_n, __global_histogram_first, [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, _IsVector{}); } diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h index 7ca0dc7b143..cf2de8f8f43 100644 --- a/include/oneapi/dpl/pstl/omp/util.h +++ b/include/oneapi/dpl/pstl/omp/util.h @@ -153,20 +153,21 @@ __process_chunk(const __chunk_metrics& __metrics, _Iterator __base, _Index __chu __f(__first, __last); } -template +template struct __thread_enumerable_storage { - __thread_enumerable_storage(std::size_t __num_bins, _ValueType __init_value) + template + __thread_enumerable_storage(Args&&... args) { _PSTL_PRAGMA(omp parallel) _PSTL_PRAGMA(omp single nowait) { __num_threads = omp_get_num_threads(); __thread_specific_storage.resize(__num_threads); - _PSTL_PRAGMA(omp taskloop shared(__thread_specific_storage, __num_bins, __init_value)) + _PSTL_PRAGMA(omp taskloop shared(__thread_specific_storage)) for (std::size_t __tid = 0; __tid < __num_threads; ++__tid) { - __thread_specific_storage[__tid].resize(__num_bins, __init_value); + __thread_specific_storage[__tid] = std::make_unique<_StorageType>(std::forward(args)...); } } } @@ -177,19 +178,19 @@ struct __thread_enumerable_storage return __num_threads; } - auto + _StorageType& get_with_id(std::size_t __i) { - return __thread_specific_storage[__i].begin(); + return *__thread_specific_storage[__i]; } - auto + _StorageType& get() { return get_with_id(omp_get_thread_num()); } - std::vector> __thread_specific_storage; + std::vector> __thread_specific_storage; std::size_t __num_threads; }; diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h index ec546bf3b91..2c92d878f87 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_serial.h +++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h @@ -42,10 +42,11 @@ __cancel_execution(oneapi::dpl::__internal::__serial_backend_tag) { } -template +template struct __thread_enumerable_storage { - __thread_enumerable_storage(std::size_t __num_bins, _ValueType __init_value) : __storage(__num_bins, __init_value) + template + __thread_enumerable_storage(Args&&... args) : __storage(std::forward(args)...) { } @@ -55,19 +56,19 @@ struct __thread_enumerable_storage return std::size_t{1}; } - auto + _StorageType& get() { - return __storage.begin(); + return __storage; } - auto + _StorageType& get_with_id(std::size_t __i) { return get(); } - std::vector<_ValueType> __storage; + _StorageType __storage; }; template diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index 538a9de1701..cbb6e3b24df 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -1307,11 +1307,12 @@ __parallel_for_each(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy tbb::this_task_arena::isolate([&]() { tbb::parallel_for_each(__begin, __end, __f); }); } -template +template struct __thread_enumerable_storage { - __thread_enumerable_storage(std::size_t __num_bins, _ValueType __init_value) - : __thread_specific_storage(__num_bins, __init_value) + template + __thread_enumerable_storage(Args&&... args) + : __thread_specific_storage(std::forward(args)...) { } @@ -1321,19 +1322,19 @@ struct __thread_enumerable_storage return __thread_specific_storage.size(); } - auto + _StorageType& get() { - return __thread_specific_storage.local().begin(); + return __thread_specific_storage.local(); } - auto + _StorageType& get_with_id(std::size_t __i) { - return __thread_specific_storage.begin()[__i].begin(); + return __thread_specific_storage.begin()[__i]; } - tbb::enumerable_thread_specific> __thread_specific_storage; + tbb::enumerable_thread_specific<_StorageType> __thread_specific_storage; }; } // namespace __tbb_backend From 240447eb0e6ed852f6d5b9db412f879a1907c4ee Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 30 Dec 2024 10:15:21 -0500 Subject: [PATCH 29/50] implement omp on demand tls Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 4 +- include/oneapi/dpl/pstl/omp/util.h | 64 ++++++++++++++----- .../oneapi/dpl/pstl/parallel_backend_serial.h | 6 +- .../oneapi/dpl/pstl/parallel_backend_tbb.h | 4 +- 4 files changed, 56 insertions(+), 22 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 9c692e38d75..785d7d8d4bd 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4348,13 +4348,13 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando [__histogram_first, &__tls](auto __global_histogram_first, auto __global_histogram_last) { _DiffType __local_n = __global_histogram_last - __global_histogram_first; - std::size_t __num_temporary_copies = __tls.size(); + std::uint32_t __num_temporary_copies = __tls.size(); _DiffType __range_begin_id = __global_histogram_first - __histogram_first; //initialize output global histogram with first local histogram via assign __internal::__brick_walk2_n(__tls.get_with_id(0).begin() + __range_begin_id, __local_n, __global_histogram_first, oneapi::dpl::__internal::__pstl_assign(), _IsVector{}); - for (std::size_t __i = 1; __i < __num_temporary_copies; ++__i) + for (std::uint32_t __i = 1; __i < __num_temporary_copies; ++__i) { //accumulate into output global histogram with other local histogram via += operator __internal::__brick_walk2_n( diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h index cf2de8f8f43..31a9673c7b3 100644 --- a/include/oneapi/dpl/pstl/omp/util.h +++ b/include/oneapi/dpl/pstl/omp/util.h @@ -25,6 +25,7 @@ #include #include #include +#include #include "../parallel_backend_utils.h" #include "../unseq_backend_simd.h" @@ -153,45 +154,78 @@ __process_chunk(const __chunk_metrics& __metrics, _Iterator __base, _Index __chu __f(__first, __last); } +template +class __construct_by_args_base { +public: + virtual ~__construct_by_args_base() { } + virtual std::unique_ptr<_StorageType> construct() = 0; +}; + +template +class __construct_by_args: public __construct_by_args_base<_StorageType>{ + public: + std::unique_ptr<_StorageType> construct() { + return std::move(std::apply([](auto... __arg_pack) { return std::make_unique<_StorageType>(__arg_pack...);}, pack)); + } + __construct_by_args( _P&& ... args ) : pack(std::forward<_P>(args)...) {} + private: + std::tuple<_P...> pack; +}; + template struct __thread_enumerable_storage { template - __thread_enumerable_storage(Args&&... args) + __thread_enumerable_storage(Args&&... args) : __num_elements(0) { + __construct_helper = std::make_unique<__construct_by_args<_StorageType, Args...>>(std::forward(args)...); _PSTL_PRAGMA(omp parallel) - _PSTL_PRAGMA(omp single nowait) + _PSTL_PRAGMA(omp single) { - __num_threads = omp_get_num_threads(); - __thread_specific_storage.resize(__num_threads); - _PSTL_PRAGMA(omp taskloop shared(__thread_specific_storage)) - for (std::size_t __tid = 0; __tid < __num_threads; ++__tid) - { - __thread_specific_storage[__tid] = std::make_unique<_StorageType>(std::forward(args)...); - } + __thread_specific_storage.resize(omp_get_num_threads()); } } - std::size_t + std::uint32_t size() const { - return __num_threads; + return __num_elements.load(); } _StorageType& - get_with_id(std::size_t __i) + get_with_id(std::uint32_t __i) { - return *__thread_specific_storage[__i]; + if (__i < size()) + { + std::uint32_t __count = 0; + std::uint32_t __j = 0; + for (; __j < __thread_specific_storage.size() && __count <= __i; ++__j) + { + if (__thread_specific_storage[__j]) + { + __count++; + } + } + // Need to back up one once we have found a valid element + return *__thread_specific_storage[__j-1]; + } } _StorageType& get() { - return get_with_id(omp_get_thread_num()); + std::uint32_t __i = omp_get_thread_num(); + if (!__thread_specific_storage[__i]) + { + __thread_specific_storage[__i] = __construct_helper->construct(); + __num_elements.fetch_add(1); + } + return *__thread_specific_storage[__i]; } std::vector> __thread_specific_storage; - std::size_t __num_threads; + std::atomic __num_elements; + std::unique_ptr<__construct_by_args_base<_StorageType>> __construct_helper; }; } // namespace __omp_backend diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h index 2c92d878f87..67db9e5114f 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_serial.h +++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h @@ -50,10 +50,10 @@ struct __thread_enumerable_storage { } - std::size_t + std::uint32_t size() const { - return std::size_t{1}; + return std::uint32_t{1}; } _StorageType& @@ -63,7 +63,7 @@ struct __thread_enumerable_storage } _StorageType& - get_with_id(std::size_t __i) + get_with_id(std::uint32_t __i) { return get(); } diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index cbb6e3b24df..5f2c40a25f4 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -1316,7 +1316,7 @@ struct __thread_enumerable_storage { } - std::size_t + std::uint32_t size() const { return __thread_specific_storage.size(); @@ -1329,7 +1329,7 @@ struct __thread_enumerable_storage } _StorageType& - get_with_id(std::size_t __i) + get_with_id(std::uint32_t __i) { return __thread_specific_storage.begin()[__i]; } From 741f8e30e132d96c950e3e08804102afd43d57a6 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 30 Dec 2024 10:16:14 -0500 Subject: [PATCH 30/50] formatting Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/omp/util.h | 40 ++++++++++++++++-------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h index 31a9673c7b3..d6c13bfda7c 100644 --- a/include/oneapi/dpl/pstl/omp/util.h +++ b/include/oneapi/dpl/pstl/omp/util.h @@ -154,22 +154,29 @@ __process_chunk(const __chunk_metrics& __metrics, _Iterator __base, _Index __chu __f(__first, __last); } -template -class __construct_by_args_base { -public: - virtual ~__construct_by_args_base() { } - virtual std::unique_ptr<_StorageType> construct() = 0; +template +class __construct_by_args_base +{ + public: + virtual ~__construct_by_args_base() {} + virtual std::unique_ptr<_StorageType> + construct() = 0; }; -template -class __construct_by_args: public __construct_by_args_base<_StorageType>{ - public: - std::unique_ptr<_StorageType> construct() { - return std::move(std::apply([](auto... __arg_pack) { return std::make_unique<_StorageType>(__arg_pack...);}, pack)); +template +class __construct_by_args : public __construct_by_args_base<_StorageType> +{ + public: + std::unique_ptr<_StorageType> + construct() + { + return std::move( + std::apply([](auto... __arg_pack) { return std::make_unique<_StorageType>(__arg_pack...); }, pack)); } - __construct_by_args( _P&& ... args ) : pack(std::forward<_P>(args)...) {} - private: - std::tuple<_P...> pack; + __construct_by_args(_P&&... args) : pack(std::forward<_P>(args)...) {} + + private: + std::tuple<_P...> pack; }; template @@ -180,10 +187,7 @@ struct __thread_enumerable_storage { __construct_helper = std::make_unique<__construct_by_args<_StorageType, Args...>>(std::forward(args)...); _PSTL_PRAGMA(omp parallel) - _PSTL_PRAGMA(omp single) - { - __thread_specific_storage.resize(omp_get_num_threads()); - } + _PSTL_PRAGMA(omp single) { __thread_specific_storage.resize(omp_get_num_threads()); } } std::uint32_t @@ -207,7 +211,7 @@ struct __thread_enumerable_storage } } // Need to back up one once we have found a valid element - return *__thread_specific_storage[__j-1]; + return *__thread_specific_storage[__j - 1]; } } From fd310b6629e323a7c6071e7c6c87c3a52cc3758b Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 30 Dec 2024 11:07:04 -0500 Subject: [PATCH 31/50] formatting Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 22 +++++++++---------- .../oneapi/dpl/pstl/parallel_backend_tbb.h | 3 +-- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 785d7d8d4bd..a204e11e8ad 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4331,22 +4331,21 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando _DiffType __n = __last - __first; if (__n > 0) { - __par_backend::__thread_enumerable_storage> __tls{__num_bins, _HistogramValueT{0}}; + __par_backend::__thread_enumerable_storage> __tls{__num_bins, + _HistogramValueT{0}}; //main histogram loop //TODO: add defaulted grain-size option for __parallel_for and use larger one here to account for overhead - __par_backend::__parallel_for(__backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, - [__func, &__tls](_RandomAccessIterator1 __first_local, - _RandomAccessIterator1 __last_local) { - __internal::__brick_histogram(__first_local, __last_local, __func, - __tls.get().begin(), _IsVector{}); - }); + __par_backend::__parallel_for( + __backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, + [__func, &__tls](_RandomAccessIterator1 __first_local, _RandomAccessIterator1 __last_local) { + __internal::__brick_histogram(__first_local, __last_local, __func, __tls.get().begin(), _IsVector{}); + }); // now accumulate temporary storage into output global histogram __par_backend::__parallel_for( __backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins, - [__histogram_first, &__tls](auto __global_histogram_first, - auto __global_histogram_last) { + [__histogram_first, &__tls](auto __global_histogram_first, auto __global_histogram_last) { _DiffType __local_n = __global_histogram_last - __global_histogram_first; std::uint32_t __num_temporary_copies = __tls.size(); _DiffType __range_begin_id = __global_histogram_first - __histogram_first; @@ -4358,9 +4357,8 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando { //accumulate into output global histogram with other local histogram via += operator __internal::__brick_walk2_n( - __tls.get_with_id(__i).begin() + __range_begin_id, __local_n, - __global_histogram_first, [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, - _IsVector{}); + __tls.get_with_id(__i).begin() + __range_begin_id, __local_n, __global_histogram_first, + [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, _IsVector{}); } }); } diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index 5f2c40a25f4..8dc470fd78b 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -1311,8 +1311,7 @@ template struct __thread_enumerable_storage { template - __thread_enumerable_storage(Args&&... args) - : __thread_specific_storage(std::forward(args)...) + __thread_enumerable_storage(Args&&... args) : __thread_specific_storage(std::forward(args)...) { } From 6820340c037c65093501d66d2296e227ee9acda2 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 30 Dec 2024 11:19:43 -0500 Subject: [PATCH 32/50] comments and clarity Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 3 +- include/oneapi/dpl/pstl/omp/util.h | 29 ++++++++++++------- .../oneapi/dpl/pstl/parallel_backend_serial.h | 2 +- .../oneapi/dpl/pstl/parallel_backend_tbb.h | 2 +- 4 files changed, 22 insertions(+), 14 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index a204e11e8ad..ce36636ceca 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4339,7 +4339,8 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando __par_backend::__parallel_for( __backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, [__func, &__tls](_RandomAccessIterator1 __first_local, _RandomAccessIterator1 __last_local) { - __internal::__brick_histogram(__first_local, __last_local, __func, __tls.get().begin(), _IsVector{}); + __internal::__brick_histogram(__first_local, __last_local, __func, + __tls.get_for_current_thread().begin(), _IsVector{}); }); // now accumulate temporary storage into output global histogram __par_backend::__parallel_for( diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h index d6c13bfda7c..f5d9e018def 100644 --- a/include/oneapi/dpl/pstl/omp/util.h +++ b/include/oneapi/dpl/pstl/omp/util.h @@ -154,6 +154,9 @@ __process_chunk(const __chunk_metrics& __metrics, _Iterator __base, _Index __chu __f(__first, __last); } + +// abstract class to allow inclusion in __thread_enumerable_storage as member without requiring explicit template +// instantiation of param types template class __construct_by_args_base { @@ -163,6 +166,7 @@ class __construct_by_args_base construct() = 0; }; +// Helper class to allow construction of _StorageType from a stored argument pack template class __construct_by_args : public __construct_by_args_base<_StorageType> { @@ -193,34 +197,37 @@ struct __thread_enumerable_storage std::uint32_t size() const { + // only count storage which has been instantiated return __num_elements.load(); } _StorageType& get_with_id(std::uint32_t __i) { - if (__i < size()) + assert(__i < size()); + + std::uint32_t __count = 0; + std::uint32_t __j = 0; + + for (; __j < __thread_specific_storage.size() && __count <= __i; ++__j) { - std::uint32_t __count = 0; - std::uint32_t __j = 0; - for (; __j < __thread_specific_storage.size() && __count <= __i; ++__j) + // Only include storage from threads which have instantiated a storage object + if (__thread_specific_storage[__j]) { - if (__thread_specific_storage[__j]) - { - __count++; - } + __count++; } - // Need to back up one once we have found a valid element - return *__thread_specific_storage[__j - 1]; } + // Need to back up one once we have found a valid storage object + return *__thread_specific_storage[__j - 1]; } _StorageType& - get() + get_for_current_thread() { std::uint32_t __i = omp_get_thread_num(); if (!__thread_specific_storage[__i]) { + // create temporary storage on first usage to avoid extra parallel region and unnecessary instantiation __thread_specific_storage[__i] = __construct_helper->construct(); __num_elements.fetch_add(1); } diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h index 67db9e5114f..4795736fde8 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_serial.h +++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h @@ -57,7 +57,7 @@ struct __thread_enumerable_storage } _StorageType& - get() + get_for_current_thread() { return __storage; } diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index 8dc470fd78b..de36b97d1ef 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -1322,7 +1322,7 @@ struct __thread_enumerable_storage } _StorageType& - get() + get_for_current_thread() { return __thread_specific_storage.local(); } From 4798d2ca8faa5abff1cbd3952e72a58083e30286 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 30 Dec 2024 11:21:32 -0500 Subject: [PATCH 33/50] bugfix for serial impl Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/parallel_backend_serial.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h index 4795736fde8..007a190b236 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_serial.h +++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h @@ -65,7 +65,7 @@ struct __thread_enumerable_storage _StorageType& get_with_id(std::uint32_t __i) { - return get(); + return get_for_current_thread(); } _StorageType __storage; From ce8b55f9e51de898b537c4f6a3a2cdb372d45d9e Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 30 Dec 2024 11:23:26 -0500 Subject: [PATCH 34/50] removing debugging output Signed-off-by: Dan Hoeflinger --- test/parallel_api/numeric/numeric.ops/histogram.pass.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/test/parallel_api/numeric/numeric.ops/histogram.pass.cpp b/test/parallel_api/numeric/numeric.ops/histogram.pass.cpp index e532b56a091..3ea20b724e3 100644 --- a/test/parallel_api/numeric/numeric.ops/histogram.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/histogram.pass.cpp @@ -34,14 +34,8 @@ struct test_histogram_even_bins Size trash) { const Size bin_size = bin_last - bin_first; - std::cout<<"n: "< Date: Mon, 30 Dec 2024 15:59:37 -0500 Subject: [PATCH 35/50] formatting Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/omp/util.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h index f5d9e018def..e945cf52109 100644 --- a/include/oneapi/dpl/pstl/omp/util.h +++ b/include/oneapi/dpl/pstl/omp/util.h @@ -154,7 +154,6 @@ __process_chunk(const __chunk_metrics& __metrics, _Iterator __base, _Index __chu __f(__first, __last); } - // abstract class to allow inclusion in __thread_enumerable_storage as member without requiring explicit template // instantiation of param types template From 66300186d9f2576999d5270024cf9a42bfd49ce6 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 30 Dec 2024 16:05:05 -0500 Subject: [PATCH 36/50] comment adjustment Signed-off-by: Dan Hoeflinger --- test/parallel_api/numeric/numeric.ops/histogram.pass.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/parallel_api/numeric/numeric.ops/histogram.pass.cpp b/test/parallel_api/numeric/numeric.ops/histogram.pass.cpp index 3ea20b724e3..1ce168859e4 100644 --- a/test/parallel_api/numeric/numeric.ops/histogram.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/histogram.pass.cpp @@ -124,7 +124,8 @@ test_histogram(T min_boundary, T max_boundary, T overflow, Size jitter, Size tra test_range_and_even_histogram(n, min_boundary, max_boundary, overflow, jitter, bin_size, trash); } } - //single large case to hit the atomic case + + //single large case for coverage Size n = 1100000; Size bin_size = 1100000; test_range_and_even_histogram(n, min_boundary, max_boundary, overflow, jitter, bin_size, trash); From 83a9f81d1d48ffbca9a74d1630553846db05a056 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 30 Dec 2024 17:02:05 -0500 Subject: [PATCH 37/50] minor naming / formatting Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/omp/util.h | 13 ++++++------- include/oneapi/dpl/pstl/parallel_backend_serial.h | 2 +- include/oneapi/dpl/pstl/parallel_backend_tbb.h | 2 +- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h index e945cf52109..28295a9533a 100644 --- a/include/oneapi/dpl/pstl/omp/util.h +++ b/include/oneapi/dpl/pstl/omp/util.h @@ -161,8 +161,7 @@ class __construct_by_args_base { public: virtual ~__construct_by_args_base() {} - virtual std::unique_ptr<_StorageType> - construct() = 0; + virtual std::unique_ptr<_StorageType> construct() = 0; }; // Helper class to allow construction of _StorageType from a stored argument pack @@ -174,21 +173,21 @@ class __construct_by_args : public __construct_by_args_base<_StorageType> construct() { return std::move( - std::apply([](auto... __arg_pack) { return std::make_unique<_StorageType>(__arg_pack...); }, pack)); + std::apply([](auto... __arg_pack) { return std::make_unique<_StorageType>(__arg_pack...); }, __pack)); } - __construct_by_args(_P&&... args) : pack(std::forward<_P>(args)...) {} + __construct_by_args(_P&&... __args) : __pack(std::forward<_P>(__args)...) {} private: - std::tuple<_P...> pack; + std::tuple<_P...> __pack; }; template struct __thread_enumerable_storage { template - __thread_enumerable_storage(Args&&... args) : __num_elements(0) + __thread_enumerable_storage(Args&&... __args) : __num_elements(0) { - __construct_helper = std::make_unique<__construct_by_args<_StorageType, Args...>>(std::forward(args)...); + __construct_helper = std::make_unique<__construct_by_args<_StorageType, Args...>>(std::forward(__args)...); _PSTL_PRAGMA(omp parallel) _PSTL_PRAGMA(omp single) { __thread_specific_storage.resize(omp_get_num_threads()); } } diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h index 007a190b236..01a67ec57d0 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_serial.h +++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h @@ -46,7 +46,7 @@ template struct __thread_enumerable_storage { template - __thread_enumerable_storage(Args&&... args) : __storage(std::forward(args)...) + __thread_enumerable_storage(Args&&... __args) : __storage(std::forward(__args)...) { } diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index de36b97d1ef..804a79c514e 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -1311,7 +1311,7 @@ template struct __thread_enumerable_storage { template - __thread_enumerable_storage(Args&&... args) : __thread_specific_storage(std::forward(args)...) + __thread_enumerable_storage(Args&&... __args) : __thread_specific_storage(std::forward(__args)...) { } From a3f13048a723ea8514fd3de0f6b8f81f42c687cb Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 30 Dec 2024 17:10:28 -0500 Subject: [PATCH 38/50] formatting Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/omp/util.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h index 28295a9533a..af04983e832 100644 --- a/include/oneapi/dpl/pstl/omp/util.h +++ b/include/oneapi/dpl/pstl/omp/util.h @@ -187,7 +187,8 @@ struct __thread_enumerable_storage template __thread_enumerable_storage(Args&&... __args) : __num_elements(0) { - __construct_helper = std::make_unique<__construct_by_args<_StorageType, Args...>>(std::forward(__args)...); + __construct_helper = + std::make_unique<__construct_by_args<_StorageType, Args...>>(std::forward(__args)...); _PSTL_PRAGMA(omp parallel) _PSTL_PRAGMA(omp single) { __thread_specific_storage.resize(omp_get_num_threads()); } } From 14cca582902f1d90531012c46448115ef08eca8c Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 6 Jan 2025 09:20:29 -0500 Subject: [PATCH 39/50] Address review feedback Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 6 +++--- include/oneapi/dpl/pstl/omp/util.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index ce36636ceca..02b68f0571a 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4326,7 +4326,7 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando { using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag; using _HistogramValueT = typename std::iterator_traits<_RandomAccessIterator2>::value_type; - using _DiffType = typename ::std::iterator_traits<_RandomAccessIterator2>::difference_type; + using _DiffType = typename std::iterator_traits<_RandomAccessIterator2>::difference_type; _DiffType __n = __last - __first; if (__n > 0) @@ -4337,14 +4337,14 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando //main histogram loop //TODO: add defaulted grain-size option for __parallel_for and use larger one here to account for overhead __par_backend::__parallel_for( - __backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, + __backend_tag{}, __exec, __first, __last, [__func, &__tls](_RandomAccessIterator1 __first_local, _RandomAccessIterator1 __last_local) { __internal::__brick_histogram(__first_local, __last_local, __func, __tls.get_for_current_thread().begin(), _IsVector{}); }); // now accumulate temporary storage into output global histogram __par_backend::__parallel_for( - __backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __histogram_first, + __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins, [__histogram_first, &__tls](auto __global_histogram_first, auto __global_histogram_last) { _DiffType __local_n = __global_histogram_last - __global_histogram_first; diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h index af04983e832..05e4f8f9ed6 100644 --- a/include/oneapi/dpl/pstl/omp/util.h +++ b/include/oneapi/dpl/pstl/omp/util.h @@ -170,7 +170,7 @@ class __construct_by_args : public __construct_by_args_base<_StorageType> { public: std::unique_ptr<_StorageType> - construct() + construct() override { return std::move( std::apply([](auto... __arg_pack) { return std::make_unique<_StorageType>(__arg_pack...); }, __pack)); From 766f42ceceae45637d252ca09de6f1eac810292b Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 6 Jan 2025 09:22:11 -0500 Subject: [PATCH 40/50] formatting Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 02b68f0571a..77ba19c3e8e 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4344,8 +4344,7 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando }); // now accumulate temporary storage into output global histogram __par_backend::__parallel_for( - __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, - __histogram_first + __num_bins, + __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins, [__histogram_first, &__tls](auto __global_histogram_first, auto __global_histogram_last) { _DiffType __local_n = __global_histogram_last - __global_histogram_first; std::uint32_t __num_temporary_copies = __tls.size(); From 6425d3731d3183dbe04f9eda463ff5d8803f029d Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 13 Jan 2025 09:47:21 -0500 Subject: [PATCH 41/50] address review feedback Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/omp/util.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h index 05e4f8f9ed6..a202cd364e0 100644 --- a/include/oneapi/dpl/pstl/omp/util.h +++ b/include/oneapi/dpl/pstl/omp/util.h @@ -26,6 +26,7 @@ #include #include #include +#include #include "../parallel_backend_utils.h" #include "../unseq_backend_simd.h" @@ -160,7 +161,7 @@ template class __construct_by_args_base { public: - virtual ~__construct_by_args_base() {} + virtual ~__construct_by_args_base() = default; virtual std::unique_ptr<_StorageType> construct() = 0; }; @@ -172,13 +173,12 @@ class __construct_by_args : public __construct_by_args_base<_StorageType> std::unique_ptr<_StorageType> construct() override { - return std::move( - std::apply([](auto... __arg_pack) { return std::make_unique<_StorageType>(__arg_pack...); }, __pack)); + return std::apply([](_P... __arg_pack) { return std::make_unique<_StorageType>(__arg_pack...); }, __pack); } __construct_by_args(_P&&... __args) : __pack(std::forward<_P>(__args)...) {} private: - std::tuple<_P...> __pack; + const std::tuple<_P...> __pack; }; template @@ -187,7 +187,7 @@ struct __thread_enumerable_storage template __thread_enumerable_storage(Args&&... __args) : __num_elements(0) { - __construct_helper = + __storage_factory = std::make_unique<__construct_by_args<_StorageType, Args...>>(std::forward(__args)...); _PSTL_PRAGMA(omp parallel) _PSTL_PRAGMA(omp single) { __thread_specific_storage.resize(omp_get_num_threads()); } @@ -205,10 +205,14 @@ struct __thread_enumerable_storage { assert(__i < size()); - std::uint32_t __count = 0; std::uint32_t __j = 0; - for (; __j < __thread_specific_storage.size() && __count <= __i; ++__j) + if (size() == __thread_specific_storage.size()) + { + return *__thread_specific_storage[__i]; + } + + for (std::uint32_t __count = 0; __j < __thread_specific_storage.size() && __count <= __i; ++__j) { // Only include storage from threads which have instantiated a storage object if (__thread_specific_storage[__j]) @@ -227,7 +231,7 @@ struct __thread_enumerable_storage if (!__thread_specific_storage[__i]) { // create temporary storage on first usage to avoid extra parallel region and unnecessary instantiation - __thread_specific_storage[__i] = __construct_helper->construct(); + __thread_specific_storage[__i] = __storage_factory->construct(); __num_elements.fetch_add(1); } return *__thread_specific_storage[__i]; @@ -235,7 +239,7 @@ struct __thread_enumerable_storage std::vector> __thread_specific_storage; std::atomic __num_elements; - std::unique_ptr<__construct_by_args_base<_StorageType>> __construct_helper; + std::unique_ptr<__construct_by_args_base<_StorageType>> __storage_factory; }; } // namespace __omp_backend From a590cacbeecacdbccd7ac0cf0009e58533574bfa Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 13 Jan 2025 09:56:55 -0500 Subject: [PATCH 42/50] address feedback Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 10 +++++----- include/oneapi/dpl/pstl/parallel_backend_serial.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 77ba19c3e8e..e6637a4ff84 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4289,7 +4289,7 @@ __pattern_shift_right(_Tag __tag, _ExecutionPolicy&& __exec, _BidirectionalItera return __res.base(); } -template +template void __brick_histogram(_ForwardIterator __first, _ForwardIterator __last, _IdxHashFunc __func, _RandomAccessIterator __histogram_first, _IsVector) noexcept @@ -4304,8 +4304,8 @@ __brick_histogram(_ForwardIterator __first, _ForwardIterator __last, _IdxHashFun } } -template +template void __pattern_histogram(_Tag, _ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _Size __num_bins, _IdxHashFunc __func, _RandomAccessIterator __histogram_first) @@ -4317,8 +4317,8 @@ __pattern_histogram(_Tag, _ExecutionPolicy&& __exec, _ForwardIterator __first, _ __brick_histogram(__first, __last, __func, __histogram_first, typename _Tag::__is_vector{}); } -template +template void __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _Size __num_bins, _IdxHashFunc __func, diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h index 01a67ec57d0..22820c818ba 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_serial.h +++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h @@ -63,7 +63,7 @@ struct __thread_enumerable_storage } _StorageType& - get_with_id(std::uint32_t __i) + get_with_id(std::uint32_t /*__i*/) { return get_for_current_thread(); } From dd35fc9f4442d4757946b6cee9784e4e57bb1094 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 13 Jan 2025 09:58:14 -0500 Subject: [PATCH 43/50] formatting Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index e6637a4ff84..dcfbe0e0ad3 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4317,8 +4317,8 @@ __pattern_histogram(_Tag, _ExecutionPolicy&& __exec, _ForwardIterator __first, _ __brick_histogram(__first, __last, __func, __histogram_first, typename _Tag::__is_vector{}); } -template +template void __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _Size __num_bins, _IdxHashFunc __func, From 455cf9c1a0445866f9dfb8e1dadf62ccef3a6bec Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 13 Jan 2025 10:03:21 -0500 Subject: [PATCH 44/50] Add comment about using `size()` Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/omp/util.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h index a202cd364e0..8dc9237ab1c 100644 --- a/include/oneapi/dpl/pstl/omp/util.h +++ b/include/oneapi/dpl/pstl/omp/util.h @@ -193,6 +193,9 @@ struct __thread_enumerable_storage _PSTL_PRAGMA(omp single) { __thread_specific_storage.resize(omp_get_num_threads()); } } + // Note: Size should not be used concurrantly with parallel loops which may instantiate storage objects, as it may + // not return an accurate count of instantiated storage objects in lockstep with the number allocated and stored. + // This is because the count is not atomic with the allocation and storage of the storage objects. std::uint32_t size() const { From 8b094fe1a4f7a3b8e5c1abfeffdc04dd06648cef Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 13 Jan 2025 11:20:38 -0500 Subject: [PATCH 45/50] fixing include errors Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/omp/util.h | 2 +- include/oneapi/dpl/pstl/parallel_backend_serial.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h index 8dc9237ab1c..2d67c8f0f8c 100644 --- a/include/oneapi/dpl/pstl/omp/util.h +++ b/include/oneapi/dpl/pstl/omp/util.h @@ -20,12 +20,12 @@ #include #include #include +#include #include #include #include #include #include -#include #include #include "../parallel_backend_utils.h" diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h index 22820c818ba..4c196a8aa4f 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_serial.h +++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h @@ -20,11 +20,11 @@ #include #include +#include #include #include #include #include - #include "parallel_backend_utils.h" namespace oneapi From 7e1463e24726e6185ee8d5418fa371c15adca084 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Mon, 13 Jan 2025 11:21:57 -0500 Subject: [PATCH 46/50] formatting Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/omp/util.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h index 2d67c8f0f8c..c1871c46fc5 100644 --- a/include/oneapi/dpl/pstl/omp/util.h +++ b/include/oneapi/dpl/pstl/omp/util.h @@ -187,8 +187,7 @@ struct __thread_enumerable_storage template __thread_enumerable_storage(Args&&... __args) : __num_elements(0) { - __storage_factory = - std::make_unique<__construct_by_args<_StorageType, Args...>>(std::forward(__args)...); + __storage_factory = std::make_unique<__construct_by_args<_StorageType, Args...>>(std::forward(__args)...); _PSTL_PRAGMA(omp parallel) _PSTL_PRAGMA(omp single) { __thread_specific_storage.resize(omp_get_num_threads()); } } From d9d4f084fbecc65a089d007f3fe8e7ddbd2c5bd0 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 15 Jan 2025 08:17:58 -0500 Subject: [PATCH 47/50] adding const Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index dcfbe0e0ad3..cf4f0f74a2c 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4346,9 +4346,9 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando __par_backend::__parallel_for( __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins, [__histogram_first, &__tls](auto __global_histogram_first, auto __global_histogram_last) { - _DiffType __local_n = __global_histogram_last - __global_histogram_first; - std::uint32_t __num_temporary_copies = __tls.size(); - _DiffType __range_begin_id = __global_histogram_first - __histogram_first; + const _DiffType __local_n = __global_histogram_last - __global_histogram_first; + const std::uint32_t __num_temporary_copies = __tls.size(); + const _DiffType __range_begin_id = __global_histogram_first - __histogram_first; //initialize output global histogram with first local histogram via assign __internal::__brick_walk2_n(__tls.get_with_id(0).begin() + __range_begin_id, __local_n, __global_histogram_first, oneapi::dpl::__internal::__pstl_assign(), From bebba5eb8c2de30509f1a48dae2c330cd3e661cd Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 15 Jan 2025 08:25:37 -0500 Subject: [PATCH 48/50] address feedback Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 17 +++++++++-------- include/oneapi/dpl/pstl/omp/util.h | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index cf4f0f74a2c..35664926a65 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4329,7 +4329,13 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando using _DiffType = typename std::iterator_traits<_RandomAccessIterator2>::difference_type; _DiffType __n = __last - __first; - if (__n > 0) + if (__n <= 0) + { + // when n <= 0, we must fill the output histogram with zeros + __pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, + __histogram_first + __num_bins, _HistogramValueT{0}); + } + else { __par_backend::__thread_enumerable_storage> __tls{__num_bins, _HistogramValueT{0}}; @@ -4343,11 +4349,11 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando __tls.get_for_current_thread().begin(), _IsVector{}); }); // now accumulate temporary storage into output global histogram + const std::uint32_t __num_temporary_copies = __tls.size(); __par_backend::__parallel_for( __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins, - [__histogram_first, &__tls](auto __global_histogram_first, auto __global_histogram_last) { + [__num_temporary_copies, __histogram_first, &__tls](auto __global_histogram_first, auto __global_histogram_last) { const _DiffType __local_n = __global_histogram_last - __global_histogram_first; - const std::uint32_t __num_temporary_copies = __tls.size(); const _DiffType __range_begin_id = __global_histogram_first - __histogram_first; //initialize output global histogram with first local histogram via assign __internal::__brick_walk2_n(__tls.get_with_id(0).begin() + __range_begin_id, __local_n, @@ -4362,11 +4368,6 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando } }); } - else - { - __pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, - __histogram_first + __num_bins, _HistogramValueT{0}); - } } } // namespace __internal diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h index c1871c46fc5..9310af1441e 100644 --- a/include/oneapi/dpl/pstl/omp/util.h +++ b/include/oneapi/dpl/pstl/omp/util.h @@ -192,7 +192,7 @@ struct __thread_enumerable_storage _PSTL_PRAGMA(omp single) { __thread_specific_storage.resize(omp_get_num_threads()); } } - // Note: Size should not be used concurrantly with parallel loops which may instantiate storage objects, as it may + // Note: Size should not be used concurrently with parallel loops which may instantiate storage objects, as it may // not return an accurate count of instantiated storage objects in lockstep with the number allocated and stored. // This is because the count is not atomic with the allocation and storage of the storage objects. std::uint32_t From d49460143ea317fc7df2c82e2e6f0cbd86226bf4 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 15 Jan 2025 08:37:45 -0500 Subject: [PATCH 49/50] address feedback Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 4 ++-- include/oneapi/dpl/pstl/omp/util.h | 12 ++++++------ include/oneapi/dpl/pstl/parallel_backend_serial.h | 6 +++--- include/oneapi/dpl/pstl/parallel_backend_tbb.h | 4 ++-- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 35664926a65..6a217e994d0 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4349,7 +4349,7 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando __tls.get_for_current_thread().begin(), _IsVector{}); }); // now accumulate temporary storage into output global histogram - const std::uint32_t __num_temporary_copies = __tls.size(); + const std::size_t __num_temporary_copies = __tls.size(); __par_backend::__parallel_for( __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins, [__num_temporary_copies, __histogram_first, &__tls](auto __global_histogram_first, auto __global_histogram_last) { @@ -4359,7 +4359,7 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando __internal::__brick_walk2_n(__tls.get_with_id(0).begin() + __range_begin_id, __local_n, __global_histogram_first, oneapi::dpl::__internal::__pstl_assign(), _IsVector{}); - for (std::uint32_t __i = 1; __i < __num_temporary_copies; ++__i) + for (std::size_t __i = 1; __i < __num_temporary_copies; ++__i) { //accumulate into output global histogram with other local histogram via += operator __internal::__brick_walk2_n( diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h index 9310af1441e..04ce21889df 100644 --- a/include/oneapi/dpl/pstl/omp/util.h +++ b/include/oneapi/dpl/pstl/omp/util.h @@ -195,7 +195,7 @@ struct __thread_enumerable_storage // Note: Size should not be used concurrently with parallel loops which may instantiate storage objects, as it may // not return an accurate count of instantiated storage objects in lockstep with the number allocated and stored. // This is because the count is not atomic with the allocation and storage of the storage objects. - std::uint32_t + std::size_t size() const { // only count storage which has been instantiated @@ -203,18 +203,18 @@ struct __thread_enumerable_storage } _StorageType& - get_with_id(std::uint32_t __i) + get_with_id(std::size_t __i) { assert(__i < size()); - std::uint32_t __j = 0; + std::size_t __j = 0; if (size() == __thread_specific_storage.size()) { return *__thread_specific_storage[__i]; } - for (std::uint32_t __count = 0; __j < __thread_specific_storage.size() && __count <= __i; ++__j) + for (std::size_t __count = 0; __j < __thread_specific_storage.size() && __count <= __i; ++__j) { // Only include storage from threads which have instantiated a storage object if (__thread_specific_storage[__j]) @@ -229,7 +229,7 @@ struct __thread_enumerable_storage _StorageType& get_for_current_thread() { - std::uint32_t __i = omp_get_thread_num(); + std::size_t __i = omp_get_thread_num(); if (!__thread_specific_storage[__i]) { // create temporary storage on first usage to avoid extra parallel region and unnecessary instantiation @@ -240,7 +240,7 @@ struct __thread_enumerable_storage } std::vector> __thread_specific_storage; - std::atomic __num_elements; + std::atomic_size_t __num_elements; std::unique_ptr<__construct_by_args_base<_StorageType>> __storage_factory; }; diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h index 4c196a8aa4f..7ba1b6982c1 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_serial.h +++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h @@ -50,10 +50,10 @@ struct __thread_enumerable_storage { } - std::uint32_t + std::size_t size() const { - return std::uint32_t{1}; + return std::size_t{1}; } _StorageType& @@ -63,7 +63,7 @@ struct __thread_enumerable_storage } _StorageType& - get_with_id(std::uint32_t /*__i*/) + get_with_id(std::size_t /*__i*/) { return get_for_current_thread(); } diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index 804a79c514e..4161a6a10eb 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -1315,7 +1315,7 @@ struct __thread_enumerable_storage { } - std::uint32_t + std::size_t size() const { return __thread_specific_storage.size(); @@ -1328,7 +1328,7 @@ struct __thread_enumerable_storage } _StorageType& - get_with_id(std::uint32_t __i) + get_with_id(std::size_t __i) { return __thread_specific_storage.begin()[__i]; } From 0a2847f89610933fd808309c4085e996b320aec2 Mon Sep 17 00:00:00 2001 From: Dan Hoeflinger Date: Wed, 15 Jan 2025 08:47:42 -0500 Subject: [PATCH 50/50] rename to __enumerable_thread_local_storage Signed-off-by: Dan Hoeflinger --- include/oneapi/dpl/pstl/algorithm_impl.h | 9 +++++---- include/oneapi/dpl/pstl/omp/util.h | 6 +++--- include/oneapi/dpl/pstl/parallel_backend_serial.h | 4 ++-- include/oneapi/dpl/pstl/parallel_backend_tbb.h | 4 ++-- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 6a217e994d0..c6a03acf294 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -4333,12 +4333,12 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando { // when n <= 0, we must fill the output histogram with zeros __pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, - __histogram_first + __num_bins, _HistogramValueT{0}); + __histogram_first + __num_bins, _HistogramValueT{0}); } else { - __par_backend::__thread_enumerable_storage> __tls{__num_bins, - _HistogramValueT{0}}; + __par_backend::__enumerable_thread_local_storage> __tls{__num_bins, + _HistogramValueT{0}}; //main histogram loop //TODO: add defaulted grain-size option for __parallel_for and use larger one here to account for overhead @@ -4352,7 +4352,8 @@ __pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Rando const std::size_t __num_temporary_copies = __tls.size(); __par_backend::__parallel_for( __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins, - [__num_temporary_copies, __histogram_first, &__tls](auto __global_histogram_first, auto __global_histogram_last) { + [__num_temporary_copies, __histogram_first, &__tls](auto __global_histogram_first, + auto __global_histogram_last) { const _DiffType __local_n = __global_histogram_last - __global_histogram_first; const _DiffType __range_begin_id = __global_histogram_first - __histogram_first; //initialize output global histogram with first local histogram via assign diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h index 04ce21889df..acb5a336fe4 100644 --- a/include/oneapi/dpl/pstl/omp/util.h +++ b/include/oneapi/dpl/pstl/omp/util.h @@ -155,7 +155,7 @@ __process_chunk(const __chunk_metrics& __metrics, _Iterator __base, _Index __chu __f(__first, __last); } -// abstract class to allow inclusion in __thread_enumerable_storage as member without requiring explicit template +// abstract class to allow inclusion in __enumerable_thread_local_storage as member without requiring explicit template // instantiation of param types template class __construct_by_args_base @@ -182,10 +182,10 @@ class __construct_by_args : public __construct_by_args_base<_StorageType> }; template -struct __thread_enumerable_storage +struct __enumerable_thread_local_storage { template - __thread_enumerable_storage(Args&&... __args) : __num_elements(0) + __enumerable_thread_local_storage(Args&&... __args) : __num_elements(0) { __storage_factory = std::make_unique<__construct_by_args<_StorageType, Args...>>(std::forward(__args)...); _PSTL_PRAGMA(omp parallel) diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h index 7ba1b6982c1..6ee4787c040 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_serial.h +++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h @@ -43,10 +43,10 @@ __cancel_execution(oneapi::dpl::__internal::__serial_backend_tag) } template -struct __thread_enumerable_storage +struct __enumerable_thread_local_storage { template - __thread_enumerable_storage(Args&&... __args) : __storage(std::forward(__args)...) + __enumerable_thread_local_storage(Args&&... __args) : __storage(std::forward(__args)...) { } diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index 4161a6a10eb..9c6c4739710 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -1308,10 +1308,10 @@ __parallel_for_each(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy } template -struct __thread_enumerable_storage +struct __enumerable_thread_local_storage { template - __thread_enumerable_storage(Args&&... __args) : __thread_specific_storage(std::forward(__args)...) + __enumerable_thread_local_storage(Args&&... __args) : __thread_specific_storage(std::forward(__args)...) { }