uxlfoundation · danhoeflinger · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -173,7 +173,7 @@ if (ONEDPL_BACKEND MATCHES "^(tbb|dpcpp|dpcpp_only)$")
     set(SET_BACKEND_${ONEDPL_BACKEND_NAME} TRUE)
 
     if (ONEDPL_BACKEND MATCHES "^(tbb|dpcpp)$")
-        find_package(TBB 2021 REQUIRED tbb OPTIONAL_COMPONENTS tbbmalloc)
+        find_package(TBB 2021 REQUIRED tbb tbbmalloc)
         message(STATUS "oneDPL uses oneTBB ${TBB_VERSION}")
         target_link_libraries(oneDPL INTERFACE TBB::tbb)
     endif()
@@ -336,7 +336,7 @@ elseif(ONEDPL_BACKEND MATCHES "^(omp)$")
         if (OpenMP_CXX_FLAGS MATCHES ".*-fiopenmp.*")
             set(_openmp_flag -fopenmp)
         elseif (OpenMP_CXX_FLAGS MATCHES ".*[-/]Qiopenmp.*")
-            set(_openmp_flag /Qopenmp)
+            set(_openmp_flag -Qopenmp)
         endif()
         if (_openmp_flag)
             message(STATUS "Using ${_openmp_flag} for openMP")

diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h
@@ -4289,6 +4289,86 @@ __pattern_shift_right(_Tag __tag, _ExecutionPolicy&& __exec, _BidirectionalItera
     return __res.base();
 }
 
+template <class _ForwardIterator, class _IdxHashFunc, class _RandomAccessIterator, class _IsVector>
+void
+__brick_histogram(_ForwardIterator __first, _ForwardIterator __last, _IdxHashFunc __func,
+                  _RandomAccessIterator __histogram_first, _IsVector) noexcept
+{
+    for (; __first != __last; ++__first)
+    {
+        std::int32_t __bin = __func.get_bin(*__first);
+        if (__bin >= 0)
+        {
+            ++__histogram_first[__bin];
+        }
+    }
+}
+
+template <class _Tag, class _ExecutionPolicy, class _ForwardIterator, class _Size, class _IdxHashFunc,
+          class _RandomAccessIterator>
+void
+__pattern_histogram(_Tag, _ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last,
+                    _Size __num_bins, _IdxHashFunc __func, _RandomAccessIterator __histogram_first)
+{
+    using _HistogramValueT = typename std::iterator_traits<_RandomAccessIterator>::value_type;
+    static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>);
+    __pattern_fill(_Tag{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins,
+                   _HistogramValueT{0});
+    __brick_histogram(__first, __last, __func, __histogram_first, typename _Tag::__is_vector{});
+}
+
+template <class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1, class _Size, class _IdxHashFunc,
+          class _RandomAccessIterator2>
+void
+__pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first,
+                    _RandomAccessIterator1 __last, _Size __num_bins, _IdxHashFunc __func,
+                    _RandomAccessIterator2 __histogram_first)
+{
+    using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag;
+    using _HistogramValueT = typename std::iterator_traits<_RandomAccessIterator2>::value_type;
+    using _DiffType = typename std::iterator_traits<_RandomAccessIterator2>::difference_type;
+
+    _DiffType __n = __last - __first;
+    if (__n > 0)
+    {
+        __par_backend::__thread_enumerable_storage<std::vector<_HistogramValueT>> __tls{__num_bins,
+                                                                                        _HistogramValueT{0}};
+
+        //main histogram loop
+        //TODO: add defaulted grain-size option for __parallel_for and use larger one here to account for overhead
+        __par_backend::__parallel_for(
+            __backend_tag{}, __exec, __first, __last,
+            [__func, &__tls](_RandomAccessIterator1 __first_local, _RandomAccessIterator1 __last_local) {
+                __internal::__brick_histogram(__first_local, __last_local, __func,
+                                              __tls.get_for_current_thread().begin(), _IsVector{});
+            });
+        // now accumulate temporary storage into output global histogram
+        __par_backend::__parallel_for(
+            __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins,
+            [__histogram_first, &__tls](auto __global_histogram_first, auto __global_histogram_last) {
+                _DiffType __local_n = __global_histogram_last - __global_histogram_first;
+                std::uint32_t __num_temporary_copies = __tls.size();
+                _DiffType __range_begin_id = __global_histogram_first - __histogram_first;
+                //initialize output global histogram with first local histogram via assign
+                __internal::__brick_walk2_n(__tls.get_with_id(0).begin() + __range_begin_id, __local_n,
+                                            __global_histogram_first, oneapi::dpl::__internal::__pstl_assign(),
+                                            _IsVector{});
+                for (std::uint32_t __i = 1; __i < __num_temporary_copies; ++__i)
+                {
+                    //accumulate into output global histogram with other local histogram via += operator
+                    __internal::__brick_walk2_n(
+                        __tls.get_with_id(__i).begin() + __range_begin_id, __local_n, __global_histogram_first,
+                        [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, _IsVector{});
+                }
+            });
+    }
+    else
+    {
+        __pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first,
+                       __histogram_first + __num_bins, _HistogramValueT{0});
+    }
+}
+
 } // namespace __internal
 } // namespace dpl
 } // namespace oneapi

diff --git a/include/oneapi/dpl/pstl/histogram_impl.h b/include/oneapi/dpl/pstl/histogram_impl.h
@@ -19,6 +19,7 @@
 #include "histogram_extension_defs.h"
 #include "histogram_binhash_utils.h"
 #include "iterator_impl.h"
+#include "algorithm_impl.h"
 
 #if _ONEDPL_HETERO_BACKEND
 #    include "hetero/histogram_impl_hetero.h"
@@ -29,23 +30,6 @@ namespace oneapi
 namespace dpl
 {
 
-namespace __internal
-{
-
-template <class _Tag, typename _ExecutionPolicy, typename _RandomAccessIterator1, typename _Size, typename _IdxHashFunc,
-          typename _RandomAccessIterator2>
-void
-__pattern_histogram(_Tag, _ExecutionPolicy&& exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last,
-                    _Size __num_bins, _IdxHashFunc __func, _RandomAccessIterator2 __histogram_first)
-{
-    static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>);
-
-    static_assert(sizeof(_Size) == 0 /*false*/,
-                  "Histogram API is currently unsupported for policies other than device execution policies");
-}
-
-} // namespace __internal
-
 template <typename _ExecutionPolicy, typename _RandomAccessIterator1, typename _Size, typename _ValueType,
           typename _RandomAccessIterator2>
 oneapi::dpl::__internal::__enable_if_execution_policy<_ExecutionPolicy, _RandomAccessIterator2>

diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h
@@ -20,11 +20,13 @@
 #include <atomic>
 #include <iterator>
 #include <cstddef>
+#include <cstdint>
 #include <cstdio>
 #include <memory>
 #include <vector>
 #include <type_traits>
 #include <omp.h>
+#include <tuple>
 
 #include "../parallel_backend_utils.h"
 #include "../unseq_backend_simd.h"
@@ -153,6 +155,95 @@ __process_chunk(const __chunk_metrics& __metrics, _Iterator __base, _Index __chu
     __f(__first, __last);
 }
 
+// abstract class to allow inclusion in __thread_enumerable_storage as member without requiring explicit template
+// instantiation of param types
+template <typename _StorageType>
+class __construct_by_args_base
+{
+  public:
+    virtual ~__construct_by_args_base() = default;
+    virtual std::unique_ptr<_StorageType> construct() = 0;
+};
+
+// Helper class to allow construction of _StorageType from a stored argument pack
+template <typename _StorageType, typename... _P>
+class __construct_by_args : public __construct_by_args_base<_StorageType>
+{
+  public:
+    std::unique_ptr<_StorageType>
+    construct() override
+    {
+        return std::apply([](_P... __arg_pack) { return std::make_unique<_StorageType>(__arg_pack...); }, __pack);
+    }
+    __construct_by_args(_P&&... __args) : __pack(std::forward<_P>(__args)...) {}
+
+  private:
+    const std::tuple<_P...> __pack;
+};
+
+template <typename _StorageType>
+struct __thread_enumerable_storage
+{
+    template <typename... Args>
+    __thread_enumerable_storage(Args&&... __args) : __num_elements(0)
+    {
+        __storage_factory = std::make_unique<__construct_by_args<_StorageType, Args...>>(std::forward<Args>(__args)...);
+        _PSTL_PRAGMA(omp parallel)
+        _PSTL_PRAGMA(omp single) { __thread_specific_storage.resize(omp_get_num_threads()); }
+    }
+
+    // Note: Size should not be used concurrantly with parallel loops which may instantiate storage objects, as it may
+    // not return an accurate count of instantiated storage objects in lockstep with the number allocated and stored.
+    // This is because the count is not atomic with the allocation and storage of the storage objects.
+    std::uint32_t
+    size() const
+    {
+        // only count storage which has been instantiated
+        return __num_elements.load();
+    }
+
+    _StorageType&
+    get_with_id(std::uint32_t __i)
+    {
+        assert(__i < size());
+
+        std::uint32_t __j = 0;
+
+        if (size() == __thread_specific_storage.size())
+        {
+            return *__thread_specific_storage[__i];
+        }
+
+        for (std::uint32_t __count = 0; __j < __thread_specific_storage.size() && __count <= __i; ++__j)
+        {
+            // Only include storage from threads which have instantiated a storage object
+            if (__thread_specific_storage[__j])
+            {
+                __count++;
+            }
+        }
+        // Need to back up one once we have found a valid storage object
+        return *__thread_specific_storage[__j - 1];
+    }
+
+    _StorageType&
+    get_for_current_thread()
+    {
+        std::uint32_t __i = omp_get_thread_num();
+        if (!__thread_specific_storage[__i])
+        {
+            // create temporary storage on first usage to avoid extra parallel region and unnecessary instantiation
+            __thread_specific_storage[__i] = __storage_factory->construct();
+            __num_elements.fetch_add(1);
+        }
+        return *__thread_specific_storage[__i];
+    }
+
+    std::vector<std::unique_ptr<_StorageType>> __thread_specific_storage;
+    std::atomic<std::uint32_t> __num_elements;
+    std::unique_ptr<__construct_by_args_base<_StorageType>> __storage_factory;
+};
+
 } // namespace __omp_backend
 } // namespace dpl
 } // namespace oneapi

diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h
@@ -20,11 +20,11 @@
 
 #include <algorithm>
 #include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <numeric>
 #include <utility>
 #include <type_traits>
-
 #include "parallel_backend_utils.h"
 
 namespace oneapi
@@ -42,6 +42,35 @@ __cancel_execution(oneapi::dpl::__internal::__serial_backend_tag)
 {
 }
 
+template <typename _StorageType>
+struct __thread_enumerable_storage
+{
+    template <typename... Args>
+    __thread_enumerable_storage(Args&&... __args) : __storage(std::forward<Args>(__args)...)
+    {
+    }
+
+    std::uint32_t
+    size() const
+    {
+        return std::uint32_t{1};
+    }
+
+    _StorageType&
+    get_for_current_thread()
+    {
+        return __storage;
+    }
+
+    _StorageType&
+    get_with_id(std::uint32_t /*__i*/)
+    {
+        return get_for_current_thread();
+    }
+
+    _StorageType __storage;
+};
+
 template <class _ExecutionPolicy, class _Index, class _Fp>
 void
 __parallel_for(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&&, _Index __first, _Index __last,

diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h
@@ -34,6 +34,7 @@
 #include <tbb/parallel_invoke.h>
 #include <tbb/task_arena.h>
 #include <tbb/tbb_allocator.h>
+#include <tbb/enumerable_thread_specific.h>
 #if TBB_INTERFACE_VERSION > 12000
 #    include <tbb/task.h>
 #endif
@@ -1306,6 +1307,35 @@ __parallel_for_each(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy
     tbb::this_task_arena::isolate([&]() { tbb::parallel_for_each(__begin, __end, __f); });
 }
 
+template <typename _StorageType>
+struct __thread_enumerable_storage
+{
+    template <typename... Args>
+    __thread_enumerable_storage(Args&&... __args) : __thread_specific_storage(std::forward<Args>(__args)...)
+    {
+    }
+
+    std::uint32_t
+    size() const
+    {
+        return __thread_specific_storage.size();
+    }
+
+    _StorageType&
+    get_for_current_thread()
+    {
+        return __thread_specific_storage.local();
+    }
+
+    _StorageType&
+    get_with_id(std::uint32_t __i)
+    {
+        return __thread_specific_storage.begin()[__i];
+    }
+
+    tbb::enumerable_thread_specific<_StorageType> __thread_specific_storage;
+};
+
 } // namespace __tbb_backend
 } // namespace dpl
 } // namespace oneapi