Moving memory alloc type into helper function and addressing other PR

comments Addressing PR comments on memory allocation type function checker and updating comment. Fixing documentation. Signed-off-by: nscipione <nicolo.scipione@codeplay.com>
codeplaysoftware · Apr 10, 2024 · 3568b48 · 3568b48
1 parent 550def0
commit 3568b48
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 57 deletions.
diff --git a/include/portblas_helper.h b/include/portblas_helper.h
@@ -220,6 +220,16 @@ inline cl::sycl::event fill(cl::sycl::queue q, element_t *buff, element_t value,
 }
 #endif
 
+template <typename sb_handle_t, typename containerT>
+inline bool is_malloc_shared(sb_handle_t &sb_handle, const containerT _rs) {
+  if constexpr (std::is_pointer_v<containerT>) {
+    return sycl::usm::alloc::shared ==
+           sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
+  } else {
+    return false;
+  }
+}
+
 }  // end namespace helper
 }  // end namespace blas
 #endif  // PORTBLAS_HELPER_H
diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp
@@ -25,6 +25,7 @@
 #ifndef PORTBLAS_ASUM_AMD_GPU_BACKEND_HPP
 #define PORTBLAS_ASUM_AMD_GPU_BACKEND_HPP
 #include "interface/blas1_interface.h"
+#include "portblas_helper.h"
 
 namespace blas {
 namespace asum {
@@ -38,18 +39,11 @@ typename sb_handle_t::event_t _asum(
    * This compile time check is absolutely necessary for AMD GPUs.
    * AMD's atomic operations require a specific combination of hardware that
    * cannot be checked nor enforced. Since the reduction operator kernel
-   * implementation uses atomic operations, without that particular hardware combination
-   * the reduction may silently fail. This check enforces a different atomic
-   * address space causing a big performance degradation, but also making the kernel
-   * behave correctly with managed memory (aka malloc_shared allocation).
+   * implementation uses atomic operations, without that particular hardware
+   * combination the reduction may silently fail.
    **/
 #ifdef SB_ENABLE_USM
-  bool usm_managed_mem{false};
-  if constexpr (std::is_pointer_v<decltype(_rs)>) {
-    usm_managed_mem =
-        sycl::usm::alloc::shared ==
-        sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
-  }
+  const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs);
 #else
   constexpr bool usm_managed_mem{false};
 #endif
@@ -135,15 +129,14 @@ typename sb_handle_t::event_t _nrm2(
     sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
   /**
-   * Read comment in _asum above.
+   * This compile time check is absolutely necessary for AMD GPUs.
+   * AMD's atomic operations require a specific combination of hardware that
+   * cannot be checked nor enforced. Since the reduction operator kernel
+   * implementation uses atomic operations, without that particular hardware
+   * combination the reduction may silently fail.
    **/
 #ifdef SB_ENABLE_USM
-  bool usm_managed_mem{false};
-  if constexpr (std::is_pointer_v<decltype(_rs)>) {
-    usm_managed_mem =
-        sycl::usm::alloc::shared ==
-        sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
-  }
+  const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs);
 #else
   constexpr bool usm_managed_mem{false};
 #endif
@@ -185,15 +178,14 @@ typename sb_handle_t::event_t _dot(
     container_1_t _vy, increment_t _incy, container_2_t _rs,
     const typename sb_handle_t::event_t& _dependencies) {
   /**
-   * Read comment in _asum above.
+   * This compile time check is absolutely necessary for AMD GPUs.
+   * AMD's atomic operations require a specific combination of hardware that
+   * cannot be checked nor enforced. Since the reduction operator kernel
+   * implementation uses atomic operations, without that particular hardware
+   * combination the reduction may silently fail.
    **/
 #ifdef SB_ENABLE_USM
-  bool usm_managed_mem{false};
-  if constexpr (std::is_pointer_v<decltype(_rs)>) {
-    usm_managed_mem =
-        sycl::usm::alloc::shared ==
-        sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
-  }
+  const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs);
 #else
   constexpr bool usm_managed_mem{false};
 #endif

diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp
@@ -226,19 +226,14 @@ typename sb_handle_t::event_t _asum(
  *                      implementation use a kernel implementation which doesn't
  *                      require local memory.
  * @tparam usmManagedMem Specifies if usm memory allocation is automatically
- *                       managed  or not. The memory automatically managed
+ *                       managed  or not. Automatically managed memory
  *                       requires that atomic address space is set to generic.
- *                       This is a strict requirement only for AMD gpus, since
- *                       otherwise it will rely on pcie atomics
- *                       which we cannot enforce, guarantee or check due to its
- *                       hardware nature. Other targets do not have the same
+ *                       This is a strict requirement only for AMD GPUs, since
+ *                       AMD's implementation of atomics may depend on specific
+ *                       hardware configurations (PCIe atomics) that cannot be
+ *                       checked at runtime. Other targets do not have the same
  *                       strong dependency and managed memory is handled
- *                       correctly in any case by default. It is automatically
- *                       initialized to false to reduce verbosity of
- *                       initialization for many targets since only one of them,
- *                       with specific allocation type, requires a different
- *                       value. Having a default value allows the compiler to
- *                       handle automatically other templates.
+ *                       correctly by default.
  */
 template <int localSize, int localMemSize, bool usmManagedMem,
           typename sb_handle_t, typename container_0_t, typename container_1_t,
@@ -562,19 +557,14 @@ typename sb_handle_t::event_t _nrm2(
  *                      implementation use a kernel implementation which doesn't
  *                      require local memory.
  * @tparam usmManagedMem Specifies if usm memory allocation is automatically
- *                       managed  or not. The memory automatically managed
+ *                       managed  or not. Automatically managed memory
  *                       requires that atomic address space is set to generic.
- *                       This is a strict requirement only for AMD gpus, since
- *                       otherwise it will rely on pcie atomics
- *                       which we cannot enforce, guarantee or check due to its
- *                       hardware nature. Other targets do not have the same
+ *                       This is a strict requirement only for AMD GPUs, since
+ *                       AMD's implementation of atomics may depend on specific
+ *                       hardware configurations (PCIe atomics) that cannot be
+ *                       checked at runtime. Other targets do not have the same
  *                       strong dependency and managed memory is handled
- *                       correctly in any case by default. It is automatically
- *                       initialized to false to reduce verbosity of
- *                       initialization for many targets since only one of them,
- *                       with specific allocation type, requires a different
- *                       value. Having a default value allows the compiler to
- *                       handle automatically other templates.
+ *                       correctly by default.
  */
 template <int localSize, int localMemSize, bool usmManagedMem,
           typename sb_handle_t, typename container_0_t, typename container_1_t,
@@ -625,19 +615,15 @@ typename sb_handle_t::event_t _nrm2_impl(
  *                      implementation use a kernel implementation which doesn't
  *                      require local memory.
  * @tparam usmManagedMem Specifies if usm memory allocation is automatically
- *                       managed  or not. The memory automatically managed
+ *                       managed  or not. Automatically managed memory
  *                       requires that atomic address space is set to generic.
- *                       This is a strict requirement only for AMD gpus, since
- *                       otherwise it will rely on pcie atomics
- *                       which we cannot enforce, guarantee or check due to its
- *                       hardware nature. Other targets do not have the same
+ *                       This is a strict requirement only for AMD GPUs, since
+ *                       AMD's implementation of atomics may depend on specific
+ *                       hardware configurations (PCIe atomics) that cannot be
+ *                       checked at runtime. Other targets do not have the same
  *                       strong dependency and managed memory is handled
- *                       correctly in any case by default. It is automatically
- *                       initialized to false to reduce verbosity of
- *                       initialization for many targets since only one of them,
- *                       with specific allocation type, requires a different
- *                       value. Having a default value allows the compiler to
- *                       handle automatically other templates.
+ *                       correctly by default.
+
  */
 template <int localSize, int localMemSize, bool usmManagedMem,
           typename sb_handle_t, typename container_0_t, typename container_1_t,