diff --git a/include/portblas_helper.h b/include/portblas_helper.h index 4ef250829..a07ba745a 100644 --- a/include/portblas_helper.h +++ b/include/portblas_helper.h @@ -220,6 +220,16 @@ inline cl::sycl::event fill(cl::sycl::queue q, element_t *buff, element_t value, } #endif +template +inline bool is_malloc_shared(sb_handle_t &sb_handle, const containerT _rs) { + if constexpr (std::is_pointer_v) { + return sycl::usm::alloc::shared == + sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); + } else { + return false; + } +} + } // end namespace helper } // end namespace blas #endif // PORTBLAS_HELPER_H diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp index 3a1409b8d..f61bc6e8b 100644 --- a/src/interface/blas1/backend/amd_gpu.hpp +++ b/src/interface/blas1/backend/amd_gpu.hpp @@ -25,6 +25,7 @@ #ifndef PORTBLAS_ASUM_AMD_GPU_BACKEND_HPP #define PORTBLAS_ASUM_AMD_GPU_BACKEND_HPP #include "interface/blas1_interface.h" +#include "portblas_helper.h" namespace blas { namespace asum { @@ -38,18 +39,11 @@ typename sb_handle_t::event_t _asum( * This compile time check is absolutely necessary for AMD GPUs. * AMD's atomic operations require a specific combination of hardware that * cannot be checked nor enforced. Since the reduction operator kernel - * implementation uses atomic operations, without that particular hardware combination - * the reduction may silently fail. This check enforces a different atomic - * address space causing a big performance degradation, but also making the kernel - * behave correctly with managed memory (aka malloc_shared allocation). + * implementation uses atomic operations, without that particular hardware + * combination the reduction may silently fail. **/ #ifdef SB_ENABLE_USM - bool usm_managed_mem{false}; - if constexpr (std::is_pointer_v) { - usm_managed_mem = - sycl::usm::alloc::shared == - sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); - } + const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs); #else constexpr bool usm_managed_mem{false}; #endif @@ -135,15 +129,14 @@ typename sb_handle_t::event_t _nrm2( sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx, container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) { /** - * Read comment in _asum above. + * This compile time check is absolutely necessary for AMD GPUs. + * AMD's atomic operations require a specific combination of hardware that + * cannot be checked nor enforced. Since the reduction operator kernel + * implementation uses atomic operations, without that particular hardware + * combination the reduction may silently fail. **/ #ifdef SB_ENABLE_USM - bool usm_managed_mem{false}; - if constexpr (std::is_pointer_v) { - usm_managed_mem = - sycl::usm::alloc::shared == - sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); - } + const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs); #else constexpr bool usm_managed_mem{false}; #endif @@ -185,15 +178,14 @@ typename sb_handle_t::event_t _dot( container_1_t _vy, increment_t _incy, container_2_t _rs, const typename sb_handle_t::event_t& _dependencies) { /** - * Read comment in _asum above. + * This compile time check is absolutely necessary for AMD GPUs. + * AMD's atomic operations require a specific combination of hardware that + * cannot be checked nor enforced. Since the reduction operator kernel + * implementation uses atomic operations, without that particular hardware + * combination the reduction may silently fail. **/ #ifdef SB_ENABLE_USM - bool usm_managed_mem{false}; - if constexpr (std::is_pointer_v) { - usm_managed_mem = - sycl::usm::alloc::shared == - sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); - } + const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs); #else constexpr bool usm_managed_mem{false}; #endif diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index 7c8180aa5..e92027823 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -226,19 +226,14 @@ typename sb_handle_t::event_t _asum( * implementation use a kernel implementation which doesn't * require local memory. * @tparam usmManagedMem Specifies if usm memory allocation is automatically - * managed or not. The memory automatically managed + * managed or not. Automatically managed memory * requires that atomic address space is set to generic. - * This is a strict requirement only for AMD gpus, since - * otherwise it will rely on pcie atomics - * which we cannot enforce, guarantee or check due to its - * hardware nature. Other targets do not have the same + * This is a strict requirement only for AMD GPUs, since + * AMD's implementation of atomics may depend on specific + * hardware configurations (PCIe atomics) that cannot be + * checked at runtime. Other targets do not have the same * strong dependency and managed memory is handled - * correctly in any case by default. It is automatically - * initialized to false to reduce verbosity of - * initialization for many targets since only one of them, - * with specific allocation type, requires a different - * value. Having a default value allows the compiler to - * handle automatically other templates. + * correctly by default. */ template