Skip to content
This repository has been archived by the owner on Jan 13, 2025. It is now read-only.

Commit

Permalink
Moving memory alloc type into helper function and addressing other PR
Browse files Browse the repository at this point in the history
comments

Addressing PR comments on memory allocation type function checker and updating
comment.
Fixing documentation.

Signed-off-by: nscipione <nicolo.scipione@codeplay.com>
  • Loading branch information
s-Nick committed Apr 10, 2024
1 parent 550def0 commit 3568b48
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 57 deletions.
10 changes: 10 additions & 0 deletions include/portblas_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,16 @@ inline cl::sycl::event fill(cl::sycl::queue q, element_t *buff, element_t value,
}
#endif

template <typename sb_handle_t, typename containerT>
inline bool is_malloc_shared(sb_handle_t &sb_handle, const containerT _rs) {
if constexpr (std::is_pointer_v<containerT>) {
return sycl::usm::alloc::shared ==
sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
} else {
return false;
}
}

} // end namespace helper
} // end namespace blas
#endif // PORTBLAS_HELPER_H
40 changes: 16 additions & 24 deletions src/interface/blas1/backend/amd_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#ifndef PORTBLAS_ASUM_AMD_GPU_BACKEND_HPP
#define PORTBLAS_ASUM_AMD_GPU_BACKEND_HPP
#include "interface/blas1_interface.h"
#include "portblas_helper.h"

namespace blas {
namespace asum {
Expand All @@ -38,18 +39,11 @@ typename sb_handle_t::event_t _asum(
* This compile time check is absolutely necessary for AMD GPUs.
* AMD's atomic operations require a specific combination of hardware that
* cannot be checked nor enforced. Since the reduction operator kernel
* implementation uses atomic operations, without that particular hardware combination
* the reduction may silently fail. This check enforces a different atomic
* address space causing a big performance degradation, but also making the kernel
* behave correctly with managed memory (aka malloc_shared allocation).
* implementation uses atomic operations, without that particular hardware
* combination the reduction may silently fail.
**/
#ifdef SB_ENABLE_USM
bool usm_managed_mem{false};
if constexpr (std::is_pointer_v<decltype(_rs)>) {
usm_managed_mem =
sycl::usm::alloc::shared ==
sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
}
const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs);
#else
constexpr bool usm_managed_mem{false};
#endif
Expand Down Expand Up @@ -135,15 +129,14 @@ typename sb_handle_t::event_t _nrm2(
sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
/**
* Read comment in _asum above.
* This compile time check is absolutely necessary for AMD GPUs.
* AMD's atomic operations require a specific combination of hardware that
* cannot be checked nor enforced. Since the reduction operator kernel
* implementation uses atomic operations, without that particular hardware
* combination the reduction may silently fail.
**/
#ifdef SB_ENABLE_USM
bool usm_managed_mem{false};
if constexpr (std::is_pointer_v<decltype(_rs)>) {
usm_managed_mem =
sycl::usm::alloc::shared ==
sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
}
const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs);
#else
constexpr bool usm_managed_mem{false};
#endif
Expand Down Expand Up @@ -185,15 +178,14 @@ typename sb_handle_t::event_t _dot(
container_1_t _vy, increment_t _incy, container_2_t _rs,
const typename sb_handle_t::event_t& _dependencies) {
/**
* Read comment in _asum above.
* This compile time check is absolutely necessary for AMD GPUs.
* AMD's atomic operations require a specific combination of hardware that
* cannot be checked nor enforced. Since the reduction operator kernel
* implementation uses atomic operations, without that particular hardware
* combination the reduction may silently fail.
**/
#ifdef SB_ENABLE_USM
bool usm_managed_mem{false};
if constexpr (std::is_pointer_v<decltype(_rs)>) {
usm_managed_mem =
sycl::usm::alloc::shared ==
sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
}
const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs);
#else
constexpr bool usm_managed_mem{false};
#endif
Expand Down
52 changes: 19 additions & 33 deletions src/interface/blas1_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,19 +226,14 @@ typename sb_handle_t::event_t _asum(
* implementation use a kernel implementation which doesn't
* require local memory.
* @tparam usmManagedMem Specifies if usm memory allocation is automatically
* managed or not. The memory automatically managed
* managed or not. Automatically managed memory
* requires that atomic address space is set to generic.
* This is a strict requirement only for AMD gpus, since
* otherwise it will rely on pcie atomics
* which we cannot enforce, guarantee or check due to its
* hardware nature. Other targets do not have the same
* This is a strict requirement only for AMD GPUs, since
* AMD's implementation of atomics may depend on specific
* hardware configurations (PCIe atomics) that cannot be
* checked at runtime. Other targets do not have the same
* strong dependency and managed memory is handled
* correctly in any case by default. It is automatically
* initialized to false to reduce verbosity of
* initialization for many targets since only one of them,
* with specific allocation type, requires a different
* value. Having a default value allows the compiler to
* handle automatically other templates.
* correctly by default.
*/
template <int localSize, int localMemSize, bool usmManagedMem,
typename sb_handle_t, typename container_0_t, typename container_1_t,
Expand Down Expand Up @@ -562,19 +557,14 @@ typename sb_handle_t::event_t _nrm2(
* implementation use a kernel implementation which doesn't
* require local memory.
* @tparam usmManagedMem Specifies if usm memory allocation is automatically
* managed or not. The memory automatically managed
* managed or not. Automatically managed memory
* requires that atomic address space is set to generic.
* This is a strict requirement only for AMD gpus, since
* otherwise it will rely on pcie atomics
* which we cannot enforce, guarantee or check due to its
* hardware nature. Other targets do not have the same
* This is a strict requirement only for AMD GPUs, since
* AMD's implementation of atomics may depend on specific
* hardware configurations (PCIe atomics) that cannot be
* checked at runtime. Other targets do not have the same
* strong dependency and managed memory is handled
* correctly in any case by default. It is automatically
* initialized to false to reduce verbosity of
* initialization for many targets since only one of them,
* with specific allocation type, requires a different
* value. Having a default value allows the compiler to
* handle automatically other templates.
* correctly by default.
*/
template <int localSize, int localMemSize, bool usmManagedMem,
typename sb_handle_t, typename container_0_t, typename container_1_t,
Expand Down Expand Up @@ -625,19 +615,15 @@ typename sb_handle_t::event_t _nrm2_impl(
* implementation use a kernel implementation which doesn't
* require local memory.
* @tparam usmManagedMem Specifies if usm memory allocation is automatically
* managed or not. The memory automatically managed
* managed or not. Automatically managed memory
* requires that atomic address space is set to generic.
* This is a strict requirement only for AMD gpus, since
* otherwise it will rely on pcie atomics
* which we cannot enforce, guarantee or check due to its
* hardware nature. Other targets do not have the same
* This is a strict requirement only for AMD GPUs, since
* AMD's implementation of atomics may depend on specific
* hardware configurations (PCIe atomics) that cannot be
* checked at runtime. Other targets do not have the same
* strong dependency and managed memory is handled
* correctly in any case by default. It is automatically
* initialized to false to reduce verbosity of
* initialization for many targets since only one of them,
* with specific allocation type, requires a different
* value. Having a default value allows the compiler to
* handle automatically other templates.
* correctly by default.
*/
template <int localSize, int localMemSize, bool usmManagedMem,
typename sb_handle_t, typename container_0_t, typename container_1_t,
Expand Down

0 comments on commit 3568b48

Please sign in to comment.