Skip to content

Commit

Permalink
fix gpu warp intrinsics (#2247)
Browse files Browse the repository at this point in the history
`reduce_by_key` depends on warp level intrinsics to transfer values
between different threads (lanes) participating in the reduction. The
pertinent intrinsic is `__shfl_down_sync` which is accessed through
Arbor's wrapper function `shfl_down`. However, the contribution from
each thread to the reduction was erroneously truncated to an integer
value. This PR fixes the signature of the respective wrapper functions
and modifies the unit test in order to check that floating point
reductions are not truncated.
While cleaning up the cuda code path, the workaround using two 32-bit
shuffle instructions for 64 bit data types (doubles) was removed - this
was probably a leftover from cuda versions prior to 9.0.
  • Loading branch information
boeschf authored Jan 11, 2024
1 parent d4579b1 commit 21f5029
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 32 deletions.
32 changes: 6 additions & 26 deletions arbor/include/arbor/gpu/cuda_api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,17 +139,6 @@ inline float gpu_atomic_sub(float* address, float val) {

/// Warp-Level Primitives

__device__ __inline__ double shfl(unsigned mask, double x, int lane)
{
auto tmp = static_cast<uint64_t>(x);
auto lo = static_cast<unsigned>(tmp);
auto hi = static_cast<unsigned>(tmp >> 32);
hi = __shfl_sync(mask, static_cast<int>(hi), lane, warpSize);
lo = __shfl_sync(mask, static_cast<int>(lo), lane, warpSize);
return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
static_cast<uint64_t>(lo));
}

__device__ __inline__ unsigned ballot(unsigned mask, unsigned is_root) {
return __ballot_sync(mask, is_root);
}
Expand All @@ -158,24 +147,15 @@ __device__ __inline__ unsigned any(unsigned mask, unsigned width) {
return __any_sync(mask, width);
}

#ifdef __NVCC__
__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
return __shfl_up_sync(mask, idx, shift);
}

__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
return __shfl_down_sync(mask, idx, shift);
template<typename T>
__device__ __inline__ T shfl_up(unsigned mask, T var, unsigned lane_id, unsigned shift) {
return __shfl_up_sync(mask, var, shift);
}

#else
__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
return shfl(mask, idx, lane_id - shift);
template<typename T>
__device__ __inline__ T shfl_down(unsigned mask, T var, unsigned lane_id, unsigned shift) {
return __shfl_down_sync(mask, var, shift);
}

__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
return shfl(mask, idx, lane_id + shift);
}
#endif
#endif

} // namespace gpu
Expand Down
19 changes: 15 additions & 4 deletions arbor/include/arbor/gpu/hip_api.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <utility>
#include <string>
#include <type_traits>

#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
Expand Down Expand Up @@ -118,6 +119,14 @@ inline float gpu_atomic_sub(float* address, float val) {

/// Warp-level Primitives

template<typename T>
__device__ __inline__
std::enable_if_t< !std::is_same_v<std::decay_t<T>, double>, std::decay_t<T>>
shfl(T x, int lane)
{
return __shfl(x, lane);
}

__device__ __inline__ double shfl(double x, int lane)
{
auto tmp = static_cast<uint64_t>(x);
Expand All @@ -137,12 +146,14 @@ __device__ __inline__ unsigned any(unsigned mask, unsigned width) {
return __any(width);
}

__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
return shfl(idx, lane_id - shift);
template<typename T>
__device__ __inline__ T shfl_up(unsigned mask, T var, unsigned lane_id, unsigned shift) {
return shfl(var, (int)lane_id - shift);
}

__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
return shfl(idx, lane_id + shift);
template<typename T>
__device__ __inline__ T shfl_down(unsigned mask, T var, unsigned lane_id, unsigned shift) {
return shfl(var, (int)lane_id + shift);
}

} // namespace gpu
Expand Down
4 changes: 2 additions & 2 deletions test/unit/test_reduce_by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@ TEST(reduce_by_key, scatter)
// onto an array of length 12.
std::size_t n = 12;
std::vector<int> index = {0,0,0,1,2,2,2,2,3,3,7,7,7,7,7,11};
std::vector<double> in(index.size(), 1);
std::vector<double> expected = {3., 1., 4., 2., 0., 0., 0., 5., 0., 0., 0., 1.};
std::vector<double> in(index.size(), 0.5);
std::vector<double> expected = {1.5, 0.5, 2., 1., 0., 0., 0., 2.5, 0., 0., 0., 0.5};

EXPECT_EQ(n, expected.size());

Expand Down

0 comments on commit 21f5029

Please sign in to comment.