From 21f5029015f65e9e2fe4ea9b229b9924a469d305 Mon Sep 17 00:00:00 2001 From: boeschf <48126478+boeschf@users.noreply.github.com> Date: Thu, 11 Jan 2024 14:40:56 +0100 Subject: [PATCH] fix gpu warp intrinsics (#2247) `reduce_by_key` depends on warp level intrinsics to transfer values between different threads (lanes) participating in the reduction. The pertinent intrinsic is `__shfl_down_sync` which is accessed through Arbor's wrapper function `shfl_down`. However, the contribution from each thread to the reduction was erroneously truncated to an integer value. This PR fixes the signature of the respective wrapper functions and modifies the unit test in order to check that floating point reductions are not truncated. While cleaning up the cuda code path, the workaround using two 32-bit shuffle instructions for 64 bit data types (doubles) was removed - this was probably a leftover from cuda versions prior to 9.0. --- arbor/include/arbor/gpu/cuda_api.hpp | 32 ++++++---------------------- arbor/include/arbor/gpu/hip_api.hpp | 19 +++++++++++++---- test/unit/test_reduce_by_key.cu | 4 ++-- 3 files changed, 23 insertions(+), 32 deletions(-) diff --git a/arbor/include/arbor/gpu/cuda_api.hpp b/arbor/include/arbor/gpu/cuda_api.hpp index 48bc96f262..532b9ec8f9 100644 --- a/arbor/include/arbor/gpu/cuda_api.hpp +++ b/arbor/include/arbor/gpu/cuda_api.hpp @@ -139,17 +139,6 @@ inline float gpu_atomic_sub(float* address, float val) { /// Warp-Level Primitives -__device__ __inline__ double shfl(unsigned mask, double x, int lane) -{ - auto tmp = static_cast(x); - auto lo = static_cast(tmp); - auto hi = static_cast(tmp >> 32); - hi = __shfl_sync(mask, static_cast(hi), lane, warpSize); - lo = __shfl_sync(mask, static_cast(lo), lane, warpSize); - return static_cast(static_cast(hi) << 32 | - static_cast(lo)); -} - __device__ __inline__ unsigned ballot(unsigned mask, unsigned is_root) { return __ballot_sync(mask, is_root); } @@ -158,24 +147,15 @@ __device__ __inline__ unsigned any(unsigned mask, unsigned width) { return __any_sync(mask, width); } -#ifdef __NVCC__ -__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) { - return __shfl_up_sync(mask, idx, shift); -} - -__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) { - return __shfl_down_sync(mask, idx, shift); +template +__device__ __inline__ T shfl_up(unsigned mask, T var, unsigned lane_id, unsigned shift) { + return __shfl_up_sync(mask, var, shift); } -#else -__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) { - return shfl(mask, idx, lane_id - shift); +template +__device__ __inline__ T shfl_down(unsigned mask, T var, unsigned lane_id, unsigned shift) { + return __shfl_down_sync(mask, var, shift); } - -__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) { - return shfl(mask, idx, lane_id + shift); -} -#endif #endif } // namespace gpu diff --git a/arbor/include/arbor/gpu/hip_api.hpp b/arbor/include/arbor/gpu/hip_api.hpp index fbdba3151f..019f235a8e 100644 --- a/arbor/include/arbor/gpu/hip_api.hpp +++ b/arbor/include/arbor/gpu/hip_api.hpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -118,6 +119,14 @@ inline float gpu_atomic_sub(float* address, float val) { /// Warp-level Primitives +template +__device__ __inline__ +std::enable_if_t< !std::is_same_v, double>, std::decay_t> +shfl(T x, int lane) +{ + return __shfl(x, lane); +} + __device__ __inline__ double shfl(double x, int lane) { auto tmp = static_cast(x); @@ -137,12 +146,14 @@ __device__ __inline__ unsigned any(unsigned mask, unsigned width) { return __any(width); } -__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) { - return shfl(idx, lane_id - shift); +template +__device__ __inline__ T shfl_up(unsigned mask, T var, unsigned lane_id, unsigned shift) { + return shfl(var, (int)lane_id - shift); } -__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) { - return shfl(idx, lane_id + shift); +template +__device__ __inline__ T shfl_down(unsigned mask, T var, unsigned lane_id, unsigned shift) { + return shfl(var, (int)lane_id + shift); } } // namespace gpu diff --git a/test/unit/test_reduce_by_key.cu b/test/unit/test_reduce_by_key.cu index 44dcab912a..5b5763c95e 100644 --- a/test/unit/test_reduce_by_key.cu +++ b/test/unit/test_reduce_by_key.cu @@ -89,8 +89,8 @@ TEST(reduce_by_key, scatter) // onto an array of length 12. std::size_t n = 12; std::vector index = {0,0,0,1,2,2,2,2,3,3,7,7,7,7,7,11}; - std::vector in(index.size(), 1); - std::vector expected = {3., 1., 4., 2., 0., 0., 0., 5., 0., 0., 0., 1.}; + std::vector in(index.size(), 0.5); + std::vector expected = {1.5, 0.5, 2., 1., 0., 0., 0., 2.5, 0., 0., 0., 0.5}; EXPECT_EQ(n, expected.size());