Skip to content
This repository has been archived by the owner on Jan 13, 2025. It is now read-only.

Commit

Permalink
Enabled half precision for GEMM (#495)
Browse files Browse the repository at this point in the history
* Enabled axpy and scal with half precision as well
* Enabled the relevant tests and benchmarks with half precision
  • Loading branch information
OuadiElfarouki authored Feb 27, 2024
1 parent cb69d68 commit 3adb52c
Show file tree
Hide file tree
Showing 43 changed files with 775 additions and 394 deletions.
11 changes: 10 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,14 @@ if(IMGDNN_DIR)
endif()

option(BLAS_ENABLE_EXTENSIONS "Whether to enable portBLAS extensions" ON)
option(BLAS_ENABLE_COMPLEX "Whether to enable complex data type for supported operators" ON)
option(BLAS_ENABLE_COMPLEX "Whether to enable complex data type for GEMM" OFF)
option(BLAS_ENABLE_HALF "Whether to enable sycl::half data type for supported operators" OFF)

if(((NOT INSTALL_HEADER_ONLY) AND (TUNING_TARGET STREQUAL "DEFAULT_CPU"))
OR (INSTALL_HEADER_ONLY AND (NOT TUNING_TARGET)))
set(BLAS_ENABLE_HALF OFF)
message(STATUS "FP16 operations are not supported for CPU targets. BLAS_ENABLE_HALF is disabled")
endif()

# CmakeFunctionHelper has to be included after any options that it depends on are declared.
# These include:
Expand All @@ -117,6 +124,8 @@ option(BLAS_ENABLE_COMPLEX "Whether to enable complex data type for supported op
# * BLAS_INDEX_TYPES
# * NAIVE_GEMM
# * BLAS_ENABLE_COMPLEX
# * BLAS_ENABLE_HALF

include(CmakeFunctionHelper)

if (INSTALL_HEADER_ONLY)
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -462,9 +462,10 @@ Some of the supported options are:
| `BLAS_MEMPOOL_BENCHMARK` | `ON`/`OFF` | Determines whether to enable the scratchpad memory pool for benchmark execution. `OFF` by default |
| `BLAS_ENABLE_CONST_INPUT` | `ON`/`OFF` | Determines whether to enable kernel instantiation with const input buffer (`ON` by default) |
| `BLAS_ENABLE_EXTENSIONS` | `ON`/`OFF` | Determines whether to enable portBLAS extensions (`ON` by default) |
| `BLAS_DATA_TYPES` | `half;float;double` | Determines the floating-point types to instantiate BLAS operations for. Default is `float` |
| `BLAS_DATA_TYPES` | `float;double` | Determines the floating-point types to instantiate BLAS operations for. Default is `float` |
| `BLAS_INDEX_TYPES` | `int32_t;int64_t` | Determines the type(s) to use for `index_t` and `increment_t`. Default is `int` |
| `BLAS_ENABLE_COMPLEX` | `ON`/`OFF` | Determines whether to enable Complex data type support *(GEMM Operators only)* (`ON` by default) |
| `BLAS_ENABLE_COMPLEX` | `ON`/`OFF` | Determines whether to enable Complex data type support *(GEMM Operators only)* (`OFF` by default) |
| `BLAS_ENABLE_HALF` | `ON`/`OFF` | Determines whether to enable Half data type support *(Support is limited to some Level 1 operators and Gemm)* (`OFF` by default) |

## ComputeCpp Compilation *(Deprecated)*

Expand Down
19 changes: 14 additions & 5 deletions benchmark/cublas/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,18 +75,27 @@ set(sources
)

# Operators supporting COMPLEX types benchmarking
set(CPLX_OPS "gemm" "gemm_batched" "gemm_batched_strided")
set(CPLX_OPS "gemm"
"gemm_batched"
"gemm_batched_strided")

# Operators supporting HALF type benchmarking
set(HALF_DATA_OPS "gemm"
"gemm_batched"
"gemm_batched_strided"
)

# Add individual benchmarks for each method
foreach(cublas_bench ${sources})
get_filename_component(bench_cublas_exec ${cublas_bench} NAME_WE)
add_executable(bench_cublas_${bench_cublas_exec} ${cublas_bench} main.cpp)
target_link_libraries(bench_cublas_${bench_cublas_exec} PRIVATE benchmark CUDA::toolkit CUDA::cublas CUDA::cudart portblas Clara::Clara bench_info)
target_compile_definitions(bench_cublas_${bench_cublas_exec} PRIVATE -DBLAS_INDEX_T=${BLAS_BENCHMARK_INDEX_TYPE})
if(${BLAS_ENABLE_COMPLEX})
if("${bench_cublas_exec}" IN_LIST CPLX_OPS)
target_compile_definitions(bench_cublas_${bench_cublas_exec} PRIVATE BLAS_ENABLE_COMPLEX=1)
endif()
if((${BLAS_ENABLE_COMPLEX}) AND ("${bench_cublas_exec}" IN_LIST CPLX_OPS))
target_compile_definitions(bench_cublas_${bench_cublas_exec} PRIVATE BLAS_ENABLE_COMPLEX=1)
endif()
if((${BLAS_ENABLE_HALF}) AND ("${bench_cublas_exec}" IN_LIST HALF_DATA_OPS))
target_compile_definitions(bench_cublas_${bench_cublas_exec} PRIVATE BLAS_ENABLE_HALF=1)
endif()
add_sycl_to_target(
TARGET bench_cublas_${bench_cublas_exec}
Expand Down
36 changes: 25 additions & 11 deletions benchmark/cublas/blas3/gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ static inline void cublas_routine(args_t&&... args) {
CUBLAS_CHECK(cublasSgemm(std::forward<args_t>(args)...));
} else if constexpr (std::is_same_v<scalar_t, double>) {
CUBLAS_CHECK(cublasDgemm(std::forward<args_t>(args)...));
} else if constexpr (std::is_same_v<scalar_t, cl::sycl::half>) {
CUBLAS_CHECK(cublasHgemm(std::forward<args_t>(args)...));
}
return;
}
Expand All @@ -54,6 +56,10 @@ template <typename scalar_t>
void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,
int t2, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta,
bool* success) {
// scalar_t if scalar_t!=sycl::half, cuda::__half otherwise
using cuda_scalar_t =
typename blas_benchmark::utils::CudaType<scalar_t>::type;

// initialize the state label
blas_benchmark::utils::set_benchmark_label<scalar_t>(state);

Expand All @@ -80,24 +86,31 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,
std::vector<scalar_t> c =
blas_benchmark::utils::const_data<scalar_t>(m * n, 0);

blas_benchmark::utils::CUDAVector<scalar_t> a_gpu(m * k, a.data());
blas_benchmark::utils::CUDAVector<scalar_t> b_gpu(k * n, b.data());
blas_benchmark::utils::CUDAVector<scalar_t> c_gpu(n * m, c.data());
blas_benchmark::utils::CUDAVector<cuda_scalar_t> a_gpu(
m * k, reinterpret_cast<cuda_scalar_t*>(a.data()));
blas_benchmark::utils::CUDAVector<cuda_scalar_t> b_gpu(
k * n, reinterpret_cast<cuda_scalar_t*>(b.data()));
blas_benchmark::utils::CUDAVector<cuda_scalar_t> c_gpu(
n * m, reinterpret_cast<cuda_scalar_t*>(c.data()));

cublasOperation_t c_t_a = (*t_a == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t c_t_b = (*t_b == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T;

cuda_scalar_t alpha_cuda = *reinterpret_cast<cuda_scalar_t*>(&alpha);
cuda_scalar_t beta_cuda = *reinterpret_cast<cuda_scalar_t*>(&beta);

#ifdef BLAS_VERIFY_BENCHMARK
// Run a first time with a verification of the results
std::vector<scalar_t> c_ref = c;
reference_blas::gemm(t_a, t_b, m, n, k, alpha, a.data(), lda, b.data(), ldb,
beta, c_ref.data(), ldc);
std::vector<scalar_t> c_temp = c;
{
blas_benchmark::utils::CUDAVector<scalar_t, true> c_temp_gpu(m * n,
c_temp.data());
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha, a_gpu,
lda, b_gpu, ldb, &beta, c_temp_gpu, ldc);
blas_benchmark::utils::CUDAVector<cuda_scalar_t, true> c_temp_gpu(
m * n, reinterpret_cast<cuda_scalar_t*>(c_temp.data()));
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha_cuda,
a_gpu, lda, b_gpu, ldb, &beta_cuda, c_temp_gpu,
ldc);
}

std::ostringstream err_stream;
Expand All @@ -107,9 +120,10 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,
*success = false;
};
#endif

auto blas_warmup = [&]() -> void {
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha, a_gpu,
lda, b_gpu, ldb, &beta, c_gpu, ldc);
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha_cuda,
a_gpu, lda, b_gpu, ldb, &beta_cuda, c_gpu, ldc);
return;
};

Expand All @@ -120,8 +134,8 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,

auto blas_method_def = [&]() -> std::vector<cudaEvent_t> {
CUDA_CHECK(cudaEventRecord(start));
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha, a_gpu,
lda, b_gpu, ldb, &beta, c_gpu, ldc);
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha_cuda,
a_gpu, lda, b_gpu, ldb, &beta_cuda, c_gpu, ldc);
CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
return std::vector{start, stop};
Expand Down
40 changes: 23 additions & 17 deletions benchmark/cublas/blas3/gemm_batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ static inline void cublas_routine(args_t&&... args) {
CUBLAS_CHECK(cublasSgemmBatched(std::forward<args_t>(args)...));
} else if constexpr (std::is_same_v<scalar_t, double>) {
CUBLAS_CHECK(cublasDgemmBatched(std::forward<args_t>(args)...));
} else if constexpr (std::is_same_v<scalar_t, cl::sycl::half>) {
CUBLAS_CHECK(cublasHgemmBatched(std::forward<args_t>(args)...));
}
return;
}
Expand All @@ -54,6 +56,10 @@ template <typename scalar_t>
void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1,
index_t t2, index_t m, index_t k, index_t n, scalar_t alpha,
scalar_t beta, index_t batch_count, int batch_type_i, bool* success) {
// scalar_t if scalar_t!=sycl::half, cuda::__half otherwise
using cuda_scalar_t =
typename blas_benchmark::utils::CudaType<scalar_t>::type;

// initialize the state label
blas_benchmark::utils::set_benchmark_label<scalar_t>(state);

Expand Down Expand Up @@ -84,17 +90,19 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1,
std::vector<scalar_t> c =
blas_benchmark::utils::const_data<scalar_t>(m * n * batch_count, 0);

blas_benchmark::utils::CUDAVectorBatched<scalar_t> d_A_array(m * k,
batch_count, a);
blas_benchmark::utils::CUDAVectorBatched<scalar_t> d_B_array(k * n,
batch_count, b);
blas_benchmark::utils::CUDAVectorBatched<scalar_t> d_C_array(m * n,
batch_count);
blas_benchmark::utils::CUDAVectorBatched<cuda_scalar_t> d_A_array(
m * k, batch_count, reinterpret_cast<cuda_scalar_t*>(a.data()));
blas_benchmark::utils::CUDAVectorBatched<cuda_scalar_t> d_B_array(
k * n, batch_count, reinterpret_cast<cuda_scalar_t*>(b.data()));
blas_benchmark::utils::CUDAVectorBatched<cuda_scalar_t> d_C_array(
m * n, batch_count);

cublasOperation_t c_t_a = (*t_a == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T;

cublasOperation_t c_t_b = (*t_b == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T;

cuda_scalar_t alpha_cuda = *reinterpret_cast<cuda_scalar_t*>(&alpha);
cuda_scalar_t beta_cuda = *reinterpret_cast<cuda_scalar_t*>(&beta);

#ifdef BLAS_VERIFY_BENCHMARK
// Run a first time with a verification of the results
{
Expand All @@ -110,13 +118,12 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1,
}

std::vector<scalar_t> c_temp(m * n * batch_count);

{
blas_benchmark::utils::CUDAVectorBatched<scalar_t, true> c_temp_gpu(
n * m, batch_count, c_temp);
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha,
blas_benchmark::utils::CUDAVectorBatched<cuda_scalar_t, true> c_temp_gpu(
n * m, batch_count, reinterpret_cast<cuda_scalar_t*>(c_temp.data()));
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha_cuda,
d_A_array.get_batch_array(), lda,
d_B_array.get_batch_array(), ldb, &beta,
d_B_array.get_batch_array(), ldb, &beta_cuda,
c_temp_gpu.get_batch_array(), ldc, batch_count);
}

Expand All @@ -128,14 +135,13 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1,
*success = false;
};
}

} // close scope for verify benchmark
#endif

auto blas_warmup = [&]() -> void {
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha,
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha_cuda,
d_A_array.get_batch_array(), lda,
d_B_array.get_batch_array(), ldb, &beta,
d_B_array.get_batch_array(), ldb, &beta_cuda,
d_C_array.get_batch_array(), ldc, batch_count);
return;
};
Expand All @@ -146,9 +152,9 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1,

auto blas_method_def = [&]() -> std::vector<cudaEvent_t> {
CUDA_CHECK(cudaEventRecord(start));
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha,
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha_cuda,
d_A_array.get_batch_array(), lda,
d_B_array.get_batch_array(), ldb, &beta,
d_B_array.get_batch_array(), ldb, &beta_cuda,
d_C_array.get_batch_array(), ldc, batch_count);
CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
Expand Down
41 changes: 26 additions & 15 deletions benchmark/cublas/blas3/gemm_batched_strided.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ static inline void cublas_routine(args_t&&... args) {
CUBLAS_CHECK(cublasSgemmStridedBatched(std::forward<args_t>(args)...));
} else if constexpr (std::is_same_v<scalar_t, double>) {
CUBLAS_CHECK(cublasDgemmStridedBatched(std::forward<args_t>(args)...));
} else if constexpr (std::is_same_v<scalar_t, cl::sycl::half>) {
CUBLAS_CHECK(cublasHgemmStridedBatched(std::forward<args_t>(args)...));
}
return;
}
Expand All @@ -55,6 +57,10 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,
int t2, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta,
index_t batch_size, index_t stride_a_mul, index_t stride_b_mul,
index_t stride_c_mul, bool* success) {
// scalar_t if scalar_t!=sycl::half, cuda::__half otherwise
using cuda_scalar_t =
typename blas_benchmark::utils::CudaType<scalar_t>::type;

// initialize the state label
blas_benchmark::utils::set_benchmark_label<scalar_t>(state);

Expand Down Expand Up @@ -103,14 +109,19 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,
std::vector<scalar_t> c =
blas_benchmark::utils::const_data<scalar_t>(size_c_batch, 0);

blas_benchmark::utils::CUDAVector<scalar_t> a_gpu(size_a_batch, a.data());
blas_benchmark::utils::CUDAVector<scalar_t> b_gpu(size_b_batch, b.data());
blas_benchmark::utils::CUDAVector<scalar_t> c_gpu(size_c_batch, c.data());
blas_benchmark::utils::CUDAVector<cuda_scalar_t> a_gpu(
size_a_batch, reinterpret_cast<cuda_scalar_t*>(a.data()));
blas_benchmark::utils::CUDAVector<cuda_scalar_t> b_gpu(
size_b_batch, reinterpret_cast<cuda_scalar_t*>(b.data()));
blas_benchmark::utils::CUDAVector<cuda_scalar_t> c_gpu(
size_c_batch, reinterpret_cast<cuda_scalar_t*>(c.data()));

cublasOperation_t c_t_a = trA ? CUBLAS_OP_N : CUBLAS_OP_T;

cublasOperation_t c_t_b = trB ? CUBLAS_OP_N : CUBLAS_OP_T;

cuda_scalar_t alpha_cuda = *reinterpret_cast<cuda_scalar_t*>(&alpha);
cuda_scalar_t beta_cuda = *reinterpret_cast<cuda_scalar_t*>(&beta);

#ifdef BLAS_VERIFY_BENCHMARK
// Run a first time with a verification of the results
std::vector<scalar_t> c_ref = c;
Expand All @@ -123,11 +134,11 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,

std::vector<scalar_t> c_temp = c;
{
blas_benchmark::utils::CUDAVector<scalar_t, true> c_temp_gpu(size_c_batch,
c_temp.data());
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha, a_gpu,
lda, stride_a, b_gpu, ldb, stride_b, &beta,
c_temp_gpu, ldc, stride_c, batch_size);
blas_benchmark::utils::CUDAVector<cuda_scalar_t, true> c_temp_gpu(
size_c_batch, reinterpret_cast<cuda_scalar_t*>(c_temp.data()));
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha_cuda,
a_gpu, lda, stride_a, b_gpu, ldb, stride_b,
&beta_cuda, c_temp_gpu, ldc, stride_c, batch_size);
}

std::ostringstream err_stream;
Expand All @@ -140,9 +151,9 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,
#endif

auto blas_warmup = [&]() -> void {
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha, a_gpu,
lda, stride_a, b_gpu, ldb, stride_b, &beta, c_gpu,
ldc, stride_c, batch_size);
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha_cuda,
a_gpu, lda, stride_a, b_gpu, ldb, stride_b,
&beta_cuda, c_gpu, ldc, stride_c, batch_size);
return;
};

Expand All @@ -152,9 +163,9 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,

auto blas_method_def = [&]() -> std::vector<cudaEvent_t> {
CUDA_CHECK(cudaEventRecord(start));
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha, a_gpu,
lda, stride_a, b_gpu, ldb, stride_b, &beta, c_gpu,
ldc, stride_c, batch_size);
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &alpha_cuda,
a_gpu, lda, stride_a, b_gpu, ldb, stride_b,
&beta_cuda, c_gpu, ldc, stride_c, batch_size);
CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
return std::vector{start, stop};
Expand Down
17 changes: 17 additions & 0 deletions benchmark/cublas/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@
#include <cuComplex.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>

// Forward declare methods that we use in `benchmark.cpp`, but define in
// `main.cpp`

Expand Down Expand Up @@ -274,6 +276,21 @@ static inline std::tuple<double, double> timef_cuda(function_t func,
return std::make_tuple(overall_time, static_cast<double>(elapsed_time) * 1E6);
}

/**
* Reference type of the underlying benchmark data aimed to match the
* cuda/cuBLAS scalar types.
*/
template <typename T, typename Enable = void>
struct CudaType {
using type = T;
};

// When T is sycl::half, use cuda's __cuda as type.
template <typename T>
struct CudaType<T, std::enable_if_t<std::is_same_v<T, cl::sycl::half>>> {
using type = __half;
};

} // namespace utils
} // namespace blas_benchmark

Expand Down
Loading

0 comments on commit 3adb52c

Please sign in to comment.