Skip to content
This repository has been archived by the owner on Jan 13, 2025. It is now read-only.

Added benchmarks for GEMM complex types #465

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
ac26cfe
Added complex type support to gemm kernels
OuadiElfarouki Sep 11, 2023
a4d6b8f
Added unit tests for complex type gemm operators
OuadiElfarouki Sep 11, 2023
58b8c35
Minor fixes
OuadiElfarouki Sep 11, 2023
bfc56ba
Typo fix
OuadiElfarouki Sep 11, 2023
3f80316
amd gpu config
OuadiElfarouki Sep 13, 2023
2eeb03f
De-coupling complex & scalar enable_if statements
OuadiElfarouki Sep 20, 2023
06705b3
Added static asserts on vector size when using cplx data
OuadiElfarouki Sep 20, 2023
f7179c5
Fixes to amd gpu configs
OuadiElfarouki Sep 21, 2023
9ba82fe
Addressed PR comments
OuadiElfarouki Sep 25, 2023
007727c
minor fixes
OuadiElfarouki Sep 25, 2023
7f76dfd
fixed bug in cmake & added readme description to complex
OuadiElfarouki Sep 27, 2023
148a2ea
Reduced complex gemm tests cases sizes
OuadiElfarouki Oct 2, 2023
49e0e01
removed unused legacy complex data utils
OuadiElfarouki Oct 4, 2023
d86cfc8
Tuned gemm complex for cpu
OuadiElfarouki Oct 4, 2023
2dc363d
Separated complex gemm load store & addressed PR comments
OuadiElfarouki Oct 13, 2023
6a0e010
Removed symm kernels generation from complex data types
OuadiElfarouki Oct 13, 2023
3e3d4dc
Added & enabled portblas, cublas & rocblas benchmarks for complex GEMM
OuadiElfarouki Oct 23, 2023
ad82e17
Removed redundant gemm batch type check in benchmark
OuadiElfarouki Oct 24, 2023
18b7f57
Merge branch 'master' into complex_gemm_bench_pr
muhammad-tanvir-1211 Oct 24, 2023
9484959
Merge branch 'master' into complex_gemm_bench_pr
muhammad-tanvir-1211 Oct 24, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions benchmark/cublas/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,20 @@ set(sources
extension/omatadd.cpp
)

# Operators supporting COMPLEX types benchmarking
set(CPLX_OPS "gemm" "gemm_batched" "gemm_batched_strided")

# Add individual benchmarks for each method
foreach(cublas_bench ${sources})
get_filename_component(bench_cublas_exec ${cublas_bench} NAME_WE)
add_executable(bench_cublas_${bench_cublas_exec} ${cublas_bench} main.cpp)
target_link_libraries(bench_cublas_${bench_cublas_exec} PRIVATE benchmark CUDA::toolkit CUDA::cublas CUDA::cudart portblas Clara::Clara bench_info)
target_compile_definitions(bench_cublas_${bench_cublas_exec} PRIVATE -DBLAS_INDEX_T=${BLAS_BENCHMARK_INDEX_TYPE})
if(${BLAS_ENABLE_COMPLEX})
if("${bench_cublas_exec}" IN_LIST CPLX_OPS)
target_compile_definitions(bench_cublas_${bench_cublas_exec} PRIVATE BLAS_ENABLE_COMPLEX=1)
endif()
endif()
add_sycl_to_target(
TARGET bench_cublas_${bench_cublas_exec}
SOURCES ${cublas_bench}
Expand Down
168 changes: 168 additions & 0 deletions benchmark/cublas/blas3/gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,18 @@ static inline void cublas_routine(args_t&&... args) {
return;
}

#ifdef BLAS_ENABLE_COMPLEX
template <typename scalar_t, typename... args_t>
static inline void cublas_cplx_routine(args_t&&... args) {
if constexpr (std::is_same_v<scalar_t, float>) {
CUBLAS_CHECK(cublasCgemm(std::forward<args_t>(args)...));
} else if constexpr (std::is_same_v<scalar_t, double>) {
CUBLAS_CHECK(cublasZgemm(std::forward<args_t>(args)...));
}
return;
}
#endif

template <typename scalar_t>
void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,
int t2, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta,
Expand Down Expand Up @@ -168,6 +180,162 @@ void register_benchmark(blas_benchmark::Args& args,
}
}

#ifdef BLAS_ENABLE_COMPLEX
template <typename scalar_t>
using cudaComplex = typename std::conditional<sizeof(scalar_t) == 8,
cuDoubleComplex, cuComplex>::type;

template <typename scalar_t>
void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,
int t2, index_t m, index_t k, index_t n, std::complex<scalar_t> alpha,
std::complex<scalar_t> beta, bool* success) {
// initialize the state label
blas_benchmark::utils::set_benchmark_label<std::complex<scalar_t>>(state);

// Standard test setup.
std::string t1s = blas_benchmark::utils::from_transpose_enum(
static_cast<blas_benchmark::utils::Transposition>(t1));
std::string t2s = blas_benchmark::utils::from_transpose_enum(
static_cast<blas_benchmark::utils::Transposition>(t2));
const char* t_a = t1s.c_str();
const char* t_b = t2s.c_str();

index_t lda = t_a[0] == 'n' ? m : k;
index_t ldb = t_b[0] == 'n' ? k : n;
index_t ldc = m;

blas_benchmark::utils::init_level_3_cplx_counters<
blas_benchmark::utils::Level3Op::gemm, scalar_t>(state, beta, m, n, k,
static_cast<index_t>(1));

cublasHandle_t& cuda_handle = *cuda_handle_ptr;

// Matrices
std::vector<std::complex<scalar_t>> a =
blas_benchmark::utils::random_cplx_data<scalar_t>(m * k);
std::vector<std::complex<scalar_t>> b =
blas_benchmark::utils::random_cplx_data<scalar_t>(k * n);
std::vector<std::complex<scalar_t>> c =
blas_benchmark::utils::const_cplx_data<scalar_t>(m * n, 0);

blas_benchmark::utils::CUDAVector<cudaComplex<scalar_t>> a_gpu(
m * k, reinterpret_cast<cudaComplex<scalar_t>*>(a.data()));
blas_benchmark::utils::CUDAVector<cudaComplex<scalar_t>> b_gpu(
k * n, reinterpret_cast<cudaComplex<scalar_t>*>(b.data()));
blas_benchmark::utils::CUDAVector<cudaComplex<scalar_t>> c_gpu(
n * m, reinterpret_cast<cudaComplex<scalar_t>*>(c.data()));

cublasOperation_t c_t_a = (*t_a == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t c_t_b = (*t_b == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T;

cudaComplex<scalar_t> cuBeta{beta.real(), beta.imag()};
cudaComplex<scalar_t> cuAlpha{alpha.real(), alpha.imag()};

#ifdef BLAS_VERIFY_BENCHMARK
// Run a first time with a verification of the results
std::vector<std::complex<scalar_t>> c_ref = c;

reference_blas::cgemm<scalar_t>(t_a, t_b, m, n, k,
reinterpret_cast<const void*>(&alpha),
reinterpret_cast<const void*>(a.data()), lda,
reinterpret_cast<const void*>(b.data()), ldb,
reinterpret_cast<const void*>(&beta),
reinterpret_cast<void*>(c_ref.data()), ldc);
std::vector<std::complex<scalar_t>> c_temp = c;
{
blas_benchmark::utils::CUDAVector<cudaComplex<scalar_t>, true> c_temp_gpu(
m * n, reinterpret_cast<cudaComplex<scalar_t>*>(c_temp.data()));
cublas_cplx_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &cuAlpha,
a_gpu, lda, b_gpu, ldb, &cuBeta, c_temp_gpu,
ldc);
}

std::ostringstream err_stream;
if (!utils::compare_vectors(c_temp, c_ref, err_stream, "")) {
const std::string& err_str = err_stream.str();
state.SkipWithError(err_str.c_str());
*success = false;
};
#endif
auto blas_warmup = [&]() -> void {
cublas_cplx_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &cuAlpha,
a_gpu, lda, b_gpu, ldb, &cuBeta, c_gpu, ldc);
return;
};

cudaEvent_t start;
cudaEvent_t stop;
CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&stop));

auto blas_method_def = [&]() -> std::vector<cudaEvent_t> {
CUDA_CHECK(cudaEventRecord(start));
cublas_cplx_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &cuAlpha,
a_gpu, lda, b_gpu, ldb, &cuBeta, c_gpu, ldc);
CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
return std::vector{start, stop};
};

// Warmup
blas_benchmark::utils::warmup(blas_warmup);
CUDA_CHECK(cudaStreamSynchronize(NULL));

blas_benchmark::utils::init_counters(state);

// Measure
for (auto _ : state) {
// Run
std::tuple<double, double> times =
blas_benchmark::utils::timef_cuda(blas_method_def);

// Report
blas_benchmark::utils::update_counters(state, times);
}

state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]);
state.SetBytesProcessed(state.iterations() *
state.counters["bytes_processed"]);

blas_benchmark::utils::calc_avg_counters(state);

CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
};

template <typename scalar_t>
void register_cplx_benchmark(blas_benchmark::Args& args,
cublasHandle_t* cuda_handle_ptr, bool* success) {
auto gemm_params =
blas_benchmark::utils::get_blas3_cplx_params<scalar_t>(args);
for (auto p : gemm_params) {
std::string t1s, t2s;
index_t m, n, k;
scalar_t alpha_r, alpha_i, beta_r, beta_i;

std::tie(t1s, t2s, m, k, n, alpha_r, alpha_i, beta_r, beta_i) = p;
int t1 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t1s));
int t2 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t2s));
std::complex<scalar_t> alpha{alpha_r, alpha_i};
std::complex<scalar_t> beta{beta_r, beta_i};

auto BM_lambda = [&](benchmark::State& st, cublasHandle_t* cuda_handle_ptr,
int t1, int t2, index_t m, index_t k, index_t n,
std::complex<scalar_t> alpha,
std::complex<scalar_t> beta, bool* success) {
run<scalar_t>(st, cuda_handle_ptr, t1, t2, m, k, n, alpha, beta, success);
};
benchmark::RegisterBenchmark(
blas_benchmark::utils::get_name<benchmark_op, std::complex<scalar_t>>(
t1s, t2s, m, k, n, blas_benchmark::utils::MEM_TYPE_USM)
.c_str(),
BM_lambda, cuda_handle_ptr, t1, t2, m, k, n, alpha, beta, success)
->UseRealTime();
}
}

#endif

namespace blas_benchmark {
void create_benchmark(blas_benchmark::Args& args,
cublasHandle_t* cuda_handle_ptr, bool* success) {
Expand Down
Loading