Skip to content
This repository has been archived by the owner on Jan 13, 2025. It is now read-only.

Commit

Permalink
simplied std complex type naming
Browse files Browse the repository at this point in the history
  • Loading branch information
OuadiElfarouki committed Sep 27, 2023
1 parent fecc180 commit d45904a
Show file tree
Hide file tree
Showing 11 changed files with 126 additions and 145 deletions.
29 changes: 13 additions & 16 deletions benchmark/cublas/blas3/gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,12 +187,10 @@ using cudaComplex = typename std::conditional<sizeof(scalar_t) == 8,

template <typename scalar_t>
void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,
int t2, index_t m, index_t k, index_t n,
blas::complex_std<scalar_t> alpha, blas::complex_std<scalar_t> beta,
bool* success) {
int t2, index_t m, index_t k, index_t n, std::complex<scalar_t> alpha,
std::complex<scalar_t> beta, bool* success) {
// initialize the state label
blas_benchmark::utils::set_benchmark_label<blas::complex_std<scalar_t>>(
state);
blas_benchmark::utils::set_benchmark_label<std::complex<scalar_t>>(state);

// Standard test setup.
std::string t1s = blas_benchmark::utils::from_transpose_enum(
Expand All @@ -213,11 +211,11 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,
cublasHandle_t& cuda_handle = *cuda_handle_ptr;

// Matrices
std::vector<blas::complex_std<scalar_t>> a =
std::vector<std::complex<scalar_t>> a =
blas_benchmark::utils::random_cplx_data<scalar_t>(m * k);
std::vector<blas::complex_std<scalar_t>> b =
std::vector<std::complex<scalar_t>> b =
blas_benchmark::utils::random_cplx_data<scalar_t>(k * n);
std::vector<blas::complex_std<scalar_t>> c =
std::vector<std::complex<scalar_t>> c =
blas_benchmark::utils::const_cplx_data<scalar_t>(m * n, 0);

blas_benchmark::utils::CUDAVector<cudaComplex<scalar_t>> a_gpu(
Expand All @@ -235,15 +233,15 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,

#ifdef BLAS_VERIFY_BENCHMARK
// Run a first time with a verification of the results
std::vector<blas::complex_std<scalar_t>> c_ref = c;
std::vector<std::complex<scalar_t>> c_ref = c;

reference_blas::cgemm<scalar_t>(t_a, t_b, m, n, k,
reinterpret_cast<const void*>(&alpha),
reinterpret_cast<const void*>(a.data()), lda,
reinterpret_cast<const void*>(b.data()), ldb,
reinterpret_cast<const void*>(&beta),
reinterpret_cast<void*>(c_ref.data()), ldc);
std::vector<blas::complex_std<scalar_t>> c_temp = c;
std::vector<std::complex<scalar_t>> c_temp = c;
{
blas_benchmark::utils::CUDAVector<cudaComplex<scalar_t>, true> c_temp_gpu(
m * n, reinterpret_cast<cudaComplex<scalar_t>*>(c_temp.data()));
Expand Down Expand Up @@ -318,18 +316,17 @@ void register_cplx_benchmark(blas_benchmark::Args& args,
std::tie(t1s, t2s, m, k, n, alpha_r, alpha_i, beta_r, beta_i) = p;
int t1 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t1s));
int t2 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t2s));
blas::complex_std<scalar_t> alpha{alpha_r, alpha_i};
blas::complex_std<scalar_t> beta{beta_r, beta_i};
std::complex<scalar_t> alpha{alpha_r, alpha_i};
std::complex<scalar_t> beta{beta_r, beta_i};

auto BM_lambda = [&](benchmark::State& st, cublasHandle_t* cuda_handle_ptr,
int t1, int t2, index_t m, index_t k, index_t n,
blas::complex_std<scalar_t> alpha,
blas::complex_std<scalar_t> beta, bool* success) {
std::complex<scalar_t> alpha,
std::complex<scalar_t> beta, bool* success) {
run<scalar_t>(st, cuda_handle_ptr, t1, t2, m, k, n, alpha, beta, success);
};
benchmark::RegisterBenchmark(
blas_benchmark::utils::get_name<benchmark_op,
blas::complex_std<scalar_t>>(
blas_benchmark::utils::get_name<benchmark_op, std::complex<scalar_t>>(
t1s, t2s, m, k, n, blas_benchmark::utils::MEM_TYPE_USM)
.c_str(),
BM_lambda, cuda_handle_ptr, t1, t2, m, k, n, alpha, beta, success)
Expand Down
26 changes: 12 additions & 14 deletions benchmark/cublas/blas3/gemm_batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -228,11 +228,10 @@ using cudaComplex = typename std::conditional<sizeof(scalar_t) == 8,
template <typename scalar_t>
void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1,
index_t t2, index_t m, index_t k, index_t n,
blas::complex_std<scalar_t> alpha, blas::complex_std<scalar_t> beta,
std::complex<scalar_t> alpha, std::complex<scalar_t> beta,
index_t batch_count, int batch_type_i, bool* success) {
// initialize the state label
blas_benchmark::utils::set_benchmark_label<blas::complex_std<scalar_t>>(
state);
blas_benchmark::utils::set_benchmark_label<std::complex<scalar_t>>(state);

// Standard setup
std::string t1s = blas_benchmark::utils::from_transpose_enum(
Expand All @@ -258,11 +257,11 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1,
const index_t size_c = m * n;

// Matrices
std::vector<blas::complex_std<scalar_t>> a =
std::vector<std::complex<scalar_t>> a =
blas_benchmark::utils::random_cplx_data<scalar_t>(size_a * batch_count);
std::vector<blas::complex_std<scalar_t>> b =
std::vector<std::complex<scalar_t>> b =
blas_benchmark::utils::random_cplx_data<scalar_t>(size_b * batch_count);
std::vector<blas::complex_std<scalar_t>> c =
std::vector<std::complex<scalar_t>> c =
blas_benchmark::utils::const_cplx_data<scalar_t>(size_c * batch_count, 0);

blas_benchmark::utils::CUDAVectorBatched<cudaComplex<scalar_t>> d_A_array(
Expand All @@ -281,7 +280,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1,
#ifdef BLAS_VERIFY_BENCHMARK
// Run a first time with a verification of the results
{
std::vector<blas::complex_std<scalar_t>> c_ref = c;
std::vector<std::complex<scalar_t>> c_ref = c;
auto _base = [=](index_t dim0, index_t dim1, index_t idx) {
return dim0 * dim1 * idx;
};
Expand All @@ -294,7 +293,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1,
reinterpret_cast<void*>(c_ref.data() + _base(m, n, batch_idx)), ldc);
}

std::vector<blas::complex_std<scalar_t>> c_temp(size_c * batch_count);
std::vector<std::complex<scalar_t>> c_temp(size_c * batch_count);

{
blas_benchmark::utils::CUDAVectorBatched<cudaComplex<scalar_t>, true>
Expand Down Expand Up @@ -381,8 +380,8 @@ void register_cplx_benchmark(blas_benchmark::Args& args,

std::tie(t1s, t2s, m, k, n, alpha_r, alpha_i, beta_r, beta_i, batch_count,
batch_type) = p;
blas::complex_std<scalar_t> alpha{alpha_r, alpha_i};
blas::complex_std<scalar_t> beta{beta_r, beta_i};
std::complex<scalar_t> alpha{alpha_r, alpha_i};
std::complex<scalar_t> beta{beta_r, beta_i};

if (batch_type == 1) {
std::cerr << "interleaved memory for gemm_batched operator is not "
Expand All @@ -395,15 +394,14 @@ void register_cplx_benchmark(blas_benchmark::Args& args,

auto BM_lambda = [&](benchmark::State& st, cublasHandle_t* cuda_handle_ptr,
int t1, int t2, index_t m, index_t k, index_t n,
blas::complex_std<scalar_t> alpha,
blas::complex_std<scalar_t> beta, index_t batch_count,
std::complex<scalar_t> alpha,
std::complex<scalar_t> beta, index_t batch_count,
int batch_type, bool* success) {
run<scalar_t>(st, cuda_handle_ptr, t1, t2, m, k, n, alpha, beta,
batch_count, batch_type, success);
};
benchmark::RegisterBenchmark(
blas_benchmark::utils::get_name<benchmark_op,
blas::complex_std<scalar_t>>(
blas_benchmark::utils::get_name<benchmark_op, std::complex<scalar_t>>(
t1s, t2s, m, k, n, batch_count, batch_type,
blas_benchmark::utils::MEM_TYPE_USM)
.c_str(),
Expand Down
31 changes: 14 additions & 17 deletions benchmark/cublas/blas3/gemm_batched_strided.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,13 +227,11 @@ using cudaComplex = typename std::conditional<sizeof(scalar_t) == 8,

template <typename scalar_t>
void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,
int t2, index_t m, index_t k, index_t n,
blas::complex_std<scalar_t> alpha, blas::complex_std<scalar_t> beta,
index_t batch_size, index_t stride_a_mul, index_t stride_b_mul,
index_t stride_c_mul, bool* success) {
int t2, index_t m, index_t k, index_t n, std::complex<scalar_t> alpha,
std::complex<scalar_t> beta, index_t batch_size, index_t stride_a_mul,
index_t stride_b_mul, index_t stride_c_mul, bool* success) {
// initialize the state label
blas_benchmark::utils::set_benchmark_label<blas::complex_std<scalar_t>>(
state);
blas_benchmark::utils::set_benchmark_label<std::complex<scalar_t>>(state);

// Standard test setup.
std::string t1s = blas_benchmark::utils::from_transpose_enum(
Expand Down Expand Up @@ -273,11 +271,11 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,

// Matrices (Total size is equal to matrix size x batch_size since we're using
// default striding values)
std::vector<blas::complex_std<scalar_t>> a =
std::vector<std::complex<scalar_t>> a =
blas_benchmark::utils::random_cplx_data<scalar_t>(size_a_batch);
std::vector<blas::complex_std<scalar_t>> b =
std::vector<std::complex<scalar_t>> b =
blas_benchmark::utils::random_cplx_data<scalar_t>(size_b_batch);
std::vector<blas::complex_std<scalar_t>> c =
std::vector<std::complex<scalar_t>> c =
blas_benchmark::utils::const_cplx_data<scalar_t>(size_c_batch, 0);

blas_benchmark::utils::CUDAVector<cudaComplex<scalar_t>> a_gpu(
Expand All @@ -295,7 +293,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,

#ifdef BLAS_VERIFY_BENCHMARK
// Run a first time with a verification of the results
std::vector<blas::complex_std<scalar_t>> c_ref = c;
std::vector<std::complex<scalar_t>> c_ref = c;
for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
reference_blas::cgemm<scalar_t>(
t_a, t_b, m, n, k, reinterpret_cast<const void*>(&alpha),
Expand All @@ -305,7 +303,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,
reinterpret_cast<void*>(c_ref.data() + batch_idx * stride_c), ldc);
}

std::vector<blas::complex_std<scalar_t>> c_temp = c;
std::vector<std::complex<scalar_t>> c_temp = c;
{
blas_benchmark::utils::CUDAVector<cudaComplex<scalar_t>, true> c_temp_gpu(
size_c_batch, reinterpret_cast<cudaComplex<scalar_t>*>(c_temp.data()));
Expand Down Expand Up @@ -385,21 +383,20 @@ void register_cplx_benchmark(blas_benchmark::Args& args,
stride_a_mul, stride_b_mul, stride_c_mul) = p;
int t1 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t1s));
int t2 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t2s));
blas::complex_std<scalar_t> alpha{alpha_r, alpha_i};
blas::complex_std<scalar_t> beta{beta_r, beta_i};
std::complex<scalar_t> alpha{alpha_r, alpha_i};
std::complex<scalar_t> beta{beta_r, beta_i};

auto BM_lambda = [&](benchmark::State& st, cublasHandle_t* cuda_handle_ptr,
int t1, int t2, index_t m, index_t k, index_t n,
blas::complex_std<scalar_t> alpha,
blas::complex_std<scalar_t> beta, index_t batch_size,
std::complex<scalar_t> alpha,
std::complex<scalar_t> beta, index_t batch_size,
index_t strd_a_mul, index_t strd_b_mul,
index_t strd_c_mul, bool* success) {
run<scalar_t>(st, cuda_handle_ptr, t1, t2, m, k, n, alpha, beta,
batch_size, strd_a_mul, strd_b_mul, strd_c_mul, success);
};
benchmark::RegisterBenchmark(
blas_benchmark::utils::get_name<benchmark_op,
blas::complex_std<scalar_t>>(
blas_benchmark::utils::get_name<benchmark_op, std::complex<scalar_t>>(
t1s, t2s, m, k, n, batch_size, stride_a_mul, stride_b_mul,
stride_c_mul, blas_benchmark::utils::MEM_TYPE_USM)
.c_str(),
Expand Down
28 changes: 13 additions & 15 deletions benchmark/portblas/blas3/gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,11 +180,10 @@ void register_benchmark(blas_benchmark::Args& args,
#ifdef BLAS_ENABLE_COMPLEX
template <typename scalar_t, blas::helper::AllocType mem_alloc>
void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1,
int t2, index_t m, index_t k, index_t n,
blas::complex_std<scalar_t> alpha, blas::complex_std<scalar_t> beta,
bool* success) {
int t2, index_t m, index_t k, index_t n, std::complex<scalar_t> alpha,
std::complex<scalar_t> beta, bool* success) {
// initialize the state label
blas_benchmark::utils::set_benchmark_label<blas::complex_std<scalar_t>>(
blas_benchmark::utils::set_benchmark_label<std::complex<scalar_t>>(
state, sb_handle_ptr->get_queue());

// Standard test setup.
Expand All @@ -207,11 +206,11 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1,
auto q = sb_handle.get_queue();

// Matrices
std::vector<blas::complex_std<scalar_t>> a =
std::vector<std::complex<scalar_t>> a =
blas_benchmark::utils::random_cplx_data<scalar_t>(m * k);
std::vector<blas::complex_std<scalar_t>> b =
std::vector<std::complex<scalar_t>> b =
blas_benchmark::utils::random_cplx_data<scalar_t>(k * n);
std::vector<blas::complex_std<scalar_t>> c =
std::vector<std::complex<scalar_t>> c =
blas_benchmark::utils::const_cplx_data<scalar_t>(m * n, 0);

auto a_gpu =
Expand Down Expand Up @@ -239,15 +238,15 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1,

#ifdef BLAS_VERIFY_BENCHMARK
// Run a first time with a verification of the results
std::vector<blas::complex_std<scalar_t>> c_ref = c;
std::vector<std::complex<scalar_t>> c_ref = c;
reference_blas::cgemm<scalar_t>(t_a, t_b, m, n, k,
reinterpret_cast<const void*>(&alpha),
reinterpret_cast<const void*>(a.data()), lda,
reinterpret_cast<const void*>(b.data()), ldb,
reinterpret_cast<const void*>(&beta),
reinterpret_cast<void*>(c_ref.data()), ldc);

std::vector<blas::complex_std<scalar_t>> c_temp = c;
std::vector<std::complex<scalar_t>> c_temp = c;

{
auto c_temp_gpu =
Expand Down Expand Up @@ -328,19 +327,18 @@ void register_cplx_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
std::tie(t1s, t2s, m, k, n, alpha_r, alpha_i, beta_r, beta_i) = p;
int t1 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t1s));
int t2 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t2s));
blas::complex_std<scalar_t> alpha{alpha_r, alpha_i};
blas::complex_std<scalar_t> beta{beta_r, beta_i};
std::complex<scalar_t> alpha{alpha_r, alpha_i};
std::complex<scalar_t> beta{beta_r, beta_i};

auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr,
int t1, int t2, index_t m, index_t k, index_t n,
blas::complex_std<scalar_t> alpha,
blas::complex_std<scalar_t> beta, bool* success) {
std::complex<scalar_t> alpha,
std::complex<scalar_t> beta, bool* success) {
run<scalar_t, mem_alloc>(st, sb_handle_ptr, t1, t2, m, k, n, alpha, beta,
success);
};
benchmark::RegisterBenchmark(
blas_benchmark::utils::get_name<benchmark_op,
blas::complex_std<scalar_t>>(
blas_benchmark::utils::get_name<benchmark_op, std::complex<scalar_t>>(
t1s, t2s, m, k, n, mem_type)
.c_str(),
BM_lambda, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, success)
Expand Down
29 changes: 14 additions & 15 deletions benchmark/portblas/blas3/gemm_batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,11 +251,11 @@ void register_benchmark(blas_benchmark::Args& args,
#ifdef BLAS_ENABLE_COMPLEX
template <typename scalar_t, blas::helper::AllocType mem_alloc>
void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1,
int t2, index_t m, index_t k, index_t n,
blas::complex_std<scalar_t> alpha, blas::complex_std<scalar_t> beta,
index_t batch_size, int batch_type_i, bool* success) {
int t2, index_t m, index_t k, index_t n, std::complex<scalar_t> alpha,
std::complex<scalar_t> beta, index_t batch_size, int batch_type_i,
bool* success) {
// initialize the state label
blas_benchmark::utils::set_benchmark_label<blas::complex_std<scalar_t>>(
blas_benchmark::utils::set_benchmark_label<std::complex<scalar_t>>(
state, sb_handle_ptr->get_queue());

// Standard test setup.
Expand All @@ -279,17 +279,17 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1,
auto q = sb_handle.get_queue();

// Matrices
std::vector<blas::complex_std<scalar_t>> a =
std::vector<std::complex<scalar_t>> a =
blas_benchmark::utils::random_cplx_data<scalar_t>(m * k * batch_size);
std::vector<blas::complex_std<scalar_t>> b =
std::vector<std::complex<scalar_t>> b =
blas_benchmark::utils::random_cplx_data<scalar_t>(k * n * batch_size);
std::vector<blas::complex_std<scalar_t>> c =
std::vector<std::complex<scalar_t>> c =
blas_benchmark::utils::const_cplx_data<scalar_t>(m * n * batch_size,
scalar_t(0));

#ifdef BLAS_VERIFY_BENCHMARK
// Run a first time with a verification of the results
std::vector<blas::complex_std<scalar_t>> c_ref = c;
std::vector<std::complex<scalar_t>> c_ref = c;
auto _base = [=](index_t dim0, index_t dim1, index_t idx) {
return dim0 * dim1 * idx;
};
Expand Down Expand Up @@ -333,7 +333,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1,
blas::complex_sycl<scalar_t> beta_sycl(beta);

#ifdef BLAS_VERIFY_BENCHMARK
std::vector<blas::complex_std<scalar_t>> c_temp = c;
std::vector<std::complex<scalar_t>> c_temp = c;
{
auto c_temp_gpu =
blas::helper::allocate<mem_alloc, blas::complex_sycl<scalar_t>>(
Expand Down Expand Up @@ -423,20 +423,19 @@ void register_cplx_benchmark(
}
int t1 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t1s));
int t2 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t2s));
blas::complex_std<scalar_t> alpha{alpha_r, alpha_i};
blas::complex_std<scalar_t> beta{beta_r, beta_i};
std::complex<scalar_t> alpha{alpha_r, alpha_i};
std::complex<scalar_t> beta{beta_r, beta_i};

auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr,
int t1, int t2, index_t m, index_t k, index_t n,
blas::complex_std<scalar_t> alpha,
blas::complex_std<scalar_t> beta, index_t batch_size,
std::complex<scalar_t> alpha,
std::complex<scalar_t> beta, index_t batch_size,
int batch_type, bool* success) {
run<scalar_t, mem_alloc>(st, sb_handle_ptr, t1, t2, m, k, n, alpha, beta,
batch_size, batch_type, success);
};
benchmark::RegisterBenchmark(
blas_benchmark::utils::get_name<benchmark_op,
blas::complex_std<scalar_t>>(
blas_benchmark::utils::get_name<benchmark_op, std::complex<scalar_t>>(
t1s, t2s, m, k, n, batch_size, batch_type, mem_type)
.c_str(),
BM_lambda, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, batch_size,
Expand Down
Loading

0 comments on commit d45904a

Please sign in to comment.