From d45904a6b847ce128a9f56bc02c8b2c74713cdd3 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Mon, 25 Sep 2023 12:41:02 +0100 Subject: [PATCH] simplied std complex type naming --- benchmark/cublas/blas3/gemm.cpp | 29 ++++++++--------- benchmark/cublas/blas3/gemm_batched.cpp | 26 +++++++--------- .../cublas/blas3/gemm_batched_strided.cpp | 31 +++++++++---------- benchmark/portblas/blas3/gemm.cpp | 28 ++++++++--------- benchmark/portblas/blas3/gemm_batched.cpp | 29 +++++++++-------- .../portblas/blas3/gemm_batched_strided.cpp | 30 +++++++++--------- benchmark/rocblas/blas3/gemm.cpp | 26 +++++++--------- benchmark/rocblas/blas3/gemm_batched.cpp | 26 +++++++--------- .../rocblas/blas3/gemm_batched_strided.cpp | 26 +++++++--------- .../include/common/blas3_state_counters.hpp | 4 +-- common/include/common/common_utils.hpp | 16 +++++----- 11 files changed, 126 insertions(+), 145 deletions(-) diff --git a/benchmark/cublas/blas3/gemm.cpp b/benchmark/cublas/blas3/gemm.cpp index 28f6e7124..c74c9e98e 100644 --- a/benchmark/cublas/blas3/gemm.cpp +++ b/benchmark/cublas/blas3/gemm.cpp @@ -187,12 +187,10 @@ using cudaComplex = typename std::conditional void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1, - int t2, index_t m, index_t k, index_t n, - blas::complex_std alpha, blas::complex_std beta, - bool* success) { + int t2, index_t m, index_t k, index_t n, std::complex alpha, + std::complex beta, bool* success) { // initialize the state label - blas_benchmark::utils::set_benchmark_label>( - state); + blas_benchmark::utils::set_benchmark_label>(state); // Standard test setup. std::string t1s = blas_benchmark::utils::from_transpose_enum( @@ -213,11 +211,11 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1, cublasHandle_t& cuda_handle = *cuda_handle_ptr; // Matrices - std::vector> a = + std::vector> a = blas_benchmark::utils::random_cplx_data(m * k); - std::vector> b = + std::vector> b = blas_benchmark::utils::random_cplx_data(k * n); - std::vector> c = + std::vector> c = blas_benchmark::utils::const_cplx_data(m * n, 0); blas_benchmark::utils::CUDAVector> a_gpu( @@ -235,7 +233,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1, #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results - std::vector> c_ref = c; + std::vector> c_ref = c; reference_blas::cgemm(t_a, t_b, m, n, k, reinterpret_cast(&alpha), @@ -243,7 +241,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1, reinterpret_cast(b.data()), ldb, reinterpret_cast(&beta), reinterpret_cast(c_ref.data()), ldc); - std::vector> c_temp = c; + std::vector> c_temp = c; { blas_benchmark::utils::CUDAVector, true> c_temp_gpu( m * n, reinterpret_cast*>(c_temp.data())); @@ -318,18 +316,17 @@ void register_cplx_benchmark(blas_benchmark::Args& args, std::tie(t1s, t2s, m, k, n, alpha_r, alpha_i, beta_r, beta_i) = p; int t1 = static_cast(blas_benchmark::utils::to_transpose_enum(t1s)); int t2 = static_cast(blas_benchmark::utils::to_transpose_enum(t2s)); - blas::complex_std alpha{alpha_r, alpha_i}; - blas::complex_std beta{beta_r, beta_i}; + std::complex alpha{alpha_r, alpha_i}; + std::complex beta{beta_r, beta_i}; auto BM_lambda = [&](benchmark::State& st, cublasHandle_t* cuda_handle_ptr, int t1, int t2, index_t m, index_t k, index_t n, - blas::complex_std alpha, - blas::complex_std beta, bool* success) { + std::complex alpha, + std::complex beta, bool* success) { run(st, cuda_handle_ptr, t1, t2, m, k, n, alpha, beta, success); }; benchmark::RegisterBenchmark( - blas_benchmark::utils::get_name>( + blas_benchmark::utils::get_name>( t1s, t2s, m, k, n, blas_benchmark::utils::MEM_TYPE_USM) .c_str(), BM_lambda, cuda_handle_ptr, t1, t2, m, k, n, alpha, beta, success) diff --git a/benchmark/cublas/blas3/gemm_batched.cpp b/benchmark/cublas/blas3/gemm_batched.cpp index 36a0b12fb..c0c50631f 100644 --- a/benchmark/cublas/blas3/gemm_batched.cpp +++ b/benchmark/cublas/blas3/gemm_batched.cpp @@ -228,11 +228,10 @@ using cudaComplex = typename std::conditional void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1, index_t t2, index_t m, index_t k, index_t n, - blas::complex_std alpha, blas::complex_std beta, + std::complex alpha, std::complex beta, index_t batch_count, int batch_type_i, bool* success) { // initialize the state label - blas_benchmark::utils::set_benchmark_label>( - state); + blas_benchmark::utils::set_benchmark_label>(state); // Standard setup std::string t1s = blas_benchmark::utils::from_transpose_enum( @@ -258,11 +257,11 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1, const index_t size_c = m * n; // Matrices - std::vector> a = + std::vector> a = blas_benchmark::utils::random_cplx_data(size_a * batch_count); - std::vector> b = + std::vector> b = blas_benchmark::utils::random_cplx_data(size_b * batch_count); - std::vector> c = + std::vector> c = blas_benchmark::utils::const_cplx_data(size_c * batch_count, 0); blas_benchmark::utils::CUDAVectorBatched> d_A_array( @@ -281,7 +280,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1, #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results { - std::vector> c_ref = c; + std::vector> c_ref = c; auto _base = [=](index_t dim0, index_t dim1, index_t idx) { return dim0 * dim1 * idx; }; @@ -294,7 +293,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1, reinterpret_cast(c_ref.data() + _base(m, n, batch_idx)), ldc); } - std::vector> c_temp(size_c * batch_count); + std::vector> c_temp(size_c * batch_count); { blas_benchmark::utils::CUDAVectorBatched, true> @@ -381,8 +380,8 @@ void register_cplx_benchmark(blas_benchmark::Args& args, std::tie(t1s, t2s, m, k, n, alpha_r, alpha_i, beta_r, beta_i, batch_count, batch_type) = p; - blas::complex_std alpha{alpha_r, alpha_i}; - blas::complex_std beta{beta_r, beta_i}; + std::complex alpha{alpha_r, alpha_i}; + std::complex beta{beta_r, beta_i}; if (batch_type == 1) { std::cerr << "interleaved memory for gemm_batched operator is not " @@ -395,15 +394,14 @@ void register_cplx_benchmark(blas_benchmark::Args& args, auto BM_lambda = [&](benchmark::State& st, cublasHandle_t* cuda_handle_ptr, int t1, int t2, index_t m, index_t k, index_t n, - blas::complex_std alpha, - blas::complex_std beta, index_t batch_count, + std::complex alpha, + std::complex beta, index_t batch_count, int batch_type, bool* success) { run(st, cuda_handle_ptr, t1, t2, m, k, n, alpha, beta, batch_count, batch_type, success); }; benchmark::RegisterBenchmark( - blas_benchmark::utils::get_name>( + blas_benchmark::utils::get_name>( t1s, t2s, m, k, n, batch_count, batch_type, blas_benchmark::utils::MEM_TYPE_USM) .c_str(), diff --git a/benchmark/cublas/blas3/gemm_batched_strided.cpp b/benchmark/cublas/blas3/gemm_batched_strided.cpp index 5e1121f26..beb81fb4c 100644 --- a/benchmark/cublas/blas3/gemm_batched_strided.cpp +++ b/benchmark/cublas/blas3/gemm_batched_strided.cpp @@ -227,13 +227,11 @@ using cudaComplex = typename std::conditional void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1, - int t2, index_t m, index_t k, index_t n, - blas::complex_std alpha, blas::complex_std beta, - index_t batch_size, index_t stride_a_mul, index_t stride_b_mul, - index_t stride_c_mul, bool* success) { + int t2, index_t m, index_t k, index_t n, std::complex alpha, + std::complex beta, index_t batch_size, index_t stride_a_mul, + index_t stride_b_mul, index_t stride_c_mul, bool* success) { // initialize the state label - blas_benchmark::utils::set_benchmark_label>( - state); + blas_benchmark::utils::set_benchmark_label>(state); // Standard test setup. std::string t1s = blas_benchmark::utils::from_transpose_enum( @@ -273,11 +271,11 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1, // Matrices (Total size is equal to matrix size x batch_size since we're using // default striding values) - std::vector> a = + std::vector> a = blas_benchmark::utils::random_cplx_data(size_a_batch); - std::vector> b = + std::vector> b = blas_benchmark::utils::random_cplx_data(size_b_batch); - std::vector> c = + std::vector> c = blas_benchmark::utils::const_cplx_data(size_c_batch, 0); blas_benchmark::utils::CUDAVector> a_gpu( @@ -295,7 +293,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1, #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results - std::vector> c_ref = c; + std::vector> c_ref = c; for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) { reference_blas::cgemm( t_a, t_b, m, n, k, reinterpret_cast(&alpha), @@ -305,7 +303,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1, reinterpret_cast(c_ref.data() + batch_idx * stride_c), ldc); } - std::vector> c_temp = c; + std::vector> c_temp = c; { blas_benchmark::utils::CUDAVector, true> c_temp_gpu( size_c_batch, reinterpret_cast*>(c_temp.data())); @@ -385,21 +383,20 @@ void register_cplx_benchmark(blas_benchmark::Args& args, stride_a_mul, stride_b_mul, stride_c_mul) = p; int t1 = static_cast(blas_benchmark::utils::to_transpose_enum(t1s)); int t2 = static_cast(blas_benchmark::utils::to_transpose_enum(t2s)); - blas::complex_std alpha{alpha_r, alpha_i}; - blas::complex_std beta{beta_r, beta_i}; + std::complex alpha{alpha_r, alpha_i}; + std::complex beta{beta_r, beta_i}; auto BM_lambda = [&](benchmark::State& st, cublasHandle_t* cuda_handle_ptr, int t1, int t2, index_t m, index_t k, index_t n, - blas::complex_std alpha, - blas::complex_std beta, index_t batch_size, + std::complex alpha, + std::complex beta, index_t batch_size, index_t strd_a_mul, index_t strd_b_mul, index_t strd_c_mul, bool* success) { run(st, cuda_handle_ptr, t1, t2, m, k, n, alpha, beta, batch_size, strd_a_mul, strd_b_mul, strd_c_mul, success); }; benchmark::RegisterBenchmark( - blas_benchmark::utils::get_name>( + blas_benchmark::utils::get_name>( t1s, t2s, m, k, n, batch_size, stride_a_mul, stride_b_mul, stride_c_mul, blas_benchmark::utils::MEM_TYPE_USM) .c_str(), diff --git a/benchmark/portblas/blas3/gemm.cpp b/benchmark/portblas/blas3/gemm.cpp index 813e1710b..27bb90650 100644 --- a/benchmark/portblas/blas3/gemm.cpp +++ b/benchmark/portblas/blas3/gemm.cpp @@ -180,11 +180,10 @@ void register_benchmark(blas_benchmark::Args& args, #ifdef BLAS_ENABLE_COMPLEX template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, - int t2, index_t m, index_t k, index_t n, - blas::complex_std alpha, blas::complex_std beta, - bool* success) { + int t2, index_t m, index_t k, index_t n, std::complex alpha, + std::complex beta, bool* success) { // initialize the state label - blas_benchmark::utils::set_benchmark_label>( + blas_benchmark::utils::set_benchmark_label>( state, sb_handle_ptr->get_queue()); // Standard test setup. @@ -207,11 +206,11 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, auto q = sb_handle.get_queue(); // Matrices - std::vector> a = + std::vector> a = blas_benchmark::utils::random_cplx_data(m * k); - std::vector> b = + std::vector> b = blas_benchmark::utils::random_cplx_data(k * n); - std::vector> c = + std::vector> c = blas_benchmark::utils::const_cplx_data(m * n, 0); auto a_gpu = @@ -239,7 +238,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results - std::vector> c_ref = c; + std::vector> c_ref = c; reference_blas::cgemm(t_a, t_b, m, n, k, reinterpret_cast(&alpha), reinterpret_cast(a.data()), lda, @@ -247,7 +246,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, reinterpret_cast(&beta), reinterpret_cast(c_ref.data()), ldc); - std::vector> c_temp = c; + std::vector> c_temp = c; { auto c_temp_gpu = @@ -328,19 +327,18 @@ void register_cplx_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, std::tie(t1s, t2s, m, k, n, alpha_r, alpha_i, beta_r, beta_i) = p; int t1 = static_cast(blas_benchmark::utils::to_transpose_enum(t1s)); int t2 = static_cast(blas_benchmark::utils::to_transpose_enum(t2s)); - blas::complex_std alpha{alpha_r, alpha_i}; - blas::complex_std beta{beta_r, beta_i}; + std::complex alpha{alpha_r, alpha_i}; + std::complex beta{beta_r, beta_i}; auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, int t1, int t2, index_t m, index_t k, index_t n, - blas::complex_std alpha, - blas::complex_std beta, bool* success) { + std::complex alpha, + std::complex beta, bool* success) { run(st, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, success); }; benchmark::RegisterBenchmark( - blas_benchmark::utils::get_name>( + blas_benchmark::utils::get_name>( t1s, t2s, m, k, n, mem_type) .c_str(), BM_lambda, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, success) diff --git a/benchmark/portblas/blas3/gemm_batched.cpp b/benchmark/portblas/blas3/gemm_batched.cpp index 2e748cec3..3d98c3697 100644 --- a/benchmark/portblas/blas3/gemm_batched.cpp +++ b/benchmark/portblas/blas3/gemm_batched.cpp @@ -251,11 +251,11 @@ void register_benchmark(blas_benchmark::Args& args, #ifdef BLAS_ENABLE_COMPLEX template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, - int t2, index_t m, index_t k, index_t n, - blas::complex_std alpha, blas::complex_std beta, - index_t batch_size, int batch_type_i, bool* success) { + int t2, index_t m, index_t k, index_t n, std::complex alpha, + std::complex beta, index_t batch_size, int batch_type_i, + bool* success) { // initialize the state label - blas_benchmark::utils::set_benchmark_label>( + blas_benchmark::utils::set_benchmark_label>( state, sb_handle_ptr->get_queue()); // Standard test setup. @@ -279,17 +279,17 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, auto q = sb_handle.get_queue(); // Matrices - std::vector> a = + std::vector> a = blas_benchmark::utils::random_cplx_data(m * k * batch_size); - std::vector> b = + std::vector> b = blas_benchmark::utils::random_cplx_data(k * n * batch_size); - std::vector> c = + std::vector> c = blas_benchmark::utils::const_cplx_data(m * n * batch_size, scalar_t(0)); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results - std::vector> c_ref = c; + std::vector> c_ref = c; auto _base = [=](index_t dim0, index_t dim1, index_t idx) { return dim0 * dim1 * idx; }; @@ -333,7 +333,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, blas::complex_sycl beta_sycl(beta); #ifdef BLAS_VERIFY_BENCHMARK - std::vector> c_temp = c; + std::vector> c_temp = c; { auto c_temp_gpu = blas::helper::allocate>( @@ -423,20 +423,19 @@ void register_cplx_benchmark( } int t1 = static_cast(blas_benchmark::utils::to_transpose_enum(t1s)); int t2 = static_cast(blas_benchmark::utils::to_transpose_enum(t2s)); - blas::complex_std alpha{alpha_r, alpha_i}; - blas::complex_std beta{beta_r, beta_i}; + std::complex alpha{alpha_r, alpha_i}; + std::complex beta{beta_r, beta_i}; auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, int t1, int t2, index_t m, index_t k, index_t n, - blas::complex_std alpha, - blas::complex_std beta, index_t batch_size, + std::complex alpha, + std::complex beta, index_t batch_size, int batch_type, bool* success) { run(st, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, batch_size, batch_type, success); }; benchmark::RegisterBenchmark( - blas_benchmark::utils::get_name>( + blas_benchmark::utils::get_name>( t1s, t2s, m, k, n, batch_size, batch_type, mem_type) .c_str(), BM_lambda, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, batch_size, diff --git a/benchmark/portblas/blas3/gemm_batched_strided.cpp b/benchmark/portblas/blas3/gemm_batched_strided.cpp index cb94076dd..a24a2a188 100644 --- a/benchmark/portblas/blas3/gemm_batched_strided.cpp +++ b/benchmark/portblas/blas3/gemm_batched_strided.cpp @@ -221,12 +221,11 @@ void register_benchmark(blas_benchmark::Args& args, #ifdef BLAS_ENABLE_COMPLEX template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, - int t2, index_t m, index_t k, index_t n, - blas::complex_std alpha, blas::complex_std beta, - index_t batch_size, index_t stride_a_mul, index_t stride_b_mul, - index_t stride_c_mul, bool* success) { + int t2, index_t m, index_t k, index_t n, std::complex alpha, + std::complex beta, index_t batch_size, index_t stride_a_mul, + index_t stride_b_mul, index_t stride_c_mul, bool* success) { // initialize the state label - blas_benchmark::utils::set_benchmark_label>( + blas_benchmark::utils::set_benchmark_label>( state, sb_handle_ptr->get_queue()); // Standard test setup. @@ -267,17 +266,17 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, const int size_c_batch = c_size + (batch_size - 1) * stride_c; // Matrices - std::vector> a = + std::vector> a = blas_benchmark::utils::random_cplx_data(size_a_batch); - std::vector> b = + std::vector> b = blas_benchmark::utils::random_cplx_data(size_b_batch); - std::vector> c = + std::vector> c = blas_benchmark::utils::const_cplx_data(size_c_batch, scalar_t(0)); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results - std::vector> c_ref = c; + std::vector> c_ref = c; for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) { reference_blas::cgemm( t_a, t_b, m, n, k, reinterpret_cast(&alpha), @@ -313,7 +312,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, blas::complex_sycl beta_sycl(beta); #ifdef BLAS_VERIFY_BENCHMARK - std::vector> c_temp = c; + std::vector> c_temp = c; { auto c_temp_gpu = blas::helper::allocate>( @@ -399,13 +398,13 @@ void register_cplx_benchmark( stride_a_mul, stride_b_mul, stride_c_mul) = p; int t1 = static_cast(blas_benchmark::utils::to_transpose_enum(t1s)); int t2 = static_cast(blas_benchmark::utils::to_transpose_enum(t2s)); - blas::complex_std alpha{alpha_r, alpha_i}; - blas::complex_std beta{beta_r, beta_i}; + std::complex alpha{alpha_r, alpha_i}; + std::complex beta{beta_r, beta_i}; auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, int t1, int t2, index_t m, index_t k, index_t n, - blas::complex_std alpha, - blas::complex_std beta, index_t batch_size, + std::complex alpha, + std::complex beta, index_t batch_size, index_t stride_a_mul, index_t stride_b_mul, index_t stride_c_mul, bool* success) { run(st, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, @@ -413,8 +412,7 @@ void register_cplx_benchmark( stride_c_mul, success); }; benchmark::RegisterBenchmark( - blas_benchmark::utils::get_name>( + blas_benchmark::utils::get_name>( t1s, t2s, m, k, n, batch_size, stride_a_mul, stride_b_mul, stride_c_mul, mem_type) .c_str(), diff --git a/benchmark/rocblas/blas3/gemm.cpp b/benchmark/rocblas/blas3/gemm.cpp index e00f8404c..ca07ba2ba 100644 --- a/benchmark/rocblas/blas3/gemm.cpp +++ b/benchmark/rocblas/blas3/gemm.cpp @@ -204,11 +204,10 @@ using rocComplex = template void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i, int t_b_i, index_t m, index_t k, index_t n, - blas::complex_std alpha, blas::complex_std beta, + std::complex alpha, std::complex beta, bool* success) { // initialize the state label - blas_benchmark::utils::set_benchmark_label>( - state); + blas_benchmark::utils::set_benchmark_label>(state); // Standard test setup. std::string t_a = blas_benchmark::utils::from_transpose_enum( @@ -241,11 +240,11 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i, const int c_size = m * n; // Matrices - std::vector> a = + std::vector> a = blas_benchmark::utils::random_cplx_data(a_size); - std::vector> b = + std::vector> b = blas_benchmark::utils::random_cplx_data(b_size); - std::vector> c = + std::vector> c = blas_benchmark::utils::const_cplx_data(c_size, 0); { @@ -259,7 +258,7 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i, #ifdef BLAS_VERIFY_BENCHMARK // Reference gemm - std::vector> c_ref = c; + std::vector> c_ref = c; reference_blas::cgemm( t_a_str, t_b_str, m, n, k, reinterpret_cast(&alpha), reinterpret_cast(a.data()), lda, @@ -268,7 +267,7 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i, reinterpret_cast(c_ref.data()), ldc); // Rocblas verification gemm - std::vector> c_temp = c; + std::vector> c_temp = c; { blas_benchmark::utils::HIPVector, true> c_temp_gpu( c_size, reinterpret_cast*>(c_temp.data())); @@ -347,18 +346,17 @@ void register_cplx_benchmark(blas_benchmark::Args& args, std::tie(t_a, t_b, m, k, n, alpha_r, alpha_i, beta_r, beta_i) = p; int t_a_i = static_cast(blas_benchmark::utils::to_transpose_enum(t_a)); int t_b_i = static_cast(blas_benchmark::utils::to_transpose_enum(t_b)); - blas::complex_std alpha{alpha_r, alpha_i}; - blas::complex_std beta{beta_r, beta_i}; + std::complex alpha{alpha_r, alpha_i}; + std::complex beta{beta_r, beta_i}; auto BM_lambda = [&](benchmark::State& st, rocblas_handle rb_handle, int t1i, int t2i, index_t m, index_t k, index_t n, - blas::complex_std alpha, - blas::complex_std beta, bool* success) { + std::complex alpha, + std::complex beta, bool* success) { run(st, rb_handle, t1i, t2i, m, k, n, alpha, beta, success); }; benchmark::RegisterBenchmark( - blas_benchmark::utils::get_name>( + blas_benchmark::utils::get_name>( t_a, t_b, m, k, n, blas_benchmark::utils::MEM_TYPE_USM) .c_str(), BM_lambda, rb_handle, t_a_i, t_b_i, m, k, n, alpha, beta, success) diff --git a/benchmark/rocblas/blas3/gemm_batched.cpp b/benchmark/rocblas/blas3/gemm_batched.cpp index 426205387..40147d5ff 100644 --- a/benchmark/rocblas/blas3/gemm_batched.cpp +++ b/benchmark/rocblas/blas3/gemm_batched.cpp @@ -229,11 +229,10 @@ using rocComplex = template void run(benchmark::State& state, rocblas_handle& rb_handle, index_t t_a_i, index_t t_b_i, index_t m, index_t k, index_t n, - blas::complex_std alpha, blas::complex_std beta, + std::complex alpha, std::complex beta, index_t batch_size, int batch_type_i, bool* success) { // initialize the state label - blas_benchmark::utils::set_benchmark_label>( - state); + blas_benchmark::utils::set_benchmark_label>(state); // Standard setup std::string t_a = blas_benchmark::utils::from_transpose_enum( @@ -271,11 +270,11 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, index_t t_a_i, const int c_size = m * n; // Matrices - std::vector> a = + std::vector> a = blas_benchmark::utils::random_cplx_data(a_size * batch_size); - std::vector> b = + std::vector> b = blas_benchmark::utils::random_cplx_data(b_size * batch_size); - std::vector> c = + std::vector> c = blas_benchmark::utils::const_cplx_data(c_size * batch_size, 0); { @@ -289,7 +288,7 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, index_t t_a_i, #ifdef BLAS_VERIFY_BENCHMARK // Reference batched gemm - std::vector> c_ref = c; + std::vector> c_ref = c; for (int batch = 0; batch < batch_size; batch++) { reference_blas::cgemm( t_a_str, t_b_str, m, n, k, reinterpret_cast(&alpha), @@ -300,7 +299,7 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, index_t t_a_i, } // Rocblas verification gemm_batched - std::vector> c_temp = c; + std::vector> c_temp = c; { blas_benchmark::utils::HIPVectorBatched, true> c_temp_gpu(c_size, batch_size, @@ -379,8 +378,8 @@ void register_cplx_benchmark(blas_benchmark::Args& args, int batch_type; std::tie(t_a, t_b, m, k, n, alpha_r, alpha_i, beta_r, beta_i, batch_size, batch_type) = p; - blas::complex_std alpha{alpha_r, alpha_i}; - blas::complex_std beta{beta_r, beta_i}; + std::complex alpha{alpha_r, alpha_i}; + std::complex beta{beta_r, beta_i}; if (batch_type == 1) { std::cerr << "interleaved memory for gemm_batched operator is not " "supported by rocBLAS\n"; @@ -392,15 +391,14 @@ void register_cplx_benchmark(blas_benchmark::Args& args, auto BM_lambda = [&](benchmark::State& st, rocblas_handle rb_handle, int t_a_i, int t_b_i, index_t m, index_t k, index_t n, - blas::complex_std alpha, - blas::complex_std beta, index_t batch_size, + std::complex alpha, + std::complex beta, index_t batch_size, int batch_type, bool* success) { run(st, rb_handle, t_a_i, t_b_i, m, k, n, alpha, beta, batch_size, batch_type, success); }; benchmark::RegisterBenchmark( - blas_benchmark::utils::get_name>( + blas_benchmark::utils::get_name>( t_a, t_b, m, k, n, batch_size, batch_type, blas_benchmark::utils::MEM_TYPE_USM) .c_str(), diff --git a/benchmark/rocblas/blas3/gemm_batched_strided.cpp b/benchmark/rocblas/blas3/gemm_batched_strided.cpp index 41652d9c4..3ecbff82c 100644 --- a/benchmark/rocblas/blas3/gemm_batched_strided.cpp +++ b/benchmark/rocblas/blas3/gemm_batched_strided.cpp @@ -242,12 +242,11 @@ using rocComplex = template void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i, int t_b_i, index_t m, index_t k, index_t n, - blas::complex_std alpha, blas::complex_std beta, + std::complex alpha, std::complex beta, index_t batch_size, index_t stride_a_mul, index_t stride_b_mul, index_t stride_c_mul, bool* success) { // initialize the state label - blas_benchmark::utils::set_benchmark_label>( - state); + blas_benchmark::utils::set_benchmark_label>(state); // Standard test setup. std::string t_a = blas_benchmark::utils::from_transpose_enum( @@ -294,11 +293,11 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i, const int size_c_batch = c_size + (batch_size - 1) * stride_c; // Matrices - std::vector> a = + std::vector> a = blas_benchmark::utils::random_cplx_data(size_a_batch); - std::vector> b = + std::vector> b = blas_benchmark::utils::random_cplx_data(size_b_batch); - std::vector> c = + std::vector> c = blas_benchmark::utils::const_cplx_data(size_c_batch, 0); { @@ -315,7 +314,7 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i, #ifdef BLAS_VERIFY_BENCHMARK // Reference gemm batched strided (strided loop of gemm) - std::vector> c_ref = c; + std::vector> c_ref = c; for (int batch = 0; batch < batch_size; batch++) { reference_blas::cgemm( t_a_str, t_b_str, m, n, k, reinterpret_cast(&alpha), @@ -326,7 +325,7 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i, } // Rocblas verification gemm_batched_strided - std::vector> c_temp = c; + std::vector> c_temp = c; { blas_benchmark::utils::HIPVectorBatchedStrided, true> c_temp_gpu(c_size, batch_size, stride_c, @@ -412,21 +411,20 @@ void register_cplx_benchmark(blas_benchmark::Args& args, stride_a_mul, stride_b_mul, stride_c_mul) = p; int t_a_i = static_cast(blas_benchmark::utils::to_transpose_enum(t_a)); int t_b_i = static_cast(blas_benchmark::utils::to_transpose_enum(t_b)); - blas::complex_std alpha{alpha_r, alpha_i}; - blas::complex_std beta{beta_r, beta_i}; + std::complex alpha{alpha_r, alpha_i}; + std::complex beta{beta_r, beta_i}; auto BM_lambda = [&](benchmark::State& st, rocblas_handle rb_handle, int t1i, int t2i, index_t m, index_t k, index_t n, - blas::complex_std alpha, - blas::complex_std beta, index_t batch_size, + std::complex alpha, + std::complex beta, index_t batch_size, index_t strd_a_mul, index_t strd_b_mul, index_t strd_c_mul, bool* success) { run(st, rb_handle, t1i, t2i, m, k, n, alpha, beta, batch_size, strd_a_mul, strd_b_mul, strd_c_mul, success); }; benchmark::RegisterBenchmark( - blas_benchmark::utils::get_name>( + blas_benchmark::utils::get_name>( t_a, t_b, m, k, n, batch_size, stride_a_mul, stride_b_mul, stride_c_mul, blas_benchmark::utils::MEM_TYPE_USM) .c_str(), diff --git a/common/include/common/blas3_state_counters.hpp b/common/include/common/blas3_state_counters.hpp index 18c9ce607..68e332773 100644 --- a/common/include/common/blas3_state_counters.hpp +++ b/common/include/common/blas3_state_counters.hpp @@ -83,8 +83,8 @@ inline typename std::enable_if::type init_level_3_cplx_counters( benchmark::State& state, - blas::complex_std beta = blas::complex_std(0, 0), - index_t m = 0, index_t n = 0, index_t k = 0, index_t batch_size = 1, + std::complex beta = std::complex(0, 0), index_t m = 0, + index_t n = 0, index_t k = 0, index_t batch_size = 1, index_t stride_a_mul = 1, index_t stride_b_mul = 1, index_t stride_c_mul = 1) { // Google-benchmark counters are double. diff --git a/common/include/common/common_utils.hpp b/common/include/common/common_utils.hpp index cc00f3e1a..d55b6ad15 100644 --- a/common/include/common/common_utils.hpp +++ b/common/include/common/common_utils.hpp @@ -513,8 +513,8 @@ static inline std::vector> get_blas3_cplx_params( std::vector> blas3_default; constexpr index_t dmin = 32, dmax = 8192; std::vector dtranspose = {"n", "t"}; - blas::complex_std alpha{1, 1}; - blas::complex_std beta{1, 1}; + std::complex alpha{1, 1}; + std::complex beta{1, 1}; for (std::string& t1 : dtranspose) { for (std::string& t2 : dtranspose) { for (index_t m = dmin; m <= dmax; m *= 8) { @@ -564,8 +564,8 @@ get_gemm_batched_strided_cplx_params(Args& args) { gemm_batched_strided_default; constexpr index_t dmin = 128, dmax = 8192; std::vector dtranspose = {"n", "t"}; - blas::complex_std alpha{1, 1}; - blas::complex_std beta{1, 1}; + std::complex alpha{1, 1}; + std::complex beta{1, 1}; index_t batch_size = 8; for (std::string& t1 : dtranspose) { for (std::string& t2 : dtranspose) { @@ -613,8 +613,8 @@ get_gemm_cplx_batched_params(Args& args) { std::vector> gemm_batched_default; constexpr index_t dmin = 128, dmax = 8192; std::vector dtranspose = {"n", "t"}; - blas::complex_std alpha{1, 1}; - blas::complex_std beta{1, 1}; + std::complex alpha{1, 1}; + std::complex beta{1, 1}; index_t batch_size = 8; int batch_type = 0; for (std::string& t1 : dtranspose) { @@ -1440,11 +1440,11 @@ inline std::string get_type_name() { #ifdef BLAS_ENABLE_COMPLEX template <> -inline std::string get_type_name>() { +inline std::string get_type_name>() { return "complex"; } template <> -inline std::string get_type_name>() { +inline std::string get_type_name>() { return "complex"; } #endif