From ed81360df447db432b178ec6276b2927156acecb Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Tue, 12 Dec 2023 18:39:01 +0000 Subject: [PATCH 1/5] Updated transpose & transpose add cpu configs --- src/interface/extension/backend/default_cpu.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/interface/extension/backend/default_cpu.hpp b/src/interface/extension/backend/default_cpu.hpp index d8a2f6c24..ba714e78c 100644 --- a/src/interface/extension/backend/default_cpu.hpp +++ b/src/interface/extension/backend/default_cpu.hpp @@ -37,12 +37,12 @@ typename sb_handle_t::event_t _transpose_outplace( container_0_t in_, index_t _ld_in, index_t _inc_in, index_t _stride_in, container_1_t out_, index_t _ld_out, index_t _inc_out, index_t _stride_out, index_t _batch_size, const typename sb_handle_t::event_t& _dependencies) { - if (_M * _N < (1 << 20)) { + if (_M * _N < (1 << 16)) { return blas::internal::_transpose_outplace_impl<16, 64, 64, false>( sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_, _ld_out, _inc_out, _stride_out, _batch_size, _dependencies); } else { - return blas::internal::_transpose_outplace_impl<32, 128, 64, false>( + return blas::internal::_transpose_outplace_impl<32, 32, 64, false>( sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_, _ld_out, _inc_out, _stride_out, _batch_size, _dependencies); } @@ -58,13 +58,13 @@ typename sb_handle_t::event_t _transpose_add( index_t _b_rows, index_t _b_cols, index_t _stride_b, container_2_t c_, index_t _ld_c, index_t _stride_c, index_t _batch_size, const typename sb_handle_t::event_t& _dependencies) { - if (_M * _N < (1 << 20)) { + if (_M * _N < (1 << 16)) { return blas::internal::_transpose_add_impl( sb_handle, _M, _N, _alpha, a_, _ld_a, _a_rows, _a_cols, _stride_a, _beta, b_, _ld_b, _b_rows, _b_cols, _stride_b, c_, _ld_c, _stride_c, _batch_size, _dependencies); } else { - return blas::internal::_transpose_add_impl( + return blas::internal::_transpose_add_impl( sb_handle, _M, _N, _alpha, a_, _ld_a, _a_rows, _a_cols, _stride_a, _beta, b_, _ld_b, _b_rows, _b_cols, _stride_b, c_, _ld_c, _stride_c, _batch_size, _dependencies); From d7b8d69cef2157803b4711234bbb2ef5620f1134 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Tue, 12 Dec 2023 18:39:36 +0000 Subject: [PATCH 2/5] Reduced omatcopy and omatadd benchmarks cases --- common/include/common/common_utils.hpp | 76 +++++++++++--------------- 1 file changed, 32 insertions(+), 44 deletions(-) diff --git a/common/include/common/common_utils.hpp b/common/include/common/common_utils.hpp index fc4562aa8..6cb967cc3 100644 --- a/common/include/common/common_utils.hpp +++ b/common/include/common/common_utils.hpp @@ -1242,15 +1242,13 @@ static inline std::vector> get_matcopy_params( std::vector> matcopy_default; constexpr index_t dmin = 64, dmax = 8192; constexpr scalar_t alpha{2}; + constexpr index_t lda_mul = 1; + constexpr index_t ldb_mul = 1; for (char trans : {'n', 't'}) { for (index_t m = dmin; m <= dmax; m *= 2) { for (index_t n = dmin; n <= dmax; n *= 2) { - for (index_t lda_mul = 1; lda_mul < 2; ++lda_mul) { - for (index_t ldb_mul = 1; ldb_mul < 2; ++ldb_mul) { - matcopy_default.push_back( - std::make_tuple(trans, m, n, alpha, lda_mul, ldb_mul)); - } - } + matcopy_default.push_back( + std::make_tuple(trans, m, n, alpha, lda_mul, ldb_mul)); } } } @@ -1287,17 +1285,15 @@ static inline std::vector> get_omatcopy2_params( std::vector> omatcopy2_default; constexpr index_t dmin = 1024, dmax = 8192; constexpr scalar_t alpha{2}; + constexpr index_t lda_mul = 1; + constexpr index_t ldb_mul = 1; for (char trans : {'n', 't'}) { for (index_t m = dmin; m <= dmax; m *= 2) { for (index_t n = dmin; n <= dmax; n *= 2) { - for (index_t lda_mul = 1; lda_mul < 2; ++lda_mul) { - for (index_t inc_a = 1; inc_a < 3; ++inc_a) { - for (index_t ldb_mul = 1; ldb_mul < 2; ++ldb_mul) { - for (index_t inc_b = 1; inc_b < 3; ++inc_b) { - omatcopy2_default.push_back(std::make_tuple( - trans, m, n, alpha, lda_mul, ldb_mul, inc_a, inc_b)); - } - } + for (index_t inc_a = 1; inc_a < 3; ++inc_a) { + for (index_t inc_b = 1; inc_b < 3; ++inc_b) { + omatcopy2_default.push_back(std::make_tuple( + trans, m, n, alpha, lda_mul, ldb_mul, inc_a, inc_b)); } } } @@ -1336,21 +1332,20 @@ get_matcopy_batch_params(Args& args) { if (args.csv_param.empty()) { warning_no_csv(); std::vector> matcopy_batch_default; - constexpr index_t dmin = 256, dmax = 8192; + constexpr index_t dmin = 256, dmax = 4096; constexpr scalar_t alpha{2}; constexpr index_t batch_size{3}; constexpr index_t stride_a_mul{1}; constexpr index_t stride_b_mul{1}; + constexpr index_t lda_mul = 1; + constexpr index_t ldb_mul = 1; + constexpr index_t ldc_mul = 1; for (char trans : {'n', 't'}) { for (index_t m = dmin; m <= dmax; m *= 2) { for (index_t n = dmin; n <= dmax; n *= 2) { - for (index_t lda_mul = 1; lda_mul < 2; ++lda_mul) { - for (index_t ldb_mul = 1; ldb_mul < 2; ++ldb_mul) { - matcopy_batch_default.push_back( - std::make_tuple(trans, m, n, alpha, lda_mul, ldb_mul, - stride_a_mul, stride_b_mul, batch_size)); - } - } + matcopy_batch_default.push_back( + std::make_tuple(trans, m, n, alpha, lda_mul, ldb_mul, + stride_a_mul, stride_b_mul, batch_size)); } } } @@ -1386,22 +1381,19 @@ static inline std::vector> get_omatadd_params( if (args.csv_param.empty()) { warning_no_csv(); std::vector> omatadd_default; - constexpr index_t dmin = 64, dmax = 8192; + constexpr index_t dmin = 64, dmax = 4096; constexpr scalar_t alpha{2}; constexpr scalar_t beta{2}; + constexpr index_t lda_mul = 1; + constexpr index_t ldb_mul = 1; + constexpr index_t ldc_mul = 1; for (char trans_a : {'n', 't'}) { for (char trans_b : {'n', 't'}) { for (index_t m = dmin; m <= dmax; m *= 2) { for (index_t n = dmin; n <= dmax; n *= 2) { - for (index_t lda_mul = 1; lda_mul < 2; ++lda_mul) { - for (index_t ldb_mul = 1; ldb_mul < 2; ++ldb_mul) { - for (index_t ldc_mul = 1; ldc_mul < 2; ++ldc_mul) { - omatadd_default.push_back( - std::make_tuple(trans_a, trans_b, m, n, alpha, beta, - lda_mul, ldb_mul, ldc_mul)); - } - } - } + omatadd_default.push_back(std::make_tuple(trans_a, trans_b, m, n, + alpha, beta, lda_mul, + ldb_mul, ldc_mul)); } } } @@ -1439,27 +1431,23 @@ get_omatadd_batch_params(Args& args) { if (args.csv_param.empty()) { warning_no_csv(); std::vector> omatadd_batch_default; - constexpr index_t dmin = 256, dmax = 8192; + constexpr index_t dmin = 1024, dmax = 4096; constexpr scalar_t alpha{2}; constexpr scalar_t beta{2}; constexpr index_t batch_size{3}; constexpr index_t stride_a_mul{1}; constexpr index_t stride_b_mul{1}; constexpr index_t stride_c_mul{1}; + constexpr index_t lda_mul = 1; + constexpr index_t ldb_mul = 1; + constexpr index_t ldc_mul = 1; for (char trans_a : {'n', 't'}) { - for (char trans_b : {'n', 't'}) { + for (char trans_b : {'n'}) { for (index_t m = dmin; m <= dmax; m *= 2) { for (index_t n = dmin; n <= dmax; n *= 2) { - for (index_t lda_mul = 1; lda_mul < 2; ++lda_mul) { - for (index_t ldb_mul = 1; ldb_mul < 2; ++ldb_mul) { - for (index_t ldc_mul = 1; ldc_mul < 2; ++ldc_mul) { - omatadd_batch_default.push_back( - std::make_tuple(trans_a, trans_b, m, n, alpha, beta, - lda_mul, ldb_mul, ldc_mul, stride_a_mul, - stride_b_mul, stride_c_mul, batch_size)); - } - } - } + omatadd_batch_default.push_back(std::make_tuple( + trans_a, trans_b, m, n, alpha, beta, lda_mul, ldb_mul, ldc_mul, + stride_a_mul, stride_b_mul, stride_c_mul, batch_size)); } } } From c6f2d0fb9ffb03ca134a25e2a96322d020af7c31 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Thu, 21 Dec 2023 22:02:05 +0000 Subject: [PATCH 3/5] removed extra attributes from transpose kernels --- include/operations/extension/transpose.h | 16 ---------------- src/operations/extension/transpose.hpp | 22 ++++++++++++---------- 2 files changed, 12 insertions(+), 26 deletions(-) diff --git a/include/operations/extension/transpose.h b/include/operations/extension/transpose.h index 71921b539..b684f0b01 100644 --- a/include/operations/extension/transpose.h +++ b/include/operations/extension/transpose.h @@ -80,11 +80,6 @@ class Transpose { static constexpr const index_t inner_tile_size_ = wg_size / Tile_size; static constexpr const index_t inner_tile_count_ = Tile_size / inner_tile_size_; - // Minimum number of Tile-mutliple rows & columns to cover the matrices - index_t M_pad_; - index_t N_pad_; - // Total size of Tile-mutliple covering matrix - index_t size_pad_; // Batch size when using batched transpose index_t batch_size_; // Number of contiguous elements to be used in local memory to avoid bank @@ -115,9 +110,6 @@ class Transpose { stride_a_(stride_a), stride_at_(stride_at), inc_at_(inc_at), - M_pad_(tile_count_m_ * Tile_size), - N_pad_(tile_count_n_ * Tile_size), - size_pad_(M_pad_ * N_pad_), batch_size_(batch_size) {} index_t get_size() const; @@ -209,11 +201,6 @@ class TransposeAdd { static constexpr const index_t inner_tile_size_ = wg_size / Tile_size; static constexpr const index_t inner_tile_count_ = Tile_size / inner_tile_size_; - // Minimum number of Tile-mutliple rows & columns to cover the output matrix - index_t M_pad_; - index_t N_pad_; - // Total size of Tile-mutliple covering matrix - index_t size_pad_; // Batch size when using batched transpose index_t batch_size_; // Number of contiguous elements to be used in local memory to avoid bank @@ -246,9 +233,6 @@ class TransposeAdd { tile_count_m_((M_ - 1) / Tile_size + 1), tile_count_n_((N_ - 1) / Tile_size + 1), tile_count_total_(tile_count_m_ * tile_count_n_), - M_pad_(tile_count_m_ * Tile_size), - N_pad_(tile_count_n_ * Tile_size), - size_pad_(M_pad_ * N_pad_), batch_size_(batch_size) {} index_t get_size() const; diff --git a/src/operations/extension/transpose.hpp b/src/operations/extension/transpose.hpp index 98ecf8e03..87485660e 100644 --- a/src/operations/extension/transpose.hpp +++ b/src/operations/extension/transpose.hpp @@ -55,7 +55,7 @@ Transpose::get_size() const { // Smallest TileSize square-multiple containing input/output matrices times // batch_size - return (size_pad_ * batch_size_); + return (tile_count_total_ * Tile_size * Tile_size * batch_size_); } template ::get_size() const { // Smallest TileSize square-multiple containing input/output matrices - return (size_pad_ * batch_size_); + return (tile_count_total_ * Tile_size * Tile_size * batch_size_); } template [0,N_], B & - *C otherwise -> [0,M_]) - * @param j [output] the global col-index (A & B when both_trans -> [0,M_], B & - *C otherwise -> [0,N_]) + * @param i [output] the global row-index (A & B when both_trans -> [0,N_], B + *& C otherwise -> [0,M_]) + * @param j [output] the global col-index (A & B when both_trans -> [0,M_], B + *& C otherwise -> [0,N_]) */ template Date: Mon, 1 Jan 2024 13:29:53 +0000 Subject: [PATCH 4/5] Reduced matcopy-batched test cases sizes --- test/unittest/extension/omatadd_batched_test.cpp | 8 ++++---- test/unittest/extension/omatcopy_batched_test.cpp | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/test/unittest/extension/omatadd_batched_test.cpp b/test/unittest/extension/omatadd_batched_test.cpp index d0ce57751..f65a97c92 100644 --- a/test/unittest/extension/omatadd_batched_test.cpp +++ b/test/unittest/extension/omatadd_batched_test.cpp @@ -160,15 +160,15 @@ const auto combi = ::testing::Values('n', 't'), // trans_b ::testing::Values(64, 129, 255), // m ::testing::Values(64, 129, 255), // n - ::testing::Values(2.5), // alpha - ::testing::Values(3.5), // beta + ::testing::Values(2.5), // alpha + ::testing::Values(3.5), // beta ::testing::Values(1, 2), // lda_mul ::testing::Values(1, 2), // ldb_mul - ::testing::Values(1, 2, 3), // ldc_mul + ::testing::Values(3), // ldc_mul ::testing::Values(1, 3), // stride_a_m ::testing::Values(1, 3), // stride_b_m ::testing::Values(1, 3), // stride_c_m - ::testing::Values(2, 3)); // batch_size + ::testing::Values(3)); // batch_size #endif template diff --git a/test/unittest/extension/omatcopy_batched_test.cpp b/test/unittest/extension/omatcopy_batched_test.cpp index e15b8405e..4f6569653 100644 --- a/test/unittest/extension/omatcopy_batched_test.cpp +++ b/test/unittest/extension/omatcopy_batched_test.cpp @@ -121,7 +121,7 @@ const auto combi = ::testing::Values('n', 't'), // trans ::testing::Values(1024, 4050, 16380), // m ::testing::Values(1024, 4050, 16380), // n - ::testing::Values(0, 1.05, -20.01), // alpha + ::testing::Values(1.05, -20.01), // alpha ::testing::Values(3, 5), // ld_in_m ::testing::Values(3, 5), // ld_out_m ::testing::Values(5, 10), // stride_in_m @@ -134,12 +134,12 @@ const auto combi = ::testing::Values('n', 't'), // trans ::testing::Values(64, 129, 255), // m ::testing::Values(64, 129, 255), // n - ::testing::Values(0, 2.5), // alpha - ::testing::Values(1, 2, 3), // ld_in_m - ::testing::Values(1, 2, 3), // ld_out_m + ::testing::Values(2.5), // alpha + ::testing::Values(1, 3), // ld_in_m + ::testing::Values(1, 3), // ld_out_m ::testing::Values(1, 3), // stride_in_m - ::testing::Values(1, 3), // stride_out_m - ::testing::Values(1, 2, 5)); // batch_size + ::testing::Values(1, 3), // stride_out_m + ::testing::Values(1, 5)); // batch_size #endif template From eba77f16a1877c5bb439242b5c63d0c7a70544ad Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Mon, 1 Jan 2024 13:32:38 +0000 Subject: [PATCH 5/5] Formatting to extension tests --- test/unittest/extension/omatadd_test.cpp | 36 ++++++++++++---------- test/unittest/extension/omatcopy2_test.cpp | 23 ++++++++------ test/unittest/extension/omatcopy_test.cpp | 12 ++++---- test/unittest/extension/reduction_test.cpp | 36 +++++++++++++--------- test/unittest/extension/transpose_test.cpp | 9 +++--- 5 files changed, 65 insertions(+), 51 deletions(-) diff --git a/test/unittest/extension/omatadd_test.cpp b/test/unittest/extension/omatadd_test.cpp index 5c4ac6958..56ef0fca3 100644 --- a/test/unittest/extension/omatadd_test.cpp +++ b/test/unittest/extension/omatadd_test.cpp @@ -27,8 +27,8 @@ #include "extension_reference.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; template void run_test(const combination_t combi) { @@ -37,8 +37,8 @@ void run_test(const combination_t combi) { index_t m, n, ld_a_mul, ld_b_mul, ld_c_mul; scalar_t alpha, beta; - std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul, ld_c_mul) = - combi; + std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul, + ld_c_mul) = combi; auto q = make_queue(); blas::SB_Handle sb_handle(q); @@ -70,12 +70,16 @@ void run_test(const combination_t combi) { auto m_b_gpu = helper::allocate(size_m_b, q); auto m_c_gpu = helper::allocate(size_m_c, q); - auto copy_m_a = helper::copy_to_device(q, A.data(), m_a_gpu, size_m_a); - auto copy_m_b = helper::copy_to_device(q, B.data(), m_b_gpu, size_m_b); - auto copy_m_c = helper::copy_to_device(q, C.data(), m_c_gpu, size_m_c); + auto copy_m_a = + helper::copy_to_device(q, A.data(), m_a_gpu, size_m_a); + auto copy_m_b = + helper::copy_to_device(q, B.data(), m_b_gpu, size_m_b); + auto copy_m_c = + helper::copy_to_device(q, C.data(), m_c_gpu, size_m_c); - auto omatadd_event = blas::_omatadd(sb_handle, trans_a, trans_b, m, n, alpha, m_a_gpu, lda, beta, - m_b_gpu, ldb, m_c_gpu, ldc, {copy_m_a, copy_m_b, copy_m_c}); + auto omatadd_event = blas::_omatadd(sb_handle, trans_a, trans_b, m, n, alpha, + m_a_gpu, lda, beta, m_b_gpu, ldb, m_c_gpu, + ldc, {copy_m_a, copy_m_b, copy_m_c}); sb_handle.wait(omatadd_event); auto event = blas::helper::copy_to_host( @@ -98,8 +102,8 @@ void run_test(const combination_t combi) { index_t m, n, ld_a_mul, ld_b_mul, ld_c_mul; scalar_t alpha, beta; - std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul, ld_c_mul) = - combi; + std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul, + ld_c_mul) = combi; if (alloc == "usm") { #ifdef SB_ENABLE_USM @@ -127,9 +131,9 @@ const auto combi = #else template const auto combi = - ::testing::Combine(::testing::Values("usm", "buf"), // allocation type - ::testing::Values('n', 't'), // trans_a - ::testing::Values('n', 't'), // trans_b + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('n', 't'), // trans_a + ::testing::Values('n', 't'), // trans_b ::testing::Values(64, 129, 255), // m ::testing::Values(64, 129, 255), // n ::testing::Values(0, 1, 2), // alpha @@ -146,8 +150,8 @@ static std::string generate_name( char trans_a, trans_b; index_t m, n, lda_mul, ldb_mul, ldc_mul; T alpha, beta; - BLAS_GENERATE_NAME(info.param, alloc, trans_a, trans_b, m, n, alpha, beta, lda_mul, - ldb_mul, ldc_mul); + BLAS_GENERATE_NAME(info.param, alloc, trans_a, trans_b, m, n, alpha, beta, + lda_mul, ldb_mul, ldc_mul); } BLAS_REGISTER_TEST_ALL(OmatAdd, combination_t, combi, generate_name); diff --git a/test/unittest/extension/omatcopy2_test.cpp b/test/unittest/extension/omatcopy2_test.cpp index bae7ba0e4..b29122b74 100644 --- a/test/unittest/extension/omatcopy2_test.cpp +++ b/test/unittest/extension/omatcopy2_test.cpp @@ -27,8 +27,8 @@ #include "extension_reference.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; template void run_test(const combination_t combi) { @@ -37,7 +37,8 @@ void run_test(const combination_t combi) { index_t m, n, inc_in, ld_in_m, inc_out, ld_out_m; scalar_t alpha; - std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) = combi; + std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) = + combi; // Leading dimensions are computed as multiples of the minimum value specified // in the oneMKL documentation at : @@ -77,8 +78,9 @@ void run_test(const combination_t combi) { auto copy_out = helper::copy_to_device(q, B.data(), matrix_out, m_b_size); - auto omatcopy2_event = blas::_omatcopy2(sb_handle, trans, m, n, alpha, matrix_in, ld_in, inc_in, - matrix_out, ld_out, inc_out, {copy_in, copy_out}); + auto omatcopy2_event = + blas::_omatcopy2(sb_handle, trans, m, n, alpha, matrix_in, ld_in, inc_in, + matrix_out, ld_out, inc_out, {copy_in, copy_out}); sb_handle.wait(omatcopy2_event); @@ -101,7 +103,8 @@ void run_test(const combination_t combi) { index_t m, n, inc_in, ld_in_m, inc_out, ld_out_m; scalar_t alpha; - std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) = combi; + std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) = + combi; if (alloc == "usm") { #ifdef SB_ENABLE_USM @@ -129,8 +132,8 @@ const auto combi = #else template const auto combi = - ::testing::Combine(::testing::Values("usm", "buf"), // allocation type - ::testing::Values('n', 't'), // trans + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('n', 't'), // trans ::testing::Values(64, 129, 255), // m ::testing::Values(64, 129, 255), // n ::testing::Values(0, 2), // alpha @@ -147,8 +150,8 @@ static std::string generate_name( char trans; index_t m, n, inc_in, ld_in_m, inc_out, ld_out_m; T alpha; - BLAS_GENERATE_NAME(info.param, alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, - ld_out_m); + BLAS_GENERATE_NAME(info.param, alloc, trans, m, n, alpha, inc_in, ld_in_m, + inc_out, ld_out_m); } BLAS_REGISTER_TEST_ALL(OmatCopy2, combination_t, combi, generate_name); diff --git a/test/unittest/extension/omatcopy_test.cpp b/test/unittest/extension/omatcopy_test.cpp index a7a1e49a8..c7737f1a3 100644 --- a/test/unittest/extension/omatcopy_test.cpp +++ b/test/unittest/extension/omatcopy_test.cpp @@ -69,8 +69,9 @@ void run_test(const combination_t combi) { auto copy_out = helper::copy_to_device(q, B.data(), matrix_out, size_b); - auto omatcopy_event = blas::_omatcopy(sb_handle, trans, m, n, alpha, matrix_in, ld_in, matrix_out, - ld_out, {copy_in, copy_out}); + auto omatcopy_event = + blas::_omatcopy(sb_handle, trans, m, n, alpha, matrix_in, ld_in, + matrix_out, ld_out, {copy_in, copy_out}); sb_handle.wait(omatcopy_event); @@ -106,7 +107,6 @@ void run_test(const combination_t combi) { } } - #ifdef STRESS_TESTING template const auto combi = @@ -120,11 +120,11 @@ const auto combi = #else template const auto combi = - ::testing::Combine(::testing::Values("usm", "buf"), // allocation type - ::testing::Values('n', 't'), // trans + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('n', 't'), // trans ::testing::Values(64, 129, 255), // m ::testing::Values(64, 129, 255), // n - ::testing::Values(0, 1, 2), // alpha + ::testing::Values(0, 1, 2), // alpha ::testing::Values(1, 2, 3), // ld_in_m ::testing::Values(1, 2, 3)); // ld_out_m #endif diff --git a/test/unittest/extension/reduction_test.cpp b/test/unittest/extension/reduction_test.cpp index 8d98a9425..7dc84d5c1 100644 --- a/test/unittest/extension/reduction_test.cpp +++ b/test/unittest/extension/reduction_test.cpp @@ -71,7 +71,8 @@ static std::string generate_name( operator_t op; reduction_dim_t reductionDim; T unused; - BLAS_GENERATE_NAME(info.param, alloc, rows, cols, ldMul, op, reductionDim, unused); + BLAS_GENERATE_NAME(info.param, alloc, rows, cols, ldMul, op, reductionDim, + unused); } template @@ -173,9 +174,9 @@ void run_test(const combination_t combi) { if (op == operator_t::Mean) { const auto nelems = reduction_dim == reduction_dim_t::outer ? cols : rows; std::transform(out_v_cpu.begin(), out_v_cpu.end(), out_v_cpu.begin(), - [=](scalar_t val) -> scalar_t { - return val / static_cast(nelems); - }); + [=](scalar_t val) -> scalar_t { + return val / static_cast(nelems); + }); } auto m_in_gpu = @@ -183,37 +184,43 @@ void run_test(const combination_t combi) { auto v_out_gpu = blas::helper::allocate(out_size, q); // out_v_gpu - auto copy_m = blas::helper::copy_to_device(q, in_m.data(), - m_in_gpu, ld * cols); + auto copy_m = blas::helper::copy_to_device(q, in_m.data(), m_in_gpu, + ld * cols); auto copy_v = blas::helper::copy_to_device(q, out_v_gpu.data(), - v_out_gpu, out_size); + v_out_gpu, out_size); blas::SB_Handle::event_t ev; try { switch (op) { case operator_t::Add: ev = extension::_reduction( - sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v}); + sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, + {copy_m, copy_v}); break; case operator_t::Product: ev = extension::_reduction( - sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v}); + sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, + {copy_m, copy_v}); break; case operator_t::Max: ev = extension::_reduction( - sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v}); + sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, + {copy_m, copy_v}); break; case operator_t::Min: ev = extension::_reduction( - sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v}); + sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, + {copy_m, copy_v}); break; case operator_t::AbsoluteAdd: ev = extension::_reduction( - sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v}); + sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, + {copy_m, copy_v}); break; case operator_t::Mean: ev = extension::_reduction( - sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v}); + sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, + {copy_m, copy_v}); break; } } catch (cl::sycl::exception& e) { @@ -253,5 +260,4 @@ void run_test(const combination_t combi) { } } -BLAS_REGISTER_TEST_ALL(ReductionPartial, combination_t, combi, - generate_name); +BLAS_REGISTER_TEST_ALL(ReductionPartial, combination_t, combi, generate_name); diff --git a/test/unittest/extension/transpose_test.cpp b/test/unittest/extension/transpose_test.cpp index d3c6a0163..1167c1ddb 100644 --- a/test/unittest/extension/transpose_test.cpp +++ b/test/unittest/extension/transpose_test.cpp @@ -70,8 +70,9 @@ void run_test(const combination_t& combi) { auto copy_out = helper::copy_to_device(q, B.data(), matrix_out, size_b); - auto trans_event = blas::extension::_transpose(sb_handle, m, n, matrix_in, ld_in, - matrix_out, ld_out, {copy_in, copy_out}); + auto trans_event = blas::extension::_transpose( + sb_handle, m, n, matrix_in, ld_in, matrix_out, ld_out, + {copy_in, copy_out}); sb_handle.wait(trans_event); @@ -114,8 +115,8 @@ void run_test(const combination_t combi) { template const auto combi = - ::testing::Combine(::testing::Values("usm", "buf"), // allocation type - ::testing::Values('i', 'o'), // Inplace | Outplace + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('i', 'o'), // Inplace | Outplace ::testing::Values(64, 129, 255), // m ::testing::Values(64, 129, 255), // n ::testing::Values(1, 2, 3), // ld_in_m