diff --git a/include/operations/extension/transpose.h b/include/operations/extension/transpose.h index 71921b539..b684f0b01 100644 --- a/include/operations/extension/transpose.h +++ b/include/operations/extension/transpose.h @@ -80,11 +80,6 @@ class Transpose { static constexpr const index_t inner_tile_size_ = wg_size / Tile_size; static constexpr const index_t inner_tile_count_ = Tile_size / inner_tile_size_; - // Minimum number of Tile-mutliple rows & columns to cover the matrices - index_t M_pad_; - index_t N_pad_; - // Total size of Tile-mutliple covering matrix - index_t size_pad_; // Batch size when using batched transpose index_t batch_size_; // Number of contiguous elements to be used in local memory to avoid bank @@ -115,9 +110,6 @@ class Transpose { stride_a_(stride_a), stride_at_(stride_at), inc_at_(inc_at), - M_pad_(tile_count_m_ * Tile_size), - N_pad_(tile_count_n_ * Tile_size), - size_pad_(M_pad_ * N_pad_), batch_size_(batch_size) {} index_t get_size() const; @@ -209,11 +201,6 @@ class TransposeAdd { static constexpr const index_t inner_tile_size_ = wg_size / Tile_size; static constexpr const index_t inner_tile_count_ = Tile_size / inner_tile_size_; - // Minimum number of Tile-mutliple rows & columns to cover the output matrix - index_t M_pad_; - index_t N_pad_; - // Total size of Tile-mutliple covering matrix - index_t size_pad_; // Batch size when using batched transpose index_t batch_size_; // Number of contiguous elements to be used in local memory to avoid bank @@ -246,9 +233,6 @@ class TransposeAdd { tile_count_m_((M_ - 1) / Tile_size + 1), tile_count_n_((N_ - 1) / Tile_size + 1), tile_count_total_(tile_count_m_ * tile_count_n_), - M_pad_(tile_count_m_ * Tile_size), - N_pad_(tile_count_n_ * Tile_size), - size_pad_(M_pad_ * N_pad_), batch_size_(batch_size) {} index_t get_size() const; diff --git a/src/operations/extension/transpose.hpp b/src/operations/extension/transpose.hpp index 98ecf8e03..87485660e 100644 --- a/src/operations/extension/transpose.hpp +++ b/src/operations/extension/transpose.hpp @@ -55,7 +55,7 @@ Transpose::get_size() const { // Smallest TileSize square-multiple containing input/output matrices times // batch_size - return (size_pad_ * batch_size_); + return (tile_count_total_ * Tile_size * Tile_size * batch_size_); } template ::get_size() const { // Smallest TileSize square-multiple containing input/output matrices - return (size_pad_ * batch_size_); + return (tile_count_total_ * Tile_size * Tile_size * batch_size_); } template [0,N_], B & - *C otherwise -> [0,M_]) - * @param j [output] the global col-index (A & B when both_trans -> [0,M_], B & - *C otherwise -> [0,N_]) + * @param i [output] the global row-index (A & B when both_trans -> [0,N_], B + *& C otherwise -> [0,M_]) + * @param j [output] the global col-index (A & B when both_trans -> [0,M_], B + *& C otherwise -> [0,N_]) */ template ('n', 't'), // trans_b ::testing::Values(64, 129, 255), // m ::testing::Values(64, 129, 255), // n - ::testing::Values(2.5), // alpha - ::testing::Values(3.5), // beta + ::testing::Values(2.5), // alpha + ::testing::Values(3.5), // beta ::testing::Values(1, 2), // lda_mul ::testing::Values(1, 2), // ldb_mul - ::testing::Values(1, 2, 3), // ldc_mul + ::testing::Values(3), // ldc_mul ::testing::Values(1, 3), // stride_a_m ::testing::Values(1, 3), // stride_b_m ::testing::Values(1, 3), // stride_c_m - ::testing::Values(2, 3)); // batch_size + ::testing::Values(3)); // batch_size #endif template diff --git a/test/unittest/extension/omatadd_test.cpp b/test/unittest/extension/omatadd_test.cpp index 5c4ac6958..56ef0fca3 100644 --- a/test/unittest/extension/omatadd_test.cpp +++ b/test/unittest/extension/omatadd_test.cpp @@ -27,8 +27,8 @@ #include "extension_reference.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; template void run_test(const combination_t combi) { @@ -37,8 +37,8 @@ void run_test(const combination_t combi) { index_t m, n, ld_a_mul, ld_b_mul, ld_c_mul; scalar_t alpha, beta; - std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul, ld_c_mul) = - combi; + std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul, + ld_c_mul) = combi; auto q = make_queue(); blas::SB_Handle sb_handle(q); @@ -70,12 +70,16 @@ void run_test(const combination_t combi) { auto m_b_gpu = helper::allocate(size_m_b, q); auto m_c_gpu = helper::allocate(size_m_c, q); - auto copy_m_a = helper::copy_to_device(q, A.data(), m_a_gpu, size_m_a); - auto copy_m_b = helper::copy_to_device(q, B.data(), m_b_gpu, size_m_b); - auto copy_m_c = helper::copy_to_device(q, C.data(), m_c_gpu, size_m_c); + auto copy_m_a = + helper::copy_to_device(q, A.data(), m_a_gpu, size_m_a); + auto copy_m_b = + helper::copy_to_device(q, B.data(), m_b_gpu, size_m_b); + auto copy_m_c = + helper::copy_to_device(q, C.data(), m_c_gpu, size_m_c); - auto omatadd_event = blas::_omatadd(sb_handle, trans_a, trans_b, m, n, alpha, m_a_gpu, lda, beta, - m_b_gpu, ldb, m_c_gpu, ldc, {copy_m_a, copy_m_b, copy_m_c}); + auto omatadd_event = blas::_omatadd(sb_handle, trans_a, trans_b, m, n, alpha, + m_a_gpu, lda, beta, m_b_gpu, ldb, m_c_gpu, + ldc, {copy_m_a, copy_m_b, copy_m_c}); sb_handle.wait(omatadd_event); auto event = blas::helper::copy_to_host( @@ -98,8 +102,8 @@ void run_test(const combination_t combi) { index_t m, n, ld_a_mul, ld_b_mul, ld_c_mul; scalar_t alpha, beta; - std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul, ld_c_mul) = - combi; + std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul, + ld_c_mul) = combi; if (alloc == "usm") { #ifdef SB_ENABLE_USM @@ -127,9 +131,9 @@ const auto combi = #else template const auto combi = - ::testing::Combine(::testing::Values("usm", "buf"), // allocation type - ::testing::Values('n', 't'), // trans_a - ::testing::Values('n', 't'), // trans_b + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('n', 't'), // trans_a + ::testing::Values('n', 't'), // trans_b ::testing::Values(64, 129, 255), // m ::testing::Values(64, 129, 255), // n ::testing::Values(0, 1, 2), // alpha @@ -146,8 +150,8 @@ static std::string generate_name( char trans_a, trans_b; index_t m, n, lda_mul, ldb_mul, ldc_mul; T alpha, beta; - BLAS_GENERATE_NAME(info.param, alloc, trans_a, trans_b, m, n, alpha, beta, lda_mul, - ldb_mul, ldc_mul); + BLAS_GENERATE_NAME(info.param, alloc, trans_a, trans_b, m, n, alpha, beta, + lda_mul, ldb_mul, ldc_mul); } BLAS_REGISTER_TEST_ALL(OmatAdd, combination_t, combi, generate_name); diff --git a/test/unittest/extension/omatcopy2_test.cpp b/test/unittest/extension/omatcopy2_test.cpp index bae7ba0e4..b29122b74 100644 --- a/test/unittest/extension/omatcopy2_test.cpp +++ b/test/unittest/extension/omatcopy2_test.cpp @@ -27,8 +27,8 @@ #include "extension_reference.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; template void run_test(const combination_t combi) { @@ -37,7 +37,8 @@ void run_test(const combination_t combi) { index_t m, n, inc_in, ld_in_m, inc_out, ld_out_m; scalar_t alpha; - std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) = combi; + std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) = + combi; // Leading dimensions are computed as multiples of the minimum value specified // in the oneMKL documentation at : @@ -77,8 +78,9 @@ void run_test(const combination_t combi) { auto copy_out = helper::copy_to_device(q, B.data(), matrix_out, m_b_size); - auto omatcopy2_event = blas::_omatcopy2(sb_handle, trans, m, n, alpha, matrix_in, ld_in, inc_in, - matrix_out, ld_out, inc_out, {copy_in, copy_out}); + auto omatcopy2_event = + blas::_omatcopy2(sb_handle, trans, m, n, alpha, matrix_in, ld_in, inc_in, + matrix_out, ld_out, inc_out, {copy_in, copy_out}); sb_handle.wait(omatcopy2_event); @@ -101,7 +103,8 @@ void run_test(const combination_t combi) { index_t m, n, inc_in, ld_in_m, inc_out, ld_out_m; scalar_t alpha; - std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) = combi; + std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) = + combi; if (alloc == "usm") { #ifdef SB_ENABLE_USM @@ -129,8 +132,8 @@ const auto combi = #else template const auto combi = - ::testing::Combine(::testing::Values("usm", "buf"), // allocation type - ::testing::Values('n', 't'), // trans + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('n', 't'), // trans ::testing::Values(64, 129, 255), // m ::testing::Values(64, 129, 255), // n ::testing::Values(0, 2), // alpha @@ -147,8 +150,8 @@ static std::string generate_name( char trans; index_t m, n, inc_in, ld_in_m, inc_out, ld_out_m; T alpha; - BLAS_GENERATE_NAME(info.param, alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, - ld_out_m); + BLAS_GENERATE_NAME(info.param, alloc, trans, m, n, alpha, inc_in, ld_in_m, + inc_out, ld_out_m); } BLAS_REGISTER_TEST_ALL(OmatCopy2, combination_t, combi, generate_name); diff --git a/test/unittest/extension/omatcopy_batched_test.cpp b/test/unittest/extension/omatcopy_batched_test.cpp index e15b8405e..4f6569653 100644 --- a/test/unittest/extension/omatcopy_batched_test.cpp +++ b/test/unittest/extension/omatcopy_batched_test.cpp @@ -121,7 +121,7 @@ const auto combi = ::testing::Values('n', 't'), // trans ::testing::Values(1024, 4050, 16380), // m ::testing::Values(1024, 4050, 16380), // n - ::testing::Values(0, 1.05, -20.01), // alpha + ::testing::Values(1.05, -20.01), // alpha ::testing::Values(3, 5), // ld_in_m ::testing::Values(3, 5), // ld_out_m ::testing::Values(5, 10), // stride_in_m @@ -134,12 +134,12 @@ const auto combi = ::testing::Values('n', 't'), // trans ::testing::Values(64, 129, 255), // m ::testing::Values(64, 129, 255), // n - ::testing::Values(0, 2.5), // alpha - ::testing::Values(1, 2, 3), // ld_in_m - ::testing::Values(1, 2, 3), // ld_out_m + ::testing::Values(2.5), // alpha + ::testing::Values(1, 3), // ld_in_m + ::testing::Values(1, 3), // ld_out_m ::testing::Values(1, 3), // stride_in_m - ::testing::Values(1, 3), // stride_out_m - ::testing::Values(1, 2, 5)); // batch_size + ::testing::Values(1, 3), // stride_out_m + ::testing::Values(1, 5)); // batch_size #endif template diff --git a/test/unittest/extension/omatcopy_test.cpp b/test/unittest/extension/omatcopy_test.cpp index a7a1e49a8..c7737f1a3 100644 --- a/test/unittest/extension/omatcopy_test.cpp +++ b/test/unittest/extension/omatcopy_test.cpp @@ -69,8 +69,9 @@ void run_test(const combination_t combi) { auto copy_out = helper::copy_to_device(q, B.data(), matrix_out, size_b); - auto omatcopy_event = blas::_omatcopy(sb_handle, trans, m, n, alpha, matrix_in, ld_in, matrix_out, - ld_out, {copy_in, copy_out}); + auto omatcopy_event = + blas::_omatcopy(sb_handle, trans, m, n, alpha, matrix_in, ld_in, + matrix_out, ld_out, {copy_in, copy_out}); sb_handle.wait(omatcopy_event); @@ -106,7 +107,6 @@ void run_test(const combination_t combi) { } } - #ifdef STRESS_TESTING template const auto combi = @@ -120,11 +120,11 @@ const auto combi = #else template const auto combi = - ::testing::Combine(::testing::Values("usm", "buf"), // allocation type - ::testing::Values('n', 't'), // trans + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('n', 't'), // trans ::testing::Values(64, 129, 255), // m ::testing::Values(64, 129, 255), // n - ::testing::Values(0, 1, 2), // alpha + ::testing::Values(0, 1, 2), // alpha ::testing::Values(1, 2, 3), // ld_in_m ::testing::Values(1, 2, 3)); // ld_out_m #endif diff --git a/test/unittest/extension/reduction_test.cpp b/test/unittest/extension/reduction_test.cpp index 8d98a9425..7dc84d5c1 100644 --- a/test/unittest/extension/reduction_test.cpp +++ b/test/unittest/extension/reduction_test.cpp @@ -71,7 +71,8 @@ static std::string generate_name( operator_t op; reduction_dim_t reductionDim; T unused; - BLAS_GENERATE_NAME(info.param, alloc, rows, cols, ldMul, op, reductionDim, unused); + BLAS_GENERATE_NAME(info.param, alloc, rows, cols, ldMul, op, reductionDim, + unused); } template @@ -173,9 +174,9 @@ void run_test(const combination_t combi) { if (op == operator_t::Mean) { const auto nelems = reduction_dim == reduction_dim_t::outer ? cols : rows; std::transform(out_v_cpu.begin(), out_v_cpu.end(), out_v_cpu.begin(), - [=](scalar_t val) -> scalar_t { - return val / static_cast(nelems); - }); + [=](scalar_t val) -> scalar_t { + return val / static_cast(nelems); + }); } auto m_in_gpu = @@ -183,37 +184,43 @@ void run_test(const combination_t combi) { auto v_out_gpu = blas::helper::allocate(out_size, q); // out_v_gpu - auto copy_m = blas::helper::copy_to_device(q, in_m.data(), - m_in_gpu, ld * cols); + auto copy_m = blas::helper::copy_to_device(q, in_m.data(), m_in_gpu, + ld * cols); auto copy_v = blas::helper::copy_to_device(q, out_v_gpu.data(), - v_out_gpu, out_size); + v_out_gpu, out_size); blas::SB_Handle::event_t ev; try { switch (op) { case operator_t::Add: ev = extension::_reduction( - sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v}); + sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, + {copy_m, copy_v}); break; case operator_t::Product: ev = extension::_reduction( - sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v}); + sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, + {copy_m, copy_v}); break; case operator_t::Max: ev = extension::_reduction( - sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v}); + sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, + {copy_m, copy_v}); break; case operator_t::Min: ev = extension::_reduction( - sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v}); + sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, + {copy_m, copy_v}); break; case operator_t::AbsoluteAdd: ev = extension::_reduction( - sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v}); + sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, + {copy_m, copy_v}); break; case operator_t::Mean: ev = extension::_reduction( - sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v}); + sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, + {copy_m, copy_v}); break; } } catch (cl::sycl::exception& e) { @@ -253,5 +260,4 @@ void run_test(const combination_t combi) { } } -BLAS_REGISTER_TEST_ALL(ReductionPartial, combination_t, combi, - generate_name); +BLAS_REGISTER_TEST_ALL(ReductionPartial, combination_t, combi, generate_name); diff --git a/test/unittest/extension/transpose_test.cpp b/test/unittest/extension/transpose_test.cpp index d3c6a0163..1167c1ddb 100644 --- a/test/unittest/extension/transpose_test.cpp +++ b/test/unittest/extension/transpose_test.cpp @@ -70,8 +70,9 @@ void run_test(const combination_t& combi) { auto copy_out = helper::copy_to_device(q, B.data(), matrix_out, size_b); - auto trans_event = blas::extension::_transpose(sb_handle, m, n, matrix_in, ld_in, - matrix_out, ld_out, {copy_in, copy_out}); + auto trans_event = blas::extension::_transpose( + sb_handle, m, n, matrix_in, ld_in, matrix_out, ld_out, + {copy_in, copy_out}); sb_handle.wait(trans_event); @@ -114,8 +115,8 @@ void run_test(const combination_t combi) { template const auto combi = - ::testing::Combine(::testing::Values("usm", "buf"), // allocation type - ::testing::Values('i', 'o'), // Inplace | Outplace + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('i', 'o'), // Inplace | Outplace ::testing::Values(64, 129, 255), // m ::testing::Values(64, 129, 255), // n ::testing::Values(1, 2, 3), // ld_in_m