From 19eae597bce4a316168bfac543d34148348f4392 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Wed, 27 Dec 2023 12:04:37 +0000 Subject: [PATCH] removed uncessary wg_size template param from Transpose kernels --- include/operations/extension/transpose.h | 47 +++---- src/interface/extension_interface.hpp | 4 +- src/operations/extension/transpose.hpp | 159 ++++++++++++----------- 3 files changed, 99 insertions(+), 111 deletions(-) diff --git a/include/operations/extension/transpose.h b/include/operations/extension/transpose.h index b684f0b01..ca79761e3 100644 --- a/include/operations/extension/transpose.h +++ b/include/operations/extension/transpose.h @@ -42,7 +42,6 @@ namespace blas { * @tparam in_place Whether the transpose is in or out of place * @tparam Tile_size Tiling size used explicitly in the local memory kernel, and * used to compute work-group size in the non-local memory case. - * @tparam wg_size work group size * @tparam cl_size cache line size * @tparam local_memory Whether to use local memory * @tparam in_t The input matrix type @@ -50,8 +49,8 @@ namespace blas { * @tparam element_t The scaling factor type * */ -template +template class Transpose { public: using index_t = typename in_t::index_t; @@ -76,10 +75,6 @@ class Transpose { index_t tile_count_n_; // Total number of tiles used to cover the matrix index_t tile_count_total_; - // Number of Inner WG Tiles - static constexpr const index_t inner_tile_size_ = wg_size / Tile_size; - static constexpr const index_t inner_tile_count_ = - Tile_size / inner_tile_size_; // Batch size when using batched transpose index_t batch_size_; // Number of contiguous elements to be used in local memory to avoid bank @@ -131,17 +126,15 @@ class Transpose { /*! @brief Generator/factory for Transpose trees. */ -template -Transpose +template +Transpose make_transpose(in_t &A, index_t inc_a, index_t &stride_a, out_t &At, index_t inc_a_t, index_t &stride_at, element_t &alpha, index_t &batch_size) { - return Transpose(A, inc_a, stride_a, At, inc_a_t, stride_at, - alpha, batch_size); + return Transpose(A, inc_a, stride_a, At, inc_a_t, stride_at, alpha, + batch_size); } /*! @@ -160,7 +153,6 @@ make_transpose(in_t &A, index_t inc_a, index_t &stride_a, out_t &At, * by the template parameter both_trans. * @tparam Tile_size Tiling size used explicitly in the local memory kernel, and * used to compute work-group size in the non-local memory case. - * @tparam wg_size work group size * @tparam cl_size cache line size * @tparam local_memory Whether to use local memory * @tparam in1_t The input matrix A type @@ -169,9 +161,8 @@ make_transpose(in_t &A, index_t inc_a, index_t &stride_a, out_t &At, * @tparam element_t The scaling factor type * */ -template +template class TransposeAdd { public: using index_t = typename in1_t::index_t; @@ -197,10 +188,6 @@ class TransposeAdd { index_t tile_count_n_; // Total number of tiles used to cover the matrix index_t tile_count_total_; - // Inner WG Tiles - static constexpr const index_t inner_tile_size_ = wg_size / Tile_size; - static constexpr const index_t inner_tile_count_ = - Tile_size / inner_tile_size_; // Batch size when using batched transpose index_t batch_size_; // Number of contiguous elements to be used in local memory to avoid bank @@ -254,16 +241,16 @@ class TransposeAdd { /*! * @brief Generator/factory for Transpose-Add trees. */ -template -TransposeAdd +template +TransposeAdd make_transpose_add(in1_t &A, index_t stride_a, in2_t &B, index_t stride_b, out_t &C, index_t stride_c, element_t &alpha, element_t &beta, index_t batch_size) { - return TransposeAdd( + return TransposeAdd( A, stride_a, B, stride_b, C, stride_c, alpha, beta, batch_size); } diff --git a/src/interface/extension_interface.hpp b/src/interface/extension_interface.hpp index 9613a4aeb..bea5a973f 100644 --- a/src/interface/extension_interface.hpp +++ b/src/interface/extension_interface.hpp @@ -77,7 +77,7 @@ typename sb_handle_t::event_t _transpose_outplace_impl( // Transpose expression Tree auto trans_scale_tree = - make_transpose( + make_transpose( in_view, _inc_in, _stride_in, out_view, _inc_out, _stride_out, _alpha, _batch_size); @@ -234,7 +234,7 @@ typename sb_handle_t::event_t _transpose_add_impl( // Transpose Add expression Tree auto trans_scale_tree = - make_transpose_add( + make_transpose_add( A_view, _stride_a, B_view, _stride_b, C_view, _stride_c, _alpha, _beta, _batch_size); diff --git a/src/operations/extension/transpose.hpp b/src/operations/extension/transpose.hpp index 87485660e..f0ece4141 100644 --- a/src/operations/extension/transpose.hpp +++ b/src/operations/extension/transpose.hpp @@ -30,39 +30,37 @@ namespace blas { // Transpose -template +template PORTBLAS_INLINE bool -Transpose::valid_thread(cl::sycl::nd_item<1> item) const { index_t idx = item.get_global_linear_id(); return (idx < get_size()); } -template -PORTBLAS_INLINE void -Transpose::bind(cl::sycl::handler &cgh) { +template +PORTBLAS_INLINE void Transpose::bind(cl::sycl::handler &cgh) { A_.bind(cgh); At_.bind(cgh); } -template +template PORTBLAS_INLINE typename in_t::index_t -Transpose::get_size() const { // Smallest TileSize square-multiple containing input/output matrices times // batch_size return (tile_count_total_ * Tile_size * Tile_size * batch_size_); } -template -PORTBLAS_INLINE void -Transpose::adjust_access_displacement() { +template +PORTBLAS_INLINE void Transpose::adjust_access_displacement() { A_.adjust_access_displacement(); At_.adjust_access_displacement(); } @@ -77,10 +75,10 @@ Transpose +template PORTBLAS_INLINE void -Transpose::get_indices(cl::sycl::nd_item<1> id, index_t &in_idx, index_t &out_idx, index_t &i, index_t &j) { index_t idg = id.get_group(0); @@ -110,13 +108,16 @@ Transpose +template PORTBLAS_INLINE void -Transpose::eval(cl::sycl::nd_item<1> id) { index_t idx = id.get_global_linear_id(); + const index_t inner_tile_size_ = id.get_local_range(0) / Tile_size; + const index_t inner_tile_count_ = Tile_size / inner_tile_size_; + index_t in_index, out_index, i_id, j_id; get_indices(id, in_index, out_index, i_id, j_id); @@ -148,10 +149,10 @@ Transpose +template PORTBLAS_INLINE void -Transpose::get_indices(cl::sycl::nd_item<1> id, index_t &in_idx, index_t &in_local_idx, index_t &out_idx, index_t &out_local_idx, @@ -188,12 +189,15 @@ Transpose +template template PORTBLAS_INLINE void -Transpose::eval(local_memory_t local_mem, cl::sycl::nd_item<1> id) { + const index_t inner_tile_size_ = id.get_local_range(0) / Tile_size; + const index_t inner_tile_count_ = Tile_size / inner_tile_size_; + value_t *local = local_mem.localAcc.get_pointer(); auto A = A_.get_pointer(); auto At = At_.get_pointer(); @@ -226,43 +230,39 @@ Transpose -PORTBLAS_INLINE bool TransposeAdd< - both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t, in2_t, out_t, - element_t>::valid_thread(cl::sycl::nd_item<1> item) const { +template +PORTBLAS_INLINE bool +TransposeAdd::valid_thread(cl::sycl::nd_item<1> item) const { auto idx = item.get_global_linear_id(); return idx < get_size(); } -template +template PORTBLAS_INLINE void -TransposeAdd::bind(cl::sycl::handler &cgh) { +TransposeAdd::bind(cl::sycl::handler &cgh) { A_.bind(cgh); B_.bind(cgh); C_.bind(cgh); } -template +template PORTBLAS_INLINE typename in1_t::index_t -TransposeAdd::get_size() const { +TransposeAdd::get_size() const { // Smallest TileSize square-multiple containing input/output matrices return (tile_count_total_ * Tile_size * Tile_size * batch_size_); } -template +template PORTBLAS_INLINE void -TransposeAdd::adjust_access_displacement() { +TransposeAdd::adjust_access_displacement() { A_.adjust_access_displacement(); B_.adjust_access_displacement(); C_.adjust_access_displacement(); @@ -281,16 +281,13 @@ TransposeAdd [0,M_], B *& C otherwise -> [0,N_]) */ -template +template PORTBLAS_INLINE void -TransposeAdd::get_indices(cl::sycl::nd_item<1> id, - index_t &in_a_idx, - index_t &in_b_idx, - index_t &out_idx, index_t &i, - index_t &j) { +TransposeAdd::get_indices(cl::sycl::nd_item<1> id, index_t &in_a_idx, + index_t &in_b_idx, index_t &out_idx, + index_t &i, index_t &j) { const index_t row_tiles = both_trans ? tile_count_n_ : tile_count_m_; index_t idg = id.get_group(0); @@ -330,12 +327,14 @@ TransposeAdd +template PORTBLAS_INLINE void -TransposeAdd::eval(cl::sycl::nd_item<1> id) { +TransposeAdd::eval(cl::sycl::nd_item<1> id) { + const index_t inner_tile_size_ = id.get_local_range(0) / Tile_size; + const index_t inner_tile_count_ = Tile_size / inner_tile_size_; + auto A = A_.get_pointer(); auto B = B_.get_pointer(); auto C = C_.get_pointer(); @@ -382,16 +381,16 @@ TransposeAdd -PORTBLAS_INLINE void TransposeAdd< - both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t, in2_t, out_t, - element_t>::get_indices(cl::sycl::nd_item<1> id, index_t &in_a_idx, - index_t &in_b_idx, index_t &in_local_idx, - index_t &out_idx, index_t &out_local_idx, - index_t &i_block_start, index_t &j_block_start, - index_t &il, index_t &jl) { +template +PORTBLAS_INLINE void +TransposeAdd::get_indices(cl::sycl::nd_item<1> id, index_t &in_a_idx, + index_t &in_b_idx, index_t &in_local_idx, + index_t &out_idx, index_t &out_local_idx, + index_t &i_block_start, + index_t &j_block_start, index_t &il, + index_t &jl) { const index_t row_tiles = both_trans ? tile_count_n_ : tile_count_m_; index_t idg = id.get_group(0); @@ -435,14 +434,16 @@ PORTBLAS_INLINE void TransposeAdd< out_local_idx = il * Tile_size + jl + il / get_num_tiles_per_line(); } -template +template template PORTBLAS_INLINE void -TransposeAdd::eval(local_memory_t local_mem, - cl::sycl::nd_item<1> id) { +TransposeAdd::eval(local_memory_t local_mem, + cl::sycl::nd_item<1> id) { + const index_t inner_tile_size_ = id.get_local_range(0) / Tile_size; + const index_t inner_tile_count_ = Tile_size / inner_tile_size_; + value_t *local = local_mem.localAcc.get_pointer(); auto A = A_.get_pointer();