From ed81360df447db432b178ec6276b2927156acecb Mon Sep 17 00:00:00 2001
From: Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
Date: Tue, 12 Dec 2023 18:39:01 +0000
Subject: [PATCH 1/5] Updated transpose & transpose add cpu configs

---
 src/interface/extension/backend/default_cpu.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/interface/extension/backend/default_cpu.hpp b/src/interface/extension/backend/default_cpu.hpp
index d8a2f6c24..ba714e78c 100644
--- a/src/interface/extension/backend/default_cpu.hpp
+++ b/src/interface/extension/backend/default_cpu.hpp
@@ -37,12 +37,12 @@ typename sb_handle_t::event_t _transpose_outplace(
     container_0_t in_, index_t _ld_in, index_t _inc_in, index_t _stride_in,
     container_1_t out_, index_t _ld_out, index_t _inc_out, index_t _stride_out,
     index_t _batch_size, const typename sb_handle_t::event_t& _dependencies) {
-  if (_M * _N < (1 << 20)) {
+  if (_M * _N < (1 << 16)) {
     return blas::internal::_transpose_outplace_impl<16, 64, 64, false>(
         sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_,
         _ld_out, _inc_out, _stride_out, _batch_size, _dependencies);
   } else {
-    return blas::internal::_transpose_outplace_impl<32, 128, 64, false>(
+    return blas::internal::_transpose_outplace_impl<32, 32, 64, false>(
         sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_,
         _ld_out, _inc_out, _stride_out, _batch_size, _dependencies);
   }
@@ -58,13 +58,13 @@ typename sb_handle_t::event_t _transpose_add(
     index_t _b_rows, index_t _b_cols, index_t _stride_b, container_2_t c_,
     index_t _ld_c, index_t _stride_c, index_t _batch_size,
     const typename sb_handle_t::event_t& _dependencies) {
-  if (_M * _N < (1 << 20)) {
+  if (_M * _N < (1 << 16)) {
     return blas::internal::_transpose_add_impl<both_trans, 16, 64, 64, false>(
         sb_handle, _M, _N, _alpha, a_, _ld_a, _a_rows, _a_cols, _stride_a,
         _beta, b_, _ld_b, _b_rows, _b_cols, _stride_b, c_, _ld_c, _stride_c,
         _batch_size, _dependencies);
   } else {
-    return blas::internal::_transpose_add_impl<both_trans, 32, 128, 64, false>(
+    return blas::internal::_transpose_add_impl<both_trans, 32, 32, 64, false>(
         sb_handle, _M, _N, _alpha, a_, _ld_a, _a_rows, _a_cols, _stride_a,
         _beta, b_, _ld_b, _b_rows, _b_cols, _stride_b, c_, _ld_c, _stride_c,
         _batch_size, _dependencies);

From d7b8d69cef2157803b4711234bbb2ef5620f1134 Mon Sep 17 00:00:00 2001
From: Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
Date: Tue, 12 Dec 2023 18:39:36 +0000
Subject: [PATCH 2/5] Reduced omatcopy and omatadd benchmarks cases

---
 common/include/common/common_utils.hpp | 76 +++++++++++---------------
 1 file changed, 32 insertions(+), 44 deletions(-)

diff --git a/common/include/common/common_utils.hpp b/common/include/common/common_utils.hpp
index fc4562aa8..6cb967cc3 100644
--- a/common/include/common/common_utils.hpp
+++ b/common/include/common/common_utils.hpp
@@ -1242,15 +1242,13 @@ static inline std::vector<matcopy_param_t<scalar_t>> get_matcopy_params(
     std::vector<matcopy_param_t<scalar_t>> matcopy_default;
     constexpr index_t dmin = 64, dmax = 8192;
     constexpr scalar_t alpha{2};
+    constexpr index_t lda_mul = 1;
+    constexpr index_t ldb_mul = 1;
     for (char trans : {'n', 't'}) {
       for (index_t m = dmin; m <= dmax; m *= 2) {
         for (index_t n = dmin; n <= dmax; n *= 2) {
-          for (index_t lda_mul = 1; lda_mul < 2; ++lda_mul) {
-            for (index_t ldb_mul = 1; ldb_mul < 2; ++ldb_mul) {
-              matcopy_default.push_back(
-                  std::make_tuple(trans, m, n, alpha, lda_mul, ldb_mul));
-            }
-          }
+          matcopy_default.push_back(
+              std::make_tuple(trans, m, n, alpha, lda_mul, ldb_mul));
         }
       }
     }
@@ -1287,17 +1285,15 @@ static inline std::vector<omatcopy2_param_t<scalar_t>> get_omatcopy2_params(
     std::vector<omatcopy2_param_t<scalar_t>> omatcopy2_default;
     constexpr index_t dmin = 1024, dmax = 8192;
     constexpr scalar_t alpha{2};
+    constexpr index_t lda_mul = 1;
+    constexpr index_t ldb_mul = 1;
     for (char trans : {'n', 't'}) {
       for (index_t m = dmin; m <= dmax; m *= 2) {
         for (index_t n = dmin; n <= dmax; n *= 2) {
-          for (index_t lda_mul = 1; lda_mul < 2; ++lda_mul) {
-            for (index_t inc_a = 1; inc_a < 3; ++inc_a) {
-              for (index_t ldb_mul = 1; ldb_mul < 2; ++ldb_mul) {
-                for (index_t inc_b = 1; inc_b < 3; ++inc_b) {
-                  omatcopy2_default.push_back(std::make_tuple(
-                      trans, m, n, alpha, lda_mul, ldb_mul, inc_a, inc_b));
-                }
-              }
+          for (index_t inc_a = 1; inc_a < 3; ++inc_a) {
+            for (index_t inc_b = 1; inc_b < 3; ++inc_b) {
+              omatcopy2_default.push_back(std::make_tuple(
+                  trans, m, n, alpha, lda_mul, ldb_mul, inc_a, inc_b));
             }
           }
         }
@@ -1336,21 +1332,20 @@ get_matcopy_batch_params(Args& args) {
   if (args.csv_param.empty()) {
     warning_no_csv();
     std::vector<matcopy_batch_param_t<scalar_t>> matcopy_batch_default;
-    constexpr index_t dmin = 256, dmax = 8192;
+    constexpr index_t dmin = 256, dmax = 4096;
     constexpr scalar_t alpha{2};
     constexpr index_t batch_size{3};
     constexpr index_t stride_a_mul{1};
     constexpr index_t stride_b_mul{1};
+    constexpr index_t lda_mul = 1;
+    constexpr index_t ldb_mul = 1;
+    constexpr index_t ldc_mul = 1;
     for (char trans : {'n', 't'}) {
       for (index_t m = dmin; m <= dmax; m *= 2) {
         for (index_t n = dmin; n <= dmax; n *= 2) {
-          for (index_t lda_mul = 1; lda_mul < 2; ++lda_mul) {
-            for (index_t ldb_mul = 1; ldb_mul < 2; ++ldb_mul) {
-              matcopy_batch_default.push_back(
-                  std::make_tuple(trans, m, n, alpha, lda_mul, ldb_mul,
-                                  stride_a_mul, stride_b_mul, batch_size));
-            }
-          }
+          matcopy_batch_default.push_back(
+              std::make_tuple(trans, m, n, alpha, lda_mul, ldb_mul,
+                              stride_a_mul, stride_b_mul, batch_size));
         }
       }
     }
@@ -1386,22 +1381,19 @@ static inline std::vector<omatadd_param_t<scalar_t>> get_omatadd_params(
   if (args.csv_param.empty()) {
     warning_no_csv();
     std::vector<omatadd_param_t<scalar_t>> omatadd_default;
-    constexpr index_t dmin = 64, dmax = 8192;
+    constexpr index_t dmin = 64, dmax = 4096;
     constexpr scalar_t alpha{2};
     constexpr scalar_t beta{2};
+    constexpr index_t lda_mul = 1;
+    constexpr index_t ldb_mul = 1;
+    constexpr index_t ldc_mul = 1;
     for (char trans_a : {'n', 't'}) {
       for (char trans_b : {'n', 't'}) {
         for (index_t m = dmin; m <= dmax; m *= 2) {
           for (index_t n = dmin; n <= dmax; n *= 2) {
-            for (index_t lda_mul = 1; lda_mul < 2; ++lda_mul) {
-              for (index_t ldb_mul = 1; ldb_mul < 2; ++ldb_mul) {
-                for (index_t ldc_mul = 1; ldc_mul < 2; ++ldc_mul) {
-                  omatadd_default.push_back(
-                      std::make_tuple(trans_a, trans_b, m, n, alpha, beta,
-                                      lda_mul, ldb_mul, ldc_mul));
-                }
-              }
-            }
+            omatadd_default.push_back(std::make_tuple(trans_a, trans_b, m, n,
+                                                      alpha, beta, lda_mul,
+                                                      ldb_mul, ldc_mul));
           }
         }
       }
@@ -1439,27 +1431,23 @@ get_omatadd_batch_params(Args& args) {
   if (args.csv_param.empty()) {
     warning_no_csv();
     std::vector<omatadd_batch_param_t<scalar_t>> omatadd_batch_default;
-    constexpr index_t dmin = 256, dmax = 8192;
+    constexpr index_t dmin = 1024, dmax = 4096;
     constexpr scalar_t alpha{2};
     constexpr scalar_t beta{2};
     constexpr index_t batch_size{3};
     constexpr index_t stride_a_mul{1};
     constexpr index_t stride_b_mul{1};
     constexpr index_t stride_c_mul{1};
+    constexpr index_t lda_mul = 1;
+    constexpr index_t ldb_mul = 1;
+    constexpr index_t ldc_mul = 1;
     for (char trans_a : {'n', 't'}) {
-      for (char trans_b : {'n', 't'}) {
+      for (char trans_b : {'n'}) {
         for (index_t m = dmin; m <= dmax; m *= 2) {
           for (index_t n = dmin; n <= dmax; n *= 2) {
-            for (index_t lda_mul = 1; lda_mul < 2; ++lda_mul) {
-              for (index_t ldb_mul = 1; ldb_mul < 2; ++ldb_mul) {
-                for (index_t ldc_mul = 1; ldc_mul < 2; ++ldc_mul) {
-                  omatadd_batch_default.push_back(
-                      std::make_tuple(trans_a, trans_b, m, n, alpha, beta,
-                                      lda_mul, ldb_mul, ldc_mul, stride_a_mul,
-                                      stride_b_mul, stride_c_mul, batch_size));
-                }
-              }
-            }
+            omatadd_batch_default.push_back(std::make_tuple(
+                trans_a, trans_b, m, n, alpha, beta, lda_mul, ldb_mul, ldc_mul,
+                stride_a_mul, stride_b_mul, stride_c_mul, batch_size));
           }
         }
       }

From c6f2d0fb9ffb03ca134a25e2a96322d020af7c31 Mon Sep 17 00:00:00 2001
From: Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
Date: Thu, 21 Dec 2023 22:02:05 +0000
Subject: [PATCH 3/5] removed extra attributes from transpose kernels

---
 include/operations/extension/transpose.h | 16 ----------------
 src/operations/extension/transpose.hpp   | 22 ++++++++++++----------
 2 files changed, 12 insertions(+), 26 deletions(-)

diff --git a/include/operations/extension/transpose.h b/include/operations/extension/transpose.h
index 71921b539..b684f0b01 100644
--- a/include/operations/extension/transpose.h
+++ b/include/operations/extension/transpose.h
@@ -80,11 +80,6 @@ class Transpose {
   static constexpr const index_t inner_tile_size_ = wg_size / Tile_size;
   static constexpr const index_t inner_tile_count_ =
       Tile_size / inner_tile_size_;
-  // Minimum number of Tile-mutliple rows & columns to cover the matrices
-  index_t M_pad_;
-  index_t N_pad_;
-  // Total size of Tile-mutliple covering matrix
-  index_t size_pad_;
   // Batch size when using batched transpose
   index_t batch_size_;
   // Number of contiguous elements to be used in local memory to avoid bank
@@ -115,9 +110,6 @@ class Transpose {
         stride_a_(stride_a),
         stride_at_(stride_at),
         inc_at_(inc_at),
-        M_pad_(tile_count_m_ * Tile_size),
-        N_pad_(tile_count_n_ * Tile_size),
-        size_pad_(M_pad_ * N_pad_),
         batch_size_(batch_size) {}
 
   index_t get_size() const;
@@ -209,11 +201,6 @@ class TransposeAdd {
   static constexpr const index_t inner_tile_size_ = wg_size / Tile_size;
   static constexpr const index_t inner_tile_count_ =
       Tile_size / inner_tile_size_;
-  // Minimum number of Tile-mutliple rows & columns to cover the output matrix
-  index_t M_pad_;
-  index_t N_pad_;
-  // Total size of Tile-mutliple covering matrix
-  index_t size_pad_;
   // Batch size when using batched transpose
   index_t batch_size_;
   // Number of contiguous elements to be used in local memory to avoid bank
@@ -246,9 +233,6 @@ class TransposeAdd {
         tile_count_m_((M_ - 1) / Tile_size + 1),
         tile_count_n_((N_ - 1) / Tile_size + 1),
         tile_count_total_(tile_count_m_ * tile_count_n_),
-        M_pad_(tile_count_m_ * Tile_size),
-        N_pad_(tile_count_n_ * Tile_size),
-        size_pad_(M_pad_ * N_pad_),
         batch_size_(batch_size) {}
 
   index_t get_size() const;
diff --git a/src/operations/extension/transpose.hpp b/src/operations/extension/transpose.hpp
index 98ecf8e03..87485660e 100644
--- a/src/operations/extension/transpose.hpp
+++ b/src/operations/extension/transpose.hpp
@@ -55,7 +55,7 @@ Transpose<in_place, Tile_size, wg_size, cl_size, local_memory, in_t, out_t,
           element_t>::get_size() const {
   // Smallest TileSize square-multiple containing input/output matrices times
   // batch_size
-  return (size_pad_ * batch_size_);
+  return (tile_count_total_ * Tile_size * Tile_size * batch_size_);
 }
 
 template <bool in_place, int Tile_size, int wg_size, int cl_size,
@@ -254,7 +254,7 @@ PORTBLAS_INLINE typename in1_t::index_t
 TransposeAdd<both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t,
              in2_t, out_t, element_t>::get_size() const {
   // Smallest TileSize square-multiple containing input/output matrices
-  return (size_pad_ * batch_size_);
+  return (tile_count_total_ * Tile_size * Tile_size * batch_size_);
 }
 
 template <bool both_trans, int Tile_size, int wg_size, int cl_size,
@@ -276,10 +276,10 @@ TransposeAdd<both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t,
  * @param in_a_idx [output] the input A global-memory index
  * @param in_b_idx [output] the input B global-memory index
  * @param out_idx [output] the output C global-memory index
- * @param i [output] the global row-index (A & B when both_trans -> [0,N_], B &
- *C otherwise -> [0,M_])
- * @param j [output] the global col-index (A & B when both_trans -> [0,M_], B &
- *C otherwise -> [0,N_])
+ * @param i [output] the global row-index (A & B when both_trans -> [0,N_], B
+ *& C otherwise -> [0,M_])
+ * @param j [output] the global col-index (A & B when both_trans -> [0,M_], B
+ *& C otherwise -> [0,N_])
  */
 template <bool both_trans, int Tile_size, int wg_size, int cl_size,
           bool local_memory, typename in1_t, typename in2_t, typename out_t,
@@ -461,7 +461,8 @@ TransposeAdd<both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t,
       // Compute & Copy sum/scaled input to local memory (before transpose)
       for (index_t l = 0; l < inner_tile_count_; l++) {
         if (j_block_start + jl + l * inner_tile_size_ < M_) {
-          // Compute & Copy sum/scaled input to local memory (before transpose)
+          // Compute & Copy sum/scaled input to local memory (before
+          // transpose)
           local[in_local_id +
                 l * (get_non_bank_conflict_line_size() + 1) *
                     (inner_tile_size_ / get_num_tiles_per_line())] =
@@ -490,7 +491,8 @@ TransposeAdd<both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t,
     if (j_block_start + il < N_) {
       for (index_t l = 0; l < inner_tile_count_; l++) {
         if (i_block_start + jl + l * inner_tile_size_ < M_) {
-          // Compute & Copy sum/scaled input to local memory (before transpose)
+          // Compute & Copy sum/scaled input to local memory (before
+          // transpose)
           local[in_local_id +
                 l * (get_non_bank_conflict_line_size() + 1) *
                     (inner_tile_size_ / get_num_tiles_per_line())] =
@@ -501,8 +503,8 @@ TransposeAdd<both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t,
 
     id.barrier(cl::sycl::access::fence_space::local_space);
 
-    // Transposed copy of previous output from local memory and scaled addition
-    // with 2nd non transposed matrix B
+    // Transposed copy of previous output from local memory and scaled
+    // addition with 2nd non transposed matrix B
     if (i_block_start + il < M_) {
       for (index_t l = 0; l < inner_tile_count_; l++) {
         if (j_block_start + jl + l * inner_tile_size_ < N_) {

From 3add384fb7e1917b86af4db9372fc27ed81311ca Mon Sep 17 00:00:00 2001
From: Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
Date: Mon, 1 Jan 2024 13:29:53 +0000
Subject: [PATCH 4/5] Reduced matcopy-batched test cases sizes

---
 test/unittest/extension/omatadd_batched_test.cpp  |  8 ++++----
 test/unittest/extension/omatcopy_batched_test.cpp | 12 ++++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/test/unittest/extension/omatadd_batched_test.cpp b/test/unittest/extension/omatadd_batched_test.cpp
index d0ce57751..f65a97c92 100644
--- a/test/unittest/extension/omatadd_batched_test.cpp
+++ b/test/unittest/extension/omatadd_batched_test.cpp
@@ -160,15 +160,15 @@ const auto combi =
                        ::testing::Values<char>('n', 't'),  // trans_b
                        ::testing::Values<index_t>(64, 129, 255),  // m
                        ::testing::Values<index_t>(64, 129, 255),  // n
-                       ::testing::Values<scalar_t>(2.5),            // alpha
-                       ::testing::Values<scalar_t>(3.5),            // beta
+                       ::testing::Values<scalar_t>(2.5),          // alpha
+                       ::testing::Values<scalar_t>(3.5),          // beta
                        ::testing::Values<index_t>(1, 2),          // lda_mul
                        ::testing::Values<index_t>(1, 2),          // ldb_mul
-                       ::testing::Values<index_t>(1, 2, 3),       // ldc_mul
+                       ::testing::Values<index_t>(3),             // ldc_mul
                        ::testing::Values<index_t>(1, 3),          // stride_a_m
                        ::testing::Values<index_t>(1, 3),          // stride_b_m
                        ::testing::Values<index_t>(1, 3),          // stride_c_m
-                       ::testing::Values<index_t>(2, 3));         // batch_size
+                       ::testing::Values<index_t>(3));            // batch_size
 #endif
 
 template <class T>
diff --git a/test/unittest/extension/omatcopy_batched_test.cpp b/test/unittest/extension/omatcopy_batched_test.cpp
index e15b8405e..4f6569653 100644
--- a/test/unittest/extension/omatcopy_batched_test.cpp
+++ b/test/unittest/extension/omatcopy_batched_test.cpp
@@ -121,7 +121,7 @@ const auto combi =
                        ::testing::Values<char>('n', 't'),              // trans
                        ::testing::Values<index_t>(1024, 4050, 16380),  // m
                        ::testing::Values<index_t>(1024, 4050, 16380),  // n
-                       ::testing::Values<scalar_t>(0, 1.05, -20.01),   // alpha
+                       ::testing::Values<scalar_t>(1.05, -20.01),      // alpha
                        ::testing::Values<index_t>(3, 5),     // ld_in_m
                        ::testing::Values<index_t>(3, 5),     // ld_out_m
                        ::testing::Values<index_t>(5, 10),    // stride_in_m
@@ -134,12 +134,12 @@ const auto combi =
                        ::testing::Values<char>('n', 't'),         // trans
                        ::testing::Values<index_t>(64, 129, 255),  // m
                        ::testing::Values<index_t>(64, 129, 255),  // n
-                       ::testing::Values<scalar_t>(0, 2.5),         // alpha
-                       ::testing::Values<index_t>(1, 2, 3),       // ld_in_m
-                       ::testing::Values<index_t>(1, 2, 3),       // ld_out_m
+                       ::testing::Values<scalar_t>(2.5),          // alpha
+                       ::testing::Values<index_t>(1, 3),          // ld_in_m
+                       ::testing::Values<index_t>(1, 3),          // ld_out_m
                        ::testing::Values<index_t>(1, 3),          // stride_in_m
-                       ::testing::Values<index_t>(1, 3),      // stride_out_m
-                       ::testing::Values<index_t>(1, 2, 5));  // batch_size
+                       ::testing::Values<index_t>(1, 3),   // stride_out_m
+                       ::testing::Values<index_t>(1, 5));  // batch_size
 #endif
 
 template <class T>

From eba77f16a1877c5bb439242b5c63d0c7a70544ad Mon Sep 17 00:00:00 2001
From: Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
Date: Mon, 1 Jan 2024 13:32:38 +0000
Subject: [PATCH 5/5] Formatting to extension tests

---
 test/unittest/extension/omatadd_test.cpp   | 36 ++++++++++++----------
 test/unittest/extension/omatcopy2_test.cpp | 23 ++++++++------
 test/unittest/extension/omatcopy_test.cpp  | 12 ++++----
 test/unittest/extension/reduction_test.cpp | 36 +++++++++++++---------
 test/unittest/extension/transpose_test.cpp |  9 +++---
 5 files changed, 65 insertions(+), 51 deletions(-)

diff --git a/test/unittest/extension/omatadd_test.cpp b/test/unittest/extension/omatadd_test.cpp
index 5c4ac6958..56ef0fca3 100644
--- a/test/unittest/extension/omatadd_test.cpp
+++ b/test/unittest/extension/omatadd_test.cpp
@@ -27,8 +27,8 @@
 #include "extension_reference.hpp"
 
 template <typename scalar_t>
-using combination_t = std::tuple<std::string, char, char, index_t, index_t, scalar_t,
-                                 scalar_t, index_t, index_t, index_t>;
+using combination_t = std::tuple<std::string, char, char, index_t, index_t,
+                                 scalar_t, scalar_t, index_t, index_t, index_t>;
 
 template <typename scalar_t, helper::AllocType mem_alloc>
 void run_test(const combination_t<scalar_t> combi) {
@@ -37,8 +37,8 @@ void run_test(const combination_t<scalar_t> combi) {
   index_t m, n, ld_a_mul, ld_b_mul, ld_c_mul;
   scalar_t alpha, beta;
 
-  std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul, ld_c_mul) =
-      combi;
+  std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul,
+           ld_c_mul) = combi;
 
   auto q = make_queue();
   blas::SB_Handle sb_handle(q);
@@ -70,12 +70,16 @@ void run_test(const combination_t<scalar_t> combi) {
   auto m_b_gpu = helper::allocate<mem_alloc, scalar_t>(size_m_b, q);
   auto m_c_gpu = helper::allocate<mem_alloc, scalar_t>(size_m_c, q);
 
-  auto copy_m_a = helper::copy_to_device<scalar_t>(q, A.data(), m_a_gpu, size_m_a);
-  auto copy_m_b = helper::copy_to_device<scalar_t>(q, B.data(), m_b_gpu, size_m_b);
-  auto copy_m_c = helper::copy_to_device<scalar_t>(q, C.data(), m_c_gpu, size_m_c);
+  auto copy_m_a =
+      helper::copy_to_device<scalar_t>(q, A.data(), m_a_gpu, size_m_a);
+  auto copy_m_b =
+      helper::copy_to_device<scalar_t>(q, B.data(), m_b_gpu, size_m_b);
+  auto copy_m_c =
+      helper::copy_to_device<scalar_t>(q, C.data(), m_c_gpu, size_m_c);
 
-  auto omatadd_event = blas::_omatadd(sb_handle, trans_a, trans_b, m, n, alpha, m_a_gpu, lda, beta,
-                 m_b_gpu, ldb, m_c_gpu, ldc, {copy_m_a, copy_m_b, copy_m_c});
+  auto omatadd_event = blas::_omatadd(sb_handle, trans_a, trans_b, m, n, alpha,
+                                      m_a_gpu, lda, beta, m_b_gpu, ldb, m_c_gpu,
+                                      ldc, {copy_m_a, copy_m_b, copy_m_c});
   sb_handle.wait(omatadd_event);
 
   auto event = blas::helper::copy_to_host<scalar_t>(
@@ -98,8 +102,8 @@ void run_test(const combination_t<scalar_t> combi) {
   index_t m, n, ld_a_mul, ld_b_mul, ld_c_mul;
   scalar_t alpha, beta;
 
-  std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul, ld_c_mul) =
-      combi;
+  std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul,
+           ld_c_mul) = combi;
 
   if (alloc == "usm") {
 #ifdef SB_ENABLE_USM
@@ -127,9 +131,9 @@ const auto combi =
 #else
 template <typename scalar_t>
 const auto combi =
-    ::testing::Combine(::testing::Values("usm", "buf"),        // allocation type
-                       ::testing::Values<char>('n', 't'),         // trans_a
-                       ::testing::Values<char>('n', 't'),         // trans_b
+    ::testing::Combine(::testing::Values("usm", "buf"),    // allocation type
+                       ::testing::Values<char>('n', 't'),  // trans_a
+                       ::testing::Values<char>('n', 't'),  // trans_b
                        ::testing::Values<index_t>(64, 129, 255),  // m
                        ::testing::Values<index_t>(64, 129, 255),  // n
                        ::testing::Values<scalar_t>(0, 1, 2),      // alpha
@@ -146,8 +150,8 @@ static std::string generate_name(
   char trans_a, trans_b;
   index_t m, n, lda_mul, ldb_mul, ldc_mul;
   T alpha, beta;
-  BLAS_GENERATE_NAME(info.param, alloc, trans_a, trans_b, m, n, alpha, beta, lda_mul,
-                     ldb_mul, ldc_mul);
+  BLAS_GENERATE_NAME(info.param, alloc, trans_a, trans_b, m, n, alpha, beta,
+                     lda_mul, ldb_mul, ldc_mul);
 }
 
 BLAS_REGISTER_TEST_ALL(OmatAdd, combination_t, combi, generate_name);
diff --git a/test/unittest/extension/omatcopy2_test.cpp b/test/unittest/extension/omatcopy2_test.cpp
index bae7ba0e4..b29122b74 100644
--- a/test/unittest/extension/omatcopy2_test.cpp
+++ b/test/unittest/extension/omatcopy2_test.cpp
@@ -27,8 +27,8 @@
 #include "extension_reference.hpp"
 
 template <typename scalar_t>
-using combination_t = std::tuple<std::string, char, index_t, index_t, scalar_t, index_t,
-                                 index_t, index_t, index_t>;
+using combination_t = std::tuple<std::string, char, index_t, index_t, scalar_t,
+                                 index_t, index_t, index_t, index_t>;
 
 template <typename scalar_t, helper::AllocType mem_alloc>
 void run_test(const combination_t<scalar_t> combi) {
@@ -37,7 +37,8 @@ void run_test(const combination_t<scalar_t> combi) {
   index_t m, n, inc_in, ld_in_m, inc_out, ld_out_m;
   scalar_t alpha;
 
-  std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) = combi;
+  std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) =
+      combi;
 
   // Leading dimensions are computed as multiples of the minimum value specified
   // in the oneMKL documentation at :
@@ -77,8 +78,9 @@ void run_test(const combination_t<scalar_t> combi) {
   auto copy_out =
       helper::copy_to_device<scalar_t>(q, B.data(), matrix_out, m_b_size);
 
-  auto omatcopy2_event = blas::_omatcopy2(sb_handle, trans, m, n, alpha, matrix_in, ld_in, inc_in,
-                   matrix_out, ld_out, inc_out, {copy_in, copy_out});
+  auto omatcopy2_event =
+      blas::_omatcopy2(sb_handle, trans, m, n, alpha, matrix_in, ld_in, inc_in,
+                       matrix_out, ld_out, inc_out, {copy_in, copy_out});
 
   sb_handle.wait(omatcopy2_event);
 
@@ -101,7 +103,8 @@ void run_test(const combination_t<scalar_t> combi) {
   index_t m, n, inc_in, ld_in_m, inc_out, ld_out_m;
   scalar_t alpha;
 
-  std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) = combi;
+  std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) =
+      combi;
 
   if (alloc == "usm") {
 #ifdef SB_ENABLE_USM
@@ -129,8 +132,8 @@ const auto combi =
 #else
 template <typename scalar_t>
 const auto combi =
-    ::testing::Combine(::testing::Values("usm", "buf"),        // allocation type
-                       ::testing::Values<char>('n', 't'),         // trans
+    ::testing::Combine(::testing::Values("usm", "buf"),    // allocation type
+                       ::testing::Values<char>('n', 't'),  // trans
                        ::testing::Values<index_t>(64, 129, 255),  // m
                        ::testing::Values<index_t>(64, 129, 255),  // n
                        ::testing::Values<scalar_t>(0, 2),         // alpha
@@ -147,8 +150,8 @@ static std::string generate_name(
   char trans;
   index_t m, n, inc_in, ld_in_m, inc_out, ld_out_m;
   T alpha;
-  BLAS_GENERATE_NAME(info.param, alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out,
-                     ld_out_m);
+  BLAS_GENERATE_NAME(info.param, alloc, trans, m, n, alpha, inc_in, ld_in_m,
+                     inc_out, ld_out_m);
 }
 
 BLAS_REGISTER_TEST_ALL(OmatCopy2, combination_t, combi, generate_name);
diff --git a/test/unittest/extension/omatcopy_test.cpp b/test/unittest/extension/omatcopy_test.cpp
index a7a1e49a8..c7737f1a3 100644
--- a/test/unittest/extension/omatcopy_test.cpp
+++ b/test/unittest/extension/omatcopy_test.cpp
@@ -69,8 +69,9 @@ void run_test(const combination_t<scalar_t> combi) {
   auto copy_out =
       helper::copy_to_device<scalar_t>(q, B.data(), matrix_out, size_b);
 
-  auto omatcopy_event = blas::_omatcopy(sb_handle, trans, m, n, alpha, matrix_in, ld_in, matrix_out,
-                  ld_out, {copy_in, copy_out});
+  auto omatcopy_event =
+      blas::_omatcopy(sb_handle, trans, m, n, alpha, matrix_in, ld_in,
+                      matrix_out, ld_out, {copy_in, copy_out});
 
   sb_handle.wait(omatcopy_event);
 
@@ -106,7 +107,6 @@ void run_test(const combination_t<scalar_t> combi) {
   }
 }
 
-
 #ifdef STRESS_TESTING
 template <typename scalar_t>
 const auto combi =
@@ -120,11 +120,11 @@ const auto combi =
 #else
 template <typename scalar_t>
 const auto combi =
-    ::testing::Combine(::testing::Values("usm", "buf"),        // allocation type
-                       ::testing::Values<char>('n', 't'),         // trans
+    ::testing::Combine(::testing::Values("usm", "buf"),    // allocation type
+                       ::testing::Values<char>('n', 't'),  // trans
                        ::testing::Values<index_t>(64, 129, 255),  // m
                        ::testing::Values<index_t>(64, 129, 255),  // n
-                       ::testing::Values<scalar_t>(0, 1, 2),   // alpha
+                       ::testing::Values<scalar_t>(0, 1, 2),      // alpha
                        ::testing::Values<index_t>(1, 2, 3),       // ld_in_m
                        ::testing::Values<index_t>(1, 2, 3));      // ld_out_m
 #endif
diff --git a/test/unittest/extension/reduction_test.cpp b/test/unittest/extension/reduction_test.cpp
index 8d98a9425..7dc84d5c1 100644
--- a/test/unittest/extension/reduction_test.cpp
+++ b/test/unittest/extension/reduction_test.cpp
@@ -71,7 +71,8 @@ static std::string generate_name(
   operator_t op;
   reduction_dim_t reductionDim;
   T unused;
-  BLAS_GENERATE_NAME(info.param, alloc, rows, cols, ldMul, op, reductionDim, unused);
+  BLAS_GENERATE_NAME(info.param, alloc, rows, cols, ldMul, op, reductionDim,
+                     unused);
 }
 
 template <typename scalar_t, helper::AllocType mem_alloc>
@@ -173,9 +174,9 @@ void run_test(const combination_t<scalar_t> combi) {
   if (op == operator_t::Mean) {
     const auto nelems = reduction_dim == reduction_dim_t::outer ? cols : rows;
     std::transform(out_v_cpu.begin(), out_v_cpu.end(), out_v_cpu.begin(),
-                    [=](scalar_t val) -> scalar_t {
-                      return val / static_cast<scalar_t>(nelems);
-                    });
+                   [=](scalar_t val) -> scalar_t {
+                     return val / static_cast<scalar_t>(nelems);
+                   });
   }
 
   auto m_in_gpu =
@@ -183,37 +184,43 @@ void run_test(const combination_t<scalar_t> combi) {
   auto v_out_gpu =
       blas::helper::allocate<mem_alloc, scalar_t>(out_size, q);  // out_v_gpu
 
-  auto copy_m = blas::helper::copy_to_device<scalar_t>(q, in_m.data(),
-                                                        m_in_gpu, ld * cols);
+  auto copy_m = blas::helper::copy_to_device<scalar_t>(q, in_m.data(), m_in_gpu,
+                                                       ld * cols);
   auto copy_v = blas::helper::copy_to_device<scalar_t>(q, out_v_gpu.data(),
-                                                        v_out_gpu, out_size);
+                                                       v_out_gpu, out_size);
 
   blas::SB_Handle::event_t ev;
   try {
     switch (op) {
       case operator_t::Add:
         ev = extension::_reduction<AddOperator, scalar_t>(
-            sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v});
+            sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim,
+            {copy_m, copy_v});
         break;
       case operator_t::Product:
         ev = extension::_reduction<ProductOperator, scalar_t>(
-            sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v});
+            sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim,
+            {copy_m, copy_v});
         break;
       case operator_t::Max:
         ev = extension::_reduction<MaxOperator, scalar_t>(
-            sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v});
+            sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim,
+            {copy_m, copy_v});
         break;
       case operator_t::Min:
         ev = extension::_reduction<MinOperator, scalar_t>(
-            sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v});
+            sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim,
+            {copy_m, copy_v});
         break;
       case operator_t::AbsoluteAdd:
         ev = extension::_reduction<AbsoluteAddOperator, scalar_t>(
-            sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v});
+            sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim,
+            {copy_m, copy_v});
         break;
       case operator_t::Mean:
         ev = extension::_reduction<MeanOperator, scalar_t>(
-            sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v});
+            sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim,
+            {copy_m, copy_v});
         break;
     }
   } catch (cl::sycl::exception& e) {
@@ -253,5 +260,4 @@ void run_test(const combination_t<scalar_t> combi) {
   }
 }
 
-BLAS_REGISTER_TEST_ALL(ReductionPartial, combination_t, combi,
-                               generate_name);
+BLAS_REGISTER_TEST_ALL(ReductionPartial, combination_t, combi, generate_name);
diff --git a/test/unittest/extension/transpose_test.cpp b/test/unittest/extension/transpose_test.cpp
index d3c6a0163..1167c1ddb 100644
--- a/test/unittest/extension/transpose_test.cpp
+++ b/test/unittest/extension/transpose_test.cpp
@@ -70,8 +70,9 @@ void run_test(const combination_t<scalar_t>& combi) {
     auto copy_out =
         helper::copy_to_device<scalar_t>(q, B.data(), matrix_out, size_b);
 
-    auto trans_event = blas::extension::_transpose<scalar_t>(sb_handle, m, n, matrix_in, ld_in,
-                                          matrix_out, ld_out, {copy_in, copy_out});
+    auto trans_event = blas::extension::_transpose<scalar_t>(
+        sb_handle, m, n, matrix_in, ld_in, matrix_out, ld_out,
+        {copy_in, copy_out});
 
     sb_handle.wait(trans_event);
 
@@ -114,8 +115,8 @@ void run_test(const combination_t<scalar_t> combi) {
 
 template <typename scalar_t>
 const auto combi =
-    ::testing::Combine(::testing::Values("usm", "buf"),      // allocation type
-                       ::testing::Values('i', 'o'),  // Inplace | Outplace
+    ::testing::Combine(::testing::Values("usm", "buf"),  // allocation type
+                       ::testing::Values('i', 'o'),      // Inplace | Outplace
                        ::testing::Values<index_t>(64, 129, 255),  // m
                        ::testing::Values<index_t>(64, 129, 255),  // n
                        ::testing::Values<index_t>(1, 2, 3),       // ld_in_m