From 5666d31c98ccf8b9901530f3c2760e129b3b5c10 Mon Sep 17 00:00:00 2001 From: pgorlani Date: Thu, 12 Oct 2023 10:48:28 +0100 Subject: [PATCH 1/6] Store temporary memory allocations --- include/sb_handle/portblas_handle.h | 77 +++++++++++++++- src/interface/blas2_interface.hpp | 81 ++++++++-------- src/interface/trsm_interface.hpp | 18 ++-- src/sb_handle/portblas_handle.hpp | 138 +++++++++++++++++++++++++--- 4 files changed, 251 insertions(+), 63 deletions(-) diff --git a/include/sb_handle/portblas_handle.h b/include/sb_handle/portblas_handle.h index 08b9a1b61..f873c2950 100644 --- a/include/sb_handle/portblas_handle.h +++ b/include/sb_handle/portblas_handle.h @@ -22,7 +22,6 @@ * @filename portblas_handle.h * **************************************************************************/ - #ifndef PORTBLAS_HANDLE_H #define PORTBLAS_HANDLE_H #include "blas_meta.h" @@ -31,6 +30,9 @@ #include "operations/blas3_trees.h" #include "operations/extension/reduction.h" #include "portblas_helper.h" +#include +#include + namespace blas { /** SB_Handle. @@ -50,7 +52,61 @@ class SB_Handle { : q_(q), workGroupSize_(helper::get_work_group_size(q)), localMemorySupport_(helper::has_local_memory(q)), - computeUnits_(helper::get_num_compute_units(q)) {} + computeUnits_(helper::get_num_compute_units(q)), + tot_size_temp_mem_(0) {} + + ~SB_Handle() { +#ifdef VERBOSE + std::cout << "Buffers destroyed on SB_Handle destruction: " + << temp_buffer_map_.size() << std::endl; +#endif + +#ifdef SB_ENABLE_USM + // synchronize with the host on destruction + q_.wait(); + +#ifdef VERBOSE + std::cout << "USM allocations freed on SB_Handle destruction: " + << temp_usm_map_.size() << std::endl; +#endif + + for (const temp_usm_map_t::value_type& p : temp_usm_map_) + cl::sycl::free(p.second, q_); +#endif + } + +#ifdef SB_ENABLE_USM + template + typename std::enable_if< + alloc == helper::AllocType::usm, + typename helper::AllocHelper::type>::type + acquire_temp_mem(size_t size); +#endif + + template + typename std::enable_if< + alloc == helper::AllocType::buffer, + typename helper::AllocHelper::type>::type + acquire_temp_mem(size_t size); + +#ifdef SB_ENABLE_USM + template + typename std::enable_if< + std::is_same::type, + helper::AllocType::usm>::type>::value, + cl::sycl::event>::type + release_temp_mem(std::vector dependencies, + const container_t& mem); +#endif + + template + typename std::enable_if< + std::is_same::type, + helper::AllocType::buffer>::type>::value, + cl::sycl::event>::type + release_temp_mem(std::vector, const container_t& mem); template event_t execute(expression_tree_t tree, const event_t& dependencies = {}); @@ -147,12 +203,27 @@ class SB_Handle { } private: + using temp_usm_map_t = std::multimap; + using temp_usm_size_map_t = std::map; + using temp_buffer_map_t = std::multimap>; + static_assert(sizeof(temp_buffer_map_t::mapped_type::value_type) == 1); + queue_t q_; const size_t workGroupSize_; const bool localMemorySupport_; const size_t computeUnits_; + + size_t tot_size_temp_mem_; + static constexpr size_t max_size_temp_mem_ = 1e9; + + std::mutex map_mutex_; +#ifdef SB_ENABLE_USM + temp_usm_map_t temp_usm_map_; + temp_usm_size_map_t temp_usm_size_map_; +#endif + temp_buffer_map_t temp_buffer_map_; }; } // namespace blas - +#undef VERBOSE #endif // PORTBLAS_HANDLE_H diff --git a/src/interface/blas2_interface.hpp b/src/interface/blas2_interface.hpp index 96340af0b..de7d0807d 100644 --- a/src/interface/blas2_interface.hpp +++ b/src/interface/blas2_interface.hpp @@ -93,10 +93,10 @@ typename sb_handle_t::event_t _gemv_impl( const auto ld = is_transposed ? _N : _M; constexpr index_t one = 1; - auto dot_products_buffer = blas::helper::allocate < is_usm + auto dot_products_buffer = sb_handle.template acquire_temp_mem < is_usm ? helper::AllocType::usm : helper::AllocType::buffer, - element_t > (ld, sb_handle.get_queue()); + element_t > (ld); auto dot_products_matrix = make_matrix_view(dot_products_buffer, ld, one, ld); @@ -136,8 +136,8 @@ typename sb_handle_t::event_t _gemv_impl( gemvEvent, sb_handle.execute(assignOp, local_range, gemvEvent)); } - blas::helper::enqueue_deallocate(ret, dot_products_buffer, - sb_handle.get_queue()); + sb_handle.template release_temp_mem(ret, dot_products_buffer); + } else // Local memory kernel { // Calculate number of work groups per each dimension based on the local @@ -159,10 +159,10 @@ typename sb_handle_t::event_t _gemv_impl( const auto dot_products_buffer_size = ld * WGs_per_C; // Create the dot products buffer and matrix view - auto dot_products_buffer = blas::helper::allocate < is_usm + auto dot_products_buffer = sb_handle.template acquire_temp_mem < is_usm ? helper::AllocType::usm : helper::AllocType::buffer, - element_t > (dot_products_buffer_size, sb_handle.get_queue()); + element_t > (dot_products_buffer_size); auto dot_products_matrix = make_matrix_view(dot_products_buffer, ld, WGs_per_C, ld); @@ -205,8 +205,7 @@ typename sb_handle_t::event_t _gemv_impl( gemvEvent, sb_handle.execute(assignOp, local_range, gemvEvent)); } - blas::helper::enqueue_deallocate(ret, dot_products_buffer, - sb_handle.get_queue()); + sb_handle.template release_temp_mem(ret, dot_products_buffer); } return ret; } @@ -263,9 +262,10 @@ typename sb_handle_t::event_t _trmv_impl( using element_t = typename ValueType::type; constexpr bool is_usm = std::is_pointer::value; - auto valT1 = blas::helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - element_t > (N * scratchSize, sb_handle.get_queue()); + auto valT1 = sb_handle.template acquire_temp_mem < is_usm + ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (N * scratchSize); auto mat1 = make_matrix_view(valT1, N, scratchSize, scratchSize); if (data_layout_t::is_col_major()) { @@ -333,7 +333,7 @@ typename sb_handle_t::event_t _trmv_impl( auto assignOp = make_op(vx, addMOp); ret = concatenate_vectors(ret, sb_handle.execute(assignOp, localSize, ret)); - blas::helper::enqueue_deallocate(ret, valT1, sb_handle.get_queue()); + sb_handle.template release_temp_mem(ret, valT1); return ret; } @@ -373,10 +373,10 @@ typename sb_handle_t::event_t _trsv_impl( auto queue = sb_handle.get_queue(); constexpr bool is_usm = std::is_pointer::value; - auto sync_buffer = blas::helper::allocate < is_usm + auto sync_buffer = sb_handle.template acquire_temp_mem < is_usm ? blas::helper::AllocType::usm : blas::helper::AllocType::buffer, - int32_t > (sync_vec.size(), queue); + int32_t > (sync_vec.size()); auto copy_sync = blas::helper::copy_to_device( queue, sync_vec.data(), sync_buffer, sync_vec.size()); sb_handle.wait(copy_sync); @@ -395,7 +395,7 @@ typename sb_handle_t::event_t _trsv_impl( static_cast(subgroup_size * (subgroup_size + 2 + sub_num)), _dependencies); - blas::helper::enqueue_deallocate(ret, sync_buffer, queue); + sb_handle.template release_temp_mem(ret, sync_buffer); return ret; #endif @@ -465,17 +465,19 @@ typename sb_handle_t::event_t _symv_impl( ((scratchPadSize == 0) ? std::min(N, localSize) : 1) * nWGPerCol_R; constexpr bool is_usm = std::is_pointer::value; - auto valTR = blas::helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - element_t > (N * scratchSize_R, sb_handle.get_queue()); + auto valTR = sb_handle.template acquire_temp_mem < is_usm + ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (N * scratchSize_R); auto matR = make_matrix_view(valTR, N, scratchSize_R, scratchSize_R); const index_t scratchSize_C = nWGPerCol_C; - auto valTC = blas::helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - element_t > (N * scratchSize_C, sb_handle.get_queue()); + auto valTC = sb_handle.template acquire_temp_mem < is_usm + ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (N * scratchSize_C); auto matC = make_matrix_view(valTC, N, scratchSize_C, scratchSize_C); @@ -512,8 +514,8 @@ typename sb_handle_t::event_t _symv_impl( auto assignOp = make_op(vy, addOp); ret = concatenate_vectors(ret, sb_handle.execute(assignOp, localSize, ret)); - blas::helper::enqueue_deallocate(ret, valTR, sb_handle.get_queue()); - blas::helper::enqueue_deallocate(ret, valTC, sb_handle.get_queue()); + sb_handle.template release_temp_mem(ret, valTR); + sb_handle.template release_temp_mem(ret, valTC); return ret; } @@ -646,9 +648,10 @@ typename sb_handle_t::event_t _tbmv_impl( constexpr bool is_usm = std::is_pointer::value; using element_t = typename ValueType::type; auto x_vector_size = _N; - auto res_buffer = blas::helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - element_t > (x_vector_size, sb_handle.get_queue()); + auto res_buffer = sb_handle.template acquire_temp_mem < is_usm + ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (x_vector_size); typename MatrixViewType::type mA = make_matrix_view(_mA, _K + 1, _N, _lda); @@ -666,7 +669,7 @@ typename sb_handle_t::event_t _tbmv_impl( auto assignEvent = sb_handle.execute(assignOp, local_range, tbmvEvent); auto ret = concatenate_vectors(tbmvEvent, assignEvent); - blas::helper::enqueue_deallocate(ret, res_buffer, sb_handle.get_queue()); + sb_handle.template release_temp_mem(ret, res_buffer); return ret; } @@ -692,9 +695,10 @@ typename sb_handle_t::event_t _tpmv_impl( using element_t = typename ValueType::type; constexpr bool is_usm = std::is_pointer::value; - auto res_buffer = blas::helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - element_t > (vector_size, sb_handle.get_queue()); + auto res_buffer = sb_handle.template acquire_temp_mem < is_usm + ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (vector_size); typename MatrixViewType::type mA = make_matrix_view(_mA, one, matrix_size, matrix_size); @@ -719,7 +723,7 @@ typename sb_handle_t::event_t _tpmv_impl( auto ret = concatenate_vectors(tpmvEvent, sb_handle.execute(assignOp, tpmvEvent)); - blas::helper::enqueue_deallocate(ret, res_buffer, sb_handle.get_queue()); + sb_handle.template release_temp_mem(ret, res_buffer); return ret; } @@ -761,10 +765,10 @@ typename sb_handle_t::event_t _tbsv_impl( constexpr bool is_usm = std::is_pointer::value; auto queue = sb_handle.get_queue(); - auto sync_buffer = blas::helper::allocate < is_usm + auto sync_buffer = sb_handle.template acquire_temp_mem < is_usm ? blas::helper::AllocType::usm : blas::helper::AllocType::buffer, - int32_t > (sync_vec.size(), queue); + int32_t > (sync_vec.size()); auto copy_sync = blas::helper::copy_to_device( queue, sync_vec.data(), sync_buffer, sync_vec.size()); sb_handle.wait(copy_sync); @@ -782,7 +786,7 @@ typename sb_handle_t::event_t _tbsv_impl( static_cast(subgroup_size * (subgroup_size + 2 + sub_num)), _dependencies); - blas::helper::enqueue_deallocate(ret, sync_buffer, queue); + sb_handle.template release_temp_mem(ret, sync_buffer); return ret; #endif @@ -825,11 +829,10 @@ typename sb_handle_t::event_t _tpsv_impl( constexpr bool is_usm = std::is_pointer::value; auto queue = sb_handle.get_queue(); - auto sync_buffer = blas::helper::allocate < is_usm + auto sync_buffer = sb_handle.template acquire_temp_mem < is_usm ? blas::helper::AllocType::usm : blas::helper::AllocType::buffer, - int32_t > (sync_vec.size(), queue); - + int32_t > (sync_vec.size()); auto copy_sync = blas::helper::copy_to_device( queue, sync_vec.data(), sync_buffer, sync_vec.size()); sb_handle.wait(copy_sync); @@ -847,7 +850,9 @@ typename sb_handle_t::event_t _tpsv_impl( roundUp(sub_num * _N, sub_num * subgroup_size), static_cast(subgroup_size * (subgroup_size + 2 + sub_num)), _dependencies); - blas::helper::enqueue_deallocate(ret, sync_buffer, queue); + + sb_handle.template release_temp_mem(ret, sync_buffer); + return ret; #endif } diff --git a/src/interface/trsm_interface.hpp b/src/interface/trsm_interface.hpp index 75996cb1d..628341022 100644 --- a/src/interface/trsm_interface.hpp +++ b/src/interface/trsm_interface.hpp @@ -147,9 +147,10 @@ typename sb_handle_t::event_t _trsm( // filled with zeroes const index_t invASize = roundUp(K, blockSize) * blockSize; constexpr bool is_usm = std::is_pointer::value; - auto invA = helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - element_t > (invASize, sb_handle.get_queue()); + auto invA = sb_handle.template acquire_temp_mem < is_usm + ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (invASize); typename sb_handle_t::event_t event = {blas::helper::fill( sb_handle.get_queue(), invA, element_t{0}, invASize, _dependencies)}; trsmEvents = concatenate_vectors(trsmEvents, event); @@ -197,9 +198,10 @@ typename sb_handle_t::event_t _trsm( // output X will hold the TRSM result and will be copied to B at the end const index_t BSize = ldb * (N - 1) + M; const index_t ldx = ldb; - auto X = helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - element_t > (BSize, sb_handle.get_queue()); + auto X = sb_handle.template acquire_temp_mem < is_usm + ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (BSize); trsmEvents = concatenate_vectors( trsmEvents, internal::_copy( @@ -385,9 +387,9 @@ typename sb_handle_t::event_t _trsm( internal::_copy( sb_handle, BSize, X, 1, B, 1, trsmEvents)); - helper::enqueue_deallocate(trsmEvents, invA, sb_handle.get_queue()); + sb_handle.template release_temp_mem(trsmEvents, invA); - helper::enqueue_deallocate(trsmEvents, X, sb_handle.get_queue()); + sb_handle.template release_temp_mem(trsmEvents, X); return trsmEvents; } diff --git a/src/sb_handle/portblas_handle.hpp b/src/sb_handle/portblas_handle.hpp index 2ad56f56f..903d74d7f 100644 --- a/src/sb_handle/portblas_handle.hpp +++ b/src/sb_handle/portblas_handle.hpp @@ -38,6 +38,116 @@ #include "views/view.h" namespace blas { +#ifdef SB_ENABLE_USM +template +typename std::enable_if< + alloc == helper::AllocType::usm, + typename helper::AllocHelper::type>::type +SB_Handle::acquire_temp_mem(size_t size) { + const size_t byteSize = size * sizeof(value_t); + map_mutex_.lock(); + auto found = temp_usm_map_.lower_bound(byteSize); + if (found != temp_usm_map_.end()) { + temp_usm_map_.extract(found); + tot_size_temp_mem_ -= found->first; + map_mutex_.unlock(); + return reinterpret_cast(found->second); + } else { + map_mutex_.unlock(); +#ifdef VERBOSE + std::cout << "Create a temporary USM allocation of " << byteSize + << " bytes." << std::endl; +#endif + value_t* tmp = cl::sycl::malloc_device(size, q_); + map_mutex_.lock(); + temp_usm_size_map_.emplace( + reinterpret_cast(tmp), byteSize); + map_mutex_.unlock(); + return tmp; + } +} +#endif + +template +typename std::enable_if< + alloc == helper::AllocType::buffer, + typename helper::AllocHelper::type>::type +SB_Handle::acquire_temp_mem(size_t size) { + const size_t byteSize = size * sizeof(value_t); + map_mutex_.lock(); + auto found = temp_buffer_map_.lower_bound(byteSize); + if (found != temp_buffer_map_.end()) { + cl::sycl::buffer buff = + found->second; + temp_buffer_map_.extract(found); + tot_size_temp_mem_ -= found->first; + map_mutex_.unlock(); + return blas::BufferIterator{buff.reinterpret( + cl::sycl::range<1>(found->first / sizeof(value_t)))}; + } else { + map_mutex_.unlock(); +#ifdef VERBOSE + std::cout << "Create a temporary buffer of " << byteSize << " bytes." + << std::endl; +#endif + return make_sycl_iterator_buffer(size); + } +} + +#ifdef SB_ENABLE_USM +template +typename std::enable_if< + std::is_same::type, + helper::AllocType::usm>::type>::value, + cl::sycl::event>::type +SB_Handle::release_temp_mem(std::vector dependencies, + const container_t& mem) { + return q_.submit([&](cl::sycl::handler& cgh) { + cgh.depends_on(dependencies); + map_mutex_.lock(); + auto found = temp_usm_size_map_.find( + reinterpret_cast(mem)); + const size_t byteSize = found->second; + if (tot_size_temp_mem_ + byteSize > max_size_temp_mem_) { + temp_usm_size_map_.erase(found); + map_mutex_.unlock(); + cl::sycl::free(mem, q_); + } else { + tot_size_temp_mem_ += byteSize; + temp_usm_map_.emplace(byteSize, + reinterpret_cast(mem)); + map_mutex_.unlock(); + } + }); +} +#endif +#undef VERBOSE +template +typename std::enable_if< + std::is_same::type, + helper::AllocType::buffer>::type>::value, + cl::sycl::event>::type +SB_Handle::release_temp_mem(std::vector dependencies, + const container_t& mem) { + return q_.submit([&](cl::sycl::handler& cgh) { + cgh.depends_on(dependencies); + const size_t byteSize = mem.get_buffer().byte_size(); + if (tot_size_temp_mem_ + byteSize <= max_size_temp_mem_) { + map_mutex_.lock(); + tot_size_temp_mem_ += byteSize; + temp_buffer_map_.emplace( + byteSize, + mem.get_buffer() + .template reinterpret( + cl::sycl::range<1>( + byteSize / + sizeof(temp_buffer_map_t::mapped_type::value_type)))); + map_mutex_.unlock(); + } + }); +} /*! * @brief Executes the tree without defining required shared memory. @@ -114,12 +224,12 @@ inline typename SB_Handle::event_t SB_Handle::execute( // Two accessors to local memory auto sharedSize = ((nWG < localSize) ? localSize : nWG); constexpr bool is_usm = std::is_pointer::value; - auto shMem1 = blas::helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - typename lhs_t::value_t > (sharedSize, q_); - auto shMem2 = blas::helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - typename lhs_t::value_t > (sharedSize, q_); + auto shMem1 = acquire_temp_mem < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + typename lhs_t::value_t > (sharedSize); + auto shMem2 = acquire_temp_mem < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + typename lhs_t::value_t > (sharedSize); auto opShMem1 = make_vector_view(shMem1, typename lhs_t::increment_t(1), sharedSize); @@ -150,9 +260,9 @@ inline typename SB_Handle::event_t SB_Handle::execute( even = !even; } while (_N > 1); - blas::helper::enqueue_deallocate(event, shMem1, q_); + release_temp_mem(event, shMem1); - blas::helper::enqueue_deallocate(event, shMem2, q_); + release_temp_mem(event, shMem2); return event; } @@ -273,9 +383,9 @@ inline typename SB_Handle::event_t SB_Handle::execute( /* First step: partial gemm */ /* Create the cube buffer that will hold the output of the partial gemm */ - auto cube_buffer = helper::allocate < is_usm ? helper::AllocType::usm + auto cube_buffer = acquire_temp_mem < is_usm ? helper::AllocType::usm : helper::AllocType::buffer, - element_t > (rows * cols * depth, q_); + element_t > (rows * cols * depth); /* Create a first matrix view used for the partial gemm */ auto cube_gemm = @@ -309,9 +419,9 @@ inline typename SB_Handle::event_t SB_Handle::execute( /* Otherwise we reduce to a temporary buffer */ else { /* Create a temporary buffer to hold alpha * A * B */ - auto temp_buffer = helper::allocate < is_usm ? helper::AllocType::usm + auto temp_buffer = acquire_temp_mem < is_usm ? helper::AllocType::usm : helper::AllocType::buffer, - element_t > (rows * cols, q_); + element_t > (rows * cols); auto temp = make_matrix_view(temp_buffer, rows, cols, rows); /* Execute the reduction */ @@ -333,10 +443,10 @@ inline typename SB_Handle::event_t SB_Handle::execute( events = concatenate_vectors(events, execute(assignOp, events)); } - helper::enqueue_deallocate(events, temp_buffer, q_); + release_temp_mem(events, temp_buffer); } - helper::enqueue_deallocate(events, cube_buffer, q_); + release_temp_mem(events, cube_buffer); return events; } From e188e710c7a81aff70bc8fd57108df98b7bf3e66 Mon Sep 17 00:00:00 2001 From: pgorlani Date: Wed, 8 Nov 2023 16:23:46 +0000 Subject: [PATCH 2/6] Pass sb_handle as an argument to the auto tuners --- tools/auto_tuner/gen/generate_combinations.py | 2 +- tools/auto_tuner/include/gemm_tuner.hpp | 15 ++++++++------- tools/auto_tuner/include/tune.hpp | 2 +- tools/auto_tuner/include/tune_impl.hpp | 3 +-- tools/auto_tuner/include/utils.hpp | 10 ++-------- tools/auto_tuner/src/tune_all.cpp | 14 ++++++++++---- tools/auto_tuner/src/tune_nn.cpp | 7 +++++-- tools/auto_tuner/src/tune_nt.cpp | 7 +++++-- tools/auto_tuner/src/tune_tn.cpp | 7 +++++-- tools/auto_tuner/src/tune_tt.cpp | 7 +++++-- 10 files changed, 43 insertions(+), 31 deletions(-) diff --git a/tools/auto_tuner/gen/generate_combinations.py b/tools/auto_tuner/gen/generate_combinations.py index 4c10f250b..083d84bde 100644 --- a/tools/auto_tuner/gen/generate_combinations.py +++ b/tools/auto_tuner/gen/generate_combinations.py @@ -393,7 +393,7 @@ def write_source_files(config_list, config_source, output_dir): #define INSTANTIATE_TUNE(DTYPE, TRA, TRB, MEM, ALGO, BATCH, VEC, ...) \ template TestResultEntry \ tune<__VA_ARGS__, GemmConfig, DTYPE>( \ - int r, GemmArgs a); + portblas_handle_t &sb_handle, int r, GemmArgs a); #define BENCH_PARAMS(MEM, ALGO, BATCH, VEC, ...) \ INSTANTIATE_TUNE(float, false, false, MEM, ALGO, BATCH, VEC, __VA_ARGS__) \ diff --git a/tools/auto_tuner/include/gemm_tuner.hpp b/tools/auto_tuner/include/gemm_tuner.hpp index b3e67b702..e3ad3e391 100644 --- a/tools/auto_tuner/include/gemm_tuner.hpp +++ b/tools/auto_tuner/include/gemm_tuner.hpp @@ -67,11 +67,10 @@ inline std::vector interleaved_to_strided( } template -static TestResultEntry tune_portblas(int r, char transA, char transB, - GemmArgs a, +static TestResultEntry tune_portblas(portblas_handle_t &sb_handle, int r, + char transA, char transB, GemmArgs a, ::blas::gemm_batch_type_t batch_type) { TestResultEntry result("portBLAS gemm"); - auto sb_handle = get_portblas_handle(); { auto event = blas::helper::copy_to_device( sb_handle.get_queue(), a.init_c.data(), a.c, a.init_c.size()); @@ -98,7 +97,8 @@ static TestResultEntry tune_portblas(int r, char transA, char transB, } template -void run_tune_gemm(int seed, int m, int k, int n, int batch_size, int rep, +void run_tune_gemm(portblas_handle_t &sb_handle, int seed, int m, int k, int n, + int batch_size, int rep, ::blas::gemm_batch_type_t batch_type) { std::cout << std::scientific; @@ -157,7 +157,8 @@ void run_tune_gemm(int seed, int m, int k, int n, int batch_size, int rep, device_c, result_c, ldc, batch_size, expected_c}; { - auto result = tune_portblas(rep, *ta_str, *tb_str, args, batch_type); + auto result = + tune_portblas(sb_handle, rep, *ta_str, *tb_str, args, batch_type); results.push_back(result); } @@ -165,7 +166,7 @@ void run_tune_gemm(int seed, int m, int k, int n, int batch_size, int rep, do { \ auto result = \ tune<__VA_ARGS__, GemmConfig, \ - DataType>(rep, args); \ + DataType>(sb_handle, rep, args); \ results.push_back(result); \ } while (0); @@ -173,7 +174,7 @@ void run_tune_gemm(int seed, int m, int k, int n, int batch_size, int rep, #undef BENCH_PARAMS std::cout << "SIZE : " << results.size() << std::endl; - get_portblas_handle().wait(); + sb_handle.wait(); std::sort(results.begin(), results.end()); results.print_all(); } diff --git a/tools/auto_tuner/include/tune.hpp b/tools/auto_tuner/include/tune.hpp index ea6e75ea8..723e03284 100644 --- a/tools/auto_tuner/include/tune.hpp +++ b/tools/auto_tuner/include/tune.hpp @@ -30,6 +30,6 @@ template -TestResultEntry tune(int r, GemmArgs a); +TestResultEntry tune(portblas_handle_t &sb_handle, int r, GemmArgs a); #endif // PORTBLAS_TOOLS_AUTO_TUNER_TUNE_HPP_ diff --git a/tools/auto_tuner/include/tune_impl.hpp b/tools/auto_tuner/include/tune_impl.hpp index bd2f15685..f456ff9df 100644 --- a/tools/auto_tuner/include/tune_impl.hpp +++ b/tools/auto_tuner/include/tune_impl.hpp @@ -33,7 +33,7 @@ template -TestResultEntry tune(int r, GemmArgs a) { +TestResultEntry tune(portblas_handle_t &sb_handle, int r, GemmArgs a) { using Gemm = ::blas::Gemm< MatrixContainer, MatrixContainer, DoubleBuffer, Nbca, Nbcb, Cls, Tile, Config::TransA, Config::TransB, Config::SymmA, Config::SymmB, T, @@ -41,7 +41,6 @@ TestResultEntry tune(int r, GemmArgs a) { static_cast(Config::ShapeMode), static_cast(Config::VecType), VecSize, static_cast(Config::BatchType)>; TestResultEntry result(Gemm::get_type_string()); - auto sb_handle = get_portblas_handle(); { { auto event = blas::helper::copy_to_device( diff --git a/tools/auto_tuner/include/utils.hpp b/tools/auto_tuner/include/utils.hpp index a7d0e25c9..4bcbebc04 100644 --- a/tools/auto_tuner/include/utils.hpp +++ b/tools/auto_tuner/include/utils.hpp @@ -34,7 +34,7 @@ #include #include -inline portblas_handle_t make_portblas_handle() { +inline cl::sycl::queue make_sycl_queue() { cl::sycl::queue q( [=](cl::sycl::exception_list ex_list) { try { @@ -50,13 +50,7 @@ inline portblas_handle_t make_portblas_handle() { << q.get_device().get_info() << std::endl; - portblas_handle_t sb_handle(q); - return sb_handle; -} - -inline portblas_handle_t &get_portblas_handle() { - static portblas_handle_t sb_handle = make_portblas_handle(); - return sb_handle; + return q; } template diff --git a/tools/auto_tuner/src/tune_all.cpp b/tools/auto_tuner/src/tune_all.cpp index 499f8235b..8c8bc4d3a 100644 --- a/tools/auto_tuner/src/tune_all.cpp +++ b/tools/auto_tuner/src/tune_all.cpp @@ -53,15 +53,21 @@ int main(int argc, char *argv[]) { return -1; } } + + portblas_handle_t sb_handle(make_sycl_queue()); + std::cout << "======= testing nn ======" << std::endl; - run_tune_gemm(seed, m, k, n, batch_size, rep, + run_tune_gemm(sb_handle, seed, m, k, n, batch_size, rep, batch_type); std::cout << "======= testing nt ======" << std::endl; - run_tune_gemm(seed, m, k, n, batch_size, rep, batch_type); + run_tune_gemm(sb_handle, seed, m, k, n, batch_size, rep, + batch_type); std::cout << "======= testing tn ======" << std::endl; - run_tune_gemm(seed, m, k, n, batch_size, rep, batch_type); + run_tune_gemm(sb_handle, seed, m, k, n, batch_size, rep, + batch_type); std::cout << "======= testing tt ======" << std::endl; - run_tune_gemm(seed, m, k, n, batch_size, rep, batch_type); + run_tune_gemm(sb_handle, seed, m, k, n, batch_size, rep, + batch_type); return 0; } diff --git a/tools/auto_tuner/src/tune_nn.cpp b/tools/auto_tuner/src/tune_nn.cpp index 36265a9d3..c5c341052 100644 --- a/tools/auto_tuner/src/tune_nn.cpp +++ b/tools/auto_tuner/src/tune_nn.cpp @@ -56,8 +56,11 @@ int main(int argc, char *argv[]) { return -1; } } - run_tune_gemm(seed, m, k, n, batch_size, rep, - batch_type); + + portblas_handle_t sb_handle(make_sycl_queue()); + + run_tune_gemm(sb_handle, seed, m, k, n, batch_size, + rep, batch_type); return 0; } diff --git a/tools/auto_tuner/src/tune_nt.cpp b/tools/auto_tuner/src/tune_nt.cpp index 7fc3a0b14..d8d2819f4 100644 --- a/tools/auto_tuner/src/tune_nt.cpp +++ b/tools/auto_tuner/src/tune_nt.cpp @@ -56,8 +56,11 @@ int main(int argc, char *argv[]) { return -1; } } - run_tune_gemm(seed, m, k, n, batch_size, rep, - batch_type); + + portblas_handle_t sb_handle(make_sycl_queue()); + + run_tune_gemm(sb_handle, seed, m, k, n, batch_size, + rep, batch_type); return 0; } diff --git a/tools/auto_tuner/src/tune_tn.cpp b/tools/auto_tuner/src/tune_tn.cpp index d19845339..2683423f2 100644 --- a/tools/auto_tuner/src/tune_tn.cpp +++ b/tools/auto_tuner/src/tune_tn.cpp @@ -56,8 +56,11 @@ int main(int argc, char *argv[]) { return -1; } } - run_tune_gemm(seed, m, k, n, batch_size, rep, - batch_type); + + portblas_handle_t sb_handle(make_sycl_queue()); + + run_tune_gemm(sb_handle, seed, m, k, n, batch_size, + rep, batch_type); return 0; } diff --git a/tools/auto_tuner/src/tune_tt.cpp b/tools/auto_tuner/src/tune_tt.cpp index 245878c42..202d2f6e7 100644 --- a/tools/auto_tuner/src/tune_tt.cpp +++ b/tools/auto_tuner/src/tune_tt.cpp @@ -56,8 +56,11 @@ int main(int argc, char *argv[]) { return -1; } } - run_tune_gemm(seed, m, k, n, batch_size, rep, - batch_type); + + portblas_handle_t sb_handle(make_sycl_queue()); + + run_tune_gemm(sb_handle, seed, m, k, n, batch_size, + rep, batch_type); return 0; } From d7ef560122855f1d7a7217b1bbce3317ac209826 Mon Sep 17 00:00:00 2001 From: pgorlani Date: Wed, 8 Nov 2023 18:15:14 +0000 Subject: [PATCH 3/6] Delete copy constructor and assign operator --- include/sb_handle/portblas_handle.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/sb_handle/portblas_handle.h b/include/sb_handle/portblas_handle.h index f873c2950..2d9dd634c 100644 --- a/include/sb_handle/portblas_handle.h +++ b/include/sb_handle/portblas_handle.h @@ -54,6 +54,8 @@ class SB_Handle { localMemorySupport_(helper::has_local_memory(q)), computeUnits_(helper::get_num_compute_units(q)), tot_size_temp_mem_(0) {} + SB_Handle(SB_Handle&) = delete; + SB_Handle operator=(SB_Handle) = delete; ~SB_Handle() { #ifdef VERBOSE @@ -62,7 +64,6 @@ class SB_Handle { #endif #ifdef SB_ENABLE_USM - // synchronize with the host on destruction q_.wait(); #ifdef VERBOSE From 3ba8ad73a666b0160fd9a4d8c655300ab0d07e61 Mon Sep 17 00:00:00 2001 From: pgorlani Date: Fri, 10 Nov 2023 16:13:02 +0000 Subject: [PATCH 4/6] Add separate memory pool class --- CMakeLists.txt | 2 + README.md | 1 + benchmark/portblas/CMakeLists.txt | 4 + benchmark/portblas/main.cpp | 6 ++ include/sb_handle/portblas_handle.h | 83 ++++++----------- include/sb_handle/temp_memory_pool.h | 112 ++++++++++++++++++++++ src/interface/blas2_interface.hpp | 41 ++++++--- src/interface/trsm_interface.hpp | 11 ++- src/sb_handle/portblas_handle.hpp | 133 ++++++++------------------- src/sb_handle/temp_memory_pool.hpp | 118 ++++++++++++++++++++++++ tools/auto_tuner/CMakeLists.txt | 3 + tools/auto_tuner/README.md | 11 +++ tools/auto_tuner/src/tune_all.cpp | 5 + tools/auto_tuner/src/tune_nn.cpp | 5 + tools/auto_tuner/src/tune_nt.cpp | 5 + tools/auto_tuner/src/tune_tn.cpp | 5 + tools/auto_tuner/src/tune_tt.cpp | 5 + 17 files changed, 380 insertions(+), 170 deletions(-) create mode 100644 include/sb_handle/temp_memory_pool.h create mode 100644 src/sb_handle/temp_memory_pool.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 09785078f..21340430c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -216,6 +216,7 @@ endif() option(BLAS_ENABLE_CONST_INPUT "Whether to enable kernel instantiation with const input buffer" ON) option(BLAS_ENABLE_BENCHMARK "Whether to enable benchmarking" ON) option(BLAS_VERIFY_BENCHMARK "Whether to verify the results of benchmarks" ON) +option(BLAS_MEMPOOL_BENCHMARK "Whether to use the memory pool in benchmarks" OFF) option(BUILD_CLBLAST_BENCHMARKS "Whether to build clBLAST benchmarks" OFF) option(BUILD_CLBLAS_BENCHMARKS "Whether to build clBLAS benchmarks" OFF) option(BUILD_CUBLAS_BENCHMARKS "Whether to build cuBLAS benchmarks" OFF) @@ -240,6 +241,7 @@ if (BLAS_BUILD_SAMPLES) endif() option(BLAS_ENABLE_AUTO_TUNERS "Whether to enable building GEMM auto tuners" OFF) +option(BLAS_ENABLE_AUTO_TUNER_MEMPOOL "Whether to enable memory pool for GEMM auto tuners" OFF) if(${BLAS_ENABLE_AUTO_TUNERS}) # Note that the auto tuners are very slow to compile, so we avoid adding # them to the ALL target. diff --git a/README.md b/README.md index 97476e994..bba1a6bc0 100644 --- a/README.md +++ b/README.md @@ -459,6 +459,7 @@ Some of the supported options are: | `BUILD_SHARED_LIBS` | `ON`/`OFF` | Build as shared library (`ON` by default) | | `ENABLE_EXPRESSION_TESTS` | `ON`/`OFF` | Build additional tests that use the header-only framework (e.g to test expression trees); `OFF` by default | | `BLAS_VERIFY_BENCHMARK` | `ON`/`OFF` | Verify the results of the benchmarks instead of only measuring the performance. See the documentation of the benchmarks for more details. `ON` by default | +| `BLAS_MEMPOOL_BENCHMARK` | `ON`/`OFF` | Determines whether to enable the scratchpad memory pool for benchmark execution. `OFF` by default | | `BLAS_ENABLE_CONST_INPUT` | `ON`/`OFF` | Determines whether to enable kernel instantiation with const input buffer (`ON` by default) | | `BLAS_ENABLE_EXTENSIONS` | `ON`/`OFF` | Determines whether to enable portBLAS extensions (`ON` by default) | | `BLAS_DATA_TYPES` | `half;float;double` | Determines the floating-point types to instantiate BLAS operations for. Default is `float` | diff --git a/benchmark/portblas/CMakeLists.txt b/benchmark/portblas/CMakeLists.txt index 87fc58eaf..8cb498ad6 100644 --- a/benchmark/portblas/CMakeLists.txt +++ b/benchmark/portblas/CMakeLists.txt @@ -96,6 +96,10 @@ foreach(portblas_bench ${sources}) ) target_include_directories(bench_${bench_exec} PRIVATE ${PORTBLAS_INCLUDE} ${CBLAS_INCLUDE} ${PORTBLAS_COMMON_INCLUDE_DIR}) + if(BLAS_MEMPOOL_BENCHMARK) + target_compile_definitions(bench_${bench_exec} PRIVATE BLAS_MEMPOOL_BENCHMARK) + endif() + if(BLAS_VERIFY_BENCHMARK) target_compile_definitions(bench_${bench_exec} PRIVATE BLAS_VERIFY_BENCHMARK) target_link_libraries(bench_${bench_exec} PRIVATE blas::blas) diff --git a/benchmark/portblas/main.cpp b/benchmark/portblas/main.cpp index a5a2e6813..46ed313f1 100644 --- a/benchmark/portblas/main.cpp +++ b/benchmark/portblas/main.cpp @@ -66,8 +66,14 @@ int main(int argc, char** argv) { utils::print_queue_information(q); +#ifdef BLAS_MEMPOOL_BENCHMARK + blas::Temp_Mem_Pool mp(q); + // Create a portBLAS sb_handle from the memory pool + blas::SB_Handle sb_handle(&mp); +#else // Create a portBLAS sb_handle from the queue blas::SB_Handle sb_handle(q); +#endif // This will be set to false by a failing benchmark bool success = true; diff --git a/include/sb_handle/portblas_handle.h b/include/sb_handle/portblas_handle.h index 2d9dd634c..d69a340cd 100644 --- a/include/sb_handle/portblas_handle.h +++ b/include/sb_handle/portblas_handle.h @@ -22,6 +22,7 @@ * @filename portblas_handle.h * **************************************************************************/ + #ifndef PORTBLAS_HANDLE_H #define PORTBLAS_HANDLE_H #include "blas_meta.h" @@ -30,8 +31,7 @@ #include "operations/blas3_trees.h" #include "operations/extension/reduction.h" #include "portblas_helper.h" -#include -#include +#include "temp_memory_pool.h" namespace blas { @@ -49,66 +49,49 @@ class SB_Handle { public: using event_t = std::vector; inline SB_Handle(queue_t q) - : q_(q), + : tempMemPool_(NULL), + q_(q), workGroupSize_(helper::get_work_group_size(q)), localMemorySupport_(helper::has_local_memory(q)), - computeUnits_(helper::get_num_compute_units(q)), - tot_size_temp_mem_(0) {} - SB_Handle(SB_Handle&) = delete; - SB_Handle operator=(SB_Handle) = delete; - - ~SB_Handle() { -#ifdef VERBOSE - std::cout << "Buffers destroyed on SB_Handle destruction: " - << temp_buffer_map_.size() << std::endl; -#endif + computeUnits_(helper::get_num_compute_units(q)) {} -#ifdef SB_ENABLE_USM - q_.wait(); - -#ifdef VERBOSE - std::cout << "USM allocations freed on SB_Handle destruction: " - << temp_usm_map_.size() << std::endl; -#endif + inline SB_Handle(Temp_Mem_Pool* tmp) + : tempMemPool_(tmp), + q_(tmp->get_queue()), + workGroupSize_(helper::get_work_group_size(q_)), + localMemorySupport_(helper::has_local_memory(q_)), + computeUnits_(helper::get_num_compute_units(q_)) {} - for (const temp_usm_map_t::value_type& p : temp_usm_map_) - cl::sycl::free(p.second, q_); -#endif - } - -#ifdef SB_ENABLE_USM template typename std::enable_if< - alloc == helper::AllocType::usm, + alloc == helper::AllocType::buffer, typename helper::AllocHelper::type>::type acquire_temp_mem(size_t size); -#endif + template + typename std::enable_if< + std::is_same::type, + helper::AllocType::buffer>::type>::value, + typename SB_Handle::event_t>::type + release_temp_mem(const typename SB_Handle::event_t&, const container_t&); + +#ifdef SB_ENABLE_USM template typename std::enable_if< - alloc == helper::AllocType::buffer, + alloc == helper::AllocType::usm, typename helper::AllocHelper::type>::type acquire_temp_mem(size_t size); -#ifdef SB_ENABLE_USM template typename std::enable_if< std::is_same::type, helper::AllocType::usm>::type>::value, - cl::sycl::event>::type - release_temp_mem(std::vector dependencies, - const container_t& mem); + typename SB_Handle::event_t>::type + release_temp_mem(const typename SB_Handle::event_t&, const container_t&); #endif - template - typename std::enable_if< - std::is_same::type, - helper::AllocType::buffer>::type>::value, - cl::sycl::event>::type - release_temp_mem(std::vector, const container_t& mem); - template event_t execute(expression_tree_t tree, const event_t& dependencies = {}); @@ -204,27 +187,13 @@ class SB_Handle { } private: - using temp_usm_map_t = std::multimap; - using temp_usm_size_map_t = std::map; - using temp_buffer_map_t = std::multimap>; - static_assert(sizeof(temp_buffer_map_t::mapped_type::value_type) == 1); - queue_t q_; const size_t workGroupSize_; const bool localMemorySupport_; const size_t computeUnits_; - - size_t tot_size_temp_mem_; - static constexpr size_t max_size_temp_mem_ = 1e9; - - std::mutex map_mutex_; -#ifdef SB_ENABLE_USM - temp_usm_map_t temp_usm_map_; - temp_usm_size_map_t temp_usm_size_map_; -#endif - temp_buffer_map_t temp_buffer_map_; + Temp_Mem_Pool* tempMemPool_; }; } // namespace blas -#undef VERBOSE + #endif // PORTBLAS_HANDLE_H diff --git a/include/sb_handle/temp_memory_pool.h b/include/sb_handle/temp_memory_pool.h new file mode 100644 index 000000000..1cb2a5d59 --- /dev/null +++ b/include/sb_handle/temp_memory_pool.h @@ -0,0 +1,112 @@ +/*************************************************************************** + * + * @license + * Copyright (C) Codeplay Software Limited + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * For your convenience, a copy of the License has been included in this + * repository. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * portBLAS: BLAS implementation using SYCL + * + * @filename temp_memory_pool.h + * + **************************************************************************/ +#ifndef TEMP_MEMORY_POOL_H +#define TEMP_MEMORY_POOL_H + +#include +#include + +namespace blas { +class Temp_Mem_Pool { + using queue_t = cl::sycl::queue; + using event_t = std::vector; + using temp_usm_map_t = std::multimap; + using temp_usm_size_map_t = std::map; + using temp_buffer_map_t = std::multimap>; + + public: + Temp_Mem_Pool(queue_t q) + : q_(q), + temp_buffer_map_tot_byte_size_(0), + temp_usm_map_tot_byte_size_(0) {} + Temp_Mem_Pool(const Temp_Mem_Pool& h) = delete; + Temp_Mem_Pool operator=(Temp_Mem_Pool) = delete; + + ~Temp_Mem_Pool() { + // Wait for the completion of all the host tasks + q_.wait(); + +#ifdef VERBOSE + std::cout << "# buffers destroyed on memory pool destruction: " + << temp_buffer_map_.size() << " (" + << temp_buffer_map_tot_byte_size_ << " bytes)" << std::endl; +#endif + +#ifdef SB_ENABLE_USM +#ifdef VERBOSE + std::cout << "# USM allocations freed on memory pool destruction: " + << temp_usm_map_.size() << " (" << temp_usm_map_tot_byte_size_ + << " bytes)" << std::endl; +#endif + for (const temp_usm_map_t::value_type& p : temp_usm_map_) + cl::sycl::free(p.second, q_); +#endif + } + + inline queue_t get_queue() const { return q_; } + + template + typename helper::AllocHelper::type + acquire_buff_mem(size_t size); + + template + typename Temp_Mem_Pool::event_t release_buff_mem( + const typename Temp_Mem_Pool::event_t&, const container_t&); + +#ifdef SB_ENABLE_USM + template + typename helper::AllocHelper::type + acquire_usm_mem(size_t size); + + template + typename Temp_Mem_Pool::event_t release_usm_mem( + const typename Temp_Mem_Pool::event_t&, const container_t&); +#endif + + private: + static_assert(sizeof(temp_buffer_map_t::mapped_type::value_type) == 1); + + static constexpr size_t max_size_temp_mem_ = 1e9; + queue_t q_; + + std::mutex temp_buffer_map_mutex_; + size_t temp_buffer_map_tot_byte_size_; + temp_buffer_map_t temp_buffer_map_; + + template + void release_usm_mem_(const container_t& mem); + +#ifdef SB_ENABLE_USM + std::mutex temp_usm_map_mutex_; + size_t temp_usm_map_tot_byte_size_; + temp_usm_map_t temp_usm_map_; + temp_usm_size_map_t temp_usm_size_map_; + + template + void release_buff_mem_(const container_t& mem); +#endif +}; +} // namespace blas +#endif diff --git a/src/interface/blas2_interface.hpp b/src/interface/blas2_interface.hpp index de7d0807d..ac07cf499 100644 --- a/src/interface/blas2_interface.hpp +++ b/src/interface/blas2_interface.hpp @@ -87,6 +87,7 @@ typename sb_handle_t::event_t _gemv_impl( constexpr bool is_usm = std::is_pointer::value; typename sb_handle_t::event_t ret; + typename sb_handle_t::event_t lastEvent; // Non-local memory kernel if (memory_type != gemv_memory_t::local) { // Leading dimension for dot products matrix @@ -126,17 +127,19 @@ typename sb_handle_t::event_t _gemv_impl( // exectutes the above expression tree to yield the final GEMV result ret = concatenate_vectors( - gemvEvent, sb_handle.execute(assignOp, local_range, gemvEvent)); + gemvEvent, + lastEvent = sb_handle.execute(assignOp, local_range, gemvEvent)); } else { auto alphaMulDotsOp = make_op(_alpha, dot_products_matrix); auto assignOp = make_op(vy, alphaMulDotsOp); ret = concatenate_vectors( - gemvEvent, sb_handle.execute(assignOp, local_range, gemvEvent)); + gemvEvent, + lastEvent = sb_handle.execute(assignOp, local_range, gemvEvent)); } - sb_handle.template release_temp_mem(ret, dot_products_buffer); + sb_handle.template release_temp_mem(lastEvent, dot_products_buffer); } else // Local memory kernel { @@ -196,16 +199,18 @@ typename sb_handle_t::event_t _gemv_impl( // exectutes the above expression tree to yield the final GEMV result ret = concatenate_vectors( - gemvEvent, sb_handle.execute(assignOp, local_range, gemvEvent)); + gemvEvent, + lastEvent = sb_handle.execute(assignOp, local_range, gemvEvent)); } else { auto alphaMulDotsOp = make_op(_alpha, sumColsOp); auto assignOp = make_op(vy, alphaMulDotsOp); ret = concatenate_vectors( - gemvEvent, sb_handle.execute(assignOp, local_range, gemvEvent)); + gemvEvent, + lastEvent = sb_handle.execute(assignOp, local_range, gemvEvent)); } - sb_handle.template release_temp_mem(ret, dot_products_buffer); + sb_handle.template release_temp_mem(lastEvent, dot_products_buffer); } return ret; } @@ -330,10 +335,12 @@ typename sb_handle_t::event_t _trmv_impl( } } auto addMOp = make_sum_matrix_columns(mat1); + typename sb_handle_t::event_t lastEvent; auto assignOp = make_op(vx, addMOp); - ret = concatenate_vectors(ret, sb_handle.execute(assignOp, localSize, ret)); + ret = concatenate_vectors( + ret, lastEvent = sb_handle.execute(assignOp, localSize, ret)); - sb_handle.template release_temp_mem(ret, valT1); + sb_handle.template release_temp_mem(lastEvent, valT1); return ret; } @@ -512,10 +519,13 @@ typename sb_handle_t::event_t _symv_impl( auto scalOp2 = make_op(_alpha, addMOp); auto addOp = make_op(scalOp1, scalOp2); auto assignOp = make_op(vy, addOp); - ret = concatenate_vectors(ret, sb_handle.execute(assignOp, localSize, ret)); - sb_handle.template release_temp_mem(ret, valTR); - sb_handle.template release_temp_mem(ret, valTC); + typename sb_handle_t::event_t lastEvent; + ret = concatenate_vectors( + ret, lastEvent = sb_handle.execute(assignOp, localSize, ret)); + + sb_handle.template release_temp_mem(lastEvent, valTR); + sb_handle.template release_temp_mem(lastEvent, valTC); return ret; } @@ -669,7 +679,7 @@ typename sb_handle_t::event_t _tbmv_impl( auto assignEvent = sb_handle.execute(assignOp, local_range, tbmvEvent); auto ret = concatenate_vectors(tbmvEvent, assignEvent); - sb_handle.template release_temp_mem(ret, res_buffer); + sb_handle.template release_temp_mem(assignEvent, res_buffer); return ret; } @@ -720,10 +730,11 @@ typename sb_handle_t::event_t _tpmv_impl( _dependencies); auto assignOp = make_op(vx, vres); - auto ret = - concatenate_vectors(tpmvEvent, sb_handle.execute(assignOp, tpmvEvent)); + typename sb_handle_t::event_t lastEvent; + auto ret = concatenate_vectors( + tpmvEvent, lastEvent = sb_handle.execute(assignOp, tpmvEvent)); - sb_handle.template release_temp_mem(ret, res_buffer); + sb_handle.template release_temp_mem(lastEvent, res_buffer); return ret; } diff --git a/src/interface/trsm_interface.hpp b/src/interface/trsm_interface.hpp index 628341022..1f5b4cb55 100644 --- a/src/interface/trsm_interface.hpp +++ b/src/interface/trsm_interface.hpp @@ -382,14 +382,15 @@ typename sb_handle_t::event_t _trsm( } // Copy bufferX to bufferB as the TRSM result + typename sb_handle_t::event_t lastEvent; trsmEvents = concatenate_vectors( - trsmEvents, - internal::_copy( - sb_handle, BSize, X, 1, B, 1, trsmEvents)); + trsmEvents, lastEvent = internal::_copy( + sb_handle, BSize, X, 1, B, 1, trsmEvents)); - sb_handle.template release_temp_mem(trsmEvents, invA); + sb_handle.template release_temp_mem(lastEvent, invA); - sb_handle.template release_temp_mem(trsmEvents, X); + sb_handle.template release_temp_mem(lastEvent, X); return trsmEvents; } diff --git a/src/sb_handle/portblas_handle.hpp b/src/sb_handle/portblas_handle.hpp index 903d74d7f..4ad19e974 100644 --- a/src/sb_handle/portblas_handle.hpp +++ b/src/sb_handle/portblas_handle.hpp @@ -32,122 +32,69 @@ #include "operations/blas1_trees.hpp" #include "operations/blas2_trees.hpp" #include "operations/blas_operators.hpp" +#include "portblas_helper.h" #include "sb_handle/kernel_constructor.h" #include "sb_handle/portblas_handle.h" -#include "portblas_helper.h" +#include "sb_handle/temp_memory_pool.hpp" #include "views/view.h" - namespace blas { -#ifdef SB_ENABLE_USM -template -typename std::enable_if< - alloc == helper::AllocType::usm, - typename helper::AllocHelper::type>::type -SB_Handle::acquire_temp_mem(size_t size) { - const size_t byteSize = size * sizeof(value_t); - map_mutex_.lock(); - auto found = temp_usm_map_.lower_bound(byteSize); - if (found != temp_usm_map_.end()) { - temp_usm_map_.extract(found); - tot_size_temp_mem_ -= found->first; - map_mutex_.unlock(); - return reinterpret_cast(found->second); - } else { - map_mutex_.unlock(); -#ifdef VERBOSE - std::cout << "Create a temporary USM allocation of " << byteSize - << " bytes." << std::endl; -#endif - value_t* tmp = cl::sycl::malloc_device(size, q_); - map_mutex_.lock(); - temp_usm_size_map_.emplace( - reinterpret_cast(tmp), byteSize); - map_mutex_.unlock(); - return tmp; - } -} -#endif template typename std::enable_if< alloc == helper::AllocType::buffer, typename helper::AllocHelper::type>::type SB_Handle::acquire_temp_mem(size_t size) { - const size_t byteSize = size * sizeof(value_t); - map_mutex_.lock(); - auto found = temp_buffer_map_.lower_bound(byteSize); - if (found != temp_buffer_map_.end()) { - cl::sycl::buffer buff = - found->second; - temp_buffer_map_.extract(found); - tot_size_temp_mem_ -= found->first; - map_mutex_.unlock(); - return blas::BufferIterator{buff.reinterpret( - cl::sycl::range<1>(found->first / sizeof(value_t)))}; - } else { - map_mutex_.unlock(); -#ifdef VERBOSE - std::cout << "Create a temporary buffer of " << byteSize << " bytes." - << std::endl; -#endif + if (tempMemPool_ != NULL) + return tempMemPool_->acquire_buff_mem(size); + else return make_sycl_iterator_buffer(size); - } } -#ifdef SB_ENABLE_USM template typename std::enable_if< std::is_same::type, - helper::AllocType::usm>::type>::value, - cl::sycl::event>::type -SB_Handle::release_temp_mem(std::vector dependencies, + helper::AllocType::buffer>::type>::value, + typename SB_Handle::event_t>::type +SB_Handle::release_temp_mem(const typename SB_Handle::event_t& dependencies, const container_t& mem) { - return q_.submit([&](cl::sycl::handler& cgh) { - cgh.depends_on(dependencies); - map_mutex_.lock(); - auto found = temp_usm_size_map_.find( - reinterpret_cast(mem)); - const size_t byteSize = found->second; - if (tot_size_temp_mem_ + byteSize > max_size_temp_mem_) { - temp_usm_size_map_.erase(found); - map_mutex_.unlock(); - cl::sycl::free(mem, q_); - } else { - tot_size_temp_mem_ += byteSize; - temp_usm_map_.emplace(byteSize, - reinterpret_cast(mem)); - map_mutex_.unlock(); - } - }); + if (tempMemPool_ != NULL) + return tempMemPool_->release_buff_mem(dependencies, mem); + else + return {}; } -#endif -#undef VERBOSE + +#ifdef SB_ENABLE_USM +template +typename std::enable_if< + alloc == helper::AllocType::usm, + typename helper::AllocHelper::type>::type +SB_Handle::acquire_temp_mem(size_t size) { + if (tempMemPool_ != NULL) + return tempMemPool_->acquire_usm_mem(size); + else + return cl::sycl::malloc_device(size, q_); +} + template typename std::enable_if< std::is_same::type, - helper::AllocType::buffer>::type>::value, - cl::sycl::event>::type -SB_Handle::release_temp_mem(std::vector dependencies, + helper::AllocType::usm>::type>::value, + typename SB_Handle::event_t>::type +SB_Handle::release_temp_mem(const typename SB_Handle::event_t& dependencies, const container_t& mem) { - return q_.submit([&](cl::sycl::handler& cgh) { - cgh.depends_on(dependencies); - const size_t byteSize = mem.get_buffer().byte_size(); - if (tot_size_temp_mem_ + byteSize <= max_size_temp_mem_) { - map_mutex_.lock(); - tot_size_temp_mem_ += byteSize; - temp_buffer_map_.emplace( - byteSize, - mem.get_buffer() - .template reinterpret( - cl::sycl::range<1>( - byteSize / - sizeof(temp_buffer_map_t::mapped_type::value_type)))); - map_mutex_.unlock(); - } - }); + if (tempMemPool_ != NULL) + return tempMemPool_->release_usm_mem(dependencies, mem); + else { + cl::sycl::context context = q_.get_context(); + return {q_.submit([&](cl::sycl::handler& cgh) { + cgh.depends_on(dependencies); + cgh.host_task([=]() { cl::sycl::free(mem, context); }); + })}; + } } +#endif /*! * @brief Executes the tree without defining required shared memory. @@ -260,9 +207,9 @@ inline typename SB_Handle::event_t SB_Handle::execute( even = !even; } while (_N > 1); - release_temp_mem(event, shMem1); + release_temp_mem({*event.rbegin()}, shMem1); - release_temp_mem(event, shMem2); + release_temp_mem({*event.rbegin()}, shMem2); return event; } diff --git a/src/sb_handle/temp_memory_pool.hpp b/src/sb_handle/temp_memory_pool.hpp new file mode 100644 index 000000000..4e3a68c43 --- /dev/null +++ b/src/sb_handle/temp_memory_pool.hpp @@ -0,0 +1,118 @@ +#ifndef TEMP_MEMORY_POOL_HPP +#define TEMP_MEMORY_POOL_HPP + +#include "portblas_helper.h" + +namespace blas { +template +typename helper::AllocHelper::type +Temp_Mem_Pool::acquire_buff_mem(size_t size) { + const size_t pad = sizeof(double) / sizeof(value_t); + // Adjust the requested size in order to reinterpret for double's + size += (pad - size % pad); + const size_t byteSize = size * sizeof(value_t); + temp_buffer_map_mutex_.lock(); // lock + auto found = temp_buffer_map_.lower_bound(byteSize); + if (found != temp_buffer_map_.end()) { + cl::sycl::buffer buff = + found->second; + temp_buffer_map_tot_byte_size_ -= found->first; + temp_buffer_map_.erase(found); + temp_buffer_map_mutex_.unlock(); // unlock + return blas::BufferIterator{buff.reinterpret( + cl::sycl::range<1>(buff.byte_size() / sizeof(value_t)))}; + } else { + temp_buffer_map_mutex_.unlock(); // unlock +#ifdef VERBOSE + std::cout << "Create a temporary buffer of " << byteSize << " bytes." + << std::endl; +#endif + return make_sycl_iterator_buffer(size); + } +} + +template +void Temp_Mem_Pool::release_buff_mem_(const container_t& mem) { + const size_t byteSize = mem.get_buffer().byte_size(); + auto rebuff = + mem.get_buffer() + .template reinterpret( + cl::sycl::range<1>( + byteSize / + sizeof(temp_buffer_map_t::mapped_type::value_type))); + temp_buffer_map_mutex_.lock(); // lock + if (temp_buffer_map_tot_byte_size_ + byteSize <= max_size_temp_mem_) { + temp_buffer_map_tot_byte_size_ += byteSize; + temp_buffer_map_.emplace(byteSize, rebuff); + } + temp_buffer_map_mutex_.unlock(); // unlock +} + +template +typename Temp_Mem_Pool::event_t Temp_Mem_Pool::release_buff_mem( + const typename Temp_Mem_Pool::event_t& dependencies, + const container_t& mem) { + return {q_.submit([&](cl::sycl::handler& cgh) { + cgh.depends_on(dependencies); + cgh.host_task([&, mem]() { release_buff_mem_(mem); }); + })}; +} + +#ifdef SB_ENABLE_USM +template +typename helper::AllocHelper::type +Temp_Mem_Pool::acquire_usm_mem(size_t size) { + const size_t byteSize = size * sizeof(value_t); + temp_usm_map_mutex_.lock(); // lock + auto found = temp_usm_map_.lower_bound(byteSize); + if (found != temp_usm_map_.end()) { + temp_usm_map_tot_byte_size_ -= found->first; + value_t* tmp = reinterpret_cast(found->second); + temp_usm_map_.erase(found); + temp_usm_map_mutex_.unlock(); // unlock + return tmp; + } else { + temp_usm_map_mutex_.unlock(); // unlock +#ifdef VERBOSE + std::cout << "Create a temporary USM allocation of " << byteSize + << " bytes." << std::endl; +#endif + value_t* tmp = cl::sycl::malloc_device(size, q_); + temp_usm_map_mutex_.lock(); // lock + temp_usm_size_map_.emplace( + reinterpret_cast(tmp), byteSize); + temp_usm_map_mutex_.unlock(); // unlock + return tmp; + } +} + +template +void Temp_Mem_Pool::release_usm_mem_(const container_t& mem) { + temp_usm_map_mutex_.lock(); // lock + auto found = temp_usm_size_map_.find( + reinterpret_cast(mem)); + const size_t byteSize = found->second; + if (temp_usm_map_tot_byte_size_ + byteSize > max_size_temp_mem_) { + temp_usm_size_map_.erase(found); + temp_usm_map_mutex_.unlock(); // unlock + cl::sycl::free(mem, q_); + } else { + temp_usm_map_tot_byte_size_ += byteSize; + temp_usm_map_.emplace(byteSize, + reinterpret_cast(mem)); + temp_usm_map_mutex_.unlock(); // unlock + } +} + +template +typename Temp_Mem_Pool::event_t Temp_Mem_Pool::release_usm_mem( + const typename Temp_Mem_Pool::event_t& dependencies, + const container_t& mem) { + return {q_.submit([&](cl::sycl::handler& cgh) { + cgh.depends_on(dependencies); + cgh.host_task([&, mem]() { release_usm_mem_(mem); }); + })}; +} +} +#endif +#endif diff --git a/tools/auto_tuner/CMakeLists.txt b/tools/auto_tuner/CMakeLists.txt index 234ad9ae2..bd39ad9f3 100644 --- a/tools/auto_tuner/CMakeLists.txt +++ b/tools/auto_tuner/CMakeLists.txt @@ -117,6 +117,9 @@ foreach(blas_tuner ${SYCL_AUTO_TUNNER_SRCS}) include/ ${CMAKE_CURRENT_BINARY_DIR} ) + if(BLAS_ENABLE_AUTO_TUNER_MEMPOOL) + target_compile_definitions(${tuner_exec} PRIVATE BLAS_ENABLE_AUTO_TUNER_MEMPOOL) + endif() add_dependencies(${tuner_exec} tuner_generate_def_file) if(is_dpcpp) target_link_libraries(${tuner_exec} PRIVATE DPCPP::DPCPP) diff --git a/tools/auto_tuner/README.md b/tools/auto_tuner/README.md index 35633ec88..08155c52a 100644 --- a/tools/auto_tuner/README.md +++ b/tools/auto_tuner/README.md @@ -21,6 +21,17 @@ $ ninja See the Setup section in this repository's main readme for more details. +Make options +------------ + +CMake options are given using `-D` immediately followed by the option name, the +symbol `=` and a value (`ON` and `OFF` can be used for boolean options and are +equivalent to 1 and 0). Example: `-DBLAS_ENABLE_TESTING=OFF` + +| name | value | description | +|---|---|---| +| `BLAS_MEMPOOL_BENCHMARK` | `ON`/`OFF` | Enable the scratchpad memory pool, useful just in case of tall skinny matrices. `OFF` by default | + Usage ----- diff --git a/tools/auto_tuner/src/tune_all.cpp b/tools/auto_tuner/src/tune_all.cpp index 8c8bc4d3a..1e32ac595 100644 --- a/tools/auto_tuner/src/tune_all.cpp +++ b/tools/auto_tuner/src/tune_all.cpp @@ -54,7 +54,12 @@ int main(int argc, char *argv[]) { } } +#ifdef BLAS_ENABLE_AUTO_TUNER_MEMPOOL + Temp_Mem_Pool mem_pool(make_sycl_queue()); + portblas_handle_t sb_handle(&mem_pool); +#else portblas_handle_t sb_handle(make_sycl_queue()); +#endif std::cout << "======= testing nn ======" << std::endl; run_tune_gemm(sb_handle, seed, m, k, n, batch_size, rep, diff --git a/tools/auto_tuner/src/tune_nn.cpp b/tools/auto_tuner/src/tune_nn.cpp index c5c341052..5b40bf296 100644 --- a/tools/auto_tuner/src/tune_nn.cpp +++ b/tools/auto_tuner/src/tune_nn.cpp @@ -57,7 +57,12 @@ int main(int argc, char *argv[]) { } } +#ifdef BLAS_ENABLE_AUTO_TUNER_MEMPOOL + Temp_Mem_Pool mem_pool(make_sycl_queue()); + portblas_handle_t sb_handle(&mem_pool); +#else portblas_handle_t sb_handle(make_sycl_queue()); +#endif run_tune_gemm(sb_handle, seed, m, k, n, batch_size, rep, batch_type); diff --git a/tools/auto_tuner/src/tune_nt.cpp b/tools/auto_tuner/src/tune_nt.cpp index d8d2819f4..c98eb3caf 100644 --- a/tools/auto_tuner/src/tune_nt.cpp +++ b/tools/auto_tuner/src/tune_nt.cpp @@ -57,7 +57,12 @@ int main(int argc, char *argv[]) { } } +#ifdef BLAS_ENABLE_AUTO_TUNER_MEMPOOL + Temp_Mem_Pool mem_pool(make_sycl_queue()); + portblas_handle_t sb_handle(&mem_pool); +#else portblas_handle_t sb_handle(make_sycl_queue()); +#endif run_tune_gemm(sb_handle, seed, m, k, n, batch_size, rep, batch_type); diff --git a/tools/auto_tuner/src/tune_tn.cpp b/tools/auto_tuner/src/tune_tn.cpp index 2683423f2..811ea7b04 100644 --- a/tools/auto_tuner/src/tune_tn.cpp +++ b/tools/auto_tuner/src/tune_tn.cpp @@ -57,7 +57,12 @@ int main(int argc, char *argv[]) { } } +#ifdef BLAS_ENABLE_AUTO_TUNER_MEMPOOL + Temp_Mem_Pool mem_pool(make_sycl_queue()); + portblas_handle_t sb_handle(&mem_pool); +#else portblas_handle_t sb_handle(make_sycl_queue()); +#endif run_tune_gemm(sb_handle, seed, m, k, n, batch_size, rep, batch_type); diff --git a/tools/auto_tuner/src/tune_tt.cpp b/tools/auto_tuner/src/tune_tt.cpp index 202d2f6e7..5fe2b7290 100644 --- a/tools/auto_tuner/src/tune_tt.cpp +++ b/tools/auto_tuner/src/tune_tt.cpp @@ -57,7 +57,12 @@ int main(int argc, char *argv[]) { } } +#ifdef BLAS_ENABLE_AUTO_TUNER_MEMPOOL + Temp_Mem_Pool mem_pool(make_sycl_queue()); + portblas_handle_t sb_handle(&mem_pool); +#else portblas_handle_t sb_handle(make_sycl_queue()); +#endif run_tune_gemm(sb_handle, seed, m, k, n, batch_size, rep, batch_type); From 75a4aa0f26ef6deca139654737cd1327a349ce85 Mon Sep 17 00:00:00 2001 From: pgorlani Date: Wed, 17 Jan 2024 17:09:32 +0000 Subject: [PATCH 5/6] add for iamax and iamin --- src/interface/blas1_interface.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index 16c0c5741..059b9cc27 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -321,9 +321,10 @@ typename sb_handle_t::event_t _iamax_iamin_impl( localMemSize == 0 ? _nWG * (static_cast(localSize) / min_sg_size) : _nWG; - auto gpu_res = blas::helper::allocate < is_usm ? helper::AllocType::usm - : helper::AllocType::buffer, - tuple_t > (memory_size, q); + auto gpu_res = sb_handle.template acquire_temp_mem < is_usm + ? helper::AllocType::usm + : helper::AllocType::buffer, + tuple_t > (memory_size); auto gpu_res_vec = make_vector_view(gpu_res, static_cast(1), memory_size); auto step0 = make_index_max_min(gpu_res_vec, tupOp); @@ -355,7 +356,7 @@ typename sb_handle_t::event_t _iamax_iamin_impl( static_cast(localSize), static_cast(localMemSize), ret)); } - blas::helper::enqueue_deallocate(ret, gpu_res, q); + sb_handle.template release_temp_mem({*ret.rbegin()}, gpu_res); } return ret; } From 9b4ca547900fe6f0f87676ebed8e349a6e818d08 Mon Sep 17 00:00:00 2001 From: pgorlani Date: Thu, 25 Jan 2024 14:35:08 +0000 Subject: [PATCH 6/6] NULL -> nullptr --- include/sb_handle/portblas_handle.h | 2 +- src/sb_handle/portblas_handle.hpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/sb_handle/portblas_handle.h b/include/sb_handle/portblas_handle.h index d69a340cd..f7104a0cc 100644 --- a/include/sb_handle/portblas_handle.h +++ b/include/sb_handle/portblas_handle.h @@ -49,7 +49,7 @@ class SB_Handle { public: using event_t = std::vector; inline SB_Handle(queue_t q) - : tempMemPool_(NULL), + : tempMemPool_(nullptr), q_(q), workGroupSize_(helper::get_work_group_size(q)), localMemorySupport_(helper::has_local_memory(q)), diff --git a/src/sb_handle/portblas_handle.hpp b/src/sb_handle/portblas_handle.hpp index 4ad19e974..c03b7b277 100644 --- a/src/sb_handle/portblas_handle.hpp +++ b/src/sb_handle/portblas_handle.hpp @@ -44,7 +44,7 @@ typename std::enable_if< alloc == helper::AllocType::buffer, typename helper::AllocHelper::type>::type SB_Handle::acquire_temp_mem(size_t size) { - if (tempMemPool_ != NULL) + if (tempMemPool_ != nullptr) return tempMemPool_->acquire_buff_mem(size); else return make_sycl_iterator_buffer(size); @@ -58,7 +58,7 @@ typename std::enable_if< typename SB_Handle::event_t>::type SB_Handle::release_temp_mem(const typename SB_Handle::event_t& dependencies, const container_t& mem) { - if (tempMemPool_ != NULL) + if (tempMemPool_ != nullptr) return tempMemPool_->release_buff_mem(dependencies, mem); else return {}; @@ -70,7 +70,7 @@ typename std::enable_if< alloc == helper::AllocType::usm, typename helper::AllocHelper::type>::type SB_Handle::acquire_temp_mem(size_t size) { - if (tempMemPool_ != NULL) + if (tempMemPool_ != nullptr) return tempMemPool_->acquire_usm_mem(size); else return cl::sycl::malloc_device(size, q_); @@ -84,7 +84,7 @@ typename std::enable_if< typename SB_Handle::event_t>::type SB_Handle::release_temp_mem(const typename SB_Handle::event_t& dependencies, const container_t& mem) { - if (tempMemPool_ != NULL) + if (tempMemPool_ != nullptr) return tempMemPool_->release_usm_mem(dependencies, mem); else { cl::sycl::context context = q_.get_context();