Skip to content
This repository has been archived by the owner on Jan 13, 2025. It is now read-only.

Reuse temporary memory allocations #478

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ endif()
option(BLAS_ENABLE_CONST_INPUT "Whether to enable kernel instantiation with const input buffer" ON)
option(BLAS_ENABLE_BENCHMARK "Whether to enable benchmarking" ON)
option(BLAS_VERIFY_BENCHMARK "Whether to verify the results of benchmarks" ON)
option(BLAS_MEMPOOL_BENCHMARK "Whether to use the memory pool in benchmarks" OFF)
option(BUILD_CLBLAST_BENCHMARKS "Whether to build clBLAST benchmarks" OFF)
option(BUILD_CLBLAS_BENCHMARKS "Whether to build clBLAS benchmarks" OFF)
option(BUILD_CUBLAS_BENCHMARKS "Whether to build cuBLAS benchmarks" OFF)
Expand All @@ -240,6 +241,7 @@ if (BLAS_BUILD_SAMPLES)
endif()

option(BLAS_ENABLE_AUTO_TUNERS "Whether to enable building GEMM auto tuners" OFF)
option(BLAS_ENABLE_AUTO_TUNER_MEMPOOL "Whether to enable memory pool for GEMM auto tuners" OFF)
if(${BLAS_ENABLE_AUTO_TUNERS})
# Note that the auto tuners are very slow to compile, so we avoid adding
# them to the ALL target.
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,7 @@ Some of the supported options are:
| `BUILD_SHARED_LIBS` | `ON`/`OFF` | Build as shared library (`ON` by default) |
| `ENABLE_EXPRESSION_TESTS` | `ON`/`OFF` | Build additional tests that use the header-only framework (e.g to test expression trees); `OFF` by default |
| `BLAS_VERIFY_BENCHMARK` | `ON`/`OFF` | Verify the results of the benchmarks instead of only measuring the performance. See the documentation of the benchmarks for more details. `ON` by default |
| `BLAS_MEMPOOL_BENCHMARK` | `ON`/`OFF` | Determines whether to enable the scratchpad memory pool for benchmark execution. `OFF` by default |
| `BLAS_ENABLE_CONST_INPUT` | `ON`/`OFF` | Determines whether to enable kernel instantiation with const input buffer (`ON` by default) |
| `BLAS_ENABLE_EXTENSIONS` | `ON`/`OFF` | Determines whether to enable portBLAS extensions (`ON` by default) |
| `BLAS_DATA_TYPES` | `half;float;double` | Determines the floating-point types to instantiate BLAS operations for. Default is `float` |
Expand Down
4 changes: 4 additions & 0 deletions benchmark/portblas/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ foreach(portblas_bench ${sources})
)
target_include_directories(bench_${bench_exec} PRIVATE ${PORTBLAS_INCLUDE} ${CBLAS_INCLUDE} ${PORTBLAS_COMMON_INCLUDE_DIR})

if(BLAS_MEMPOOL_BENCHMARK)
target_compile_definitions(bench_${bench_exec} PRIVATE BLAS_MEMPOOL_BENCHMARK)
endif()

if(BLAS_VERIFY_BENCHMARK)
target_compile_definitions(bench_${bench_exec} PRIVATE BLAS_VERIFY_BENCHMARK)
target_link_libraries(bench_${bench_exec} PRIVATE blas::blas)
Expand Down
6 changes: 6 additions & 0 deletions benchmark/portblas/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,14 @@ int main(int argc, char** argv) {

utils::print_queue_information(q);

#ifdef BLAS_MEMPOOL_BENCHMARK
blas::Temp_Mem_Pool mp(q);
// Create a portBLAS sb_handle from the memory pool
blas::SB_Handle sb_handle(&mp);
#else
// Create a portBLAS sb_handle from the queue
blas::SB_Handle sb_handle(q);
#endif

// This will be set to false by a failing benchmark
bool success = true;
Expand Down
43 changes: 42 additions & 1 deletion include/sb_handle/portblas_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
#include "operations/blas3_trees.h"
#include "operations/extension/reduction.h"
#include "portblas_helper.h"
#include "temp_memory_pool.h"

namespace blas {

/** SB_Handle.
Expand All @@ -47,11 +49,49 @@ class SB_Handle {
public:
using event_t = std::vector<cl::sycl::event>;
inline SB_Handle(queue_t q)
: q_(q),
: tempMemPool_(nullptr),
q_(q),
workGroupSize_(helper::get_work_group_size(q)),
localMemorySupport_(helper::has_local_memory(q)),
computeUnits_(helper::get_num_compute_units(q)) {}

inline SB_Handle(Temp_Mem_Pool* tmp)
: tempMemPool_(tmp),
q_(tmp->get_queue()),
workGroupSize_(helper::get_work_group_size(q_)),
localMemorySupport_(helper::has_local_memory(q_)),
computeUnits_(helper::get_num_compute_units(q_)) {}

template <helper::AllocType alloc, typename value_t>
typename std::enable_if<
alloc == helper::AllocType::buffer,
typename helper::AllocHelper<value_t, alloc>::type>::type
acquire_temp_mem(size_t size);

template <typename container_t>
typename std::enable_if<
std::is_same<container_t, typename helper::AllocHelper<
typename ValueType<container_t>::type,
helper::AllocType::buffer>::type>::value,
typename SB_Handle::event_t>::type
release_temp_mem(const typename SB_Handle::event_t&, const container_t&);

#ifdef SB_ENABLE_USM
template <helper::AllocType alloc, typename value_t>
typename std::enable_if<
alloc == helper::AllocType::usm,
typename helper::AllocHelper<value_t, alloc>::type>::type
acquire_temp_mem(size_t size);

template <typename container_t>
typename std::enable_if<
std::is_same<container_t, typename helper::AllocHelper<
typename ValueType<container_t>::type,
helper::AllocType::usm>::type>::value,
typename SB_Handle::event_t>::type
release_temp_mem(const typename SB_Handle::event_t&, const container_t&);
#endif

template <typename expression_tree_t>
event_t execute(expression_tree_t tree, const event_t& dependencies = {});

Expand Down Expand Up @@ -151,6 +191,7 @@ class SB_Handle {
const size_t workGroupSize_;
const bool localMemorySupport_;
const size_t computeUnits_;
Temp_Mem_Pool* tempMemPool_;
};

} // namespace blas
Expand Down
112 changes: 112 additions & 0 deletions include/sb_handle/temp_memory_pool.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/***************************************************************************
*
* @license
* Copyright (C) Codeplay Software Limited
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* For your convenience, a copy of the License has been included in this
* repository.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* portBLAS: BLAS implementation using SYCL
*
* @filename temp_memory_pool.h
*
**************************************************************************/
#ifndef TEMP_MEMORY_POOL_H
#define TEMP_MEMORY_POOL_H

#include <map>
#include <mutex>

namespace blas {
class Temp_Mem_Pool {
using queue_t = cl::sycl::queue;
using event_t = std::vector<cl::sycl::event>;
using temp_usm_map_t = std::multimap<size_t, void*>;
using temp_usm_size_map_t = std::map<void*, size_t>;
using temp_buffer_map_t = std::multimap<size_t, cl::sycl::buffer<int8_t, 1>>;

public:
Temp_Mem_Pool(queue_t q)
: q_(q),
temp_buffer_map_tot_byte_size_(0),
temp_usm_map_tot_byte_size_(0) {}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the temp_usm_map_tot_byte_size_ is undefined if SB_ENABLE_USM is not defined.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in aa1fdf1.

Temp_Mem_Pool(const Temp_Mem_Pool& h) = delete;
Temp_Mem_Pool operator=(Temp_Mem_Pool) = delete;

~Temp_Mem_Pool() {
// Wait for the completion of all the host tasks
q_.wait();

#ifdef VERBOSE
std::cout << "# buffers destroyed on memory pool destruction: "
<< temp_buffer_map_.size() << " ("
<< temp_buffer_map_tot_byte_size_ << " bytes)" << std::endl;
#endif

#ifdef SB_ENABLE_USM
#ifdef VERBOSE
std::cout << "# USM allocations freed on memory pool destruction: "
<< temp_usm_map_.size() << " (" << temp_usm_map_tot_byte_size_
<< " bytes)" << std::endl;
#endif
for (const temp_usm_map_t::value_type& p : temp_usm_map_)
cl::sycl::free(p.second, q_);
#endif
}

inline queue_t get_queue() const { return q_; }

template <typename value_t>
typename helper::AllocHelper<value_t, helper::AllocType::buffer>::type
acquire_buff_mem(size_t size);

template <typename container_t>
typename Temp_Mem_Pool::event_t release_buff_mem(
const typename Temp_Mem_Pool::event_t&, const container_t&);

#ifdef SB_ENABLE_USM
template <typename value_t>
typename helper::AllocHelper<value_t, helper::AllocType::usm>::type
acquire_usm_mem(size_t size);

template <typename container_t>
typename Temp_Mem_Pool::event_t release_usm_mem(
const typename Temp_Mem_Pool::event_t&, const container_t&);
#endif

private:
static_assert(sizeof(temp_buffer_map_t::mapped_type::value_type) == 1);

static constexpr size_t max_size_temp_mem_ = 1e9;
s-Nick marked this conversation as resolved.
Show resolved Hide resolved
queue_t q_;

std::mutex temp_buffer_map_mutex_;
size_t temp_buffer_map_tot_byte_size_;
temp_buffer_map_t temp_buffer_map_;

template <typename container_t>
void release_usm_mem_(const container_t& mem);

#ifdef SB_ENABLE_USM
std::mutex temp_usm_map_mutex_;
size_t temp_usm_map_tot_byte_size_;
temp_usm_map_t temp_usm_map_;
temp_usm_size_map_t temp_usm_size_map_;

template <typename container_t>
void release_buff_mem_(const container_t& mem);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the late comment. I think this one should be swapped with the release_usm_mem_ above.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in aa1fdf1.

#endif
};
} // namespace blas
#endif
9 changes: 5 additions & 4 deletions src/interface/blas1_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,9 +321,10 @@ typename sb_handle_t::event_t _iamax_iamin_impl(
localMemSize == 0
? _nWG * (static_cast<index_t>(localSize) / min_sg_size)
: _nWG;
auto gpu_res = blas::helper::allocate < is_usm ? helper::AllocType::usm
: helper::AllocType::buffer,
tuple_t > (memory_size, q);
auto gpu_res = sb_handle.template acquire_temp_mem < is_usm
? helper::AllocType::usm
: helper::AllocType::buffer,
tuple_t > (memory_size);
auto gpu_res_vec =
make_vector_view(gpu_res, static_cast<increment_t>(1), memory_size);
auto step0 = make_index_max_min<is_max, true>(gpu_res_vec, tupOp);
Expand Down Expand Up @@ -355,7 +356,7 @@ typename sb_handle_t::event_t _iamax_iamin_impl(
static_cast<index_t>(localSize),
static_cast<index_t>(localMemSize), ret));
}
blas::helper::enqueue_deallocate(ret, gpu_res, q);
sb_handle.template release_temp_mem({*ret.rbegin()}, gpu_res);
}
return ret;
}
Expand Down
Loading
Loading