Skip to content
This repository has been archived by the owner on Jan 13, 2025. It is now read-only.

Commit

Permalink
Auto-tuner fixes and new configurations (#468)
Browse files Browse the repository at this point in the history
Added new configurations for tuning the NVIDIA_GPU and AMD_GPU targets
  • Loading branch information
pgorlani authored Oct 13, 2023
1 parent 0ebeae5 commit fe44d70
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 36 deletions.
4 changes: 3 additions & 1 deletion .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,6 @@ ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
SpaceBeforeParens: ControlStatements
DisableFormat: false
...

---
Language: Json
DisableFormat: true
4 changes: 4 additions & 0 deletions tools/auto_tuner/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ endif()
if(${TUNING_TARGET} STREQUAL "NVIDIA_GPU")
set(GEN_CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/gen/nvidia_gpu.json)
endif()
if(${TUNING_TARGET} STREQUAL "AMD_GPU")
set(GEN_CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/gen/amd_gpu.json)
endif()




Expand Down
73 changes: 73 additions & 0 deletions tools/auto_tuner/gen/amd_gpu.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
{"local":[
{
"cache_line_size":[128,64,32],
"work_item_sizes":[[1,1],[2,1],[1,2],[2,2],[4,1],[1,4],[4,2],[2,4],[4,4],[1,8],[8,1],[2,8],[8,2],[4,8],[8,4],[8,8]],
"work_group_sizes":[[4,4],[8,4],[4,8],[8,8],[16,2],[16,4],[16,8],[2,16],[4,16],[8,16],[16,16],[32,2],[2,32],[32,4],[4,32],[32,8],[8,32]],
"vectorization_size":[1],
"block_level_tiles":[[1, 1]],
"double_buffer":[false],
"no_bank_conflict_a":[true,false],
"no_bank_conflict_b":[true]
},
{
"cache_line_size":[128, 64],
"work_item_sizes":[[1, 16],[2, 16],[4, 16]],
"work_group_sizes":[[32,8]],
"vectorization_size":[1],
"block_level_tiles":[[1, 1]],
"double_buffer":[false],
"no_bank_conflict_a":[true],
"no_bank_conflict_b":[true]
},
{
"cache_line_size":[128, 64],
"work_item_sizes":[[16, 1],[16, 2],[16, 4]],
"work_group_sizes":[[8,32]],
"vectorization_size":[1],
"block_level_tiles":[[1, 1]],
"double_buffer":[false],
"no_bank_conflict_a":[true],
"no_bank_conflict_b":[true]
},
{
"cache_line_size":[128, 64],
"work_item_sizes":[[1,1],[2,1],[1,2],[2,2],[4,1],[1,4],[4,2],[2,4]],
"work_group_sizes":[[64,1],[1,64],[64,2],[64,4],[2,64],[4,64]],
"vectorization_size":[1],
"block_level_tiles":[[1, 1]],
"double_buffer":[false],
"no_bank_conflict_a":[true],
"no_bank_conflict_b":[true]
},
{
"cache_line_size":[128, 64],
"work_item_sizes":[[1,8],[2,8],[4,8],[1,16],[2,16]],
"work_group_sizes":[[64,4],[64,2]],
"vectorization_size":[1],
"block_level_tiles":[[1, 1]],
"double_buffer":[false],
"no_bank_conflict_a":[true],
"no_bank_conflict_b":[true]
},
{
"cache_line_size":[128, 64],
"work_item_sizes":[[8,1],[8,2],[8,4],[16,2]],
"work_group_sizes":[[4,64],[2,64]],
"vectorization_size":[1],
"block_level_tiles":[[1, 1]],
"double_buffer":[false],
"no_bank_conflict_a":[true],
"no_bank_conflict_b":[true]
},
{
"cache_line_size":[64],
"work_item_sizes":[[8,8]],
"work_group_sizes":[[8,8]],
"vectorization_size":[1],
"block_level_tiles":[[2, 2]],
"double_buffer":[false],
"no_bank_conflict_a":[false],
"no_bank_conflict_b":[true]
}
]
}
87 changes: 63 additions & 24 deletions tools/auto_tuner/gen/nvidia_gpu.json
Original file line number Diff line number Diff line change
@@ -1,34 +1,73 @@
{"local":[
{
"cache_line_size":[64],
"work_item_sizes":[[12, 8], [8, 12]],
"work_group_sizes":[[12,8],[8,12]],
"vectorization_size":[1, 2, 4],
{
"cache_line_size":[128,64,32],
"work_item_sizes":[[1,1],[2,1],[1,2],[2,2],[4,1],[1,4],[4,2],[2,4],[4,4],[1,8],[8,1],[2,8],[8,2],[4,8],[8,4],[8,8]],
"work_group_sizes":[[4,4],[8,4],[4,8],[8,8],[16,2],[16,4],[16,8],[2,16],[4,16],[8,16],[16,16],[32,2],[2,32],[32,4],[4,32],[32,8],[8,32]],
"vectorization_size":[1],
"block_level_tiles":[[1, 1]],
"double_buffer":[true, false],
"no_bank_conflict_a":[true, false],
"no_bank_conflict_b":[true, false]
"double_buffer":[false],
"no_bank_conflict_a":[true,false],
"no_bank_conflict_b":[true]
},
{
"cache_line_size":[64],
"work_item_sizes":[[2, 2]],
"work_group_sizes":[[8,8]],
"vectorization_size":[1, 2, 4],
{
"cache_line_size":[128, 64],
"work_item_sizes":[[1, 16],[2, 16],[4, 16]],
"work_group_sizes":[[32,8]],
"vectorization_size":[1],
"block_level_tiles":[[1, 1]],
"double_buffer":[true, false],
"no_bank_conflict_a":[true, false],
"no_bank_conflict_b":[true, false]
"double_buffer":[false],
"no_bank_conflict_a":[true],
"no_bank_conflict_b":[true]
},
{
"cache_line_size":[64,128],
"work_item_sizes":[[8, 8]],
"work_group_sizes":[[8,8]],
"vectorization_size":[1, 2, 4],
"cache_line_size":[128, 64],
"work_item_sizes":[[16, 1],[16, 2],[16, 4]],
"work_group_sizes":[[8,32]],
"vectorization_size":[1],
"block_level_tiles":[[1, 1]],
"double_buffer":[false],
"no_bank_conflict_a":[true],
"no_bank_conflict_b":[true]
},
{
"cache_line_size":[128, 64],
"work_item_sizes":[[1,1],[2,1],[1,2],[2,2],[4,1],[1,4],[4,2],[2,4]],
"work_group_sizes":[[64,1],[1,64],[64,2],[64,4],[2,64],[4,64]],
"vectorization_size":[1],
"block_level_tiles":[[1, 1]],
"double_buffer":[true, false],
"no_bank_conflict_a":[true, false],
"no_bank_conflict_b":[true, false]
"double_buffer":[false],
"no_bank_conflict_a":[true],
"no_bank_conflict_b":[true]
},
{
"cache_line_size":[128, 64],
"work_item_sizes":[[1,8],[2,8],[4,8],[1,16],[2,16]],
"work_group_sizes":[[64,4],[64,2]],
"vectorization_size":[1],
"block_level_tiles":[[1, 1]],
"double_buffer":[false],
"no_bank_conflict_a":[true],
"no_bank_conflict_b":[true]
},
{
"cache_line_size":[128, 64],
"work_item_sizes":[[8,1],[8,2],[8,4],[16,2]],
"work_group_sizes":[[4,64],[2,64]],
"vectorization_size":[1],
"block_level_tiles":[[1, 1]],
"double_buffer":[false],
"no_bank_conflict_a":[true],
"no_bank_conflict_b":[true]
},
{
"cache_line_size":[64],
"work_item_sizes":[[8,8]],
"work_group_sizes":[[8,8]],
"vectorization_size":[1],
"block_level_tiles":[[2, 2]],
"double_buffer":[false],
"no_bank_conflict_a":[false],
"no_bank_conflict_b":[true]
}

]
}
26 changes: 15 additions & 11 deletions tools/auto_tuner/include/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,17 @@
#include <random>

inline portblas_handle_t make_portblas_handle() {
cl::sycl::queue q([=](cl::sycl::exception_list ex_list) {
try {
for (auto &e_ptr : ex_list) {
std::rethrow_exception(e_ptr);
}
} catch (cl::sycl::exception &e) {
throw std::runtime_error(e.what());
}
});
cl::sycl::queue q(
[=](cl::sycl::exception_list ex_list) {
try {
for (auto &e_ptr : ex_list) {
std::rethrow_exception(e_ptr);
}
} catch (cl::sycl::exception &e) {
throw std::runtime_error(e.what());
}
},
{cl::sycl::property::queue::in_order()});
std::cout << "\nDevice: "
<< q.get_device().get_info<cl::sycl::info::device::name>()
<< std::endl;
Expand Down Expand Up @@ -87,9 +89,11 @@ static void run_tune(int rep, double flop_cnt, TestResultEntry &result,
using Seconds = std::chrono::duration<double>;
using MilliSeconds = std::chrono::duration<double, std::milli>;
Seconds runtime_secs;
// warmup
try {
op();
// warmup
for (int i = 0; i < 10; ++i) {
op();
}
auto start = std::chrono::steady_clock::now();
for (int i = 0; i < rep; ++i) {
op();
Expand Down

0 comments on commit fe44d70

Please sign in to comment.