diff --git a/.clang-format b/.clang-format index 5f66bf6b3..b0ac480bb 100644 --- a/.clang-format +++ b/.clang-format @@ -59,4 +59,6 @@ ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] SpaceBeforeParens: ControlStatements DisableFormat: false ... - +--- +Language: Json +DisableFormat: true diff --git a/tools/auto_tuner/CMakeLists.txt b/tools/auto_tuner/CMakeLists.txt index 759533cd4..234ad9ae2 100644 --- a/tools/auto_tuner/CMakeLists.txt +++ b/tools/auto_tuner/CMakeLists.txt @@ -37,6 +37,10 @@ endif() if(${TUNING_TARGET} STREQUAL "NVIDIA_GPU") set(GEN_CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/gen/nvidia_gpu.json) endif() +if(${TUNING_TARGET} STREQUAL "AMD_GPU") + set(GEN_CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/gen/amd_gpu.json) +endif() + diff --git a/tools/auto_tuner/gen/amd_gpu.json b/tools/auto_tuner/gen/amd_gpu.json new file mode 100644 index 000000000..5d3bab060 --- /dev/null +++ b/tools/auto_tuner/gen/amd_gpu.json @@ -0,0 +1,73 @@ +{"local":[ + { + "cache_line_size":[128,64,32], + "work_item_sizes":[[1,1],[2,1],[1,2],[2,2],[4,1],[1,4],[4,2],[2,4],[4,4],[1,8],[8,1],[2,8],[8,2],[4,8],[8,4],[8,8]], + "work_group_sizes":[[4,4],[8,4],[4,8],[8,8],[16,2],[16,4],[16,8],[2,16],[4,16],[8,16],[16,16],[32,2],[2,32],[32,4],[4,32],[32,8],[8,32]], + "vectorization_size":[1], + "block_level_tiles":[[1, 1]], + "double_buffer":[false], + "no_bank_conflict_a":[true,false], + "no_bank_conflict_b":[true] + }, + { + "cache_line_size":[128, 64], + "work_item_sizes":[[1, 16],[2, 16],[4, 16]], + "work_group_sizes":[[32,8]], + "vectorization_size":[1], + "block_level_tiles":[[1, 1]], + "double_buffer":[false], + "no_bank_conflict_a":[true], + "no_bank_conflict_b":[true] + }, + { + "cache_line_size":[128, 64], + "work_item_sizes":[[16, 1],[16, 2],[16, 4]], + "work_group_sizes":[[8,32]], + "vectorization_size":[1], + "block_level_tiles":[[1, 1]], + "double_buffer":[false], + "no_bank_conflict_a":[true], + "no_bank_conflict_b":[true] + }, + { + "cache_line_size":[128, 64], + "work_item_sizes":[[1,1],[2,1],[1,2],[2,2],[4,1],[1,4],[4,2],[2,4]], + "work_group_sizes":[[64,1],[1,64],[64,2],[64,4],[2,64],[4,64]], + "vectorization_size":[1], + "block_level_tiles":[[1, 1]], + "double_buffer":[false], + "no_bank_conflict_a":[true], + "no_bank_conflict_b":[true] + }, + { + "cache_line_size":[128, 64], + "work_item_sizes":[[1,8],[2,8],[4,8],[1,16],[2,16]], + "work_group_sizes":[[64,4],[64,2]], + "vectorization_size":[1], + "block_level_tiles":[[1, 1]], + "double_buffer":[false], + "no_bank_conflict_a":[true], + "no_bank_conflict_b":[true] + }, + { + "cache_line_size":[128, 64], + "work_item_sizes":[[8,1],[8,2],[8,4],[16,2]], + "work_group_sizes":[[4,64],[2,64]], + "vectorization_size":[1], + "block_level_tiles":[[1, 1]], + "double_buffer":[false], + "no_bank_conflict_a":[true], + "no_bank_conflict_b":[true] + }, + { + "cache_line_size":[64], + "work_item_sizes":[[8,8]], + "work_group_sizes":[[8,8]], + "vectorization_size":[1], + "block_level_tiles":[[2, 2]], + "double_buffer":[false], + "no_bank_conflict_a":[false], + "no_bank_conflict_b":[true] + } +] +} diff --git a/tools/auto_tuner/gen/nvidia_gpu.json b/tools/auto_tuner/gen/nvidia_gpu.json index f3045fc55..5d3bab060 100644 --- a/tools/auto_tuner/gen/nvidia_gpu.json +++ b/tools/auto_tuner/gen/nvidia_gpu.json @@ -1,34 +1,73 @@ {"local":[ - { - "cache_line_size":[64], - "work_item_sizes":[[12, 8], [8, 12]], - "work_group_sizes":[[12,8],[8,12]], - "vectorization_size":[1, 2, 4], + { + "cache_line_size":[128,64,32], + "work_item_sizes":[[1,1],[2,1],[1,2],[2,2],[4,1],[1,4],[4,2],[2,4],[4,4],[1,8],[8,1],[2,8],[8,2],[4,8],[8,4],[8,8]], + "work_group_sizes":[[4,4],[8,4],[4,8],[8,8],[16,2],[16,4],[16,8],[2,16],[4,16],[8,16],[16,16],[32,2],[2,32],[32,4],[4,32],[32,8],[8,32]], + "vectorization_size":[1], "block_level_tiles":[[1, 1]], - "double_buffer":[true, false], - "no_bank_conflict_a":[true, false], - "no_bank_conflict_b":[true, false] + "double_buffer":[false], + "no_bank_conflict_a":[true,false], + "no_bank_conflict_b":[true] }, - { - "cache_line_size":[64], - "work_item_sizes":[[2, 2]], - "work_group_sizes":[[8,8]], - "vectorization_size":[1, 2, 4], + { + "cache_line_size":[128, 64], + "work_item_sizes":[[1, 16],[2, 16],[4, 16]], + "work_group_sizes":[[32,8]], + "vectorization_size":[1], "block_level_tiles":[[1, 1]], - "double_buffer":[true, false], - "no_bank_conflict_a":[true, false], - "no_bank_conflict_b":[true, false] + "double_buffer":[false], + "no_bank_conflict_a":[true], + "no_bank_conflict_b":[true] }, { - "cache_line_size":[64,128], - "work_item_sizes":[[8, 8]], - "work_group_sizes":[[8,8]], - "vectorization_size":[1, 2, 4], + "cache_line_size":[128, 64], + "work_item_sizes":[[16, 1],[16, 2],[16, 4]], + "work_group_sizes":[[8,32]], + "vectorization_size":[1], + "block_level_tiles":[[1, 1]], + "double_buffer":[false], + "no_bank_conflict_a":[true], + "no_bank_conflict_b":[true] + }, + { + "cache_line_size":[128, 64], + "work_item_sizes":[[1,1],[2,1],[1,2],[2,2],[4,1],[1,4],[4,2],[2,4]], + "work_group_sizes":[[64,1],[1,64],[64,2],[64,4],[2,64],[4,64]], + "vectorization_size":[1], "block_level_tiles":[[1, 1]], - "double_buffer":[true, false], - "no_bank_conflict_a":[true, false], - "no_bank_conflict_b":[true, false] + "double_buffer":[false], + "no_bank_conflict_a":[true], + "no_bank_conflict_b":[true] + }, + { + "cache_line_size":[128, 64], + "work_item_sizes":[[1,8],[2,8],[4,8],[1,16],[2,16]], + "work_group_sizes":[[64,4],[64,2]], + "vectorization_size":[1], + "block_level_tiles":[[1, 1]], + "double_buffer":[false], + "no_bank_conflict_a":[true], + "no_bank_conflict_b":[true] + }, + { + "cache_line_size":[128, 64], + "work_item_sizes":[[8,1],[8,2],[8,4],[16,2]], + "work_group_sizes":[[4,64],[2,64]], + "vectorization_size":[1], + "block_level_tiles":[[1, 1]], + "double_buffer":[false], + "no_bank_conflict_a":[true], + "no_bank_conflict_b":[true] + }, + { + "cache_line_size":[64], + "work_item_sizes":[[8,8]], + "work_group_sizes":[[8,8]], + "vectorization_size":[1], + "block_level_tiles":[[2, 2]], + "double_buffer":[false], + "no_bank_conflict_a":[false], + "no_bank_conflict_b":[true] } - ] } diff --git a/tools/auto_tuner/include/utils.hpp b/tools/auto_tuner/include/utils.hpp index 9d1322880..a7d0e25c9 100644 --- a/tools/auto_tuner/include/utils.hpp +++ b/tools/auto_tuner/include/utils.hpp @@ -35,15 +35,17 @@ #include inline portblas_handle_t make_portblas_handle() { - cl::sycl::queue q([=](cl::sycl::exception_list ex_list) { - try { - for (auto &e_ptr : ex_list) { - std::rethrow_exception(e_ptr); - } - } catch (cl::sycl::exception &e) { - throw std::runtime_error(e.what()); - } - }); + cl::sycl::queue q( + [=](cl::sycl::exception_list ex_list) { + try { + for (auto &e_ptr : ex_list) { + std::rethrow_exception(e_ptr); + } + } catch (cl::sycl::exception &e) { + throw std::runtime_error(e.what()); + } + }, + {cl::sycl::property::queue::in_order()}); std::cout << "\nDevice: " << q.get_device().get_info() << std::endl; @@ -87,9 +89,11 @@ static void run_tune(int rep, double flop_cnt, TestResultEntry &result, using Seconds = std::chrono::duration; using MilliSeconds = std::chrono::duration; Seconds runtime_secs; - // warmup try { - op(); + // warmup + for (int i = 0; i < 10; ++i) { + op(); + } auto start = std::chrono::steady_clock::now(); for (int i = 0; i < rep; ++i) { op();