diff --git a/CMakeLists.txt b/CMakeLists.txt index e0d418c57..8447a0a45 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,6 +115,19 @@ if(((NOT INSTALL_HEADER_ONLY) AND (TUNING_TARGET STREQUAL "DEFAULT_CPU")) message(STATUS "FP16 operations are not supported for CPU targets. BLAS_ENABLE_HALF is disabled") endif() +if (SYCL_COMPILER MATCHES "adaptivecpp") + if(BLAS_ENABLE_COMPLEX) + message(STATUS "SYCL Complex data is not supported on AdaptiveCpp/hipSYCL. Complex + data type is disabled") + set(BLAS_ENABLE_COMPLEX OFF) + endif() + if(BLAS_MEMPOOL_BENCHMARK) + message(STATUS "Memory pool feature is not supported on AdaptiveCpp/hipSYCL. Corresponding + benchmarks are disabled") + set(BLAS_MEMPOOL_BENCHMARK OFF) + endif() +endif() + # CmakeFunctionHelper has to be included after any options that it depends on are declared. # These include: # * TARGET @@ -145,8 +158,8 @@ else() target_link_libraries(portblas PUBLIC ComputeCpp::ComputeCpp) elseif(is_dpcpp) target_link_libraries(portblas PUBLIC DPCPP::DPCPP) - elseif(is_hipsycl) - target_link_libraries(portblas PUBLIC hipSYCL::hipSYCL-rt) + elseif(is_adaptivecpp) + target_link_libraries(portblas PUBLIC AdaptiveCpp::acpp-rt) endif() endif() if(is_computecpp) @@ -154,8 +167,8 @@ else() elseif(is_dpcpp) set(sycl_impl DPCPP::DPCPP) add_sycl_to_target(TARGET portblas SOURCES) - elseif(is_hipsycl) - set(sycl_impl hipSYCL::hipSYCL-rt) + elseif(is_adaptivecpp) + set(sycl_impl AdaptiveCpp::acpp-rt) add_sycl_to_target(TARGET portblas SOURCES) endif() if(IMGDNN_DIR) diff --git a/README.md b/README.md index fb210470d..7b1ee2b70 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ the project. - [Requirements](#requirements) - [Setup](#setup) - [Compile with DPC++](#compile-with-dpc) - - [Compile with hipSYCL](#compile-with-hipsycl) + - [Compile with AdaptiveCpp *(Formerly hipSYCL)*](#compile-with-adaptivecpp) - [Instaling portBLAS](#instaling-portBLAS) - [Doxygen](#doxygen) - [CMake options](#cmake-options) @@ -390,9 +390,9 @@ added to the `CMAKE_PREFIX_PATH` when building portBLAS (see **IMPORTANT NOTE:** The `TARGET` CMake variable is no longer supported. It has been replaced by `TUNING_TARGET`, which accepts the same options. -`TUNING_TARGET` affects only the tuning configuration, applicable for some operators such -as GEMM, and has no effect on the target triplet for DPC++ or the hipSYCL target. Please -refer to the sections below for setting them. +`TUNING_TARGET` affects only the tuning configuration and has no effect on the target +triplet for DPC++ or the AdaptiveCpp/hipSYCL target. Please refer to the sections +below for setting them. 1. Clone the portBLAS repository, making sure to pass the `--recursive` option, in order to clone submodule(s). @@ -417,13 +417,41 @@ advisable for NVIDIA and **mandatory for AMD** to provide the specific device architecture through `-DDPCPP_SYCL_ARCH=`, e.g., `` can be `sm_80` for NVIDIA or `gfx908` for AMD. -### Compile with hipSYCL +### Compile with AdaptiveCpp *(Formerly hipSYCL)* +The following instructions concern the **generic** *(clang-based)* flow supported +by AdaptiveCpp. + ```bash cd build -cmake -GNinja ../ -DhipSYCL_DIR=/path/to/hipSYCL/install/lib/cmake/hipSYCL -DSYCL_COMPILER=hipsycl +export CC=[path/to/system/clang] +export CXX=[path/to/AdaptiveCpp/install/bin/acpp] +export ACPP_TARGETS=[compilation_flow:target] # (e.g. cuda:sm_75) +cmake -GNinja ../ -DAdaptiveCpp_DIR=/path/to/AdaptiveCpp/install/lib/cmake/AdaptiveCpp \ + -DSYCL_COMPILER=adaptivecpp -DACPP_TARGETS=$ACPP_TARGETS ninja ``` -To build for other than the default devices (`omp`), set the `HIPSYCL_TARGETS` environment variable or specify `-DHIPSYCL_TARGETS` as [documented](https://github.com/illuhad/hipSYCL/blob/develop/doc/using-hipsycl.md). +To build for other than the default backend *(host cpu through `omp`*)*, set the `ACPP_TARGETS` environment +variable or specify `-DACPP_TARGETS` as +[documented](https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/using-hipsycl.md). +The available backends are the ones built with AdaptiveCpp in the first place. + +Similarly to DPCPP's `sycl-ls`, AdaptiveCpp's `acpp-info` helps display the available +backends informations. In case of building AdaptiveCpp against llvm *(generic-flow)*, +the `llvm-to-xxx.so` library files should be visible by the runtime to target the +appropriate device, which can be ensured by setting the ENV variable : + +```bash +export LD_LIBRARY_PATH=[path/to/AdaptiveCpp/install/lib/hipSYCL:$LD_LIBRARY_PATH] +export LD_LIBRARY_PATH=[path/to/AdaptiveCpp/install/lib/hipSYCL/llvm-to-backend:$LD_LIBRARY_PATH] +``` + +*Notes :* +- Some operator kernels are implemented using extensions / SYCL 2020 features not yet implemented +in AdaptiveCpp and are not supported when portBLAS is built with it. These operators include +`asum`, `nrm2`, `dot`, `sdsdot`, `rot`, `trsv`, `tbsv` and `tpsv`. +- The default `omp` host CPU backend *(as well as its optimized variant `omp.accelerated`)* hasn't been +not been fully integrated into the library and currently causes some tests to fail *(interleaved batched +gemm in particular)*. It's thus advised to use the llvm/OpenCL generic flow when targetting CPUs. ### Installing portBLAS To install the portBLAS library (see `CMAKE_INSTALL_PREFIX` below) @@ -452,7 +480,7 @@ Some of the supported options are: |---|---|---| | `BLAS_ENABLE_TESTING` | `ON`/`OFF` | Set it to `OFF` to avoid building the tests (`ON` is the default value) | | `BLAS_ENABLE_BENCHMARK` | `ON`/`OFF` | Set it to `OFF` to avoid building the benchmarks (`ON` is the default value) | -| `SYCL_COMPILER` | name | Used to determine which SYCL implementation to use. By default, the first implementation found is used. Supported values are: `dpcpp`, `hipsycl` and `computecpp`*(deprecated)*. | +| `SYCL_COMPILER` | name | Used to determine which SYCL implementation to use. By default, the first implementation found is used. Supported values are: `dpcpp`, `adaptivecpp` and `computecpp`*(deprecated)*. | | `TUNING_TARGET` | name | By default, this flag is set to `DEFAULT_CPU` to restrict any device specific compiler optimizations. Use this flag to tune the code for a target (**highly recommended** for performance). The supported targets are: `INTEL_GPU`, `NVIDIA_GPU`, `AMD_GPU` | | `CMAKE_PREFIX_PATH` | path | List of paths to check when searching for dependencies | | `CMAKE_INSTALL_PREFIX` | path | Specify the install location, used when invoking `ninja install` | diff --git a/benchmark/portblas/CMakeLists.txt b/benchmark/portblas/CMakeLists.txt index f309286b3..0da461aba 100644 --- a/benchmark/portblas/CMakeLists.txt +++ b/benchmark/portblas/CMakeLists.txt @@ -85,6 +85,21 @@ if(${BLAS_ENABLE_EXTENSIONS}) list(APPEND sources extension/reduction.cpp) endif() +# Skip these benchmarks for AdaptiveCpp for SPIRV/OpenCL targets +# that use SYCL 2020 features like group reduction or hang +# during execution (https://github.com/AdaptiveCpp/AdaptiveCpp/issues/1309) +set(ADAPTIVE_CPP_SKIP + blas1/asum.cpp + blas1/dot.cpp + blas1/sdsdot.cpp + blas1/nrm2.cpp + blas2/trsv.cpp + blas2/tbsv.cpp + blas2/tpsv.cpp + # Hang during execution (without failing) + blas3/trsm.cpp +) + # Operators supporting COMPLEX types benchmarking set(CPLX_OPS "gemm" "gemm_batched" @@ -101,6 +116,9 @@ set(HALF_DATA_OPS "axpy" # Add individual benchmarks for each method foreach(portblas_bench ${sources}) get_filename_component(bench_exec ${portblas_bench} NAME_WE) + if(is_adaptivecpp AND ${portblas_bench} IN_LIST ADAPTIVE_CPP_SKIP) + continue() + endif() add_executable(bench_${bench_exec} ${portblas_bench} main.cpp) target_link_libraries(bench_${bench_exec} PRIVATE benchmark Clara::Clara portblas bench_info) target_compile_definitions(bench_${bench_exec} PRIVATE -DBLAS_INDEX_T=${BLAS_BENCHMARK_INDEX_TYPE}) diff --git a/cmake/Modules/SYCL.cmake b/cmake/Modules/SYCL.cmake index 42cd90f85..31f28245e 100644 --- a/cmake/Modules/SYCL.cmake +++ b/cmake/Modules/SYCL.cmake @@ -25,26 +25,25 @@ include(CheckCXXCompilerFlag) include(ConfigurePORTBLAS) -# find_package(hipSYCL) requires HIPSYCL_TARGETS to be set, so set it to a default value before find_package(hipSYCL) -if(SYCL_COMPILER MATCHES "hipsycl" AND NOT HIPSYCL_TARGETS AND NOT ENV{HIPSYCL_TARGETS}) - message(STATUS "Using `omp` as HIPSYCL_TARGETS") - set(HIPSYCL_TARGETS "omp") +# find_package(AdaptiveCpp) requires ACPP_TARGETS to be set, so set it to a default value before find_package(AdaptiveCpp) +if(SYCL_COMPILER MATCHES "adaptivecpp" AND NOT ACPP_TARGETS AND NOT ENV{ACPP_TARGETS}) + message(STATUS "Using `omp` as ACPP_TARGETS") + set(ACPP_TARGETS "omp") +else() + message(STATUS "Using ${ACPP_TARGETS} as ACPP_TARGETS") endif() +check_cxx_compiler_flag("--acpp-targets" has_acpp) check_cxx_compiler_flag("-fsycl" has_fsycl) if(NOT SYCL_COMPILER) - if(has_fsycl) + if(has_acpp) + find_package(AdaptiveCpp QUIET) + set(is_adaptivecpp ${AdaptiveCpp_FOUND}) + set(SYCL_COMPILER "adaptivecpp") + else() set(is_dpcpp ON) set(SYCL_COMPILER "dpcpp") - else() - find_package(hipSYCL QUIET) - set(is_hipsycl ${hipSYCL_FOUND}) - set(SYCL_COMPILER "hipsycl") - if(NOT is_hipsycl) - set(is_computecpp ON) - set(SYCL_COMPILER "computecpp") - endif() endif() else() if(SYCL_COMPILER MATCHES "dpcpp") @@ -52,9 +51,13 @@ else() if(NOT has_fsycl) message(WARNING "Selected DPC++ as backend, but -fsycl not supported") endif() - elseif(SYCL_COMPILER MATCHES "hipsycl") - find_package(hipSYCL REQUIRED CONFIG) - set(is_hipsycl ON) + elseif(SYCL_COMPILER MATCHES "adaptivecpp") + find_package(AdaptiveCpp CONFIG REQUIRED) + set(is_adaptivecpp ${AdaptiveCpp_FOUND}) + if(NOT has_acpp) + message(WARNING "Selected AdaptiveCpp as backend, but the compiler is not + fully supported") + endif() elseif(SYCL_COMPILER MATCHES "computecpp") set(is_computecpp ON) else() @@ -88,8 +91,14 @@ elseif(is_dpcpp) endif() find_package(DPCPP REQUIRED) get_target_property(SYCL_INCLUDE_DIRS DPCPP::DPCPP INTERFACE_INCLUDE_DIRECTORIES) -elseif(is_hipsycl) +elseif(is_adaptivecpp) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") - get_target_property(SYCL_INCLUDE_DIRS hipSYCL::hipSYCL-rt INTERFACE_INCLUDE_DIRECTORIES) + get_target_property(SYCL_INCLUDE_DIRS AdaptiveCpp::acpp-rt INTERFACE_INCLUDE_DIRECTORIES) + set(HIP_BENCH_UNSUPPORTED_TARGETS "INTEL_GPU" "DEFAULT_CPU") + if((${BLAS_ENABLE_BENCHMARK}) AND (${TUNING_TARGET} IN_LIST HIP_BENCH_UNSUPPORTED_TARGETS)) + message(STATUS "Benchmarks are not supported when targetting OpenCL/LevelZero backend + devices. portBLAS Benchmarks are disabled.") + set(BLAS_ENABLE_BENCHMARK OFF) + endif() endif() diff --git a/include/container/sycl_iterator.h b/include/container/sycl_iterator.h index e6ea4f953..9e12939ed 100644 --- a/include/container/sycl_iterator.h +++ b/include/container/sycl_iterator.h @@ -194,17 +194,32 @@ template inline typename BufferIterator::template accessor_t BufferIterator::get_range_accessor(cl::sycl::handler& cgh, size_t size) { - return typename BufferIterator::template accessor_t( - buffer_, cgh, cl::sycl::range<1>(size), - cl::sycl::id<1>(BufferIterator::get_offset())); + if constexpr (acc_md_t == cl::sycl::access::mode::read) { + return typename BufferIterator::template accessor_t( + buffer_, cgh, cl::sycl::range<1>(size), + cl::sycl::id<1>(BufferIterator::get_offset())); + } else { + // Skip data initialization if not accessing in read mode only + return typename BufferIterator::template accessor_t( + buffer_, cgh, cl::sycl::range<1>(size), + cl::sycl::id<1>(BufferIterator::get_offset()), + cl::sycl::property::no_init{}); + } } template template inline typename BufferIterator::template accessor_t BufferIterator::get_range_accessor(cl::sycl::handler& cgh) { - return BufferIterator::get_range_accessor( - cgh, BufferIterator::get_size()); + if constexpr (acc_md_t == cl::sycl::access::mode::read) { + return BufferIterator::get_range_accessor( + cgh, BufferIterator::get_size()); + } else { + // Skip data initialization if not accessing in read mode only + return BufferIterator::get_range_accessor( + cgh, BufferIterator::get_size(), + cl::sycl::property::no_init{}); + } } template @@ -212,9 +227,18 @@ template inline typename BufferIterator::template placeholder_accessor_t< acc_md_t> BufferIterator::get_range_accessor(size_t size) { - return typename BufferIterator::template placeholder_accessor_t< - acc_md_t>(buffer_, cl::sycl::range<1>(size), - cl::sycl::id<1>(BufferIterator::get_offset())); + if constexpr (acc_md_t == cl::sycl::access::mode::read) { + return typename BufferIterator::template placeholder_accessor_t< + acc_md_t>(buffer_, cl::sycl::range<1>(size), + cl::sycl::id<1>(BufferIterator::get_offset())); + + } else { + // Skip data initialization if not accessing in read mode only + return typename BufferIterator::template placeholder_accessor_t< + acc_md_t>(buffer_, cl::sycl::range<1>(size), + cl::sycl::id<1>(BufferIterator::get_offset()), + cl::sycl::property::no_init{}); + } } template diff --git a/include/interface/blas1_interface.h b/include/interface/blas1_interface.h index c2fd8de1a..7684a0a65 100644 --- a/include/interface/blas1_interface.h +++ b/include/interface/blas1_interface.h @@ -196,7 +196,7 @@ typename sb_handle_t::event_t _swap( const typename sb_handle_t::event_t &_dependencies); /** - * \brief SCALAR operation on a vector + * \brief SCALAR operation on a vector * @param sb_handle_t sb_handle * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X @@ -208,6 +208,37 @@ typename sb_handle_t::event_t _scal( sb_handle_t &sb_handle, index_t _N, element_t _alpha, container_0_t _vx, increment_t _incx, const typename sb_handle_t::event_t &_dependencies); +/** + * \brief SCALAR operation on a matrix. (this is a generalization of + * vector-based _scal operator meant for internal use within the library, namely + * for GEMM and inplace-Matcopy operators) + * @param sb_handle_t sb_handle + * @param _A Input/Output BufferIterator or USM pointer + * @param _incA Increment for the matrix A + * @param _lda Leading dimension for the matrix A + * @param _M number of rows + * @param _N number of columns + * @param alpha scaling scalar + * @param _dependencies Vector of events + */ +template +typename sb_handle_t::event_t _scal_matrix( + sb_handle_t &sb_handle, index_t _M, index_t _N, element_t _alpha, + container_0_t _A, index_t _lda, increment_t _incA, + const typename sb_handle_t::event_t &_dependencies); + +/*! + * \brief Prototype for the internal implementation of the _scal_matrix + * operator. + */ +template +typename sb_handle_t::event_t _scal_matrix_impl( + sb_handle_t &sb_handle, index_t _M, index_t _N, element_t _alpha, + container_0_t _A, index_t _lda, increment_t _incA, + const typename sb_handle_t::event_t &_dependencies); + /** * \brief NRM2 Returns the euclidian norm of a vector * @param sb_handle SB_Handle diff --git a/include/operations/blas_constants.h b/include/operations/blas_constants.h index 229cd6721..82a8e0beb 100644 --- a/include/operations/blas_constants.h +++ b/include/operations/blas_constants.h @@ -263,16 +263,18 @@ struct constant_pair { } // namespace blas +#ifndef __ADAPTIVECPP__ template -struct sycl::is_device_copyable> +struct cl::sycl::is_device_copyable> : std::true_type {}; template -struct sycl::is_device_copyable> +struct cl::sycl::is_device_copyable> : std::true_type {}; template struct std::is_trivially_copyable> : std::true_type {}; +#endif #endif // BLAS_CONSTANTS_H diff --git a/include/sb_handle/portblas_handle.h b/include/sb_handle/portblas_handle.h index f7104a0cc..836b37c61 100644 --- a/include/sb_handle/portblas_handle.h +++ b/include/sb_handle/portblas_handle.h @@ -49,18 +49,24 @@ class SB_Handle { public: using event_t = std::vector; inline SB_Handle(queue_t q) - : tempMemPool_(nullptr), + : +#ifndef __ADAPTIVECPP__ + tempMemPool_(nullptr), +#endif q_(q), workGroupSize_(helper::get_work_group_size(q)), localMemorySupport_(helper::has_local_memory(q)), - computeUnits_(helper::get_num_compute_units(q)) {} + computeUnits_(helper::get_num_compute_units(q)) { + } +#ifndef __ADAPTIVECPP__ inline SB_Handle(Temp_Mem_Pool* tmp) : tempMemPool_(tmp), q_(tmp->get_queue()), workGroupSize_(helper::get_work_group_size(q_)), localMemorySupport_(helper::has_local_memory(q_)), computeUnits_(helper::get_num_compute_units(q_)) {} +#endif template typename std::enable_if< @@ -191,7 +197,9 @@ class SB_Handle { const size_t workGroupSize_; const bool localMemorySupport_; const size_t computeUnits_; +#ifndef __ADAPTIVECPP__ Temp_Mem_Pool* tempMemPool_; +#endif }; } // namespace blas diff --git a/include/sb_handle/temp_memory_pool.h b/include/sb_handle/temp_memory_pool.h index 1cb2a5d59..836fb98c3 100644 --- a/include/sb_handle/temp_memory_pool.h +++ b/include/sb_handle/temp_memory_pool.h @@ -25,6 +25,7 @@ #ifndef TEMP_MEMORY_POOL_H #define TEMP_MEMORY_POOL_H +#ifndef __ADAPTIVECPP__ #include #include @@ -39,8 +40,11 @@ class Temp_Mem_Pool { public: Temp_Mem_Pool(queue_t q) : q_(q), - temp_buffer_map_tot_byte_size_(0), - temp_usm_map_tot_byte_size_(0) {} +#ifdef SB_ENABLE_USM + temp_usm_map_tot_byte_size_(0), +#endif + temp_buffer_map_tot_byte_size_(0) { + } Temp_Mem_Pool(const Temp_Mem_Pool& h) = delete; Temp_Mem_Pool operator=(Temp_Mem_Pool) = delete; @@ -96,7 +100,7 @@ class Temp_Mem_Pool { temp_buffer_map_t temp_buffer_map_; template - void release_usm_mem_(const container_t& mem); + void release_buff_mem_(const container_t& mem); #ifdef SB_ENABLE_USM std::mutex temp_usm_map_mutex_; @@ -105,8 +109,11 @@ class Temp_Mem_Pool { temp_usm_size_map_t temp_usm_size_map_; template - void release_buff_mem_(const container_t& mem); -#endif + void release_usm_mem_(const container_t& mem); +#endif // SB_ENABLE_USM }; } // namespace blas + +#endif // __ADAPTIVECPP__ + #endif diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 61ddd8727..eac74afb7 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -18,7 +18,7 @@ foreach(src_file ${SAMPLES_LIST}) get_filename_component(sample_exec ${src_file} NAME_WE) set(sample_exec "sample_${sample_exec}") add_executable(${sample_exec} ${src_file}) - if(is_hipsycl OR is_dpcpp) + if(is_adaptivecpp OR is_dpcpp) set_target_properties(${sample_exec} PROPERTIES CXX_STANDARD 17) else() set_target_properties(${sample_exec} PROPERTIES CXX_STANDARD 14) diff --git a/src/interface/blas1/backend/default_cpu.hpp b/src/interface/blas1/backend/default_cpu.hpp index fb52c7a42..ac2fe764d 100644 --- a/src/interface/blas1/backend/default_cpu.hpp +++ b/src/interface/blas1/backend/default_cpu.hpp @@ -49,6 +49,7 @@ template ( @@ -59,6 +60,20 @@ typename sb_handle_t::event_t _iamax( return blas::internal::_iamax_iamin_impl( sb_handle, _N, _vx, _incx, _rs, nWG, _dependencies); } +#else + // Temporary work-around to avoid non-local memory implementation of + // iamin/iamax with AdaptiveCpp. + constexpr int localSize = 128; + if (_N < 8192) { + return blas::internal::_iamax_iamin_impl( + sb_handle, _N, _vx, _incx, _rs, static_cast(1), _dependencies); + } else { + const index_t nWG = std::min((_N + localSize - 1) / (localSize * 4), + static_cast(512)); + return blas::internal::_iamax_iamin_impl( + sb_handle, _N, _vx, _incx, _rs, nWG, _dependencies); + } +#endif } } // namespace backend } // namespace iamax @@ -70,6 +85,7 @@ template ( @@ -80,6 +96,21 @@ typename sb_handle_t::event_t _iamin( return blas::internal::_iamax_iamin_impl( sb_handle, _N, _vx, _incx, _rs, nWG, _dependencies); } +#else + // Temporary work-around to avoid non-local memory implementation of + // iamin/iamax with AdaptiveCpp. + constexpr int localSize = 128; + if (_N < 8192) { + return blas::internal::_iamax_iamin_impl( + sb_handle, _N, _vx, _incx, _rs, static_cast(1), _dependencies); + } else { + const index_t nWG = std::min((_N + localSize - 1) / (localSize * 4), + static_cast(512)); + return blas::internal::_iamax_iamin_impl(sb_handle, _N, _vx, _incx, + _rs, nWG, _dependencies); + } +#endif } } // namespace backend } // namespace iamin diff --git a/src/interface/blas1/scal.cpp.in b/src/interface/blas1/scal.cpp.in index aa8bd8ac1..4b82068fb 100644 --- a/src/interface/blas1/scal.cpp.in +++ b/src/interface/blas1/scal.cpp.in @@ -51,5 +51,30 @@ template typename SB_Handle::event_t _scal( const typename SB_Handle::event_t& dependencies); #endif +/** + * \brief SCALAR operation on a Matrix + * @param SB_Handle sb_handle + * @param _alpha scaling scalar + * @param _A Input/Output BufferIterator or USM pointer + * @param _M number of rows + * @param _N number of columns + * @param _lda Leading dimension for the matrix A + * @param _incA Increment for the matrix A + * @param _dependencies Vector of events + */ +template typename SB_Handle::event_t _scal_matrix( + SB_Handle& sb_handle, ${INDEX_TYPE} _M, ${INDEX_TYPE} _N, + ${DATA_TYPE} _alpha, BufferIterator<${DATA_TYPE}> _A, + ${INDEX_TYPE} _lda, ${INCREMENT_TYPE} _incA, + const typename SB_Handle::event_t& dependencies); + +#ifdef SB_ENABLE_USM +template typename SB_Handle::event_t _scal_matrix( + SB_Handle& sb_handle, ${INDEX_TYPE} _M, ${INDEX_TYPE} _N, + ${DATA_TYPE} _alpha, ${DATA_TYPE} * _A, + ${INDEX_TYPE} _lda, ${INCREMENT_TYPE} _incA, + const typename SB_Handle::event_t& dependencies); +#endif + } // namespace internal } // namespace blas diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index 059b9cc27..df914b71b 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -151,6 +151,7 @@ typename sb_handle_t::event_t _sdsdot( sb_handle_t &sb_handle, index_t _N, float sb, container_0_t _vx, increment_t _incx, container_1_t _vy, increment_t _incy, container_2_t _rs, const typename sb_handle_t::event_t &_dependencies) { +#ifndef __ADAPTIVECPP__ if (!_N) { using element_t = typename ValueType::type; sb_handle.wait(_dependencies); @@ -168,6 +169,11 @@ typename sb_handle_t::event_t _sdsdot( auto ret = sb_handle.execute(assignOp2, dotOp); return blas::concatenate_vectors(dotOp, ret); } +#else + throw std::runtime_error( + "Sdsdot is not supported with AdaptiveCpp as it uses SYCL 2020 " + "reduction."); +#endif } /** @@ -184,7 +190,7 @@ typename sb_handle_t::event_t _asum( sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, container_1_t _rs, const typename sb_handle_t::event_t &_dependencies) { // keep compatibility with older sycl versions -#if SYCL_LANGUAGE_VERSION < 202000 +#if SYCL_LANGUAGE_VERSION < 202000 || defined(__ADAPTIVECPP__) typename VectorViewType::type vx = make_vector_view(_vx, _incx, _N); auto rs = make_vector_view(_rs, static_cast(1), @@ -202,7 +208,7 @@ typename sb_handle_t::event_t _asum( #endif } -#if SYCL_LANGUAGE_VERSION >= 202000 +#if SYCL_LANGUAGE_VERSION >= 202000 && !defined(__ADAPTIVECPP__) /*! _asum_impl. * @brief Internal implementation of the Absolute sum operator. * @@ -296,7 +302,7 @@ typename sb_handle_t::event_t _iamax_iamin_impl( // get the minimum supported sub_group size const index_t min_sg_size = static_cast( q.get_device() - .template get_info()[0]); + .template get_info()[0]); ret = sb_handle.execute(op, min_sg_size, min_sg_size, _dependencies); } else { ret = sb_handle.execute( @@ -311,7 +317,7 @@ typename sb_handle_t::event_t _iamax_iamin_impl( // get the minimum supported sub_group size const index_t min_sg_size = static_cast( q.get_device() - .template get_info()[0]); + .template get_info()[0]); // if using no local memory, every sub_group writes one intermediate output, // in case if sub_group size is not known at allocation time, than allocate // extra memory using min supported sub_group size. @@ -455,6 +461,58 @@ typename sb_handle_t::event_t _scal( } } +/** + * \brief SCALAR operation on a matrix + * @param sb_handle_t sb_handle + * @param _M number of matrix rows + * @param _N number of matrix columns + * @param alpha scaling scalar + * @param _A Input/Output BufferIterator or USM pointer matrix + * @param _incA Increment for the matrix A + * @param _lda Leading dimension for the matrix A + * @param _dependencies Vector of events + */ +template +typename sb_handle_t::event_t _scal_matrix( + sb_handle_t &sb_handle, index_t _M, index_t _N, element_t _alpha, + container_0_t _A, index_t _lda, increment_t _incA, + const typename sb_handle_t::event_t &_dependencies) { + if (_incA == index_t(1)) { + return _scal_matrix_impl(sb_handle, _M, _N, _alpha, _A, _lda, _incA, + _dependencies); + } else { + return _scal_matrix_impl(sb_handle, _M, _N, _alpha, _A, _lda, _incA, + _dependencies); + } +} + +/** + * \brief Internal implementation of Matrix scaling + * @tparam has_inc Whether matrix has an increment != 1 + * + * Remaining parameters match internal::_scal_matrix parameters (see above) + */ +template +typename sb_handle_t::event_t _scal_matrix_impl( + sb_handle_t &sb_handle, index_t _M, index_t _N, element_t _alpha, + container_0_t _A, index_t _lda, increment_t _incA, + const typename sb_handle_t::event_t &_dependencies) { + typename MatrixViewType::type + m_view = make_matrix_view( + _A, _M, _N, _lda, _incA); + if (_alpha == element_t{1}) { + return _dependencies; + } else { + auto scal_op = make_op(_alpha, m_view); + auto copy_op = make_op(m_view, scal_op); + typename sb_handle_t::event_t ret = + sb_handle.execute(copy_op, _dependencies); + return ret; + } +} + /** * \brief NRM2 Returns the euclidian norm of a vector * @param sb_handle_t sb_handle @@ -548,24 +606,23 @@ typename sb_handle_t::event_t _dot_impl( typename sb_handle_t::event_t ret_event; // Skip if N==0, _rs is not overwritten if (!_N) return {_dependencies}; - auto vx = make_vector_view(_vx, _incx, _N); auto vy = make_vector_view(_vy, _incy, _N); auto rs = make_vector_view(_rs, static_cast(1), static_cast(1)); auto prdOp = make_op(vx, vy); - auto assignOp = make_wg_atomic_reduction(rs, prdOp); + auto wgReductionOp = make_wg_atomic_reduction(rs, prdOp); if constexpr (localMemSize) { ret_event = - sb_handle.execute(assignOp, static_cast(localSize), + sb_handle.execute(wgReductionOp, static_cast(localSize), static_cast(_number_wg * localSize), static_cast(localMemSize), _dependencies); } else { - ret_event = sb_handle.execute(assignOp, static_cast(localSize), - static_cast(_number_wg * localSize), - _dependencies); + ret_event = sb_handle.execute( + wgReductionOp, static_cast(localSize), + static_cast(_number_wg * localSize), _dependencies); } return ret_event; } @@ -867,6 +924,7 @@ typename ValueType::type _dot( sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, container_1_t _vy, increment_t _incy, const typename sb_handle_t::event_t &_dependencies) { +#ifndef __ADAPTIVECPP__ constexpr bool is_usm = std::is_pointer::value; using element_t = typename ValueType::type; element_t res{0}; @@ -889,6 +947,10 @@ typename ValueType::type _dot( : helper::AllocType::buffer>(gpu_res, sb_handle.get_queue()); return res; +#else + throw std::runtime_error( + "Dot is not supported with AdaptiveCpp as it uses SYCL 2020 reduction."); +#endif } /** @@ -1012,6 +1074,7 @@ template ::type _asum( sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx, const typename sb_handle_t::event_t &_dependencies) { +#ifndef __ADAPTIVECPP__ constexpr bool is_usm = std::is_pointer::value; using element_t = typename ValueType::type; auto res = std::vector(1, element_t(0)); @@ -1032,6 +1095,10 @@ typename ValueType::type _asum( : helper::AllocType::buffer>( gpu_res, sb_handle.get_queue()); return res[0]; +#else + throw std::runtime_error( + "Asum is not supported with AdaptiveCpp as it uses SYCL 2020 reduction."); +#endif } /** @@ -1047,6 +1114,7 @@ template ::type _nrm2( sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx, const typename sb_handle_t::event_t &_dependencies) { +#ifndef __ADAPTIVECPP__ constexpr bool is_usm = std::is_pointer::value; using element_t = typename ValueType::type; auto res = std::vector(1, element_t(0)); @@ -1066,6 +1134,10 @@ typename ValueType::type _nrm2( : helper::AllocType::buffer>( gpu_res, sb_handle.get_queue()); return res[0]; +#else + throw std::runtime_error( + "Nrm2 is not supported with AdaptiveCpp as it uses SYCL 2020 reduction."); +#endif } } // namespace internal diff --git a/src/interface/blas2_interface.hpp b/src/interface/blas2_interface.hpp index ac07cf499..71dbee066 100644 --- a/src/interface/blas2_interface.hpp +++ b/src/interface/blas2_interface.hpp @@ -356,7 +356,7 @@ typename sb_handle_t::event_t _trsv_impl( sb_handle_t& sb_handle, index_t _N, container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, const typename sb_handle_t::event_t& _dependencies) { -#if (SYCL_LANGUAGE_VERSION < 202000) || (defined __HIPSYCL__) +#if (SYCL_LANGUAGE_VERSION < 202000) || (defined __ADAPTIVECPP__) throw std::runtime_error("trsv requires SYCL 2020"); #else static_assert(subgroup_size % subgroups == 0, @@ -747,7 +747,7 @@ typename sb_handle_t::event_t _tbsv_impl( sb_handle_t& sb_handle, index_t _N, index_t _K, container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, const typename sb_handle_t::event_t& _dependencies) { -#if (SYCL_LANGUAGE_VERSION < 202000) || (defined __HIPSYCL__) +#if (SYCL_LANGUAGE_VERSION < 202000) || (defined __ADAPTIVECPP__) throw std::runtime_error("tbsv requires SYCL 2020"); #else static_assert(subgroup_size % subgroups == 0, @@ -810,7 +810,7 @@ template ::eval( using element_t = typename ResolveReturnType::type::value_t::value_t; +#ifndef __ADAPTIVECPP__ // reduction within the sub_group for (index_t i = sg_local_range >> 1; i > 0; i >>= 1) { if (sg_local_id < i) { @@ -103,7 +104,12 @@ PORTBLAS_INLINE void IndexMaxMin::eval( val = op::eval(val, shfl); } } - +#else + // AdaptiveCpp uses a different interface "shift_group_left" which is + // recognized by the compiler but throws JIT errors at runtime. Currently this + // part is skipped as non-local memory kernel is never called with + // AdaptiveCpp. +#endif const index_t lhs_idx = ndItem.get_group_linear_id() * (local_range / sg_local_range) + sg.get_group_linear_id(); @@ -144,7 +150,7 @@ PORTBLAS_INLINE void IndexMaxMin::eval( } scratch[local_id] = val; - ndItem.barrier(sycl::access::fence_space::local_space); + ndItem.barrier(cl::sycl::access::fence_space::local_space); value_t local_val = op::template init(); // reduction within the work group @@ -154,7 +160,7 @@ PORTBLAS_INLINE void IndexMaxMin::eval( local_val = scratch[local_id + i]; scratch[local_id] = op::eval(val, local_val); } - ndItem.barrier(sycl::access::fence_space::local_space); + ndItem.barrier(cl::sycl::access::fence_space::local_space); } // write IndexValueTuple to Global Memory iff reduction step0 diff --git a/src/operations/blas1/WGAtomicReduction.hpp b/src/operations/blas1/WGAtomicReduction.hpp index 0e499dddc..22d923e8d 100644 --- a/src/operations/blas1/WGAtomicReduction.hpp +++ b/src/operations/blas1/WGAtomicReduction.hpp @@ -55,10 +55,11 @@ PORTBLAS_INLINE bool WGAtomicReduction::valid_thread( template PORTBLAS_INLINE typename WGAtomicReduction::value_t WGAtomicReduction::eval(cl::sycl::nd_item<1> ndItem) { - auto atomic_res = sycl::atomic_ref( - lhs_.get_data()[0]); + auto atomic_res = + cl::sycl::atomic_ref( + lhs_.get_data()[0]); const auto size = get_size(); int lid = ndItem.get_global_linear_id(); value_t val = operator_t::template init(); @@ -70,10 +71,11 @@ WGAtomicReduction::eval(cl::sycl::nd_item<1> ndItem) { val = operator_t::eval(val, rhs_.eval(id)); } - val = sycl::reduce_over_group(ndItem.get_sub_group(), val, sycl::plus<>()); + val = cl::sycl::reduce_over_group(ndItem.get_sub_group(), val, + cl::sycl::plus()); - if ((ndItem.get_local_id() & - (ndItem.get_sub_group().get_local_range() - 1)) == 0) { + if ((ndItem.get_local_id()[0] & + (ndItem.get_sub_group().get_local_range()[0] - 1)) == 0) { atomic_res += val; } return {}; @@ -83,10 +85,11 @@ template PORTBLAS_INLINE typename WGAtomicReduction::value_t WGAtomicReduction::eval(sharedT scratch, cl::sycl::nd_item<1> ndItem) { - auto atomic_res = sycl::atomic_ref( - lhs_.get_data()[0]); + auto atomic_res = + cl::sycl::atomic_ref( + lhs_.get_data()[0]); const auto size = get_size(); const int lid = static_cast(ndItem.get_global_linear_id()); const auto loop_stride = @@ -98,21 +101,24 @@ WGAtomicReduction::eval(sharedT scratch, val = operator_t::eval(val, rhs_.eval(id)); } - val = sycl::reduce_over_group(ndItem.get_sub_group(), val, sycl::plus<>()); + val = cl::sycl::reduce_over_group(ndItem.get_sub_group(), val, + cl::sycl::plus()); - if (ndItem.get_sub_group().get_local_id() == 0) { + if (ndItem.get_sub_group().get_local_id()[0] == 0) { scratch[ndItem.get_sub_group().get_group_linear_id()] = val; } ndItem.barrier(); - val = (ndItem.get_local_id() < (ndItem.get_local_range(0) / - ndItem.get_sub_group().get_local_range()[0])) - ? scratch[ndItem.get_sub_group().get_local_id()] - : 0; - if (ndItem.get_sub_group().get_group_id() == 0) { - val = sycl::reduce_over_group(ndItem.get_sub_group(), val, sycl::plus<>()); + val = + (ndItem.get_local_id()[0] < (ndItem.get_local_range(0) / + ndItem.get_sub_group().get_local_range()[0])) + ? scratch[ndItem.get_sub_group().get_local_id()] + : 0; + if (ndItem.get_sub_group().get_group_id()[0] == 0) { + val = cl::sycl::reduce_over_group(ndItem.get_sub_group(), val, + cl::sycl::plus()); } - if (ndItem.get_local_id() == 0) { + if (ndItem.get_local_id()[0] == 0) { atomic_res += val; } diff --git a/src/operations/blas2/spr.hpp b/src/operations/blas2/spr.hpp index ee317a92f..9a69866d3 100644 --- a/src/operations/blas2/spr.hpp +++ b/src/operations/blas2/spr.hpp @@ -101,15 +101,20 @@ typename rhs_1_t::value_t Spr::eval( index_t row = 0, col = 0; +#ifndef __ADAPTIVECPP__ if (!id) { +#endif Spr::compute_row_col( global_idx, N_, row, col); +#ifndef __ADAPTIVECPP__ } row = cl::sycl::group_broadcast(ndItem.get_group(), row); col = cl::sycl::group_broadcast(ndItem.get_group(), col); +#endif if (global_idx < lhs_size) { +#ifndef __ADAPTIVECPP__ if constexpr (isUpper) { if (id) { row += id; @@ -127,6 +132,7 @@ typename rhs_1_t::value_t Spr::eval( } } } +#endif value_t lhs_val = lhs_.eval(global_idx); value_t rhs_1_val = rhs_1_.eval(row); @@ -135,7 +141,8 @@ typename rhs_1_t::value_t Spr::eval( value_t rhs_1_val_second = rhs_1_.eval(col); value_t rhs_2_val_second = rhs_2_.eval(row); lhs_.eval(global_idx) = rhs_1_val * rhs_2_val * alpha_ + - rhs_1_val_second * rhs_2_val_second * alpha_ + lhs_val; + rhs_1_val_second * rhs_2_val_second * alpha_ + + lhs_val; } else lhs_.eval(global_idx) = rhs_1_val * rhs_2_val * alpha_ + lhs_val; } @@ -161,9 +168,8 @@ Spr::adjust_access_displacement() { template -PORTBLAS_INLINE - typename Spr::index_t - Spr::get_size() const { +PORTBLAS_INLINE typename Spr::index_t +Spr::get_size() const { return rhs_1_.get_size(); } template ::eval(local_memory_t local_mem, cl::sycl::nd_item<1> ndItem) { value_t ret = 0; -#if (SYCL_LANGUAGE_VERSION >= 202000) && !(defined __HIPSYCL__) +#if (SYCL_LANGUAGE_VERSION >= 202000) && !(defined __ADAPTIVECPP__) constexpr bool is_forward = (is_upper && is_transposed) || (!is_upper && !is_transposed); @@ -137,9 +137,9 @@ Txsv( + auto a = cl::sycl::atomic_ref( sync_.eval(0)); // Get the wg_id of actual workgroup @@ -185,7 +185,7 @@ Txsv ready_block)))) ready_block = - sycl::group_broadcast(ndItem.get_sub_group(), not_wi0 ? 0 : *p); + cl::sycl::group_broadcast(ndItem.get_sub_group(), not_wi0 ? 0 : *p); loc_x[l_idx] = (curr_offset < _N) ? lhs_.eval(curr_offset) : value_t(0); } @@ -272,9 +272,9 @@ Txsv::value) { + *reg_res = reg_a[j * (item_batchs / VectorSize) + b] * + reg_b[i * (item_batchs / VectorSize) + b] + + *reg_res; + } else { + *reg_res = cl::sycl::mad(reg_a[j * (item_batchs / VectorSize) + b], + reg_b[i * (item_batchs / VectorSize) + b], + *reg_res); + } +#else *reg_res = cl::sycl::mad(reg_a[j * (item_batchs / VectorSize) + b], reg_b[i * (item_batchs / VectorSize) + b], *reg_res); +#endif ++reg_res; } } diff --git a/src/operations/extension/axpy_batch.hpp b/src/operations/extension/axpy_batch.hpp index d5a70a7d6..344d4ec2a 100644 --- a/src/operations/extension/axpy_batch.hpp +++ b/src/operations/extension/axpy_batch.hpp @@ -68,7 +68,7 @@ Axpy_batch::eval( const index_t size_compute_rateo = (n > nbl * localSize) ? n / (nbl * localSize) : batch_size_; - const index_t jump_value{sycl::min(batch_size_, size_compute_rateo)}; + const index_t jump_value{cl::sycl::min(batch_size_, size_compute_rateo)}; if (group_id >= jump_value || l_id > n) return {}; diff --git a/src/sb_handle/portblas_handle.hpp b/src/sb_handle/portblas_handle.hpp index c03b7b277..f158de242 100644 --- a/src/sb_handle/portblas_handle.hpp +++ b/src/sb_handle/portblas_handle.hpp @@ -44,9 +44,11 @@ typename std::enable_if< alloc == helper::AllocType::buffer, typename helper::AllocHelper::type>::type SB_Handle::acquire_temp_mem(size_t size) { +#ifndef __ADAPTIVECPP__ if (tempMemPool_ != nullptr) return tempMemPool_->acquire_buff_mem(size); else +#endif return make_sycl_iterator_buffer(size); } @@ -58,9 +60,11 @@ typename std::enable_if< typename SB_Handle::event_t>::type SB_Handle::release_temp_mem(const typename SB_Handle::event_t& dependencies, const container_t& mem) { +#ifndef __ADAPTIVECPP__ if (tempMemPool_ != nullptr) return tempMemPool_->release_buff_mem(dependencies, mem); else +#endif return {}; } diff --git a/src/sb_handle/temp_memory_pool.hpp b/src/sb_handle/temp_memory_pool.hpp index 4e3a68c43..1d57b0c6f 100644 --- a/src/sb_handle/temp_memory_pool.hpp +++ b/src/sb_handle/temp_memory_pool.hpp @@ -1,6 +1,6 @@ #ifndef TEMP_MEMORY_POOL_HPP #define TEMP_MEMORY_POOL_HPP - +#ifndef __ADAPTIVECPP__ #include "portblas_helper.h" namespace blas { @@ -114,5 +114,6 @@ typename Temp_Mem_Pool::event_t Temp_Mem_Pool::release_usm_mem( })}; } } -#endif +#endif // SB_ENABLE_USM +#endif // __ADAPTIVECPP__ #endif diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index 7b398a2a7..f6044d830 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -34,6 +34,7 @@ set(SYCL_UNITTEST_SRCS ${PORTBLAS_UNITTEST}/blas1/blas1_axpy_test.cpp ${PORTBLAS_UNITTEST}/blas1/blas1_copy_test.cpp ${PORTBLAS_UNITTEST}/blas1/blas1_scal_test.cpp + ${PORTBLAS_UNITTEST}/blas1/blas1_swap_test.cpp ${PORTBLAS_UNITTEST}/blas1/blas1_rot_test.cpp ${PORTBLAS_UNITTEST}/blas1/blas1_rotm_test.cpp ${PORTBLAS_UNITTEST}/blas1/blas1_rotmg_test.cpp @@ -55,6 +56,8 @@ set(SYCL_UNITTEST_SRCS ${PORTBLAS_UNITTEST}/blas2/blas2_syr2_test.cpp ${PORTBLAS_UNITTEST}/blas2/blas2_symv_test.cpp ${PORTBLAS_UNITTEST}/blas2/blas2_tpmv_test.cpp + ${PORTBLAS_UNITTEST}/blas2/blas2_trmv_test.cpp + ${PORTBLAS_UNITTEST}/blas2/blas2_tbmv_test.cpp # Blas 3 tests ${PORTBLAS_UNITTEST}/blas3/blas3_gemm_test.cpp ${PORTBLAS_UNITTEST}/blas3/blas3_gemm_batched_test.cpp @@ -67,6 +70,20 @@ set(SYCL_UNITTEST_SRCS ${PORTBLAS_UNITTEST}/extension/omatcopy_batched_test.cpp ${PORTBLAS_UNITTEST}/extension/omatadd_batched_test.cpp ${PORTBLAS_UNITTEST}/extension/axpy_batch_test.cpp + ${PORTBLAS_UNITTEST}/buffers/sycl_buffer_test.cpp +) + +# Skip these tests for AdaptiveCpp for SPIRV/OpenCL targets +# that use SYCL 2020 features like group reduction or hang +# during execution (https://github.com/AdaptiveCpp/AdaptiveCpp/issues/1309) +set(ADAPTIVE_CPP_SKIP + ${PORTBLAS_UNITTEST}/blas1/blas1_asum_test.cpp + ${PORTBLAS_UNITTEST}/blas1/blas1_sdsdot_test.cpp + ${PORTBLAS_UNITTEST}/blas1/blas1_nrm2_test.cpp + ${PORTBLAS_UNITTEST}/blas1/blas1_dot_test.cpp + ${PORTBLAS_UNITTEST}/blas1/blas1_rot_test.cpp + # Hang during execution (without failing) + ${PORTBLAS_UNITTEST}/blas3/blas3_trsm_test.cpp ) if(${BLAS_ENABLE_EXTENSIONS}) @@ -84,19 +101,6 @@ if(is_dpcpp) ) endif() -# Temporary disabling the following tests fro Intel DPC++ as currently Intel compiler crashes while running the following tests -if(is_computecpp) - set(SYCL_UNITTEST_SRCS ${SYCL_UNITTEST_SRCS} - # Blas 1 tests - ${PORTBLAS_UNITTEST}/blas1/blas1_swap_test.cpp - # Blas 2 tests - ${PORTBLAS_UNITTEST}/blas2/blas2_trmv_test.cpp - ${PORTBLAS_UNITTEST}/blas2/blas2_tbmv_test.cpp - # Blas buffer tests - ${PORTBLAS_UNITTEST}/buffers/sycl_buffer_test.cpp - ) - -endif() # Contains tests that fail if compiled with -ffast-math set(SYCL_UNITTEST_NOFASTMATH @@ -115,6 +119,9 @@ set(HALF_DATA_OPS "blas1_axpy_test" ) foreach(blas_test ${SYCL_UNITTEST_SRCS}) + if(is_adaptivecpp AND ${blas_test} IN_LIST ADAPTIVE_CPP_SKIP) + continue() + endif() get_filename_component(test_exec ${blas_test} NAME_WE) add_executable(${test_exec} main.cpp ${blas_test}) if(is_computecpp) diff --git a/test/unittest/blas1/blas1_iamax_test.cpp b/test/unittest/blas1/blas1_iamax_test.cpp index acd80af02..ccdbcdd38 100644 --- a/test/unittest/blas1/blas1_iamax_test.cpp +++ b/test/unittest/blas1/blas1_iamax_test.cpp @@ -4,7 +4,6 @@ template void run_test(const combination_t combi) { - std::string alloc; api_type api; index_t size; diff --git a/test/unittest/blas1/blas1_iamin_test.cpp b/test/unittest/blas1/blas1_iamin_test.cpp index 55997c530..9f0193697 100644 --- a/test/unittest/blas1/blas1_iamin_test.cpp +++ b/test/unittest/blas1/blas1_iamin_test.cpp @@ -29,7 +29,6 @@ template void run_test(const combination_t combi) { - std::string alloc; api_type api; index_t size; diff --git a/test/unittest/blas3/blas3_gemm_batched_test.cpp b/test/unittest/blas3/blas3_gemm_batched_test.cpp index 74257b5cf..cc26455f5 100644 --- a/test/unittest/blas3/blas3_gemm_batched_test.cpp +++ b/test/unittest/blas3/blas3_gemm_batched_test.cpp @@ -127,22 +127,22 @@ GENERATE_GEMM_STRIDED_BATCHED_TEST(BatchStridedGemm, DefaultGemmAndGemmBatched); template const auto AllStridedBatched = - ::testing::Combine(::testing::Values("usm", "buf"), // allocation type - ::testing::Values(0, 33), // offset - ::testing::Values(5), // batch - ::testing::Values(128), // m - ::testing::Values(128), // n - ::testing::Values(128), // k - ::testing::Values('n', 't'), // transa - ::testing::Values('n', 't'), // transb - ::testing::Values(3.0), // alpha - ::testing::Values(7.0), // beta - ::testing::Values(2), // lda_mul - ::testing::Values(3), // ldb_mul - ::testing::Values(4), // ldc_mul - ::testing::Values(0, 1, 2), // stride_a_mul - ::testing::Values(0, 1, 2), // stride_b_mul - ::testing::Values(1, 2, 3) // stride_c_mul + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(0, 33), // offset + ::testing::Values(5), // batch + ::testing::Values(63), // m + ::testing::Values(63), // n + ::testing::Values(128), // k + ::testing::Values('n', 't'), // transa + ::testing::Values('n', 't'), // transb + ::testing::Values(3.0, 0.0), // alpha + ::testing::Values(7.0, 1.0, 0.0), // beta + ::testing::Values(2), // lda_mul + ::testing::Values(3), // ldb_mul + ::testing::Values(4), // ldc_mul + ::testing::Values(0, 1, 2), // stride_a_mul + ::testing::Values(0, 1, 2), // stride_b_mul + ::testing::Values(1, 3) // stride_c_mul ); GENERATE_GEMM_STRIDED_BATCHED_TEST(BatchStridedGemm, AllStridedBatched); diff --git a/test/unittest/blas3/blas3_gemm_common.hpp b/test/unittest/blas3/blas3_gemm_common.hpp index 3f06d58ec..0b0073a5e 100644 --- a/test/unittest/blas3/blas3_gemm_common.hpp +++ b/test/unittest/blas3/blas3_gemm_common.hpp @@ -356,10 +356,7 @@ inline void verify_gemm( blas::helper::copy_to_host(q, m_c_gpu, c_m_gpu.data(), buffer_size_c); sb_handle.wait(event); - const bool isAlmostEqual = - (stride_c_mul == 1) - ? utils::compare_vectors(c_m_gpu, c_m_cpu) - : utils::compare_vectors_strided(c_m_gpu, c_m_cpu, stride_c, size_c); + const bool isAlmostEqual = utils::compare_vectors(c_m_gpu, c_m_cpu); ASSERT_TRUE(isAlmostEqual); helper::deallocate(m_a_gpu, q); @@ -681,10 +678,7 @@ inline void verify_gemm( buffer_size_c); sb_handle.wait(event); - const bool isAlmostEqual = - (stride_c_mul == 1) - ? utils::compare_vectors(c_m_gpu, c_m_cpu) - : utils::compare_vectors_strided(c_m_gpu, c_m_cpu, stride_c, size_c); + const bool isAlmostEqual = utils::compare_vectors(c_m_gpu, c_m_cpu); ASSERT_TRUE(isAlmostEqual); helper::deallocate(m_a_gpu, q);