Skip to content
This repository has been archived by the owner on Jan 13, 2025. It is now read-only.

Commit

Permalink
Merge branch 'master' into cmake_update
Browse files Browse the repository at this point in the history
  • Loading branch information
muhammad-tanvir-1211 authored Mar 4, 2024
2 parents 30afc49 + fec888f commit 64fdc09
Show file tree
Hide file tree
Showing 28 changed files with 466 additions and 153 deletions.
21 changes: 17 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,19 @@ if(((NOT INSTALL_HEADER_ONLY) AND (TUNING_TARGET STREQUAL "DEFAULT_CPU"))
message(STATUS "FP16 operations are not supported for CPU targets. BLAS_ENABLE_HALF is disabled")
endif()

if (SYCL_COMPILER MATCHES "adaptivecpp")
if(BLAS_ENABLE_COMPLEX)
message(STATUS "SYCL Complex data is not supported on AdaptiveCpp/hipSYCL. Complex
data type is disabled")
set(BLAS_ENABLE_COMPLEX OFF)
endif()
if(BLAS_MEMPOOL_BENCHMARK)
message(STATUS "Memory pool feature is not supported on AdaptiveCpp/hipSYCL. Corresponding
benchmarks are disabled")
set(BLAS_MEMPOOL_BENCHMARK OFF)
endif()
endif()

# CmakeFunctionHelper has to be included after any options that it depends on are declared.
# These include:
# * TARGET
Expand Down Expand Up @@ -149,17 +162,17 @@ else()
target_link_libraries(portblas PUBLIC ComputeCpp::ComputeCpp)
elseif(is_dpcpp)
target_link_libraries(portblas PUBLIC DPCPP::DPCPP)
elseif(is_hipsycl)
target_link_libraries(portblas PUBLIC hipSYCL::hipSYCL-rt)
elseif(is_adaptivecpp)
target_link_libraries(portblas PUBLIC AdaptiveCpp::acpp-rt)
endif()
endif()
if(is_computecpp)
set(sycl_impl ComputeCpp::ComputeCpp)
elseif(is_dpcpp)
set(sycl_impl DPCPP::DPCPP)
add_sycl_to_target(TARGET portblas SOURCES)
elseif(is_hipsycl)
set(sycl_impl hipSYCL::hipSYCL-rt)
elseif(is_adaptivecpp)
set(sycl_impl AdaptiveCpp::acpp-rt)
add_sycl_to_target(TARGET portblas SOURCES)
endif()
if(IMGDNN_DIR)
Expand Down
44 changes: 36 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ the project.
- [Requirements](#requirements)
- [Setup](#setup)
- [Compile with DPC++](#compile-with-dpc)
- [Compile with hipSYCL](#compile-with-hipsycl)
- [Compile with AdaptiveCpp *(Formerly hipSYCL)*](#compile-with-adaptivecpp)
- [Instaling portBLAS](#instaling-portBLAS)
- [Doxygen](#doxygen)
- [CMake options](#cmake-options)
Expand Down Expand Up @@ -390,9 +390,9 @@ added to the `CMAKE_PREFIX_PATH` when building portBLAS (see

**IMPORTANT NOTE:** The `TARGET` CMake variable is no longer supported. It has
been replaced by `TUNING_TARGET`, which accepts the same options.
`TUNING_TARGET` affects only the tuning configuration, applicable for some operators such
as GEMM, and has no effect on the target triplet for DPC++ or the hipSYCL target. Please
refer to the sections below for setting them.
`TUNING_TARGET` affects only the tuning configuration and has no effect on the target
triplet for DPC++ or the AdaptiveCpp/hipSYCL target. Please refer to the sections
below for setting them.

1. Clone the portBLAS repository, making sure to pass the `--recursive` option, in order
to clone submodule(s).
Expand All @@ -417,13 +417,41 @@ advisable for NVIDIA and **mandatory for AMD** to provide the specific device
architecture through `-DDPCPP_SYCL_ARCH=<arch>`, e.g., `<arch>` can be `sm_80`
for NVIDIA or `gfx908` for AMD.

### Compile with hipSYCL
### Compile with AdaptiveCpp *(Formerly hipSYCL)*
The following instructions concern the **generic** *(clang-based)* flow supported
by AdaptiveCpp.

```bash
cd build
cmake -GNinja ../ -DhipSYCL_DIR=/path/to/hipSYCL/install/lib/cmake/hipSYCL -DSYCL_COMPILER=hipsycl
export CC=[path/to/system/clang]
export CXX=[path/to/AdaptiveCpp/install/bin/acpp]
export ACPP_TARGETS=[compilation_flow:target] # (e.g. cuda:sm_75)
cmake -GNinja ../ -DAdaptiveCpp_DIR=/path/to/AdaptiveCpp/install/lib/cmake/AdaptiveCpp \
-DSYCL_COMPILER=adaptivecpp -DACPP_TARGETS=$ACPP_TARGETS
ninja
```
To build for other than the default devices (`omp`), set the `HIPSYCL_TARGETS` environment variable or specify `-DHIPSYCL_TARGETS` as [documented](https://github.com/illuhad/hipSYCL/blob/develop/doc/using-hipsycl.md).
To build for other than the default backend *(host cpu through `omp`*)*, set the `ACPP_TARGETS` environment
variable or specify `-DACPP_TARGETS` as
[documented](https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/using-hipsycl.md).
The available backends are the ones built with AdaptiveCpp in the first place.

Similarly to DPCPP's `sycl-ls`, AdaptiveCpp's `acpp-info` helps display the available
backends informations. In case of building AdaptiveCpp against llvm *(generic-flow)*,
the `llvm-to-xxx.so` library files should be visible by the runtime to target the
appropriate device, which can be ensured by setting the ENV variable :

```bash
export LD_LIBRARY_PATH=[path/to/AdaptiveCpp/install/lib/hipSYCL:$LD_LIBRARY_PATH]
export LD_LIBRARY_PATH=[path/to/AdaptiveCpp/install/lib/hipSYCL/llvm-to-backend:$LD_LIBRARY_PATH]
```

*Notes :*
- Some operator kernels are implemented using extensions / SYCL 2020 features not yet implemented
in AdaptiveCpp and are not supported when portBLAS is built with it. These operators include
`asum`, `nrm2`, `dot`, `sdsdot`, `rot`, `trsv`, `tbsv` and `tpsv`.
- The default `omp` host CPU backend *(as well as its optimized variant `omp.accelerated`)* hasn't been
not been fully integrated into the library and currently causes some tests to fail *(interleaved batched
gemm in particular)*. It's thus advised to use the llvm/OpenCL generic flow when targetting CPUs.

### Installing portBLAS
To install the portBLAS library (see `CMAKE_INSTALL_PREFIX` below)
Expand Down Expand Up @@ -452,7 +480,7 @@ Some of the supported options are:
|---|---|---|
| `BLAS_ENABLE_TESTING` | `ON`/`OFF` | Set it to `OFF` to avoid building the tests (`ON` is the default value) |
| `BLAS_ENABLE_BENCHMARK` | `ON`/`OFF` | Set it to `OFF` to avoid building the benchmarks (`ON` is the default value) |
| `SYCL_COMPILER` | name | Used to determine which SYCL implementation to use. By default, the first implementation found is used. Supported values are: `dpcpp`, `hipsycl` and `computecpp`*(deprecated)*. |
| `SYCL_COMPILER` | name | Used to determine which SYCL implementation to use. By default, the first implementation found is used. Supported values are: `dpcpp`, `adaptivecpp` and `computecpp`*(deprecated)*. |
| `TUNING_TARGET` | name | By default, this flag is set to `DEFAULT_CPU` to restrict any device specific compiler optimizations. Use this flag to tune the code for a target (**highly recommended** for performance). The supported targets are: `INTEL_GPU`, `NVIDIA_GPU`, `AMD_GPU` |
| `CMAKE_PREFIX_PATH` | path | List of paths to check when searching for dependencies |
| `CMAKE_INSTALL_PREFIX` | path | Specify the install location, used when invoking `ninja install` |
Expand Down
18 changes: 18 additions & 0 deletions benchmark/portblas/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,21 @@ if(${BLAS_ENABLE_EXTENSIONS})
list(APPEND sources extension/reduction.cpp)
endif()

# Skip these benchmarks for AdaptiveCpp for SPIRV/OpenCL targets
# that use SYCL 2020 features like group reduction or hang
# during execution (https://github.com/AdaptiveCpp/AdaptiveCpp/issues/1309)
set(ADAPTIVE_CPP_SKIP
blas1/asum.cpp
blas1/dot.cpp
blas1/sdsdot.cpp
blas1/nrm2.cpp
blas2/trsv.cpp
blas2/tbsv.cpp
blas2/tpsv.cpp
# Hang during execution (without failing)
blas3/trsm.cpp
)

# Operators supporting COMPLEX types benchmarking
set(CPLX_OPS "gemm"
"gemm_batched"
Expand All @@ -101,6 +116,9 @@ set(HALF_DATA_OPS "axpy"
# Add individual benchmarks for each method
foreach(portblas_bench ${sources})
get_filename_component(bench_exec ${portblas_bench} NAME_WE)
if(is_adaptivecpp AND ${portblas_bench} IN_LIST ADAPTIVE_CPP_SKIP)
continue()
endif()
add_executable(bench_${bench_exec} ${portblas_bench} main.cpp)
target_link_libraries(bench_${bench_exec} PRIVATE benchmark Clara::Clara portblas bench_info)
target_compile_definitions(bench_${bench_exec} PRIVATE -DBLAS_INDEX_T=${BLAS_BENCHMARK_INDEX_TYPE})
Expand Down
45 changes: 27 additions & 18 deletions cmake/Modules/SYCL.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -25,36 +25,39 @@
include(CheckCXXCompilerFlag)
include(ConfigurePORTBLAS)

# find_package(hipSYCL) requires HIPSYCL_TARGETS to be set, so set it to a default value before find_package(hipSYCL)
if(SYCL_COMPILER MATCHES "hipsycl" AND NOT HIPSYCL_TARGETS AND NOT ENV{HIPSYCL_TARGETS})
message(STATUS "Using `omp` as HIPSYCL_TARGETS")
set(HIPSYCL_TARGETS "omp")
# find_package(AdaptiveCpp) requires ACPP_TARGETS to be set, so set it to a default value before find_package(AdaptiveCpp)
if(SYCL_COMPILER MATCHES "adaptivecpp" AND NOT ACPP_TARGETS AND NOT ENV{ACPP_TARGETS})
message(STATUS "Using `omp` as ACPP_TARGETS")
set(ACPP_TARGETS "omp")
else()
message(STATUS "Using ${ACPP_TARGETS} as ACPP_TARGETS")
endif()

check_cxx_compiler_flag("--acpp-targets" has_acpp)
check_cxx_compiler_flag("-fsycl" has_fsycl)

if(NOT SYCL_COMPILER)
if(has_fsycl)
if(has_acpp)
find_package(AdaptiveCpp QUIET)
set(is_adaptivecpp ${AdaptiveCpp_FOUND})
set(SYCL_COMPILER "adaptivecpp")
else()
set(is_dpcpp ON)
set(SYCL_COMPILER "dpcpp")
else()
find_package(hipSYCL QUIET)
set(is_hipsycl ${hipSYCL_FOUND})
set(SYCL_COMPILER "hipsycl")
if(NOT is_hipsycl)
set(is_computecpp ON)
set(SYCL_COMPILER "computecpp")
endif()
endif()
else()
if(SYCL_COMPILER MATCHES "dpcpp")
set(is_dpcpp ON)
if(NOT has_fsycl)
message(WARNING "Selected DPC++ as backend, but -fsycl not supported")
endif()
elseif(SYCL_COMPILER MATCHES "hipsycl")
find_package(hipSYCL REQUIRED CONFIG)
set(is_hipsycl ON)
elseif(SYCL_COMPILER MATCHES "adaptivecpp")
find_package(AdaptiveCpp CONFIG REQUIRED)
set(is_adaptivecpp ${AdaptiveCpp_FOUND})
if(NOT has_acpp)
message(WARNING "Selected AdaptiveCpp as backend, but the compiler is not
fully supported")
endif()
elseif(SYCL_COMPILER MATCHES "computecpp")
set(is_computecpp ON)
else()
Expand Down Expand Up @@ -88,8 +91,14 @@ elseif(is_dpcpp)
endif()
find_package(DPCPP REQUIRED)
get_target_property(SYCL_INCLUDE_DIRS DPCPP::DPCPP INTERFACE_INCLUDE_DIRECTORIES)
elseif(is_hipsycl)
elseif(is_adaptivecpp)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
get_target_property(SYCL_INCLUDE_DIRS hipSYCL::hipSYCL-rt INTERFACE_INCLUDE_DIRECTORIES)
get_target_property(SYCL_INCLUDE_DIRS AdaptiveCpp::acpp-rt INTERFACE_INCLUDE_DIRECTORIES)
set(HIP_BENCH_UNSUPPORTED_TARGETS "INTEL_GPU" "DEFAULT_CPU")
if((${BLAS_ENABLE_BENCHMARK}) AND (${TUNING_TARGET} IN_LIST HIP_BENCH_UNSUPPORTED_TARGETS))
message(STATUS "Benchmarks are not supported when targetting OpenCL/LevelZero backend
devices. portBLAS Benchmarks are disabled.")
set(BLAS_ENABLE_BENCHMARK OFF)
endif()
endif()
40 changes: 32 additions & 8 deletions include/container/sycl_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,27 +194,51 @@ template <cl::sycl::access::mode acc_md_t>
inline typename BufferIterator<element_t>::template accessor_t<acc_md_t>
BufferIterator<element_t>::get_range_accessor(cl::sycl::handler& cgh,
size_t size) {
return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
buffer_, cgh, cl::sycl::range<1>(size),
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
if constexpr (acc_md_t == cl::sycl::access::mode::read) {
return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
buffer_, cgh, cl::sycl::range<1>(size),
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
} else {
// Skip data initialization if not accessing in read mode only
return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
buffer_, cgh, cl::sycl::range<1>(size),
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()),
cl::sycl::property::no_init{});
}
}

template <typename element_t>
template <cl::sycl::access::mode acc_md_t>
inline typename BufferIterator<element_t>::template accessor_t<acc_md_t>
BufferIterator<element_t>::get_range_accessor(cl::sycl::handler& cgh) {
return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
cgh, BufferIterator<element_t>::get_size());
if constexpr (acc_md_t == cl::sycl::access::mode::read) {
return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
cgh, BufferIterator<element_t>::get_size());
} else {
// Skip data initialization if not accessing in read mode only
return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
cgh, BufferIterator<element_t>::get_size(),
cl::sycl::property::no_init{});
}
}

template <typename element_t>
template <cl::sycl::access::mode acc_md_t>
inline typename BufferIterator<element_t>::template placeholder_accessor_t<
acc_md_t>
BufferIterator<element_t>::get_range_accessor(size_t size) {
return typename BufferIterator<element_t>::template placeholder_accessor_t<
acc_md_t>(buffer_, cl::sycl::range<1>(size),
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
if constexpr (acc_md_t == cl::sycl::access::mode::read) {
return typename BufferIterator<element_t>::template placeholder_accessor_t<
acc_md_t>(buffer_, cl::sycl::range<1>(size),
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));

} else {
// Skip data initialization if not accessing in read mode only
return typename BufferIterator<element_t>::template placeholder_accessor_t<
acc_md_t>(buffer_, cl::sycl::range<1>(size),
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()),
cl::sycl::property::no_init{});
}
}

template <typename element_t>
Expand Down
33 changes: 32 additions & 1 deletion include/interface/blas1_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ typename sb_handle_t::event_t _swap(
const typename sb_handle_t::event_t &_dependencies);

/**
* \brief SCALAR operation on a vector
* \brief SCALAR operation on a vector
* @param sb_handle_t sb_handle
* @param _vx BufferIterator or USM pointer
* @param _incx Increment for the vector X
Expand All @@ -208,6 +208,37 @@ typename sb_handle_t::event_t _scal(
sb_handle_t &sb_handle, index_t _N, element_t _alpha, container_0_t _vx,
increment_t _incx, const typename sb_handle_t::event_t &_dependencies);

/**
* \brief SCALAR operation on a matrix. (this is a generalization of
* vector-based _scal operator meant for internal use within the library, namely
* for GEMM and inplace-Matcopy operators)
* @param sb_handle_t sb_handle
* @param _A Input/Output BufferIterator or USM pointer
* @param _incA Increment for the matrix A
* @param _lda Leading dimension for the matrix A
* @param _M number of rows
* @param _N number of columns
* @param alpha scaling scalar
* @param _dependencies Vector of events
*/
template <typename sb_handle_t, typename element_t, typename container_0_t,
typename index_t, typename increment_t>
typename sb_handle_t::event_t _scal_matrix(
sb_handle_t &sb_handle, index_t _M, index_t _N, element_t _alpha,
container_0_t _A, index_t _lda, increment_t _incA,
const typename sb_handle_t::event_t &_dependencies);

/*!
* \brief Prototype for the internal implementation of the _scal_matrix
* operator.
*/
template <bool has_inc, typename sb_handle_t, typename element_t,
typename container_0_t, typename index_t, typename increment_t>
typename sb_handle_t::event_t _scal_matrix_impl(
sb_handle_t &sb_handle, index_t _M, index_t _N, element_t _alpha,
container_0_t _A, index_t _lda, increment_t _incA,
const typename sb_handle_t::event_t &_dependencies);

/**
* \brief NRM2 Returns the euclidian norm of a vector
* @param sb_handle SB_Handle
Expand Down
6 changes: 4 additions & 2 deletions include/operations/blas_constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -263,16 +263,18 @@ struct constant_pair {

} // namespace blas

#ifndef __ADAPTIVECPP__
template <typename ind_t, typename val_t>
struct sycl::is_device_copyable<blas::IndexValueTuple<ind_t, val_t>>
struct cl::sycl::is_device_copyable<blas::IndexValueTuple<ind_t, val_t>>
: std::true_type {};

template <typename ind_t, typename val_t>
struct sycl::is_device_copyable<const blas::IndexValueTuple<ind_t, val_t>>
struct cl::sycl::is_device_copyable<const blas::IndexValueTuple<ind_t, val_t>>
: std::true_type {};

template <typename ind_t, typename val_t>
struct std::is_trivially_copyable<blas::IndexValueTuple<ind_t, val_t>>
: std::true_type {};
#endif

#endif // BLAS_CONSTANTS_H
12 changes: 10 additions & 2 deletions include/sb_handle/portblas_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,18 +49,24 @@ class SB_Handle {
public:
using event_t = std::vector<cl::sycl::event>;
inline SB_Handle(queue_t q)
: tempMemPool_(nullptr),
:
#ifndef __ADAPTIVECPP__
tempMemPool_(nullptr),
#endif
q_(q),
workGroupSize_(helper::get_work_group_size(q)),
localMemorySupport_(helper::has_local_memory(q)),
computeUnits_(helper::get_num_compute_units(q)) {}
computeUnits_(helper::get_num_compute_units(q)) {
}

#ifndef __ADAPTIVECPP__
inline SB_Handle(Temp_Mem_Pool* tmp)
: tempMemPool_(tmp),
q_(tmp->get_queue()),
workGroupSize_(helper::get_work_group_size(q_)),
localMemorySupport_(helper::has_local_memory(q_)),
computeUnits_(helper::get_num_compute_units(q_)) {}
#endif

template <helper::AllocType alloc, typename value_t>
typename std::enable_if<
Expand Down Expand Up @@ -191,7 +197,9 @@ class SB_Handle {
const size_t workGroupSize_;
const bool localMemorySupport_;
const size_t computeUnits_;
#ifndef __ADAPTIVECPP__
Temp_Mem_Pool* tempMemPool_;
#endif
};

} // namespace blas
Expand Down
Loading

0 comments on commit 64fdc09

Please sign in to comment.