diff --git a/CMakeLists.txt b/CMakeLists.txt
index e0d418c57..8447a0a45 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -115,6 +115,19 @@ if(((NOT INSTALL_HEADER_ONLY) AND (TUNING_TARGET STREQUAL "DEFAULT_CPU"))
   message(STATUS "FP16 operations are not supported for CPU targets. BLAS_ENABLE_HALF is disabled")
 endif()
 
+if (SYCL_COMPILER MATCHES "adaptivecpp") 
+  if(BLAS_ENABLE_COMPLEX)
+    message(STATUS "SYCL Complex data is not supported on AdaptiveCpp/hipSYCL. Complex
+            data type is disabled")
+    set(BLAS_ENABLE_COMPLEX OFF)
+  endif()
+  if(BLAS_MEMPOOL_BENCHMARK)
+    message(STATUS "Memory pool feature is not supported on AdaptiveCpp/hipSYCL. Corresponding
+            benchmarks are disabled")
+    set(BLAS_MEMPOOL_BENCHMARK OFF)
+  endif()
+endif()
+
 # CmakeFunctionHelper has to be included after any options that it depends on are declared.
 # These include:
 # * TARGET
@@ -145,8 +158,8 @@ else()
       target_link_libraries(portblas PUBLIC ComputeCpp::ComputeCpp)
     elseif(is_dpcpp)
       target_link_libraries(portblas PUBLIC DPCPP::DPCPP)
-    elseif(is_hipsycl)
-      target_link_libraries(portblas PUBLIC hipSYCL::hipSYCL-rt)
+    elseif(is_adaptivecpp)
+      target_link_libraries(portblas PUBLIC AdaptiveCpp::acpp-rt)
     endif()
   endif()
   if(is_computecpp)
@@ -154,8 +167,8 @@ else()
   elseif(is_dpcpp)
     set(sycl_impl DPCPP::DPCPP)
     add_sycl_to_target(TARGET portblas SOURCES)
-  elseif(is_hipsycl)
-    set(sycl_impl hipSYCL::hipSYCL-rt)
+  elseif(is_adaptivecpp)
+    set(sycl_impl AdaptiveCpp::acpp-rt)
     add_sycl_to_target(TARGET portblas SOURCES)
   endif()
   if(IMGDNN_DIR)
diff --git a/README.md b/README.md
index fb210470d..7b1ee2b70 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,7 @@ the project.
   - [Requirements](#requirements)
   - [Setup](#setup)
     - [Compile with DPC++](#compile-with-dpc)
-    - [Compile with hipSYCL](#compile-with-hipsycl)
+    - [Compile with AdaptiveCpp *(Formerly hipSYCL)*](#compile-with-adaptivecpp)
     - [Instaling portBLAS](#instaling-portBLAS)
     - [Doxygen](#doxygen)
     - [CMake options](#cmake-options)
@@ -390,9 +390,9 @@ added to the `CMAKE_PREFIX_PATH` when building portBLAS (see
 
 **IMPORTANT NOTE:** The `TARGET` CMake variable is no longer supported. It has
 been replaced by `TUNING_TARGET`, which accepts the same options.
-`TUNING_TARGET` affects only the tuning configuration, applicable for some operators such
-as GEMM, and has no effect on the target triplet for DPC++ or the hipSYCL target. Please 
-refer to the sections below for setting them.
+`TUNING_TARGET` affects only the tuning configuration and has no effect on the target
+triplet for DPC++ or the AdaptiveCpp/hipSYCL target. Please refer to the sections 
+below for setting them.
 
 1. Clone the portBLAS repository, making sure to pass the `--recursive` option, in order 
 to clone submodule(s).
@@ -417,13 +417,41 @@ advisable for NVIDIA and **mandatory for AMD** to provide the specific device
 architecture through `-DDPCPP_SYCL_ARCH=<arch>`, e.g., `<arch>` can be `sm_80`
 for NVIDIA or `gfx908` for AMD.
 
-### Compile with hipSYCL
+### Compile with AdaptiveCpp *(Formerly hipSYCL)*
+The following instructions concern the **generic** *(clang-based)* flow supported
+by AdaptiveCpp.
+
 ```bash
 cd build
-cmake -GNinja ../ -DhipSYCL_DIR=/path/to/hipSYCL/install/lib/cmake/hipSYCL -DSYCL_COMPILER=hipsycl
+export CC=[path/to/system/clang]
+export CXX=[path/to/AdaptiveCpp/install/bin/acpp]
+export ACPP_TARGETS=[compilation_flow:target] # (e.g. cuda:sm_75)
+cmake -GNinja ../ -DAdaptiveCpp_DIR=/path/to/AdaptiveCpp/install/lib/cmake/AdaptiveCpp \
+      -DSYCL_COMPILER=adaptivecpp -DACPP_TARGETS=$ACPP_TARGETS
 ninja
 ```
-To build for other than the default devices (`omp`), set the `HIPSYCL_TARGETS` environment variable or specify `-DHIPSYCL_TARGETS` as [documented](https://github.com/illuhad/hipSYCL/blob/develop/doc/using-hipsycl.md).
+To build for other than the default backend *(host cpu through `omp`*)*, set the `ACPP_TARGETS` environment
+variable or specify `-DACPP_TARGETS` as 
+[documented](https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/using-hipsycl.md). 
+The available backends are the ones built with AdaptiveCpp in the first place.  
+
+Similarly to DPCPP's `sycl-ls`, AdaptiveCpp's `acpp-info` helps display the available
+backends informations. In case of building AdaptiveCpp against llvm *(generic-flow)*,
+the `llvm-to-xxx.so` library files should be visible by the runtime to target the 
+appropriate device, which can be ensured by setting the ENV variable : 
+
+```bash
+export LD_LIBRARY_PATH=[path/to/AdaptiveCpp/install/lib/hipSYCL:$LD_LIBRARY_PATH]
+export LD_LIBRARY_PATH=[path/to/AdaptiveCpp/install/lib/hipSYCL/llvm-to-backend:$LD_LIBRARY_PATH]
+```
+
+*Notes :*
+- Some operator kernels are implemented using extensions / SYCL 2020 features not yet implemented 
+in AdaptiveCpp and are not supported when portBLAS is built with it. These operators include 
+`asum`, `nrm2`, `dot`, `sdsdot`, `rot`, `trsv`, `tbsv` and `tpsv`.
+- The default `omp` host CPU backend *(as well as its optimized variant `omp.accelerated`)* hasn't been
+not been fully integrated into the library and currently causes some tests to fail *(interleaved batched
+gemm in particular)*. It's thus advised to use the llvm/OpenCL generic flow when targetting CPUs.
 
 ### Installing portBLAS
 To install the portBLAS library (see `CMAKE_INSTALL_PREFIX` below)
@@ -452,7 +480,7 @@ Some of the supported options are:
 |---|---|---|
 | `BLAS_ENABLE_TESTING` | `ON`/`OFF` | Set it to `OFF` to avoid building the tests (`ON` is the default value) |
 | `BLAS_ENABLE_BENCHMARK` | `ON`/`OFF` | Set it to `OFF` to avoid building the benchmarks (`ON` is the default value) |
-| `SYCL_COMPILER` | name | Used to determine which SYCL implementation to use. By default, the first implementation found is used. Supported values are: `dpcpp`, `hipsycl` and `computecpp`*(deprecated)*. |
+| `SYCL_COMPILER` | name | Used to determine which SYCL implementation to use. By default, the first implementation found is used. Supported values are: `dpcpp`, `adaptivecpp` and `computecpp`*(deprecated)*. |
 | `TUNING_TARGET` | name | By default, this flag is set to `DEFAULT_CPU` to restrict any device specific compiler optimizations. Use this flag to tune the code for a target (**highly recommended** for performance). The supported targets are: `INTEL_GPU`, `NVIDIA_GPU`, `AMD_GPU` |
 | `CMAKE_PREFIX_PATH` | path | List of paths to check when searching for dependencies |
 | `CMAKE_INSTALL_PREFIX` | path | Specify the install location, used when invoking `ninja install` |
diff --git a/benchmark/portblas/CMakeLists.txt b/benchmark/portblas/CMakeLists.txt
index f309286b3..0da461aba 100644
--- a/benchmark/portblas/CMakeLists.txt
+++ b/benchmark/portblas/CMakeLists.txt
@@ -85,6 +85,21 @@ if(${BLAS_ENABLE_EXTENSIONS})
   list(APPEND sources extension/reduction.cpp)
 endif()
 
+# Skip these benchmarks for AdaptiveCpp for SPIRV/OpenCL targets
+# that use SYCL 2020 features like group reduction or hang 
+# during execution (https://github.com/AdaptiveCpp/AdaptiveCpp/issues/1309)
+set(ADAPTIVE_CPP_SKIP
+  blas1/asum.cpp
+  blas1/dot.cpp
+  blas1/sdsdot.cpp
+  blas1/nrm2.cpp
+  blas2/trsv.cpp
+  blas2/tbsv.cpp
+  blas2/tpsv.cpp
+  # Hang during execution (without failing)
+  blas3/trsm.cpp
+)
+
 # Operators supporting COMPLEX types benchmarking
 set(CPLX_OPS "gemm" 
              "gemm_batched"
@@ -101,6 +116,9 @@ set(HALF_DATA_OPS "axpy"
 # Add individual benchmarks for each method
 foreach(portblas_bench ${sources})
   get_filename_component(bench_exec ${portblas_bench} NAME_WE)
+  if(is_adaptivecpp AND ${portblas_bench} IN_LIST ADAPTIVE_CPP_SKIP)
+    continue()
+  endif()
   add_executable(bench_${bench_exec} ${portblas_bench} main.cpp)
   target_link_libraries(bench_${bench_exec} PRIVATE benchmark Clara::Clara portblas bench_info)
   target_compile_definitions(bench_${bench_exec} PRIVATE -DBLAS_INDEX_T=${BLAS_BENCHMARK_INDEX_TYPE})
diff --git a/cmake/Modules/SYCL.cmake b/cmake/Modules/SYCL.cmake
index 42cd90f85..31f28245e 100644
--- a/cmake/Modules/SYCL.cmake
+++ b/cmake/Modules/SYCL.cmake
@@ -25,26 +25,25 @@
 include(CheckCXXCompilerFlag)
 include(ConfigurePORTBLAS)
 
-# find_package(hipSYCL) requires HIPSYCL_TARGETS to be set, so set it to a default value before find_package(hipSYCL)
-if(SYCL_COMPILER MATCHES "hipsycl" AND NOT HIPSYCL_TARGETS AND NOT ENV{HIPSYCL_TARGETS})
-  message(STATUS "Using `omp` as HIPSYCL_TARGETS")
-  set(HIPSYCL_TARGETS "omp")
+# find_package(AdaptiveCpp) requires ACPP_TARGETS to be set, so set it to a default value before find_package(AdaptiveCpp)
+if(SYCL_COMPILER MATCHES "adaptivecpp" AND NOT ACPP_TARGETS AND NOT ENV{ACPP_TARGETS})
+  message(STATUS "Using `omp` as ACPP_TARGETS")
+  set(ACPP_TARGETS "omp")
+else()
+  message(STATUS "Using ${ACPP_TARGETS} as ACPP_TARGETS")
 endif()
 
+check_cxx_compiler_flag("--acpp-targets" has_acpp)
 check_cxx_compiler_flag("-fsycl" has_fsycl)
 
 if(NOT SYCL_COMPILER)
-  if(has_fsycl)
+  if(has_acpp)
+    find_package(AdaptiveCpp QUIET)
+    set(is_adaptivecpp ${AdaptiveCpp_FOUND})
+    set(SYCL_COMPILER "adaptivecpp")
+  else()
     set(is_dpcpp ON)
     set(SYCL_COMPILER "dpcpp")
-  else()
-    find_package(hipSYCL QUIET)
-    set(is_hipsycl ${hipSYCL_FOUND})
-    set(SYCL_COMPILER "hipsycl")
-    if(NOT is_hipsycl)
-      set(is_computecpp ON)
-      set(SYCL_COMPILER "computecpp")
-    endif()
   endif()
 else()
   if(SYCL_COMPILER MATCHES "dpcpp")
@@ -52,9 +51,13 @@ else()
     if(NOT has_fsycl)
       message(WARNING "Selected DPC++ as backend, but -fsycl not supported")
     endif()
-  elseif(SYCL_COMPILER MATCHES "hipsycl")
-    find_package(hipSYCL REQUIRED CONFIG)
-    set(is_hipsycl ON)
+  elseif(SYCL_COMPILER MATCHES "adaptivecpp")
+    find_package(AdaptiveCpp CONFIG REQUIRED)
+    set(is_adaptivecpp ${AdaptiveCpp_FOUND})
+    if(NOT has_acpp)
+      message(WARNING "Selected AdaptiveCpp as backend, but the compiler is not 
+              fully supported")
+    endif()
   elseif(SYCL_COMPILER MATCHES "computecpp")
     set(is_computecpp ON)
   else()
@@ -88,8 +91,14 @@ elseif(is_dpcpp)
   endif()
   find_package(DPCPP REQUIRED)
   get_target_property(SYCL_INCLUDE_DIRS DPCPP::DPCPP INTERFACE_INCLUDE_DIRECTORIES)
-elseif(is_hipsycl)
+elseif(is_adaptivecpp)
   set(CMAKE_CXX_STANDARD 17)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
-  get_target_property(SYCL_INCLUDE_DIRS hipSYCL::hipSYCL-rt INTERFACE_INCLUDE_DIRECTORIES)
+  get_target_property(SYCL_INCLUDE_DIRS AdaptiveCpp::acpp-rt INTERFACE_INCLUDE_DIRECTORIES)
+  set(HIP_BENCH_UNSUPPORTED_TARGETS "INTEL_GPU" "DEFAULT_CPU")
+  if((${BLAS_ENABLE_BENCHMARK}) AND (${TUNING_TARGET} IN_LIST HIP_BENCH_UNSUPPORTED_TARGETS))
+    message(STATUS "Benchmarks are not supported when targetting OpenCL/LevelZero backend 
+            devices. portBLAS Benchmarks are disabled.")
+    set(BLAS_ENABLE_BENCHMARK OFF)
+  endif()
 endif()
diff --git a/include/container/sycl_iterator.h b/include/container/sycl_iterator.h
index e6ea4f953..9e12939ed 100644
--- a/include/container/sycl_iterator.h
+++ b/include/container/sycl_iterator.h
@@ -194,17 +194,32 @@ template <cl::sycl::access::mode acc_md_t>
 inline typename BufferIterator<element_t>::template accessor_t<acc_md_t>
 BufferIterator<element_t>::get_range_accessor(cl::sycl::handler& cgh,
                                               size_t size) {
-  return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
-      buffer_, cgh, cl::sycl::range<1>(size),
-      cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
+  if constexpr (acc_md_t == cl::sycl::access::mode::read) {
+    return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
+        buffer_, cgh, cl::sycl::range<1>(size),
+        cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
+  } else {
+    // Skip data initialization if not accessing in read mode only
+    return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
+        buffer_, cgh, cl::sycl::range<1>(size),
+        cl::sycl::id<1>(BufferIterator<element_t>::get_offset()),
+        cl::sycl::property::no_init{});
+  }
 }
 
 template <typename element_t>
 template <cl::sycl::access::mode acc_md_t>
 inline typename BufferIterator<element_t>::template accessor_t<acc_md_t>
 BufferIterator<element_t>::get_range_accessor(cl::sycl::handler& cgh) {
-  return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
-      cgh, BufferIterator<element_t>::get_size());
+  if constexpr (acc_md_t == cl::sycl::access::mode::read) {
+    return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
+        cgh, BufferIterator<element_t>::get_size());
+  } else {
+    // Skip data initialization if not accessing in read mode only
+    return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
+        cgh, BufferIterator<element_t>::get_size(),
+        cl::sycl::property::no_init{});
+  }
 }
 
 template <typename element_t>
@@ -212,9 +227,18 @@ template <cl::sycl::access::mode acc_md_t>
 inline typename BufferIterator<element_t>::template placeholder_accessor_t<
     acc_md_t>
 BufferIterator<element_t>::get_range_accessor(size_t size) {
-  return typename BufferIterator<element_t>::template placeholder_accessor_t<
-      acc_md_t>(buffer_, cl::sycl::range<1>(size),
-                cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
+  if constexpr (acc_md_t == cl::sycl::access::mode::read) {
+    return typename BufferIterator<element_t>::template placeholder_accessor_t<
+        acc_md_t>(buffer_, cl::sycl::range<1>(size),
+                  cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
+
+  } else {
+    // Skip data initialization if not accessing in read mode only
+    return typename BufferIterator<element_t>::template placeholder_accessor_t<
+        acc_md_t>(buffer_, cl::sycl::range<1>(size),
+                  cl::sycl::id<1>(BufferIterator<element_t>::get_offset()),
+                  cl::sycl::property::no_init{});
+  }
 }
 
 template <typename element_t>
diff --git a/include/interface/blas1_interface.h b/include/interface/blas1_interface.h
index c2fd8de1a..7684a0a65 100644
--- a/include/interface/blas1_interface.h
+++ b/include/interface/blas1_interface.h
@@ -196,7 +196,7 @@ typename sb_handle_t::event_t _swap(
     const typename sb_handle_t::event_t &_dependencies);
 
 /**
- * \brief SCALAR  operation on a vector
+ * \brief SCALAR operation on a vector
  * @param sb_handle_t sb_handle
  * @param _vx BufferIterator or USM pointer
  * @param _incx Increment for the vector X
@@ -208,6 +208,37 @@ typename sb_handle_t::event_t _scal(
     sb_handle_t &sb_handle, index_t _N, element_t _alpha, container_0_t _vx,
     increment_t _incx, const typename sb_handle_t::event_t &_dependencies);
 
+/**
+ * \brief SCALAR operation on a matrix. (this is a generalization of
+ * vector-based _scal operator meant for internal use within the library, namely
+ * for GEMM and inplace-Matcopy operators)
+ * @param sb_handle_t sb_handle
+ * @param _A Input/Output BufferIterator or USM pointer
+ * @param _incA Increment for the matrix A
+ * @param _lda Leading dimension for the matrix A
+ * @param _M number of rows
+ * @param _N number of columns
+ * @param alpha scaling scalar
+ * @param _dependencies Vector of events
+ */
+template <typename sb_handle_t, typename element_t, typename container_0_t,
+          typename index_t, typename increment_t>
+typename sb_handle_t::event_t _scal_matrix(
+    sb_handle_t &sb_handle, index_t _M, index_t _N, element_t _alpha,
+    container_0_t _A, index_t _lda, increment_t _incA,
+    const typename sb_handle_t::event_t &_dependencies);
+
+/*!
+ * \brief Prototype for the internal implementation of the _scal_matrix
+ * operator.
+ */
+template <bool has_inc, typename sb_handle_t, typename element_t,
+          typename container_0_t, typename index_t, typename increment_t>
+typename sb_handle_t::event_t _scal_matrix_impl(
+    sb_handle_t &sb_handle, index_t _M, index_t _N, element_t _alpha,
+    container_0_t _A, index_t _lda, increment_t _incA,
+    const typename sb_handle_t::event_t &_dependencies);
+
 /**
  * \brief NRM2 Returns the euclidian norm of a vector
  * @param sb_handle SB_Handle
diff --git a/include/operations/blas_constants.h b/include/operations/blas_constants.h
index 229cd6721..82a8e0beb 100644
--- a/include/operations/blas_constants.h
+++ b/include/operations/blas_constants.h
@@ -263,16 +263,18 @@ struct constant_pair {
 
 }  // namespace blas
 
+#ifndef __ADAPTIVECPP__
 template <typename ind_t, typename val_t>
-struct sycl::is_device_copyable<blas::IndexValueTuple<ind_t, val_t>>
+struct cl::sycl::is_device_copyable<blas::IndexValueTuple<ind_t, val_t>>
     : std::true_type {};
 
 template <typename ind_t, typename val_t>
-struct sycl::is_device_copyable<const blas::IndexValueTuple<ind_t, val_t>>
+struct cl::sycl::is_device_copyable<const blas::IndexValueTuple<ind_t, val_t>>
     : std::true_type {};
 
 template <typename ind_t, typename val_t>
 struct std::is_trivially_copyable<blas::IndexValueTuple<ind_t, val_t>>
     : std::true_type {};
+#endif
 
 #endif  // BLAS_CONSTANTS_H
diff --git a/include/sb_handle/portblas_handle.h b/include/sb_handle/portblas_handle.h
index f7104a0cc..836b37c61 100644
--- a/include/sb_handle/portblas_handle.h
+++ b/include/sb_handle/portblas_handle.h
@@ -49,18 +49,24 @@ class SB_Handle {
  public:
   using event_t = std::vector<cl::sycl::event>;
   inline SB_Handle(queue_t q)
-      : tempMemPool_(nullptr),
+      :
+#ifndef __ADAPTIVECPP__
+        tempMemPool_(nullptr),
+#endif
         q_(q),
         workGroupSize_(helper::get_work_group_size(q)),
         localMemorySupport_(helper::has_local_memory(q)),
-        computeUnits_(helper::get_num_compute_units(q)) {}
+        computeUnits_(helper::get_num_compute_units(q)) {
+  }
 
+#ifndef __ADAPTIVECPP__
   inline SB_Handle(Temp_Mem_Pool* tmp)
       : tempMemPool_(tmp),
         q_(tmp->get_queue()),
         workGroupSize_(helper::get_work_group_size(q_)),
         localMemorySupport_(helper::has_local_memory(q_)),
         computeUnits_(helper::get_num_compute_units(q_)) {}
+#endif
 
   template <helper::AllocType alloc, typename value_t>
   typename std::enable_if<
@@ -191,7 +197,9 @@ class SB_Handle {
   const size_t workGroupSize_;
   const bool localMemorySupport_;
   const size_t computeUnits_;
+#ifndef __ADAPTIVECPP__
   Temp_Mem_Pool* tempMemPool_;
+#endif
 };
 
 }  // namespace blas
diff --git a/include/sb_handle/temp_memory_pool.h b/include/sb_handle/temp_memory_pool.h
index 1cb2a5d59..836fb98c3 100644
--- a/include/sb_handle/temp_memory_pool.h
+++ b/include/sb_handle/temp_memory_pool.h
@@ -25,6 +25,7 @@
 #ifndef TEMP_MEMORY_POOL_H
 #define TEMP_MEMORY_POOL_H
 
+#ifndef __ADAPTIVECPP__
 #include <map>
 #include <mutex>
 
@@ -39,8 +40,11 @@ class Temp_Mem_Pool {
  public:
   Temp_Mem_Pool(queue_t q)
       : q_(q),
-        temp_buffer_map_tot_byte_size_(0),
-        temp_usm_map_tot_byte_size_(0) {}
+#ifdef SB_ENABLE_USM
+        temp_usm_map_tot_byte_size_(0),
+#endif
+        temp_buffer_map_tot_byte_size_(0) {
+  }
   Temp_Mem_Pool(const Temp_Mem_Pool& h) = delete;
   Temp_Mem_Pool operator=(Temp_Mem_Pool) = delete;
 
@@ -96,7 +100,7 @@ class Temp_Mem_Pool {
   temp_buffer_map_t temp_buffer_map_;
 
   template <typename container_t>
-  void release_usm_mem_(const container_t& mem);
+  void release_buff_mem_(const container_t& mem);
 
 #ifdef SB_ENABLE_USM
   std::mutex temp_usm_map_mutex_;
@@ -105,8 +109,11 @@ class Temp_Mem_Pool {
   temp_usm_size_map_t temp_usm_size_map_;
 
   template <typename container_t>
-  void release_buff_mem_(const container_t& mem);
-#endif
+  void release_usm_mem_(const container_t& mem);
+#endif  // SB_ENABLE_USM
 };
 }  // namespace blas
+
+#endif  // __ADAPTIVECPP__
+
 #endif
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 61ddd8727..eac74afb7 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -18,7 +18,7 @@ foreach(src_file ${SAMPLES_LIST})
   get_filename_component(sample_exec ${src_file} NAME_WE)
   set(sample_exec "sample_${sample_exec}")
   add_executable(${sample_exec} ${src_file})
-  if(is_hipsycl OR is_dpcpp)
+  if(is_adaptivecpp OR is_dpcpp)
     set_target_properties(${sample_exec} PROPERTIES CXX_STANDARD 17)
   else()
     set_target_properties(${sample_exec} PROPERTIES CXX_STANDARD 14)
diff --git a/src/interface/blas1/backend/default_cpu.hpp b/src/interface/blas1/backend/default_cpu.hpp
index fb52c7a42..ac2fe764d 100644
--- a/src/interface/blas1/backend/default_cpu.hpp
+++ b/src/interface/blas1/backend/default_cpu.hpp
@@ -49,6 +49,7 @@ template <typename sb_handle_t, typename container_0_t, typename container_1_t,
 typename sb_handle_t::event_t _iamax(
     sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
+#ifndef __ADAPTIVECPP__
   constexpr int localSize = 128;
   if (_N < 8192) {
     return blas::internal::_iamax_iamin_impl<localSize, 0, true, true>(
@@ -59,6 +60,20 @@ typename sb_handle_t::event_t _iamax(
     return blas::internal::_iamax_iamin_impl<localSize, 0, true, false>(
         sb_handle, _N, _vx, _incx, _rs, nWG, _dependencies);
   }
+#else
+  // Temporary work-around to avoid non-local memory implementation of
+  // iamin/iamax with AdaptiveCpp.
+  constexpr int localSize = 128;
+  if (_N < 8192) {
+    return blas::internal::_iamax_iamin_impl<localSize, localSize, true, true>(
+        sb_handle, _N, _vx, _incx, _rs, static_cast<index_t>(1), _dependencies);
+  } else {
+    const index_t nWG = std::min((_N + localSize - 1) / (localSize * 4),
+                                 static_cast<index_t>(512));
+    return blas::internal::_iamax_iamin_impl<localSize, localSize, true, false>(
+        sb_handle, _N, _vx, _incx, _rs, nWG, _dependencies);
+  }
+#endif
 }
 }  // namespace backend
 }  // namespace iamax
@@ -70,6 +85,7 @@ template <typename sb_handle_t, typename container_0_t, typename container_1_t,
 typename sb_handle_t::event_t _iamin(
     sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
+#ifndef __ADAPTIVECPP__
   constexpr int localSize = 128;
   if (_N < 8192) {
     return blas::internal::_iamax_iamin_impl<localSize, 0, false, true>(
@@ -80,6 +96,21 @@ typename sb_handle_t::event_t _iamin(
     return blas::internal::_iamax_iamin_impl<localSize, 0, false, false>(
         sb_handle, _N, _vx, _incx, _rs, nWG, _dependencies);
   }
+#else
+  // Temporary work-around to avoid non-local memory implementation of
+  // iamin/iamax with AdaptiveCpp.
+  constexpr int localSize = 128;
+  if (_N < 8192) {
+    return blas::internal::_iamax_iamin_impl<localSize, localSize, false, true>(
+        sb_handle, _N, _vx, _incx, _rs, static_cast<index_t>(1), _dependencies);
+  } else {
+    const index_t nWG = std::min((_N + localSize - 1) / (localSize * 4),
+                                 static_cast<index_t>(512));
+    return blas::internal::_iamax_iamin_impl<localSize, localSize, false,
+                                             false>(sb_handle, _N, _vx, _incx,
+                                                    _rs, nWG, _dependencies);
+  }
+#endif
 }
 }  // namespace backend
 }  // namespace iamin
diff --git a/src/interface/blas1/scal.cpp.in b/src/interface/blas1/scal.cpp.in
index aa8bd8ac1..4b82068fb 100644
--- a/src/interface/blas1/scal.cpp.in
+++ b/src/interface/blas1/scal.cpp.in
@@ -51,5 +51,30 @@ template typename SB_Handle::event_t _scal(
     const typename SB_Handle::event_t& dependencies);
 #endif
 
+/**
+ * \brief SCALAR operation on a Matrix
+ * @param SB_Handle sb_handle
+ * @param _alpha scaling scalar
+ * @param _A Input/Output BufferIterator or USM pointer
+ * @param _M number of rows
+ * @param _N number of columns
+ * @param _lda Leading dimension for the matrix A
+ * @param _incA Increment for the matrix A
+ * @param _dependencies Vector of events
+ */
+template typename SB_Handle::event_t _scal_matrix(
+    SB_Handle& sb_handle, ${INDEX_TYPE} _M,  ${INDEX_TYPE} _N, 
+    ${DATA_TYPE} _alpha, BufferIterator<${DATA_TYPE}> _A, 
+    ${INDEX_TYPE} _lda, ${INCREMENT_TYPE} _incA,
+    const typename SB_Handle::event_t& dependencies);
+
+#ifdef SB_ENABLE_USM
+template typename SB_Handle::event_t _scal_matrix(
+    SB_Handle& sb_handle, ${INDEX_TYPE} _M, ${INDEX_TYPE} _N,
+    ${DATA_TYPE} _alpha, ${DATA_TYPE} * _A, 
+    ${INDEX_TYPE} _lda, ${INCREMENT_TYPE} _incA,
+    const typename SB_Handle::event_t& dependencies);
+#endif
+
 }  // namespace internal
 }  // namespace blas
diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp
index 059b9cc27..df914b71b 100644
--- a/src/interface/blas1_interface.hpp
+++ b/src/interface/blas1_interface.hpp
@@ -151,6 +151,7 @@ typename sb_handle_t::event_t _sdsdot(
     sb_handle_t &sb_handle, index_t _N, float sb, container_0_t _vx,
     increment_t _incx, container_1_t _vy, increment_t _incy, container_2_t _rs,
     const typename sb_handle_t::event_t &_dependencies) {
+#ifndef __ADAPTIVECPP__
   if (!_N) {
     using element_t = typename ValueType<container_2_t>::type;
     sb_handle.wait(_dependencies);
@@ -168,6 +169,11 @@ typename sb_handle_t::event_t _sdsdot(
     auto ret = sb_handle.execute(assignOp2, dotOp);
     return blas::concatenate_vectors(dotOp, ret);
   }
+#else
+  throw std::runtime_error(
+      "Sdsdot is not supported with AdaptiveCpp as it uses SYCL 2020 "
+      "reduction.");
+#endif
 }
 
 /**
@@ -184,7 +190,7 @@ typename sb_handle_t::event_t _asum(
     sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _rs, const typename sb_handle_t::event_t &_dependencies) {
   // keep compatibility with older sycl versions
-#if SYCL_LANGUAGE_VERSION < 202000
+#if SYCL_LANGUAGE_VERSION < 202000 || defined(__ADAPTIVECPP__)
   typename VectorViewType<container_0_t, index_t, increment_t>::type vx =
       make_vector_view(_vx, _incx, _N);
   auto rs = make_vector_view(_rs, static_cast<increment_t>(1),
@@ -202,7 +208,7 @@ typename sb_handle_t::event_t _asum(
 #endif
 }
 
-#if SYCL_LANGUAGE_VERSION >= 202000
+#if SYCL_LANGUAGE_VERSION >= 202000 && !defined(__ADAPTIVECPP__)
 /*! _asum_impl.
  * @brief Internal implementation of the Absolute sum operator.
  *
@@ -296,7 +302,7 @@ typename sb_handle_t::event_t _iamax_iamin_impl(
       // get the minimum supported sub_group size
       const index_t min_sg_size = static_cast<index_t>(
           q.get_device()
-              .template get_info<sycl::info::device::sub_group_sizes>()[0]);
+              .template get_info<cl::sycl::info::device::sub_group_sizes>()[0]);
       ret = sb_handle.execute(op, min_sg_size, min_sg_size, _dependencies);
     } else {
       ret = sb_handle.execute(
@@ -311,7 +317,7 @@ typename sb_handle_t::event_t _iamax_iamin_impl(
     // get the minimum supported sub_group size
     const index_t min_sg_size = static_cast<index_t>(
         q.get_device()
-            .template get_info<sycl::info::device::sub_group_sizes>()[0]);
+            .template get_info<cl::sycl::info::device::sub_group_sizes>()[0]);
     // if using no local memory, every sub_group writes one intermediate output,
     // in case if sub_group size is not known at allocation time, than allocate
     // extra memory using min supported sub_group size.
@@ -455,6 +461,58 @@ typename sb_handle_t::event_t _scal(
   }
 }
 
+/**
+ * \brief SCALAR operation on a matrix
+ * @param sb_handle_t sb_handle
+ * @param _M number of matrix rows
+ * @param _N number of matrix columns
+ * @param alpha scaling scalar
+ * @param _A Input/Output BufferIterator or USM pointer matrix
+ * @param _incA Increment for the matrix A
+ * @param _lda Leading dimension for the matrix A
+ * @param _dependencies Vector of events
+ */
+template <typename sb_handle_t, typename element_t, typename container_0_t,
+          typename index_t, typename increment_t>
+typename sb_handle_t::event_t _scal_matrix(
+    sb_handle_t &sb_handle, index_t _M, index_t _N, element_t _alpha,
+    container_0_t _A, index_t _lda, increment_t _incA,
+    const typename sb_handle_t::event_t &_dependencies) {
+  if (_incA == index_t(1)) {
+    return _scal_matrix_impl<false>(sb_handle, _M, _N, _alpha, _A, _lda, _incA,
+                                    _dependencies);
+  } else {
+    return _scal_matrix_impl<true>(sb_handle, _M, _N, _alpha, _A, _lda, _incA,
+                                   _dependencies);
+  }
+}
+
+/**
+ * \brief Internal implementation of Matrix scaling
+ * @tparam has_inc Whether matrix has an increment != 1
+ *
+ * Remaining parameters match internal::_scal_matrix parameters (see above)
+ */
+template <bool has_inc, typename sb_handle_t, typename element_t,
+          typename container_0_t, typename index_t, typename increment_t>
+typename sb_handle_t::event_t _scal_matrix_impl(
+    sb_handle_t &sb_handle, index_t _M, index_t _N, element_t _alpha,
+    container_0_t _A, index_t _lda, increment_t _incA,
+    const typename sb_handle_t::event_t &_dependencies) {
+  typename MatrixViewType<container_0_t, index_t, col_major, has_inc>::type
+      m_view = make_matrix_view<col_major, element_t, index_t, has_inc>(
+          _A, _M, _N, _lda, _incA);
+  if (_alpha == element_t{1}) {
+    return _dependencies;
+  } else {
+    auto scal_op = make_op<ScalarOp, ProductOperator>(_alpha, m_view);
+    auto copy_op = make_op<Assign>(m_view, scal_op);
+    typename sb_handle_t::event_t ret =
+        sb_handle.execute(copy_op, _dependencies);
+    return ret;
+  }
+}
+
 /**
  * \brief NRM2 Returns the euclidian norm of a vector
  * @param sb_handle_t sb_handle
@@ -548,24 +606,23 @@ typename sb_handle_t::event_t _dot_impl(
   typename sb_handle_t::event_t ret_event;
   // Skip if N==0, _rs is not overwritten
   if (!_N) return {_dependencies};
-
   auto vx = make_vector_view(_vx, _incx, _N);
   auto vy = make_vector_view(_vy, _incy, _N);
   auto rs = make_vector_view(_rs, static_cast<increment_t>(1),
                              static_cast<index_t>(1));
 
   auto prdOp = make_op<BinaryOpConst, ProductOperator>(vx, vy);
-  auto assignOp = make_wg_atomic_reduction<AddOperator>(rs, prdOp);
+  auto wgReductionOp = make_wg_atomic_reduction<AddOperator>(rs, prdOp);
 
   if constexpr (localMemSize) {
     ret_event =
-        sb_handle.execute(assignOp, static_cast<index_t>(localSize),
+        sb_handle.execute(wgReductionOp, static_cast<index_t>(localSize),
                           static_cast<index_t>(_number_wg * localSize),
                           static_cast<index_t>(localMemSize), _dependencies);
   } else {
-    ret_event = sb_handle.execute(assignOp, static_cast<index_t>(localSize),
-                                  static_cast<index_t>(_number_wg * localSize),
-                                  _dependencies);
+    ret_event = sb_handle.execute(
+        wgReductionOp, static_cast<index_t>(localSize),
+        static_cast<index_t>(_number_wg * localSize), _dependencies);
   }
   return ret_event;
 }
@@ -867,6 +924,7 @@ typename ValueType<container_0_t>::type _dot(
     sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _vy, increment_t _incy,
     const typename sb_handle_t::event_t &_dependencies) {
+#ifndef __ADAPTIVECPP__
   constexpr bool is_usm = std::is_pointer<container_0_t>::value;
   using element_t = typename ValueType<container_0_t>::type;
   element_t res{0};
@@ -889,6 +947,10 @@ typename ValueType<container_0_t>::type _dot(
                             : helper::AllocType::buffer>(gpu_res,
                                                          sb_handle.get_queue());
   return res;
+#else
+  throw std::runtime_error(
+      "Dot is not supported with AdaptiveCpp as it uses SYCL 2020 reduction.");
+#endif
 }
 
 /**
@@ -1012,6 +1074,7 @@ template <typename sb_handle_t, typename container_t, typename index_t,
 typename ValueType<container_t>::type _asum(
     sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx,
     const typename sb_handle_t::event_t &_dependencies) {
+#ifndef __ADAPTIVECPP__
   constexpr bool is_usm = std::is_pointer<container_t>::value;
   using element_t = typename ValueType<container_t>::type;
   auto res = std::vector<element_t>(1, element_t(0));
@@ -1032,6 +1095,10 @@ typename ValueType<container_t>::type _asum(
                                   : helper::AllocType::buffer>(
       gpu_res, sb_handle.get_queue());
   return res[0];
+#else
+  throw std::runtime_error(
+      "Asum is not supported with AdaptiveCpp as it uses SYCL 2020 reduction.");
+#endif
 }
 
 /**
@@ -1047,6 +1114,7 @@ template <typename sb_handle_t, typename container_t, typename index_t,
 typename ValueType<container_t>::type _nrm2(
     sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx,
     const typename sb_handle_t::event_t &_dependencies) {
+#ifndef __ADAPTIVECPP__
   constexpr bool is_usm = std::is_pointer<container_t>::value;
   using element_t = typename ValueType<container_t>::type;
   auto res = std::vector<element_t>(1, element_t(0));
@@ -1066,6 +1134,10 @@ typename ValueType<container_t>::type _nrm2(
                                   : helper::AllocType::buffer>(
       gpu_res, sb_handle.get_queue());
   return res[0];
+#else
+  throw std::runtime_error(
+      "Nrm2 is not supported with AdaptiveCpp as it uses SYCL 2020 reduction.");
+#endif
 }
 
 }  // namespace internal
diff --git a/src/interface/blas2_interface.hpp b/src/interface/blas2_interface.hpp
index ac07cf499..71dbee066 100644
--- a/src/interface/blas2_interface.hpp
+++ b/src/interface/blas2_interface.hpp
@@ -356,7 +356,7 @@ typename sb_handle_t::event_t _trsv_impl(
     sb_handle_t& sb_handle, index_t _N, container_t0 _mA, index_t _lda,
     container_t1 _vx, increment_t _incx,
     const typename sb_handle_t::event_t& _dependencies) {
-#if (SYCL_LANGUAGE_VERSION < 202000) || (defined __HIPSYCL__)
+#if (SYCL_LANGUAGE_VERSION < 202000) || (defined __ADAPTIVECPP__)
   throw std::runtime_error("trsv requires SYCL 2020");
 #else
   static_assert(subgroup_size % subgroups == 0,
@@ -747,7 +747,7 @@ typename sb_handle_t::event_t _tbsv_impl(
     sb_handle_t& sb_handle, index_t _N, index_t _K, container_t0 _mA,
     index_t _lda, container_t1 _vx, increment_t _incx,
     const typename sb_handle_t::event_t& _dependencies) {
-#if (SYCL_LANGUAGE_VERSION < 202000) || (defined __HIPSYCL__)
+#if (SYCL_LANGUAGE_VERSION < 202000) || (defined __ADAPTIVECPP__)
   throw std::runtime_error("tbsv requires SYCL 2020");
 #else
   static_assert(subgroup_size % subgroups == 0,
@@ -810,7 +810,7 @@ template <uint32_t subgroup_size, uint32_t subgroups, uplo_type uplo,
 typename sb_handle_t::event_t _tpsv_impl(
     sb_handle_t& sb_handle, index_t _N, container_t0 _mA, container_t1 _vx,
     increment_t _incx, const typename sb_handle_t::event_t& _dependencies) {
-#if (SYCL_LANGUAGE_VERSION < 202000) || (defined __HIPSYCL__)
+#if (SYCL_LANGUAGE_VERSION < 202000) || (defined __ADAPTIVECPP__)
   throw std::runtime_error("tpsv requires SYCL 2020");
 #else
   static_assert(subgroup_size % subgroups == 0,
diff --git a/src/interface/gemm_interface.hpp b/src/interface/gemm_interface.hpp
index b22d2688f..b777d1215 100644
--- a/src/interface/gemm_interface.hpp
+++ b/src/interface/gemm_interface.hpp
@@ -110,19 +110,28 @@ typename sb_handle_t::event_t _gemm_backend(
     index_t batch_size, gemm_batch_type_t batch_type,
     const typename sb_handle_t::event_t& _dependencies) {
   if (_alpha == element_t{0}) {
-    // When alpha = 0, GEMM is equivalent to C = beta * C.
-    if (_ldc == _M) {
-      // When LDC is M, we can scale the full matrix at once.
-      const auto matrix_size = _N * _M * batch_size;
-      return ::blas::_scal(sb_handle, matrix_size, _beta, _C, index_t{1},
-                           _dependencies);
+    index_t size_c = _ldc * _N;
+    // When alpha = 0, GEMM is equivalent to {batch_size Times C = beta * C}.
+    if (size_c == _stridec) {
+      if (_ldc == _M) {
+        // When LDC is M, we can scale the full batched-matrices at once as a
+        // single vector.
+        return ::blas::_scal(sb_handle, size_c * batch_size, _beta, _C,
+                             index_t{1}, _dependencies);
+      } else {
+        // When stride matches matrix size, the _ldc is conserved between
+        // matrices, we can thus scale the full batched-matrices at one as a
+        // single matrix.
+        return _scal_matrix(sb_handle, _M, _N * batch_size, _beta, _C, _ldc,
+                            index_t{1}, _dependencies);
+      }
+
     } else {
-      // When LDC is not M, we must scale each column of C separately.
       typename sb_handle_t::event_t events;
-      const auto num_columns = batch_size * _N;
-      for (index_t i = 0; i < num_columns; ++i) {
-        auto ev = ::blas::_scal(sb_handle, _M, _beta, _C + i * _ldc, index_t{1},
-                                _dependencies);
+      // Generic case needs to be serialized across batch matrices.
+      for (index_t b = 0; b < batch_size; b++) {
+        auto ev = _scal_matrix(sb_handle, _M, _N, _beta, _C + b * _stridec,
+                               _ldc, index_t{1}, _dependencies);
         append_vector(events, ev);
       }
       return events;
diff --git a/src/operations/blas1/IndexMaxMin.hpp b/src/operations/blas1/IndexMaxMin.hpp
index e7dfd1ec5..d40367d6c 100644
--- a/src/operations/blas1/IndexMaxMin.hpp
+++ b/src/operations/blas1/IndexMaxMin.hpp
@@ -94,6 +94,7 @@ PORTBLAS_INLINE void IndexMaxMin<is_max, is_step0, lhs_t, rhs_t>::eval(
   using element_t =
       typename ResolveReturnType<op, rhs_t>::type::value_t::value_t;
 
+#ifndef __ADAPTIVECPP__
   // reduction within the sub_group
   for (index_t i = sg_local_range >> 1; i > 0; i >>= 1) {
     if (sg_local_id < i) {
@@ -103,7 +104,12 @@ PORTBLAS_INLINE void IndexMaxMin<is_max, is_step0, lhs_t, rhs_t>::eval(
       val = op::eval(val, shfl);
     }
   }
-
+#else
+  // AdaptiveCpp uses a different interface "shift_group_left" which is
+  // recognized by the compiler but throws JIT errors at runtime. Currently this
+  // part is skipped as non-local memory kernel is never called with
+  // AdaptiveCpp.
+#endif
   const index_t lhs_idx =
       ndItem.get_group_linear_id() * (local_range / sg_local_range) +
       sg.get_group_linear_id();
@@ -144,7 +150,7 @@ PORTBLAS_INLINE void IndexMaxMin<is_max, is_step0, lhs_t, rhs_t>::eval(
   }
 
   scratch[local_id] = val;
-  ndItem.barrier(sycl::access::fence_space::local_space);
+  ndItem.barrier(cl::sycl::access::fence_space::local_space);
 
   value_t local_val = op::template init<rhs_t>();
   // reduction within the work group
@@ -154,7 +160,7 @@ PORTBLAS_INLINE void IndexMaxMin<is_max, is_step0, lhs_t, rhs_t>::eval(
       local_val = scratch[local_id + i];
       scratch[local_id] = op::eval(val, local_val);
     }
-    ndItem.barrier(sycl::access::fence_space::local_space);
+    ndItem.barrier(cl::sycl::access::fence_space::local_space);
   }
 
   // write IndexValueTuple to Global Memory iff reduction step0
diff --git a/src/operations/blas1/WGAtomicReduction.hpp b/src/operations/blas1/WGAtomicReduction.hpp
index 0e499dddc..22d923e8d 100644
--- a/src/operations/blas1/WGAtomicReduction.hpp
+++ b/src/operations/blas1/WGAtomicReduction.hpp
@@ -55,10 +55,11 @@ PORTBLAS_INLINE bool WGAtomicReduction<operator_t, lhs_t, rhs_t>::valid_thread(
 template <typename operator_t, typename lhs_t, typename rhs_t>
 PORTBLAS_INLINE typename WGAtomicReduction<operator_t, lhs_t, rhs_t>::value_t
 WGAtomicReduction<operator_t, lhs_t, rhs_t>::eval(cl::sycl::nd_item<1> ndItem) {
-  auto atomic_res = sycl::atomic_ref<value_t, sycl::memory_order::relaxed,
-                                     sycl::memory_scope::device,
-                                     sycl::access::address_space::global_space>(
-      lhs_.get_data()[0]);
+  auto atomic_res =
+      cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+          lhs_.get_data()[0]);
   const auto size = get_size();
   int lid = ndItem.get_global_linear_id();
   value_t val = operator_t::template init<rhs_t>();
@@ -70,10 +71,11 @@ WGAtomicReduction<operator_t, lhs_t, rhs_t>::eval(cl::sycl::nd_item<1> ndItem) {
     val = operator_t::eval(val, rhs_.eval(id));
   }
 
-  val = sycl::reduce_over_group(ndItem.get_sub_group(), val, sycl::plus<>());
+  val = cl::sycl::reduce_over_group(ndItem.get_sub_group(), val,
+                                    cl::sycl::plus<value_t>());
 
-  if ((ndItem.get_local_id() &
-       (ndItem.get_sub_group().get_local_range() - 1)) == 0) {
+  if ((ndItem.get_local_id()[0] &
+       (ndItem.get_sub_group().get_local_range()[0] - 1)) == 0) {
     atomic_res += val;
   }
   return {};
@@ -83,10 +85,11 @@ template <typename sharedT>
 PORTBLAS_INLINE typename WGAtomicReduction<operator_t, lhs_t, rhs_t>::value_t
 WGAtomicReduction<operator_t, lhs_t, rhs_t>::eval(sharedT scratch,
                                                   cl::sycl::nd_item<1> ndItem) {
-  auto atomic_res = sycl::atomic_ref<value_t, sycl::memory_order::relaxed,
-                                     sycl::memory_scope::device,
-                                     sycl::access::address_space::global_space>(
-      lhs_.get_data()[0]);
+  auto atomic_res =
+      cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+          lhs_.get_data()[0]);
   const auto size = get_size();
   const int lid = static_cast<int>(ndItem.get_global_linear_id());
   const auto loop_stride =
@@ -98,21 +101,24 @@ WGAtomicReduction<operator_t, lhs_t, rhs_t>::eval(sharedT scratch,
     val = operator_t::eval(val, rhs_.eval(id));
   }
 
-  val = sycl::reduce_over_group(ndItem.get_sub_group(), val, sycl::plus<>());
+  val = cl::sycl::reduce_over_group(ndItem.get_sub_group(), val,
+                                    cl::sycl::plus<value_t>());
 
-  if (ndItem.get_sub_group().get_local_id() == 0) {
+  if (ndItem.get_sub_group().get_local_id()[0] == 0) {
     scratch[ndItem.get_sub_group().get_group_linear_id()] = val;
   }
   ndItem.barrier();
 
-  val = (ndItem.get_local_id() < (ndItem.get_local_range(0) /
-                                  ndItem.get_sub_group().get_local_range()[0]))
-            ? scratch[ndItem.get_sub_group().get_local_id()]
-            : 0;
-  if (ndItem.get_sub_group().get_group_id() == 0) {
-    val = sycl::reduce_over_group(ndItem.get_sub_group(), val, sycl::plus<>());
+  val =
+      (ndItem.get_local_id()[0] < (ndItem.get_local_range(0) /
+                                   ndItem.get_sub_group().get_local_range()[0]))
+          ? scratch[ndItem.get_sub_group().get_local_id()]
+          : 0;
+  if (ndItem.get_sub_group().get_group_id()[0] == 0) {
+    val = cl::sycl::reduce_over_group(ndItem.get_sub_group(), val,
+                                      cl::sycl::plus<value_t>());
   }
-  if (ndItem.get_local_id() == 0) {
+  if (ndItem.get_local_id()[0] == 0) {
     atomic_res += val;
   }
 
diff --git a/src/operations/blas2/spr.hpp b/src/operations/blas2/spr.hpp
index ee317a92f..9a69866d3 100644
--- a/src/operations/blas2/spr.hpp
+++ b/src/operations/blas2/spr.hpp
@@ -101,15 +101,20 @@ typename rhs_1_t::value_t Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::eval(
 
   index_t row = 0, col = 0;
 
+#ifndef __ADAPTIVECPP__
   if (!id) {
+#endif
     Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::compute_row_col(
         global_idx, N_, row, col);
+#ifndef __ADAPTIVECPP__
   }
 
   row = cl::sycl::group_broadcast(ndItem.get_group(), row);
   col = cl::sycl::group_broadcast(ndItem.get_group(), col);
+#endif
 
   if (global_idx < lhs_size) {
+#ifndef __ADAPTIVECPP__
     if constexpr (isUpper) {
       if (id) {
         row += id;
@@ -127,6 +132,7 @@ typename rhs_1_t::value_t Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::eval(
         }
       }
     }
+#endif
 
     value_t lhs_val = lhs_.eval(global_idx);
     value_t rhs_1_val = rhs_1_.eval(row);
@@ -135,7 +141,8 @@ typename rhs_1_t::value_t Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::eval(
       value_t rhs_1_val_second = rhs_1_.eval(col);
       value_t rhs_2_val_second = rhs_2_.eval(row);
       lhs_.eval(global_idx) = rhs_1_val * rhs_2_val * alpha_ +
-            rhs_1_val_second * rhs_2_val_second * alpha_ + lhs_val;
+                              rhs_1_val_second * rhs_2_val_second * alpha_ +
+                              lhs_val;
     } else
       lhs_.eval(global_idx) = rhs_1_val * rhs_2_val * alpha_ + lhs_val;
   }
@@ -161,9 +168,8 @@ Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::adjust_access_displacement() {
 
 template <bool Single, bool isUpper, typename lhs_t, typename rhs_1_t,
           typename rhs_2_t>
-PORTBLAS_INLINE
-    typename Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::index_t
-    Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::get_size() const {
+PORTBLAS_INLINE typename Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::index_t
+Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::get_size() const {
   return rhs_1_.get_size();
 }
 template <bool Single, bool isUpper, typename lhs_t, typename rhs_1_t,
diff --git a/src/operations/blas2/txsv.hpp b/src/operations/blas2/txsv.hpp
index 5f92ba63f..a974253f2 100644
--- a/src/operations/blas2/txsv.hpp
+++ b/src/operations/blas2/txsv.hpp
@@ -100,7 +100,7 @@ Txsv<vector_t, matrix_t, sync_t, matrix_format, subgroup_size, subgroups,
      is_upper, is_transposed, is_unitdiag>::eval(local_memory_t local_mem,
                                                  cl::sycl::nd_item<1> ndItem) {
   value_t ret = 0;
-#if (SYCL_LANGUAGE_VERSION >= 202000) && !(defined __HIPSYCL__)
+#if (SYCL_LANGUAGE_VERSION >= 202000) && !(defined __ADAPTIVECPP__)
 
   constexpr bool is_forward =
       (is_upper && is_transposed) || (!is_upper && !is_transposed);
@@ -137,9 +137,9 @@ Txsv<vector_t, matrix_t, sync_t, matrix_format, subgroup_size, subgroups,
   value_t *const par_x = loc_x + loc_x_dim + loc_x_dim * l_idy;
   value_t *const loc_recip = loc_x + loc_x_dim;
 
-  auto a = sycl::atomic_ref<int32_t, sycl::memory_order::relaxed,
-                            sycl::memory_scope::device,
-                            sycl::access::address_space::global_space>(
+  auto a = cl::sycl::atomic_ref<int32_t, cl::sycl::memory_order::relaxed,
+                                cl::sycl::memory_scope::device,
+                                cl::sycl::access::address_space::global_space>(
       sync_.eval(0));
 
   // Get the wg_id of actual workgroup
@@ -185,7 +185,7 @@ Txsv<vector_t, matrix_t, sync_t, matrix_format, subgroup_size, subgroups,
   volatile int32_t *p = &sync_.eval(1);
   int32_t ready_block =
       (l_idy == 0)
-          ? sycl::group_broadcast(ndItem.get_sub_group(), not_wi0 ? 0 : *p)
+          ? cl::sycl::group_broadcast(ndItem.get_sub_group(), not_wi0 ? 0 : *p)
           : 0;
 
   const index_t steps =
@@ -210,7 +210,7 @@ Txsv<vector_t, matrix_t, sync_t, matrix_format, subgroup_size, subgroups,
       while (!((is_forward && (curr_block < ready_block)) ||
                (!is_forward && (curr_block > ready_block))))
         ready_block =
-            sycl::group_broadcast(ndItem.get_sub_group(), not_wi0 ? 0 : *p);
+            cl::sycl::group_broadcast(ndItem.get_sub_group(), not_wi0 ? 0 : *p);
 
       loc_x[l_idx] = (curr_offset < _N) ? lhs_.eval(curr_offset) : value_t(0);
     }
@@ -272,9 +272,9 @@ Txsv<vector_t, matrix_t, sync_t, matrix_format, subgroup_size, subgroups,
     for (index_t _it = 0; _it < loc_x_dim; ++_it) {
       const index_t l_diag = is_forward ? _it : (loc_x_dim - 1 - _it);
 
-      r_diag =
-          sycl::group_broadcast(ndItem.get_sub_group(),
-                                is_unitdiag ? r_x : r_x * A_diag_recip, l_diag);
+      r_diag = cl::sycl::group_broadcast(ndItem.get_sub_group(),
+                                         is_unitdiag ? r_x : r_x * A_diag_recip,
+                                         l_diag);
       _A = (is_transposed) ? loc_A[l_lda * l_idx + l_diag]
                            : loc_A[l_lda * l_diag + l_idx];
       r_x -= _A * r_diag;
@@ -286,12 +286,14 @@ Txsv<vector_t, matrix_t, sync_t, matrix_format, subgroup_size, subgroups,
     if (g_idx < _N) *lhs_p = ret = loc_x[l_idx];
   }
 
-  sycl::atomic_fence(sycl::memory_order::seq_cst, sycl::memory_scope::device);
+  cl::sycl::atomic_fence(cl::sycl::memory_order::seq_cst,
+                         cl::sycl::memory_scope::device);
 
   volatile int32_t *sync = sync_.get_pointer() + 1;
   if (!not_wi0) *sync = wg_id + (is_forward ? 1 : -1);
 
-  sycl::atomic_fence(sycl::memory_order::seq_cst, sycl::memory_scope::device);
+  cl::sycl::atomic_fence(cl::sycl::memory_order::seq_cst,
+                         cl::sycl::memory_scope::device);
 
 #endif
   return ret;
diff --git a/src/operations/blas3/gemm_interleaved.hpp b/src/operations/blas3/gemm_interleaved.hpp
index 66629033e..f3ce55afb 100644
--- a/src/operations/blas3/gemm_interleaved.hpp
+++ b/src/operations/blas3/gemm_interleaved.hpp
@@ -493,9 +493,21 @@ class Gemm<input_t, output_t, /* DoubleBuffer = */ false, /* NbcA = */ false,
       for (int j = 0; j < item_rows; ++j) {
 #pragma unroll
         for (int b = 0; b < item_batchs / VectorSize; ++b) {
+#ifdef __ADAPTIVECPP__
+          if constexpr (is_half<element_t>::value) {
+            *reg_res = reg_a[j * (item_batchs / VectorSize) + b] *
+                           reg_b[i * (item_batchs / VectorSize) + b] +
+                       *reg_res;
+          } else {
+            *reg_res = cl::sycl::mad(reg_a[j * (item_batchs / VectorSize) + b],
+                                     reg_b[i * (item_batchs / VectorSize) + b],
+                                     *reg_res);
+          }
+#else
           *reg_res = cl::sycl::mad(reg_a[j * (item_batchs / VectorSize) + b],
                                    reg_b[i * (item_batchs / VectorSize) + b],
                                    *reg_res);
+#endif
           ++reg_res;
         }
       }
diff --git a/src/operations/extension/axpy_batch.hpp b/src/operations/extension/axpy_batch.hpp
index d5a70a7d6..344d4ec2a 100644
--- a/src/operations/extension/axpy_batch.hpp
+++ b/src/operations/extension/axpy_batch.hpp
@@ -68,7 +68,7 @@ Axpy_batch<sameSign, localSize, maxBlockPerBatch, lhs_t, rhs_t>::eval(
 
   const index_t size_compute_rateo =
       (n > nbl * localSize) ? n / (nbl * localSize) : batch_size_;
-  const index_t jump_value{sycl::min(batch_size_, size_compute_rateo)};
+  const index_t jump_value{cl::sycl::min(batch_size_, size_compute_rateo)};
 
   if (group_id >= jump_value || l_id > n) return {};
 
diff --git a/src/sb_handle/portblas_handle.hpp b/src/sb_handle/portblas_handle.hpp
index c03b7b277..f158de242 100644
--- a/src/sb_handle/portblas_handle.hpp
+++ b/src/sb_handle/portblas_handle.hpp
@@ -44,9 +44,11 @@ typename std::enable_if<
     alloc == helper::AllocType::buffer,
     typename helper::AllocHelper<value_t, alloc>::type>::type
 SB_Handle::acquire_temp_mem(size_t size) {
+#ifndef __ADAPTIVECPP__
   if (tempMemPool_ != nullptr)
     return tempMemPool_->acquire_buff_mem<value_t>(size);
   else
+#endif
     return make_sycl_iterator_buffer<value_t>(size);
 }
 
@@ -58,9 +60,11 @@ typename std::enable_if<
     typename SB_Handle::event_t>::type
 SB_Handle::release_temp_mem(const typename SB_Handle::event_t& dependencies,
                             const container_t& mem) {
+#ifndef __ADAPTIVECPP__
   if (tempMemPool_ != nullptr)
     return tempMemPool_->release_buff_mem(dependencies, mem);
   else
+#endif
     return {};
 }
 
diff --git a/src/sb_handle/temp_memory_pool.hpp b/src/sb_handle/temp_memory_pool.hpp
index 4e3a68c43..1d57b0c6f 100644
--- a/src/sb_handle/temp_memory_pool.hpp
+++ b/src/sb_handle/temp_memory_pool.hpp
@@ -1,6 +1,6 @@
 #ifndef TEMP_MEMORY_POOL_HPP
 #define TEMP_MEMORY_POOL_HPP
-
+#ifndef __ADAPTIVECPP__
 #include "portblas_helper.h"
 
 namespace blas {
@@ -114,5 +114,6 @@ typename Temp_Mem_Pool::event_t Temp_Mem_Pool::release_usm_mem(
   })};
 }
 }
-#endif
+#endif  // SB_ENABLE_USM
+#endif  // __ADAPTIVECPP__
 #endif
diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt
index 7b398a2a7..f6044d830 100644
--- a/test/unittest/CMakeLists.txt
+++ b/test/unittest/CMakeLists.txt
@@ -34,6 +34,7 @@ set(SYCL_UNITTEST_SRCS
   ${PORTBLAS_UNITTEST}/blas1/blas1_axpy_test.cpp
   ${PORTBLAS_UNITTEST}/blas1/blas1_copy_test.cpp
   ${PORTBLAS_UNITTEST}/blas1/blas1_scal_test.cpp
+  ${PORTBLAS_UNITTEST}/blas1/blas1_swap_test.cpp
   ${PORTBLAS_UNITTEST}/blas1/blas1_rot_test.cpp
   ${PORTBLAS_UNITTEST}/blas1/blas1_rotm_test.cpp
   ${PORTBLAS_UNITTEST}/blas1/blas1_rotmg_test.cpp
@@ -55,6 +56,8 @@ set(SYCL_UNITTEST_SRCS
   ${PORTBLAS_UNITTEST}/blas2/blas2_syr2_test.cpp
   ${PORTBLAS_UNITTEST}/blas2/blas2_symv_test.cpp
   ${PORTBLAS_UNITTEST}/blas2/blas2_tpmv_test.cpp
+  ${PORTBLAS_UNITTEST}/blas2/blas2_trmv_test.cpp
+  ${PORTBLAS_UNITTEST}/blas2/blas2_tbmv_test.cpp
   # Blas 3 tests
   ${PORTBLAS_UNITTEST}/blas3/blas3_gemm_test.cpp
   ${PORTBLAS_UNITTEST}/blas3/blas3_gemm_batched_test.cpp
@@ -67,6 +70,20 @@ set(SYCL_UNITTEST_SRCS
   ${PORTBLAS_UNITTEST}/extension/omatcopy_batched_test.cpp
   ${PORTBLAS_UNITTEST}/extension/omatadd_batched_test.cpp
   ${PORTBLAS_UNITTEST}/extension/axpy_batch_test.cpp
+  ${PORTBLAS_UNITTEST}/buffers/sycl_buffer_test.cpp
+)
+
+# Skip these tests for AdaptiveCpp for SPIRV/OpenCL targets 
+# that use SYCL 2020 features like group reduction or hang 
+# during execution (https://github.com/AdaptiveCpp/AdaptiveCpp/issues/1309)
+set(ADAPTIVE_CPP_SKIP 
+  ${PORTBLAS_UNITTEST}/blas1/blas1_asum_test.cpp
+  ${PORTBLAS_UNITTEST}/blas1/blas1_sdsdot_test.cpp
+  ${PORTBLAS_UNITTEST}/blas1/blas1_nrm2_test.cpp
+  ${PORTBLAS_UNITTEST}/blas1/blas1_dot_test.cpp
+  ${PORTBLAS_UNITTEST}/blas1/blas1_rot_test.cpp
+  # Hang during execution (without failing)
+  ${PORTBLAS_UNITTEST}/blas3/blas3_trsm_test.cpp
 )
 
 if(${BLAS_ENABLE_EXTENSIONS})
@@ -84,19 +101,6 @@ if(is_dpcpp)
   )
 endif()
 
-# Temporary disabling the following tests fro Intel DPC++ as currently Intel compiler crashes while running the following tests
-if(is_computecpp)
-  set(SYCL_UNITTEST_SRCS ${SYCL_UNITTEST_SRCS}
-    # Blas 1 tests
-    ${PORTBLAS_UNITTEST}/blas1/blas1_swap_test.cpp
-    # Blas 2 tests
-    ${PORTBLAS_UNITTEST}/blas2/blas2_trmv_test.cpp
-    ${PORTBLAS_UNITTEST}/blas2/blas2_tbmv_test.cpp
-    # Blas buffer tests
-    ${PORTBLAS_UNITTEST}/buffers/sycl_buffer_test.cpp
-  )
-
-endif()
 
 # Contains tests that fail if compiled with -ffast-math
 set(SYCL_UNITTEST_NOFASTMATH
@@ -115,6 +119,9 @@ set(HALF_DATA_OPS "blas1_axpy_test"
                   )
 
 foreach(blas_test ${SYCL_UNITTEST_SRCS})
+  if(is_adaptivecpp AND ${blas_test} IN_LIST ADAPTIVE_CPP_SKIP)
+    continue()
+  endif()
   get_filename_component(test_exec ${blas_test} NAME_WE)
   add_executable(${test_exec} main.cpp ${blas_test})
   if(is_computecpp)
diff --git a/test/unittest/blas1/blas1_iamax_test.cpp b/test/unittest/blas1/blas1_iamax_test.cpp
index acd80af02..ccdbcdd38 100644
--- a/test/unittest/blas1/blas1_iamax_test.cpp
+++ b/test/unittest/blas1/blas1_iamax_test.cpp
@@ -4,7 +4,6 @@
 
 template <typename scalar_t, helper::AllocType mem_alloc>
 void run_test(const combination_t<scalar_t> combi) {
-
   std::string alloc;
   api_type api;
   index_t size;
diff --git a/test/unittest/blas1/blas1_iamin_test.cpp b/test/unittest/blas1/blas1_iamin_test.cpp
index 55997c530..9f0193697 100644
--- a/test/unittest/blas1/blas1_iamin_test.cpp
+++ b/test/unittest/blas1/blas1_iamin_test.cpp
@@ -29,7 +29,6 @@
 
 template <typename scalar_t, helper::AllocType mem_alloc>
 void run_test(const combination_t<scalar_t> combi) {
-
   std::string alloc;
   api_type api;
   index_t size;
diff --git a/test/unittest/blas3/blas3_gemm_batched_test.cpp b/test/unittest/blas3/blas3_gemm_batched_test.cpp
index 74257b5cf..cc26455f5 100644
--- a/test/unittest/blas3/blas3_gemm_batched_test.cpp
+++ b/test/unittest/blas3/blas3_gemm_batched_test.cpp
@@ -127,22 +127,22 @@ GENERATE_GEMM_STRIDED_BATCHED_TEST(BatchStridedGemm, DefaultGemmAndGemmBatched);
 
 template <typename scalar_t>
 const auto AllStridedBatched =
-    ::testing::Combine(::testing::Values("usm", "buf"),   // allocation type
-                       ::testing::Values(0, 33),          // offset
-                       ::testing::Values(5),              // batch
-                       ::testing::Values(128),            // m
-                       ::testing::Values(128),            // n
-                       ::testing::Values(128),            // k
-                       ::testing::Values('n', 't'),       // transa
-                       ::testing::Values('n', 't'),       // transb
-                       ::testing::Values<scalar_t>(3.0),  // alpha
-                       ::testing::Values<scalar_t>(7.0),  // beta
-                       ::testing::Values(2),              // lda_mul
-                       ::testing::Values(3),              // ldb_mul
-                       ::testing::Values(4),              // ldc_mul
-                       ::testing::Values(0, 1, 2),        // stride_a_mul
-                       ::testing::Values(0, 1, 2),        // stride_b_mul
-                       ::testing::Values(1, 2, 3)         // stride_c_mul
+    ::testing::Combine(::testing::Values("usm", "buf"),  // allocation type
+                       ::testing::Values(0, 33),         // offset
+                       ::testing::Values(5),             // batch
+                       ::testing::Values(63),            // m
+                       ::testing::Values(63),            // n
+                       ::testing::Values(128),           // k
+                       ::testing::Values('n', 't'),      // transa
+                       ::testing::Values('n', 't'),      // transb
+                       ::testing::Values<scalar_t>(3.0, 0.0),       // alpha
+                       ::testing::Values<scalar_t>(7.0, 1.0, 0.0),  // beta
+                       ::testing::Values(2),                        // lda_mul
+                       ::testing::Values(3),                        // ldb_mul
+                       ::testing::Values(4),                        // ldc_mul
+                       ::testing::Values(0, 1, 2),  // stride_a_mul
+                       ::testing::Values(0, 1, 2),  // stride_b_mul
+                       ::testing::Values(1, 3)      // stride_c_mul
     );
 GENERATE_GEMM_STRIDED_BATCHED_TEST(BatchStridedGemm, AllStridedBatched);
 
diff --git a/test/unittest/blas3/blas3_gemm_common.hpp b/test/unittest/blas3/blas3_gemm_common.hpp
index 3f06d58ec..0b0073a5e 100644
--- a/test/unittest/blas3/blas3_gemm_common.hpp
+++ b/test/unittest/blas3/blas3_gemm_common.hpp
@@ -356,10 +356,7 @@ inline void verify_gemm(
       blas::helper::copy_to_host(q, m_c_gpu, c_m_gpu.data(), buffer_size_c);
   sb_handle.wait(event);
 
-  const bool isAlmostEqual =
-      (stride_c_mul == 1)
-          ? utils::compare_vectors(c_m_gpu, c_m_cpu)
-          : utils::compare_vectors_strided(c_m_gpu, c_m_cpu, stride_c, size_c);
+  const bool isAlmostEqual = utils::compare_vectors(c_m_gpu, c_m_cpu);
   ASSERT_TRUE(isAlmostEqual);
 
   helper::deallocate<mem_alloc>(m_a_gpu, q);
@@ -681,10 +678,7 @@ inline void verify_gemm(
       buffer_size_c);
   sb_handle.wait(event);
 
-  const bool isAlmostEqual =
-      (stride_c_mul == 1)
-          ? utils::compare_vectors(c_m_gpu, c_m_cpu)
-          : utils::compare_vectors_strided(c_m_gpu, c_m_cpu, stride_c, size_c);
+  const bool isAlmostEqual = utils::compare_vectors(c_m_gpu, c_m_cpu);
   ASSERT_TRUE(isAlmostEqual);
 
   helper::deallocate<mem_alloc>(m_a_gpu, q);