From be6421ea6752d8d8c017bf3070750e343ba0653d Mon Sep 17 00:00:00 2001
From: Luca Ferragina <ferragina.luca@gmail.com>
Date: Mon, 28 Nov 2022 12:22:22 +0100
Subject: [PATCH 01/10] Rewrite the SYCL backend for the SYCL 2020 standard and
 USM allocations (part 1)

Initial work to support the SYCL 2020 standard, using USM allocations instead of
SYCL buffers and accessors:
  - bring the SYCL interface in line with the other backends, and remove the last
    uses of the alpaka::experimental namespace;
  - reimplement the alpaka memory buffers, memset and memcpy tasks for the USM
    SYCL backend;
  - make the SYCL native handles more consistent with the other backends;
  - use the oneAPI printf extension, and implement a workaround for the OpenCL
    limitation on variadic functions and the conflict with AMD HIP/ROCm device code;
  - add more debug print messages;
  - various fixes for kernel names, memory_scope Grid and atomics;
  - update copyright information.

Initial work on the SYCL random number generators (not fully working yet).
---
 example/vectorAdd/src/vectorAdd.cpp           |  10 +-
 include/alpaka/acc/AccCpuSyclIntel.hpp        |   2 +-
 include/alpaka/acc/AccGenericSycl.hpp         |   6 +-
 include/alpaka/alpaka.hpp                     |   4 +-
 include/alpaka/atomic/AtomicGenericSycl.hpp   |   2 +-
 include/alpaka/core/Sycl.hpp                  |   5 +-
 include/alpaka/dev/DevGenericSycl.hpp         |   4 +-
 .../alpaka/kernel/TaskKernelCpuSyclIntel.hpp  |   2 +-
 .../alpaka/kernel/TaskKernelGenericSycl.hpp   |  11 +-
 include/alpaka/math/MathGenericSycl.hpp       |   2 +-
 include/alpaka/mem/buf/BufCpuSyclIntel.hpp    |   4 +-
 include/alpaka/mem/buf/BufGenericSycl.hpp     | 182 ++++++++------
 include/alpaka/mem/buf/BufGpuSyclIntel.hpp    |   4 +-
 include/alpaka/mem/buf/sycl/Copy.hpp          | 226 +++++++++++------
 include/alpaka/mem/buf/sycl/Set.hpp           | 174 ++++++++++---
 .../alpaka/mem/fence/MemFenceGenericSycl.hpp  |   7 +
 include/alpaka/meta/SyclPrintf.hpp            |  35 +++
 include/alpaka/pltf/PltfCpuSyclIntel.hpp      |   2 +-
 include/alpaka/pltf/PltfFpgaSyclIntel.hpp     |   2 +-
 include/alpaka/pltf/PltfGenericSycl.hpp       |   2 +-
 include/alpaka/pltf/PltfGpuSyclIntel.hpp      |   2 +-
 .../queue/QueueCpuSyclIntelBlocking.hpp       |   2 +-
 .../queue/QueueCpuSyclIntelNonBlocking.hpp    |   2 +-
 .../queue/sycl/QueueGenericSyclBase.hpp       |   3 +-
 include/alpaka/rand/RandGenericSycl.hpp       | 238 ++++++++++++++++++
 include/alpaka/test/Check.hpp                 |  13 +-
 include/alpaka/warp/WarpGenericSycl.hpp       |   2 +-
 include/alpaka/workdiv/WorkDivGenericSycl.hpp |   2 +-
 28 files changed, 735 insertions(+), 215 deletions(-)
 create mode 100644 include/alpaka/meta/SyclPrintf.hpp
 create mode 100644 include/alpaka/rand/RandGenericSycl.hpp
diff --git a/example/vectorAdd/src/vectorAdd.cpp b/example/vectorAdd/src/vectorAdd.cpp
index e057e359e43d..676d0041018b 100644
--- a/example/vectorAdd/src/vectorAdd.cpp
+++ b/example/vectorAdd/src/vectorAdd.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan, Luca Ferragina
  * SPDX-License-Identifier: ISC
  */
 
@@ -75,7 +75,11 @@ auto main() -> int
     // - AccCpuTbbBlocks
     // - AccCpuSerial
     // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    // using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+
+    using Acc = alpaka::AccGpuSyclIntel<Dim, Idx>;
+    using Pltf = alpaka::Pltf<Acc>;
+    using DevAcc = alpaka::Dev<Acc>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     // Defines the synchronization behavior of a queue
@@ -136,7 +140,7 @@ auto main() -> int
     }
 
     // Allocate 3 buffers on the accelerator
-    using BufAcc = alpaka::Buf<Acc, Data, Dim, Idx>;
+    using BufAcc = alpaka::Buf<DevAcc, Data, Dim, Idx>;
     BufAcc bufAccA(alpaka::allocBuf<Data, Idx>(devAcc, extent));
     BufAcc bufAccB(alpaka::allocBuf<Data, Idx>(devAcc, extent));
     BufAcc bufAccC(alpaka::allocBuf<Data, Idx>(devAcc, extent));
diff --git a/include/alpaka/acc/AccCpuSyclIntel.hpp b/include/alpaka/acc/AccCpuSyclIntel.hpp
index f1f53fad45b6..751d16c6b8b0 100644
--- a/include/alpaka/acc/AccCpuSyclIntel.hpp
+++ b/include/alpaka/acc/AccCpuSyclIntel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2022 Jan Stephan, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/acc/AccGenericSycl.hpp b/include/alpaka/acc/AccGenericSycl.hpp
index 211d89a7740f..8a7d0972ddd7 100644
--- a/include/alpaka/acc/AccGenericSycl.hpp
+++ b/include/alpaka/acc/AccGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Andrea Bocci
+/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Andrea Bocci, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -15,6 +15,8 @@
 #include "alpaka/intrinsic/IntrinsicGenericSycl.hpp"
 #include "alpaka/math/MathGenericSycl.hpp"
 #include "alpaka/mem/fence/MemFenceGenericSycl.hpp"
+#include "alpaka/meta/SyclPrintf.hpp"
+#include "alpaka/rand/RandGenericSycl.hpp"
 #include "alpaka/warp/WarpGenericSycl.hpp"
 #include "alpaka/workdiv/WorkDivGenericSycl.hpp"
 
@@ -56,6 +58,7 @@ namespace alpaka
         , public BlockSyncGenericSycl<TDim>
         , public IntrinsicGenericSycl
         , public MemFenceGenericSycl
+        , public rand::RandGenericSycl
         , public warp::WarpGenericSycl<TDim>
     {
     public:
@@ -83,6 +86,7 @@ namespace alpaka
             , BlockSyncGenericSycl<TDim>{work_item}
             , IntrinsicGenericSycl{}
             , MemFenceGenericSycl{global_fence_dummy, local_fence_dummy}
+            , rand::RandGenericSycl{}
             , warp::WarpGenericSycl<TDim>{work_item}
             , cout{output_stream}
         {
diff --git a/include/alpaka/alpaka.hpp b/include/alpaka/alpaka.hpp
index 43829f951fde..c96d396ac4c4 100644
--- a/include/alpaka/alpaka.hpp
+++ b/include/alpaka/alpaka.hpp
@@ -1,5 +1,5 @@
 /* Copyright 2023 Axel Hübl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Bernhard Manfred Gruber,
- *                Jan Stephan, Antonio Di Pilato
+ *                Jan Stephan, Antonio Di Pilato, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -172,6 +172,7 @@
 #include "alpaka/meta/NdLoop.hpp"
 #include "alpaka/meta/NonZero.hpp"
 #include "alpaka/meta/Set.hpp"
+#include "alpaka/meta/SyclPrintf.hpp"
 #include "alpaka/meta/Transform.hpp"
 #include "alpaka/meta/TypeListOps.hpp"
 // offset
@@ -186,6 +187,7 @@
 #include "alpaka/pltf/Traits.hpp"
 // rand
 #include "alpaka/rand/RandDefault.hpp"
+#include "alpaka/rand/RandGenericSycl.hpp"
 #include "alpaka/rand/RandPhilox.hpp"
 #include "alpaka/rand/RandStdLib.hpp"
 #include "alpaka/rand/RandUniformCudaHipRand.hpp"
diff --git a/include/alpaka/atomic/AtomicGenericSycl.hpp b/include/alpaka/atomic/AtomicGenericSycl.hpp
index 742adcdfa3e7..8864508d578b 100644
--- a/include/alpaka/atomic/AtomicGenericSycl.hpp
+++ b/include/alpaka/atomic/AtomicGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci
+/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/core/Sycl.hpp b/include/alpaka/core/Sycl.hpp
index cab73fe17f00..3f067f5f8b39 100644
--- a/include/alpaka/core/Sycl.hpp
+++ b/include/alpaka/core/Sycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2022 Jan Stephan, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -146,10 +146,7 @@ namespace alpaka::trait
     {
         using type = std::conditional_t<std::is_scalar_v<T>, T, typename T::element_type>;
     };
-} // namespace alpaka::trait
 
-namespace alpaka::trait
-{
     //! The SYCL vectors' extent get trait specialization.
     template<typename TExtent>
     struct GetExtent<DimInt<Dim<TExtent>::value>, TExtent, std::enable_if_t<IsSyclBuiltInType<TExtent>::value>>
diff --git a/include/alpaka/dev/DevGenericSycl.hpp b/include/alpaka/dev/DevGenericSycl.hpp
index 5892df450df3..aa5027d23513 100644
--- a/include/alpaka/dev/DevGenericSycl.hpp
+++ b/include/alpaka/dev/DevGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan, Antonio Di Pilato
+/* Copyright 2022 Jan Stephan, Antonio Di Pilato, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -204,7 +204,7 @@ namespace alpaka::trait
     template<typename TElem, typename TDim, typename TIdx, typename TPltf>
     struct BufType<DevGenericSycl<TPltf>, TElem, TDim, TIdx>
     {
-        using type = BufGenericSycl<TElem, TDim, TIdx, DevGenericSycl<TPltf>>;
+        using type = BufGenericSycl<TElem, TDim, TIdx, TPltf>;
     };
 
     //! The SYCL device platform type trait specialization.
diff --git a/include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp b/include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp
index 18b67d51a8e1..fbfc88b9c4b3 100644
--- a/include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp
+++ b/include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2022 Jan Stephan, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/kernel/TaskKernelGenericSycl.hpp b/include/alpaka/kernel/TaskKernelGenericSycl.hpp
index 8dfb20fc6d92..d4f1937415cb 100644
--- a/include/alpaka/kernel/TaskKernelGenericSycl.hpp
+++ b/include/alpaka/kernel/TaskKernelGenericSycl.hpp
@@ -1,10 +1,11 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci
+/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
 #include "alpaka/acc/Traits.hpp"
+#include "alpaka/block/shared/dyn/BlockSharedDynMemberAllocKiB.hpp"
 #include "alpaka/core/BoostPredef.hpp"
 #include "alpaka/core/STLTuple/STLTuple.hpp"
 #include "alpaka/core/Sycl.hpp"
@@ -48,9 +49,8 @@ namespace alpaka::detail
     template<typename TElem, typename TIdx, std::size_t TDim, typename TAccessModes>
     inline auto require(
         sycl::handler& cgh,
-        experimental::
-            Accessor<detail::SyclAccessor<TElem, DimInt<TDim>::value, TAccessModes>, TElem, TIdx, TDim, TAccessModes>
-                acc,
+        experimental::Accessor<SyclAccessor<TElem, DimInt<TDim>::value, TAccessModes>, TElem, TIdx, TDim, TAccessModes>
+            acc,
         special)
     {
         cgh.require(acc.m_accessor);
@@ -136,7 +136,8 @@ namespace alpaka
 
             auto output_stream = sycl::stream{buf_size, buf_per_work_item, cgh};
 #    endif
-            cgh.parallel_for<detail::kernel<TAcc, TKernelFnObj, TArgs...>>(
+            // cgh.parallel_for<detail::kernel<TAcc, TKernelFnObj, TArgs...>>( //FIXME_
+            cgh.parallel_for(
                 sycl::nd_range<TDim::value>{global_size, local_size},
                 [=](sycl::nd_item<TDim::value> work_item)
                 {
diff --git a/include/alpaka/math/MathGenericSycl.hpp b/include/alpaka/math/MathGenericSycl.hpp
index cf7233826741..d48b32b1704f 100644
--- a/include/alpaka/math/MathGenericSycl.hpp
+++ b/include/alpaka/math/MathGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2022 Jan Stephan, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/mem/buf/BufCpuSyclIntel.hpp b/include/alpaka/mem/buf/BufCpuSyclIntel.hpp
index 967dbc8141a4..0d73ffe281c6 100644
--- a/include/alpaka/mem/buf/BufCpuSyclIntel.hpp
+++ b/include/alpaka/mem/buf/BufCpuSyclIntel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2022 Jan Stephan, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -12,7 +12,7 @@
 namespace alpaka
 {
     template<typename TElem, typename TDim, typename TIdx>
-    using BufCpuSyclIntel = BufGenericSycl<TElem, TDim, TIdx, DevCpuSyclIntel>;
+    using BufCpuSyclIntel = BufGenericSycl<TElem, TDim, TIdx, PltfCpuSyclIntel>;
 }
 
 #endif
diff --git a/include/alpaka/mem/buf/BufGenericSycl.hpp b/include/alpaka/mem/buf/BufGenericSycl.hpp
index 825fad4e37f1..b60d70c8b033 100644
--- a/include/alpaka/mem/buf/BufGenericSycl.hpp
+++ b/include/alpaka/mem/buf/BufGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2022 Jan Stephan, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -12,6 +12,7 @@
 #include "alpaka/mem/buf/BufCpu.hpp"
 #include "alpaka/mem/buf/Traits.hpp"
 #include "alpaka/mem/view/Accessor.hpp"
+#include "alpaka/mem/view/ViewAccessOps.hpp"
 #include "alpaka/vec/Vec.hpp"
 
 #include <memory>
@@ -24,22 +25,22 @@
 namespace alpaka
 {
     //! The SYCL memory buffer.
-    template<typename TElem, typename TDim, typename TIdx, typename TDev>
-    class BufGenericSycl
+    template<typename TElem, typename TDim, typename TIdx, typename TPltf>
+    class BufGenericSycl : public internal::ViewAccessOps<BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
+    public:
         static_assert(
             !std::is_const_v<TElem>,
             "The elem type of the buffer can not be const because the C++ Standard forbids containers of const "
             "elements!");
         static_assert(!std::is_const_v<TIdx>, "The idx type of the buffer can not be const!");
 
-    public:
         //! Constructor
-        template<typename TExtent>
-        BufGenericSycl(TDev const& dev, sycl::buffer<TElem, TDim::value> buffer, TExtent const& extent)
+        template<typename TExtent, typename Deleter>
+        BufGenericSycl(DevGenericSycl<TPltf> const& dev, TElem* const pMem, Deleter deleter, TExtent const& extent)
             : m_dev{dev}
             , m_extentElements{getExtentVecEnd<TDim>(extent)}
-            , m_buffer{buffer}
+            , m_spMem(pMem, std::move(deleter))
         {
             ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
@@ -53,92 +54,102 @@ namespace alpaka
                 "The idx type of TExtent and the TIdx template parameter have to be identical!");
         }
 
-        TDev m_dev;
+        DevGenericSycl<TPltf> m_dev;
         Vec<TDim, TIdx> m_extentElements;
-        sycl::buffer<TElem, TDim::value> m_buffer;
+        std::shared_ptr<TElem> m_spMem;
     };
 } // namespace alpaka
 
 namespace alpaka::trait
 {
     //! The BufGenericSycl device type trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TDev>
-    struct DevType<BufGenericSycl<TElem, TDim, TIdx, TDev>>
+    template<typename TElem, typename TDim, typename TIdx, typename TPltf>
+    struct DevType<BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
-        using type = TDev;
+        using type = DevGenericSycl<TPltf>;
     };
 
     //! The BufGenericSycl device get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TDev>
-    struct GetDev<BufGenericSycl<TElem, TDim, TIdx, TDev>>
+    template<typename TElem, typename TDim, typename TIdx, typename TPltf>
+    struct GetDev<BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
-        static auto getDev(BufGenericSycl<TElem, TDim, TIdx, TDev> const& buf)
+        ALPAKA_FN_HOST static auto getDev(BufGenericSycl<TElem, TDim, TIdx, TPltf> const& buf)
         {
             return buf.m_dev;
         }
     };
 
     //! The BufGenericSycl dimension getter trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TDev>
-    struct DimType<BufGenericSycl<TElem, TDim, TIdx, TDev>>
+    template<typename TElem, typename TDim, typename TIdx, typename TPltf>
+    struct DimType<BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
         using type = TDim;
     };
 
     //! The BufGenericSycl memory element type get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TDev>
-    struct ElemType<BufGenericSycl<TElem, TDim, TIdx, TDev>>
+    template<typename TElem, typename TDim, typename TIdx, typename TPltf>
+    struct ElemType<BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
         using type = TElem;
     };
 
     //! The BufGenericSycl extent get trait specialization.
-    template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TIdx, typename TDev>
-    struct GetExtent<TIdxIntegralConst, BufGenericSycl<TElem, TDim, TIdx, TDev>>
+    template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TIdx, typename TPltf>
+    struct GetExtent<TIdxIntegralConst, BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
         static_assert(TDim::value > TIdxIntegralConst::value, "Requested dimension out of bounds");
 
-        static auto getExtent(BufGenericSycl<TElem, TDim, TIdx, TDev> const& buf) -> TIdx
+        ALPAKA_FN_HOST static auto getExtent(BufGenericSycl<TElem, TDim, TIdx, TPltf> const& buf) -> TIdx
         {
             return buf.m_extentElements[TIdxIntegralConst::value];
         }
     };
 
     //! The BufGenericSycl native pointer get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TDev>
-    struct GetPtrNative<BufGenericSycl<TElem, TDim, TIdx, TDev>>
+    template<typename TElem, typename TDim, typename TIdx, typename TPltf>
+    struct GetPtrNative<BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
-        static_assert(
-            !sizeof(TElem),
-            "Accessing device-side pointers on the host is not supported by the SYCL back-end");
-
-        static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TDev> const&) -> TElem const*
+        ALPAKA_FN_HOST static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TPltf> const& buf) -> TElem const*
         {
-            return nullptr;
+            return buf.m_spMem.get();
         }
 
-        static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TDev>&) -> TElem*
+        ALPAKA_FN_HOST static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TPltf>& buf) -> TElem*
         {
-            return nullptr;
+            return buf.m_spMem.get();
         }
     };
 
     //! The BufGenericSycl pointer on device get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TDev>
-    struct GetPtrDev<BufGenericSycl<TElem, TDim, TIdx, TDev>, TDev>
+    template<typename TElem, typename TDim, typename TIdx, typename TPltf>
+    struct GetPtrDev<BufGenericSycl<TElem, TDim, TIdx, TPltf>, DevGenericSycl<TPltf>>
     {
-        static_assert(
-            !sizeof(TElem),
-            "Accessing device-side pointers on the host is not supported by the SYCL back-end");
-
-        static auto getPtrDev(BufGenericSycl<TElem, TDim, TIdx, TDev> const&, TDev const&) -> TElem const*
+        ALPAKA_FN_HOST static auto getPtrDev(
+            BufGenericSycl<TElem, TDim, TIdx, TPltf> const& buf,
+            DevGenericSycl<TPltf> const& dev) -> TElem const*
         {
-            return nullptr;
+            if(dev == getDev(buf))
+            {
+                return buf.m_spMem.get();
+            }
+            else
+            {
+                throw std::runtime_error("The buffer is not accessible from the given device!");
+            }
         }
 
-        static auto getPtrDev(BufGenericSycl<TElem, TDim, TIdx, TDev>&, TDev const&) -> TElem*
+        ALPAKA_FN_HOST static auto getPtrDev(
+            BufGenericSycl<TElem, TDim, TIdx, TPltf>& buf,
+            DevGenericSycl<TPltf> const& dev) -> TElem*
         {
-            return nullptr;
+            if(dev == getDev(buf))
+            {
+                return buf.m_spMem.get();
+            }
+            else
+            {
+                throw std::runtime_error("The buffer is not accessible from the given device!");
+            }
         }
     };
 
@@ -147,68 +158,89 @@ namespace alpaka::trait
     struct BufAlloc<TElem, TDim, TIdx, DevGenericSycl<TPltf>>
     {
         template<typename TExtent>
-        static auto allocBuf(DevGenericSycl<TPltf> const& dev, TExtent const& ext)
-            -> BufGenericSycl<TElem, TDim, TIdx, DevGenericSycl<TPltf>>
+        ALPAKA_FN_HOST static auto allocBuf(DevGenericSycl<TPltf> const& dev, TExtent const& extent)
+            -> BufGenericSycl<TElem, TDim, TIdx, TPltf>
         {
             ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
             if constexpr(TDim::value == 0 || TDim::value == 1)
             {
-                auto const width = getWidth(ext);
+                auto const width = getWidth(extent);
 
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
                 auto const widthBytes = width * static_cast<TIdx>(sizeof(TElem));
                 std::cout << __func__ << " ew: " << width << " ewb: " << widthBytes << '\n';
-#    endif
-
-                auto const range = sycl::range<1>{width};
-                return {dev, sycl::buffer<TElem, 1>{range}, ext};
             }
             else if constexpr(TDim::value == 2)
             {
-                auto const width = getWidth(ext);
-                auto const height = getHeight(ext);
+                auto const width = getWidth(extent);
+                auto const height = getHeight(extent);
 
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
                 auto const widthBytes = width * static_cast<TIdx>(sizeof(TElem));
                 std::cout << __func__ << " ew: " << width << " eh: " << height << " ewb: " << widthBytes
                           << " pitch: " << widthBytes << '\n';
-#    endif
-
-                auto const range = sycl::range<2>{width, height};
-                return {dev, sycl::buffer<TElem, 2>{range}, ext};
             }
             else if constexpr(TDim::value == 3)
             {
-                auto const width = getWidth(ext);
-                auto const height = getHeight(ext);
-                auto const depth = getDepth(ext);
+                auto const width = getWidth(extent);
+                auto const height = getHeight(extent);
+                auto const depth = getDepth(extent);
 
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
                 auto const widthBytes = width * static_cast<TIdx>(sizeof(TElem));
                 std::cout << __func__ << " ew: " << width << " eh: " << height << " ed: " << depth
                           << " ewb: " << widthBytes << " pitch: " << widthBytes << '\n';
+            }
 #    endif
 
-                auto const range = sycl::range<3>{width, height, depth};
-                return {dev, sycl::buffer<TElem, 3>{range}, ext};
-            }
+            auto* memPtr = sycl::malloc_device<TElem>(
+                static_cast<std::size_t>(getExtentProduct(extent)),
+                dev.getNativeHandle().first,
+                dev.getNativeHandle().second);
+            auto deleter = [&dev](TElem* ptr) { sycl::free(ptr, dev.getNativeHandle().second); };
+
+            return BufGenericSycl<TElem, TDim, TIdx, TPltf>(dev, memPtr, std::move(deleter), extent);
         }
     };
 
+    //! The BufGenericSycl stream-ordered memory allocation capability trait specialization.
+    template<typename TDim, typename TPltf>
+    struct HasAsyncBufSupport<TDim, DevGenericSycl<TPltf>> : public std::false_type
+    {
+    };
+
     //! The BufGenericSycl offset get trait specialization.
-    template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TIdx, typename TDev>
-    struct GetOffset<TIdxIntegralConst, BufGenericSycl<TElem, TDim, TIdx, TDev>>
+    template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TIdx, typename TPltf>
+    struct GetOffset<TIdxIntegralConst, BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
-        static auto getOffset(BufGenericSycl<TElem, TDim, TIdx, TDev> const&) -> TIdx
+        static auto getOffset(BufGenericSycl<TElem, TDim, TIdx, TPltf> const&) -> TIdx
         {
             return 0u;
         }
     };
 
+    //! The pinned/mapped memory allocation trait specialization for the SYCL devices.
+    template<typename TPltf, typename TElem, typename TDim, typename TIdx>
+    struct BufAllocMapped
+    {
+        template<typename TExtent>
+        ALPAKA_FN_HOST static auto allocMappedBuf(DevCpu const& host, TExtent const& extent)
+            -> BufCpu<TElem, TDim, TIdx>
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            // Allocate SYCL page-locked memory on the host, mapped into the TPltf address space and
+            // accessible to all devices in the TPltf.
+            auto ctx = TPltf::syclContext();
+            TElem* memPtr = sycl::malloc_host<TElem>(static_cast<std::size_t>(getExtentProduct(extent)), ctx);
+            auto deleter = [ctx](TElem* ptr) { sycl::free(ptr, ctx); };
+
+            return BufCpu<TElem, TDim, TIdx>(host, memPtr, std::move(deleter), extent);
+        }
+    };
+
     //! The BufGenericSycl idx type trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TDev>
-    struct IdxType<BufGenericSycl<TElem, TDim, TIdx, TDev>>
+    template<typename TElem, typename TDim, typename TIdx, typename TPltf>
+    struct IdxType<BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
         using type = TIdx;
     };
@@ -217,16 +249,14 @@ namespace alpaka::trait
     template<typename TElem, typename TDim, typename TIdx, typename TPltf>
     struct GetPtrDev<BufCpu<TElem, TDim, TIdx>, DevGenericSycl<TPltf>>
     {
-        static_assert(!sizeof(TElem), "Accessing host pointers on the device is not supported by the SYCL back-end");
-
-        static auto getPtrDev(BufCpu<TElem, TDim, TIdx> const&, DevGenericSycl<TPltf> const&) -> TElem const*
+        ALPAKA_FN_HOST static auto getPtrDev(BufCpu<TElem, TDim, TIdx> const& buf, DevGenericSycl<TPltf> const&)
+            -> TElem const*
         {
-            return nullptr;
+            return getPtrNative(buf);
         }
-
-        static auto getPtrDev(BufCpu<TElem, TDim, TIdx>&, DevGenericSycl<TPltf> const&) -> TElem*
+        ALPAKA_FN_HOST static auto getPtrDev(BufCpu<TElem, TDim, TIdx>& buf, DevGenericSycl<TPltf> const&) -> TElem*
         {
-            return nullptr;
+            return getPtrNative(buf);
         }
     };
 } // namespace alpaka::trait
diff --git a/include/alpaka/mem/buf/BufGpuSyclIntel.hpp b/include/alpaka/mem/buf/BufGpuSyclIntel.hpp
index 0b27ab2ae1f9..cd12f975c8d4 100644
--- a/include/alpaka/mem/buf/BufGpuSyclIntel.hpp
+++ b/include/alpaka/mem/buf/BufGpuSyclIntel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2022 Jan Stephan, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -12,7 +12,7 @@
 namespace alpaka
 {
     template<typename TElem, typename TDim, typename TIdx>
-    using BufGpuSyclIntel = BufGenericSycl<TElem, TDim, TIdx, DevGpuSyclIntel>;
+    using BufGpuSyclIntel = BufGenericSycl<TElem, TDim, TIdx, PltfGpuSyclIntel>;
 }
 
 #endif
diff --git a/include/alpaka/mem/buf/sycl/Copy.hpp b/include/alpaka/mem/buf/sycl/Copy.hpp
index e6b1ae03836e..cfede497adb1 100644
--- a/include/alpaka/mem/buf/sycl/Copy.hpp
+++ b/include/alpaka/mem/buf/sycl/Copy.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2022 Jan Stephan, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -14,6 +14,9 @@
 #include "alpaka/extent/Traits.hpp"
 #include "alpaka/mem/buf/sycl/Common.hpp"
 #include "alpaka/mem/view/Traits.hpp"
+#include "alpaka/meta/NdLoop.hpp"
+#include "alpaka/queue/QueueGenericSyclBlocking.hpp"
+#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
 
 #include <memory>
 #include <type_traits>
@@ -24,41 +27,145 @@
 
 namespace alpaka::detail
 {
-    template<typename TElem, std::size_t TDim>
-    using SrcAccessor = sycl::
-        accessor<TElem, TDim, sycl::access_mode::read, sycl::target::global_buffer, sycl::access::placeholder::true_t>;
-
-    template<typename TElem, std::size_t TDim>
-    using DstAccessor = sycl::accessor<
-        TElem,
-        TDim,
-        sycl::access_mode::write,
-        sycl::target::global_buffer,
-        sycl::access::placeholder::true_t>;
-
-    enum class Direction
+    //!  The Sycl device memory copy task base.
+    template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
+    struct TaskCopySyclBase
     {
-        h2d,
-        d2h,
-        d2d
+        static_assert(
+            std::is_same_v<std::remove_const_t<alpaka::Elem<TViewSrc>>, std::remove_const_t<alpaka::Elem<TViewDst>>>,
+            "The source and the destination view are required to have the same element type!");
+        using ExtentSize = Idx<TExtent>;
+        using DstSize = Idx<TViewDst>;
+        using SrcSize = Idx<TViewSrc>;
+        using Elem = alpaka::Elem<TViewSrc>;
+
+        template<typename TViewFwd>
+        TaskCopySyclBase(TViewFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
+            : m_extent(getExtentVec(extent))
+            , m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast<ExtentSize>(sizeof(Elem)))
+#    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+            , m_dstExtent(getExtentVec(viewDst))
+            , m_srcExtent(getExtentVec(viewSrc))
+#    endif
+            , m_dstPitchBytes(getPitchBytesVec(viewDst))
+            , m_srcPitchBytes(getPitchBytesVec(viewSrc))
+            , m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(viewDst)))
+            , m_srcMemNative(reinterpret_cast<std::uint8_t const*>(getPtrNative(viewSrc)))
+        {
+            if constexpr(TDim::value > 0)
+            {
+                ALPAKA_ASSERT((castVec<DstSize>(m_extent) <= m_dstExtent).foldrAll(std::logical_or<bool>()));
+                ALPAKA_ASSERT((castVec<SrcSize>(m_extent) <= m_srcExtent).foldrAll(std::logical_or<bool>()));
+            }
+        }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+        ALPAKA_FN_HOST auto printDebug() const -> void
+        {
+            std::cout << __func__ << " e: " << m_extent << " ewb: " << this->m_extentWidthBytes
+                      << " de: " << m_dstExtent << " dptr: " << reinterpret_cast<void*>(m_dstMemNative)
+                      << " se: " << m_srcExtent << " sptr: " << reinterpret_cast<void const*>(m_srcMemNative)
+                      << std::endl;
+        }
+#    endif
+
+        Vec<TDim, ExtentSize> const m_extent;
+        ExtentSize const m_extentWidthBytes;
+#    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+        Vec<TDim, DstSize> const m_dstExtent;
+        Vec<TDim, SrcSize> const m_srcExtent;
+#    endif
+
+        Vec<TDim, DstSize> const m_dstPitchBytes;
+        Vec<TDim, SrcSize> const m_srcPitchBytes;
+        std::uint8_t* const m_dstMemNative;
+        std::uint8_t const* const m_srcMemNative;
+        static constexpr auto is_sycl_task = true;
     };
 
-    template<typename TSrc, typename TDst, Direction TDirection>
-    struct TaskCopySycl
+    //! The Sycl device ND memory copy task.
+    template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
+    struct TaskCopySycl : public TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>
     {
-        auto operator()(sycl::handler& cgh) const -> void
+        using DimMin1 = DimInt<TDim::value - 1u>;
+        using typename TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>::ExtentSize;
+        using typename TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>::DstSize;
+        using typename TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>::SrcSize;
+
+        using TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>::TaskCopySyclBase;
+
+        ALPAKA_FN_HOST auto operator()(sycl::handler& cgh) const -> void
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            this->printDebug();
+#    endif
+            if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+            {
+                cgh.memcpy(
+                    reinterpret_cast<void*>(this->m_dstMemNative),
+                    reinterpret_cast<void const*>(this->m_srcMemNative),
+                    static_cast<std::size_t>(this->m_extentWidthBytes * this->m_extent.prod()));
+            }
+        }
+    };
+
+    //! The SYCL device 1D memory copy task.
+    template<typename TViewDst, typename TViewSrc, typename TExtent>
+    struct TaskCopySycl<DimInt<1u>, TViewDst, TViewSrc, TExtent>
+        : TaskCopySyclBase<DimInt<1u>, TViewDst, TViewSrc, TExtent>
+    {
+        using TaskCopySyclBase<DimInt<1u>, TViewDst, TViewSrc, TExtent>::TaskCopySyclBase;
+
+        ALPAKA_FN_HOST auto operator()(sycl::handler& cgh) const -> void
         {
-            if constexpr(TDirection == Direction::d2h || TDirection == Direction::d2d)
-                cgh.require(m_src);
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            this->printDebug();
+#    endif
+            if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+            {
+                cgh.memcpy(
+                    reinterpret_cast<void*>(this->m_dstMemNative),
+                    reinterpret_cast<void const*>(this->m_srcMemNative),
+                    static_cast<std::size_t>(this->m_extentWidthBytes));
+            }
+        }
+    };
 
-            if constexpr(TDirection == Direction::h2d || TDirection == Direction::d2d)
-                cgh.require(m_dst);
+    //! The scalar SYCL memory copy trait.
+    template<typename TViewDst, typename TViewSrc, typename TExtent>
+    struct TaskCopySycl<DimInt<0u>, TViewDst, TViewSrc, TExtent>
+    {
+        static_assert(
+            std::is_same_v<std::remove_const_t<alpaka::Elem<TViewSrc>>, std::remove_const_t<alpaka::Elem<TViewDst>>>,
+            "The source and the destination view are required to have the same element type!");
+
+        using Elem = alpaka::Elem<TViewSrc>;
+
+        template<typename TViewDstFwd>
+        ALPAKA_FN_HOST TaskCopySycl(
+            TViewDstFwd&& viewDst,
+            TViewSrc const& viewSrc,
+            [[maybe_unused]] TExtent const& extent)
+            : m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
+            , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
+        {
+            // all zero-sized extents are equivalent
+            ALPAKA_ASSERT(getExtentVec(extent).prod() == 1u);
+            ALPAKA_ASSERT(getExtentVec(viewDst).prod() == 1u);
+            ALPAKA_ASSERT(getExtentVec(viewSrc).prod() == 1u);
+        }
 
-            cgh.copy(m_src, m_dst);
+        auto operator()(sycl::handler& cgh) const -> void
+        {
+            cgh.memcpy(m_dstMemNative, m_srcMemNative, sizeof(Elem));
         }
 
-        TSrc m_src;
-        TDst m_dst;
+        void* m_dstMemNative;
+        void const* m_srcMemNative;
         static constexpr auto is_sycl_task = true;
     };
 } // namespace alpaka::detail
@@ -67,76 +174,53 @@ namespace alpaka::detail
 namespace alpaka::trait
 {
     //! The SYCL host-to-device memory copy trait specialization.
-    template<typename TDim, typename TPltf>
+    template<typename TPltf, typename TDim>
     struct CreateTaskMemcpy<TDim, DevGenericSycl<TPltf>, DevCpu>
     {
         template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
-        static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& ext)
+        ALPAKA_FN_HOST static auto createTaskMemcpy(
+            TViewDstFwd&& viewDst,
+            TViewSrc const& viewSrc,
+            TExtent const& extent)
+            -> alpaka::detail::TaskCopySycl<TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
         {
             ALPAKA_DEBUG_FULL_LOG_SCOPE;
 
-            constexpr auto copy_dim = static_cast<int>(Dim<TExtent>::value);
-            using ElemType = Elem<std::remove_const_t<TViewSrc>>;
-            using SrcType = ElemType const*;
-            using DstType = alpaka::detail::DstAccessor<ElemType, copy_dim>;
-
-            auto const range = detail::make_sycl_range(ext);
-            auto const offset = detail::make_sycl_offset(viewDst);
-
-            return detail::TaskCopySycl<SrcType, DstType, detail::Direction::h2d>{
-                getPtrNative(viewSrc),
-                DstType{viewDst.m_buffer, range, offset}};
+            return {std::forward<TViewDstFwd>(viewDst), viewSrc, extent};
         }
     };
 
     //! The SYCL device-to-host memory copy trait specialization.
-    template<typename TDim, typename TPltf>
+    template<typename TPltf, typename TDim>
     struct CreateTaskMemcpy<TDim, DevCpu, DevGenericSycl<TPltf>>
     {
         template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
-        static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& ext)
+        ALPAKA_FN_HOST static auto createTaskMemcpy(
+            TViewDstFwd&& viewDst,
+            TViewSrc const& viewSrc,
+            TExtent const& extent)
+            -> alpaka::detail::TaskCopySycl<TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
         {
             ALPAKA_DEBUG_FULL_LOG_SCOPE;
 
-            constexpr auto copy_dim = static_cast<int>(Dim<TExtent>::value);
-            using ElemType = Elem<std::remove_const_t<TViewSrc>>;
-            using SrcType = alpaka::detail::SrcAccessor<ElemType, copy_dim>;
-            using DstType = ElemType*;
-
-            auto const range = detail::make_sycl_range(ext);
-            auto const offset = detail::make_sycl_offset(viewSrc);
-
-            auto view_src = const_cast<TViewSrc&>(viewSrc);
-
-            return detail::TaskCopySycl<SrcType, DstType, detail::Direction::d2h>{
-                SrcType{view_src.m_buffer, range, offset},
-                getPtrNative(viewDst)};
+            return {std::forward<TViewDstFwd>(viewDst), viewSrc, extent};
         }
     };
 
     //! The SYCL device-to-device memory copy trait specialization.
-    template<typename TDim, typename TPltfDst, typename TPltfSrc>
+    template<typename TPltfDst, typename TPltfSrc, typename TDim>
     struct CreateTaskMemcpy<TDim, DevGenericSycl<TPltfDst>, DevGenericSycl<TPltfSrc>>
     {
         template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
-        static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& ext)
+        ALPAKA_FN_HOST static auto createTaskMemcpy(
+            TViewDstFwd&& viewDst,
+            TViewSrc const& viewSrc,
+            TExtent const& extent)
+            -> alpaka::detail::TaskCopySycl<TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
         {
             ALPAKA_DEBUG_FULL_LOG_SCOPE;
 
-            constexpr auto copy_dim = static_cast<int>(Dim<TExtent>::value);
-            using ElemType = Elem<std::remove_const_t<TViewSrc>>;
-            using SrcType = alpaka::detail::SrcAccessor<ElemType, copy_dim>;
-            using DstType = alpaka::detail::DstAccessor<ElemType, copy_dim>;
-
-            auto const range = detail::make_sycl_range(ext);
-            auto const offset_src = detail::make_sycl_offset(viewSrc);
-            auto const offset_dst = detail::make_sycl_offset(viewDst);
-
-            auto view_src = const_cast<TViewSrc&>(viewSrc);
-
-            return detail::TaskCopySycl<SrcType, DstType, detail::Direction::d2d>{
-                SrcType{view_src.m_buffer, range, offset_src},
-                DstType{viewDst.m_buffer, range, offset_dst}};
+            return {std::forward<TViewDstFwd>(viewDst), viewSrc, extent};
         }
     };
 } // namespace alpaka::trait
diff --git a/include/alpaka/mem/buf/sycl/Set.hpp b/include/alpaka/mem/buf/sycl/Set.hpp
index f584e0461e87..25d602228424 100644
--- a/include/alpaka/mem/buf/sycl/Set.hpp
+++ b/include/alpaka/mem/buf/sycl/Set.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2022 Jan Stephan, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -12,6 +12,9 @@
 #include "alpaka/extent/Traits.hpp"
 #include "alpaka/mem/buf/sycl/Common.hpp"
 #include "alpaka/mem/view/Traits.hpp"
+#include "alpaka/meta/NdLoop.hpp"
+#include "alpaka/queue/QueueGenericSyclBlocking.hpp"
+#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
 #include "alpaka/queue/Traits.hpp"
 
 #include <cstddef>
@@ -25,29 +28,144 @@ namespace alpaka
 
     namespace detail
     {
-        template<std::size_t TDim>
-        using Accessor = sycl::accessor<
-            std::byte,
-            TDim,
-            sycl::access_mode::write,
-            sycl::target::global_buffer,
-            sycl::access::placeholder::true_t>;
-
-        //! The SYCL memory set trait.
-        template<typename TAccessor>
-        struct TaskSetSycl
+        //!  The SYCL ND memory set task base.
+        template<typename TDim, typename TView, typename TExtent>
+        struct TaskSetSyclBase
         {
+            using ExtentSize = Idx<TExtent>;
+            using DstSize = Idx<TView>;
+            using Elem = alpaka::Elem<TView>;
+
+            template<typename TViewFwd>
+            TaskSetSyclBase(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
+                : m_byte(byte)
+                , m_extent(getExtentVec(extent))
+                , m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast<ExtentSize>(sizeof(Elem)))
+#    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+                , m_dstExtent(getExtentVec(view))
+#    endif
+
+                , m_dstPitchBytes(getPitchBytesVec(view))
+                , m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(view)))
+
+            {
+                ALPAKA_ASSERT((castVec<DstSize>(m_extent) <= m_dstExtent).foldrAll(std::logical_or<bool>()));
+                ALPAKA_ASSERT(m_extentWidthBytes <= m_dstPitchBytes[TDim::value - 1u]);
+            }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            ALPAKA_FN_HOST auto printDebug() const -> void
+            {
+                std::cout << __func__ << " e: " << this->m_extent << " ewb: " << this->m_extentWidthBytes
+                          << " de: " << this->m_dstExtent << " dptr: " << reinterpret_cast<void*>(this->m_dstMemNative)
+                          << " dpitchb: " << this->m_dstPitchBytes << std::endl;
+            }
+#    endif
+
+            std::uint8_t const m_byte;
+            Vec<TDim, ExtentSize> const m_extent;
+            ExtentSize const m_extentWidthBytes;
+#    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+            Vec<TDim, DstSize> const m_dstExtent;
+#    endif
+            Vec<TDim, DstSize> const m_dstPitchBytes;
+            std::uint8_t* const m_dstMemNative;
+            static constexpr auto is_sycl_task = true;
+        };
+
+        //! The SYCL device ND memory set task.
+        template<typename TDim, typename TView, typename TExtent>
+        struct TaskSetSycl : public TaskSetSyclBase<TDim, TView, TExtent>
+        {
+            using DimMin1 = DimInt<TDim::value - 1u>;
+            using typename TaskSetSyclBase<TDim, TView, TExtent>::ExtentSize;
+            using typename TaskSetSyclBase<TDim, TView, TExtent>::DstSize;
+
+            using TaskSetSyclBase<TDim, TView, TExtent>::TaskSetSyclBase;
+
             auto operator()(sycl::handler& cgh) const -> void
             {
-                cgh.require(m_accessor);
-                cgh.fill(m_accessor, m_value);
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                this->printDebug();
+#    endif
+                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+                {
+                    cgh.memset(
+                        reinterpret_cast<void*>(this->m_dstMemNative),
+                        this->m_byte,
+                        static_cast<std::size_t>(this->m_extentWidthBytes * this->m_extent.prod()));
+                }
             }
+        };
 
-            TAccessor m_accessor;
-            std::byte m_value;
-            // Distinguish from non-alpaka types (= host tasks)
+        //! The 1D SYCL memory set task.
+        template<typename TView, typename TExtent>
+        struct TaskSetSycl<DimInt<1u>, TView, TExtent> : public TaskSetSyclBase<DimInt<1u>, TView, TExtent>
+        {
+            using TaskSetSyclBase<DimInt<1u>, TView, TExtent>::TaskSetSyclBase;
+
+            auto operator()(sycl::handler& cgh) const -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                this->printDebug();
+#    endif
+                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+                {
+                    cgh.memset(
+                        reinterpret_cast<void*>(this->m_dstMemNative),
+                        this->m_byte,
+                        static_cast<std::size_t>(this->m_extentWidthBytes));
+                }
+            }
+        };
+
+        //! The SYCL device scalar memory set task.
+        template<typename TView, typename TExtent>
+        struct TaskSetSycl<DimInt<0u>, TView, TExtent>
+        {
+            using ExtentSize = Idx<TExtent>;
+            using Scalar = Vec<DimInt<0u>, ExtentSize>;
+            using DstSize = Idx<TView>;
+            using Elem = alpaka::Elem<TView>;
+
+            template<typename TViewFwd>
+            TaskSetSycl(TViewFwd&& view, std::uint8_t const& byte, [[maybe_unused]] TExtent const& extent)
+                : m_byte(byte)
+                , m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(view)))
+            {
+                // all zero-sized extents are equivalent
+                ALPAKA_ASSERT(getExtentVec(extent).prod() == 1u);
+                ALPAKA_ASSERT(getExtentVec(view).prod() == 1u);
+            }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            ALPAKA_FN_HOST auto printDebug() const -> void
+            {
+                std::cout << __func__ << " e: " << Scalar() << " ewb: " << sizeof(Elem) << " de: " << Scalar()
+                          << " dptr: " << reinterpret_cast<void*>(m_dstMemNative) << " dpitchb: " << Scalar()
+                          << std::endl;
+            }
+#    endif
+
+            auto operator()(sycl::handler& cgh) const -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                printDebug();
+#    endif
+                cgh.memset(reinterpret_cast<void*>(m_dstMemNative), m_byte, sizeof(Elem));
+            }
+
+            std::uint8_t const m_byte;
+            std::uint8_t* const m_dstMemNative;
             static constexpr auto is_sycl_task = true;
         };
+
     } // namespace detail
 
 
@@ -57,25 +175,15 @@ namespace alpaka
         template<typename TDim, typename TPltf>
         struct CreateTaskMemset<TDim, DevGenericSycl<TPltf>>
         {
-            template<typename TExtent, typename TViewFwd>
-            static auto createTaskMemset(TViewFwd&& view, std::uint8_t const& byte, TExtent const& ext)
+            template<typename TExtent, typename TView>
+            ALPAKA_FN_HOST static auto createTaskMemset(TView& view, std::uint8_t const& byte, TExtent const& extent)
+                -> detail::TaskSetSycl<TDim, TView, TExtent>
             {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                constexpr auto set_dim = static_cast<int>(Dim<TExtent>::value);
-                using TView = std::remove_reference_t<TViewFwd>;
-                using ElemType = Elem<TView>;
-                using DstType = alpaka::detail::Accessor<set_dim>;
-
-                // Reinterpret as byte buffer
-                auto buf = view.m_buffer.template reinterpret<std::byte>();
-                auto const byte_val = static_cast<std::byte>(byte);
-
-                auto const range = detail::make_sycl_range(ext, sizeof(ElemType));
-                return detail::TaskSetSycl<DstType>{DstType{buf, range}, byte_val};
+                return detail::TaskSetSycl<TDim, TView, TExtent>(view, byte, extent);
             }
         };
+
     } // namespace trait
-} // namespace alpaka
 
+} // namespace alpaka
 #endif
diff --git a/include/alpaka/mem/fence/MemFenceGenericSycl.hpp b/include/alpaka/mem/fence/MemFenceGenericSycl.hpp
index c3d96113e13f..c0a308448f8e 100644
--- a/include/alpaka/mem/fence/MemFenceGenericSycl.hpp
+++ b/include/alpaka/mem/fence/MemFenceGenericSycl.hpp
@@ -32,6 +32,13 @@ namespace alpaka
             static constexpr auto scope = sycl::memory_scope::device;
             static constexpr auto space = sycl::access::address_space::global_space;
         };
+
+        template<>
+        struct SyclFenceProps<alpaka::memory_scope::Grid>
+        {
+            static constexpr auto scope = sycl::memory_scope::device;
+            static constexpr auto space = sycl::access::address_space::global_space;
+        };
     } // namespace detail
 
     //! The SYCL memory fence.
diff --git a/include/alpaka/meta/SyclPrintf.hpp b/include/alpaka/meta/SyclPrintf.hpp
new file mode 100644
index 000000000000..ef432e15a87f
--- /dev/null
+++ b/include/alpaka/meta/SyclPrintf.hpp
@@ -0,0 +1,35 @@
+/* Copyright 2022 Andrea Bocci
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <CL/sycl.hpp>
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+// Kill printf in AMD GPU code because of missing compiler support
+#    ifdef __AMDGCN__
+#        include <cstdio> // the define breaks <cstdio> if it is included afterwards
+#        define printf(...)
+#    else
+
+#        ifdef __SYCL_DEVICE_ONLY__
+#            define CONSTANT __attribute__((opencl_constant))
+#        else
+#            define CONSTANT
+#        endif
+
+#        define printf(FORMAT, ...)                                                                                   \
+            do                                                                                                        \
+            {                                                                                                         \
+                static const CONSTANT char format[] = FORMAT;                                                         \
+                sycl::ext::oneapi::experimental::printf(format, ##__VA_ARGS__);                                       \
+            } while(false)
+#    endif
+#endif
diff --git a/include/alpaka/pltf/PltfCpuSyclIntel.hpp b/include/alpaka/pltf/PltfCpuSyclIntel.hpp
index b8f3815ddaed..1c28a1288610 100644
--- a/include/alpaka/pltf/PltfCpuSyclIntel.hpp
+++ b/include/alpaka/pltf/PltfCpuSyclIntel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/pltf/PltfFpgaSyclIntel.hpp b/include/alpaka/pltf/PltfFpgaSyclIntel.hpp
index ef0b0cfa35c6..1d908c832f1b 100644
--- a/include/alpaka/pltf/PltfFpgaSyclIntel.hpp
+++ b/include/alpaka/pltf/PltfFpgaSyclIntel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci
+/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/pltf/PltfGenericSycl.hpp b/include/alpaka/pltf/PltfGenericSycl.hpp
index e1e0d38ea174..bd7f351c17a8 100644
--- a/include/alpaka/pltf/PltfGenericSycl.hpp
+++ b/include/alpaka/pltf/PltfGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan
+/* Copyright 2023 Jan Stephan, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/pltf/PltfGpuSyclIntel.hpp b/include/alpaka/pltf/PltfGpuSyclIntel.hpp
index 4c54e6c6ec1c..be1ae4643810 100644
--- a/include/alpaka/pltf/PltfGpuSyclIntel.hpp
+++ b/include/alpaka/pltf/PltfGpuSyclIntel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci
+/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp b/include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp
index 844375913d95..039a05b3229f 100644
--- a/include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp
+++ b/include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2022 Jan Stephan, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp b/include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp
index 77f20c1acbba..02391231f5f5 100644
--- a/include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp
+++ b/include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2022 Jan Stephan, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
index 297b29732f24..2db22b6d3abb 100644
--- a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
+++ b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan, Antonio Di Pilato
+/* Copyright 2022 Jan Stephan, Antonio Di Pilato, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -8,6 +8,7 @@
 #include "alpaka/dev/Traits.hpp"
 #include "alpaka/event/Traits.hpp"
 #include "alpaka/queue/Traits.hpp"
+#include "alpaka/traits/Traits.hpp"
 #include "alpaka/wait/Traits.hpp"
 
 #include <algorithm>
diff --git a/include/alpaka/rand/RandGenericSycl.hpp b/include/alpaka/rand/RandGenericSycl.hpp
new file mode 100644
index 000000000000..bd274e9a526a
--- /dev/null
+++ b/include/alpaka/rand/RandGenericSycl.hpp
@@ -0,0 +1,238 @@
+/* Copyright 2022 Luca Ferragina
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/dev/DevGenericSycl.hpp>
+#    include <alpaka/rand/Traits.hpp>
+
+// Backend specific imports.
+#    if defined(ALPAKA_ACC_SYCL_ENABLED)
+#        include <CL/sycl.hpp>
+#        include <oneapi/mkl/rng.hpp>
+
+#        include <oneapi/dpl/random>
+#    endif
+
+#    include <type_traits>
+
+namespace alpaka::rand
+{
+    //! The SYCL rand implementation.
+    class RandGenericSycl : public concepts::Implements<ConceptRand, RandGenericSycl>
+    {
+    };
+
+#    if !defined(ALPAKA_HOST_ONLY)
+
+    namespace distribution::sycl_rand
+    {
+        //! The SYCL random number floating point normal distribution.
+        template<typename T>
+        class NormalReal;
+
+        //! The SYCL random number floating point uniform distribution.
+        template<typename T>
+        class UniformReal;
+
+        //! The SYCL random number integer uniform distribution.
+        template<typename T>
+        class UniformUint;
+    } // namespace distribution::sycl_rand
+
+    namespace engine::sycl_rand
+    {
+        //! The SYCL linear congruential random number generator engine.
+        class Minstd
+        {
+        public:
+            // After calling this constructor the instance is not valid initialized and
+            // need to be overwritten with a valid object
+            Minstd() = default;
+
+            Minstd(std::uint32_t const& seed, std::uint32_t const& subsequence = 0, std::uint32_t const& offset = 0)
+            {
+                oneapi::dpl::minstd_rand engine(seed, offset);
+                rng_engine = engine;
+            }
+
+        private:
+            template<typename T>
+            friend class distribution::sycl_rand::NormalReal;
+            template<typename T>
+            friend class distribution::sycl_rand::UniformReal;
+            template<typename T>
+            friend class distribution::sycl_rand::UniformUint;
+
+            oneapi::dpl::minstd_rand rng_engine;
+
+        public:
+            using result_type = decltype(rng_engine);
+
+            // ALPAKA_FN_HOST_ACC constexpr static result_type min()
+            // {
+            //     return std::numeric_limits<result_type>::min();
+            // }
+            // ALPAKA_FN_HOST_ACC constexpr static result_type max()
+            // {
+            //     return std::numeric_limits<result_type>::max();
+            // }
+            result_type operator()()
+            {
+                return rng_engine;
+            }
+        };
+    } // namespace engine::sycl_rand
+
+    namespace distribution::sycl_rand
+    {
+        //! The SYCL random number float normal distribution.
+        template<>
+        class NormalReal<float>
+        {
+        public:
+            template<typename TEngine>
+            auto operator()(TEngine& engine) -> float
+            {
+                // Create float uniform_real_distribution distribution
+                oneapi::dpl::normal_distribution<float> distr;
+
+                // Generate float random number
+                return distr(engine);
+            }
+        };
+
+        //! The SYCL random number double normal distribution.
+        template<>
+        class NormalReal<double>
+        {
+        public:
+            template<typename TEngine>
+            auto operator()(TEngine& engine) -> double
+            {
+                // Create float uniform_real_distribution distribution
+                oneapi::dpl::normal_distribution<double> distr;
+
+                // Generate float random number
+                return distr(engine);
+            }
+        };
+
+        //! The SYCL random number float uniform distribution.
+        template<>
+        class UniformReal<float>
+        {
+        public:
+            template<typename TEngine>
+            auto operator()(TEngine& engine) -> float
+            {
+                // Create float uniform_real_distribution distribution
+                oneapi::dpl::uniform_real_distribution<float> distr;
+
+                // Generate float random number
+                return distr(engine);
+                // NOTE: (1.0f - curand_uniform) does not work, because curand_uniform seems to return
+                // denormalized floats around 0.f. [0.f, 1.0f)
+                // return fUniformRand * static_cast<float>(fUniformRand != 1.0f);
+            }
+        };
+
+        //! The SYCL random number float uniform distribution.
+        template<>
+        class UniformReal<double>
+        {
+        public:
+            template<typename TEngine>
+            auto operator()(TEngine& engine) -> double
+            {
+                // Create float uniform_real_distribution distribution
+                oneapi::dpl::uniform_real_distribution<double> distr;
+
+                // Generate float random number
+                return distr(engine);
+                // NOTE: (1.0f - curand_uniform_double) does not work, because curand_uniform_double seems to
+                // return denormalized floats around 0.f. [0.f, 1.0f)
+                // return fUniformRand * static_cast<double>(fUniformRand != 1.0);
+            }
+        };
+
+        //! The SYCL random number unsigned integer uniform distribution.
+        template<>
+        class UniformUint<unsigned int>
+        {
+        public:
+            template<typename TEngine>
+            auto operator()(TEngine& engine) -> unsigned int
+            {
+                // Create float uniform_real_distribution distribution
+                oneapi::dpl::uniform_int_distribution<unsigned int> distr;
+
+                // Generate float random number
+                return distr(engine);
+            }
+        };
+    } // namespace distribution::sycl_rand
+
+    namespace distribution::trait
+    {
+        //! The SYCL random number float normal distribution get trait specialization.
+        template<typename T>
+        struct CreateNormalReal<RandGenericSycl, T, std::enable_if_t<std::is_floating_point_v<T>>>
+        {
+            static auto createNormalReal(RandGenericSycl const& /*rand*/) -> sycl_rand::NormalReal<T>
+            {
+                return {};
+            }
+        };
+
+        //! The SYCL random number float uniform distribution get trait specialization.
+        template<typename T>
+        struct CreateUniformReal<RandGenericSycl, T, std::enable_if_t<std::is_floating_point_v<T>>>
+        {
+            static auto createUniformReal(RandGenericSycl const& /*rand*/) -> sycl_rand::UniformReal<T>
+            {
+                return {};
+            }
+        };
+
+        //! The SYCL random number integer uniform distribution get trait specialization.
+        template<typename T>
+        struct CreateUniformUint<RandGenericSycl, T, std::enable_if_t<std::is_integral_v<T>>>
+        {
+            static auto createUniformUint(RandGenericSycl const& /*rand*/) -> sycl_rand::UniformUint<T>
+            {
+                return {};
+            }
+        };
+    } // namespace distribution::trait
+
+    namespace engine::trait
+    {
+        //! The SYCL random number default generator get trait specialization.
+        template<>
+        struct CreateDefault<RandGenericSycl>
+        {
+            static auto createDefault(
+                RandGenericSycl const& /*rand*/,
+                std::uint32_t const& seed = 0,
+                std::uint32_t const& subsequence = 0,
+                std::uint32_t const& offset = 0) -> sycl_rand::Minstd
+            {
+                return {seed, subsequence, offset};
+            }
+        };
+    } // namespace engine::trait
+#    endif
+} // namespace alpaka::rand
+
+#endif
diff --git a/include/alpaka/test/Check.hpp b/include/alpaka/test/Check.hpp
index 253cb477fe9b..28f064701ccb 100644
--- a/include/alpaka/test/Check.hpp
+++ b/include/alpaka/test/Check.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Benjamin Worpitz, Jan Stephan
+/* Copyright 2022 Benjamin Worpitz, Jan Stephan, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -14,7 +14,16 @@
         {                                                                                                             \
             if(!(expression))                                                                                         \
             {                                                                                                         \
-                acc.cout << "ALPAKA_CHECK failed because '!(" << #expression << ")'\n";                               \
+                /*if constexpr(alpaka::accMatchesTags<   TODO                                                         \
+                                 decltype(acc),                                                                       \
+                                 alpaka::TagCpuSyclIntel,                                                             \
+                                 alpaka::TagFpgaSyclIntel,                                                            \
+                                 alpaka::TagFpgaSyclXilinx,                                                           \
+                                 alpaka::TagGpuSyclIntel,                                                             \
+                                 alpaka::TagGenericSycl>)                                                             \
+                    acc.cout << "ALPAKA_CHECK failed because '!(" << #expression << ")'\n";                           \
+                else                                                                                                  \
+                    printf("ALPAKA_CHECK failed because '!(%s)'\n", #expression);*/                                   \
                 success = false;                                                                                      \
             }                                                                                                         \
         } while(0)
diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp
index 33ac1884deb2..4df62480e001 100644
--- a/include/alpaka/warp/WarpGenericSycl.hpp
+++ b/include/alpaka/warp/WarpGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2022 Jan Stephan, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/workdiv/WorkDivGenericSycl.hpp b/include/alpaka/workdiv/WorkDivGenericSycl.hpp
index c6d1c114a3c9..cb4f003fa211 100644
--- a/include/alpaka/workdiv/WorkDivGenericSycl.hpp
+++ b/include/alpaka/workdiv/WorkDivGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2022 Jan Stephan, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 

From 5b1b2b7b7750c4ff4d3b41f3df22f74fa98ffa94 Mon Sep 17 00:00:00 2001
From: AuroraPerego <aurora.perego@cern.ch>
Date: Sun, 2 Apr 2023 19:06:13 +0200
Subject: [PATCH 02/10] Rewrite the SYCL backend for the SYCL 2020 standard and
 USM allocations (part 2)

More changes to the SYCL backend:
  - move printf to alpaka/core and use it in ALPAKA_CHECK;
  - remove IsView -> false in mem/buf/sycl/Accessor;
  - remove wrong attribute in mem/buf/sycl/Copy;
  - remove the SYCL experimental BuildAccessor<BufGenericSycl>, use the default
    implementation from alpaka/mem/view.

Fix the examples to work with the SYCL backend:
  - fix the accelerator in the vectorAdd example;
  - move AccCpuSerial at the end in the ExampleDefaultAcc, as it was preventing
    the SYCL accelerators from being selected.

Complete the work on the SYCL random number generators.
---
 example/vectorAdd/src/vectorAdd.cpp          |   7 +-
 include/alpaka/acc/AccGenericSycl.hpp        |   8 +-
 include/alpaka/alpaka.hpp                    |   3 +-
 include/alpaka/core/Sycl.hpp                 |  22 +++-
 include/alpaka/example/ExampleDefaultAcc.hpp |   6 +-
 include/alpaka/mem/buf/sycl/Accessor.hpp     |  26 +----
 include/alpaka/mem/buf/sycl/Copy.hpp         |   7 +-
 include/alpaka/meta/SyclPrintf.hpp           |  35 -------
 include/alpaka/rand/RandGenericSycl.hpp      | 100 +++++++++----------
 include/alpaka/test/Check.hpp                |  43 ++------
 10 files changed, 93 insertions(+), 164 deletions(-)
 delete mode 100644 include/alpaka/meta/SyclPrintf.hpp

diff --git a/example/vectorAdd/src/vectorAdd.cpp b/example/vectorAdd/src/vectorAdd.cpp
index 676d0041018b..a67362200189 100644
--- a/example/vectorAdd/src/vectorAdd.cpp
+++ b/example/vectorAdd/src/vectorAdd.cpp
@@ -1,4 +1,5 @@
-/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan, Luca Ferragina
+/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan, Luca Ferragina,
+ *                Aurora Perego
  * SPDX-License-Identifier: ISC
  */
 
@@ -75,9 +76,7 @@ auto main() -> int
     // - AccCpuTbbBlocks
     // - AccCpuSerial
     // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
-    // using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
-
-    using Acc = alpaka::AccGpuSyclIntel<Dim, Idx>;
+    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
     using Pltf = alpaka::Pltf<Acc>;
     using DevAcc = alpaka::Dev<Acc>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
diff --git a/include/alpaka/acc/AccGenericSycl.hpp b/include/alpaka/acc/AccGenericSycl.hpp
index 8a7d0972ddd7..fe3a28bc1828 100644
--- a/include/alpaka/acc/AccGenericSycl.hpp
+++ b/include/alpaka/acc/AccGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Andrea Bocci, Luca Ferragina
+/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Andrea Bocci, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -15,7 +15,6 @@
 #include "alpaka/intrinsic/IntrinsicGenericSycl.hpp"
 #include "alpaka/math/MathGenericSycl.hpp"
 #include "alpaka/mem/fence/MemFenceGenericSycl.hpp"
-#include "alpaka/meta/SyclPrintf.hpp"
 #include "alpaka/rand/RandGenericSycl.hpp"
 #include "alpaka/warp/WarpGenericSycl.hpp"
 #include "alpaka/workdiv/WorkDivGenericSycl.hpp"
@@ -58,7 +57,7 @@ namespace alpaka
         , public BlockSyncGenericSycl<TDim>
         , public IntrinsicGenericSycl
         , public MemFenceGenericSycl
-        , public rand::RandGenericSycl
+        , public rand::RandGenericSycl<TDim>
         , public warp::WarpGenericSycl<TDim>
     {
     public:
@@ -86,7 +85,7 @@ namespace alpaka
             , BlockSyncGenericSycl<TDim>{work_item}
             , IntrinsicGenericSycl{}
             , MemFenceGenericSycl{global_fence_dummy, local_fence_dummy}
-            , rand::RandGenericSycl{}
+            , rand::RandGenericSycl<TDim>{work_item}
             , warp::WarpGenericSycl<TDim>{work_item}
             , cout{output_stream}
         {
@@ -111,6 +110,7 @@ namespace alpaka
             , BlockSyncGenericSycl<TDim>{work_item}
             , IntrinsicGenericSycl{}
             , MemFenceGenericSycl{global_fence_dummy, local_fence_dummy}
+            , rand::RandGenericSycl<TDim>{work_item}
             , warp::WarpGenericSycl<TDim>{work_item}
         {
         }
diff --git a/include/alpaka/alpaka.hpp b/include/alpaka/alpaka.hpp
index c96d396ac4c4..3262ea360edc 100644
--- a/include/alpaka/alpaka.hpp
+++ b/include/alpaka/alpaka.hpp
@@ -1,5 +1,5 @@
 /* Copyright 2023 Axel Hübl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Bernhard Manfred Gruber,
- *                Jan Stephan, Antonio Di Pilato, Luca Ferragina
+ *                Jan Stephan, Antonio Di Pilato, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -172,7 +172,6 @@
 #include "alpaka/meta/NdLoop.hpp"
 #include "alpaka/meta/NonZero.hpp"
 #include "alpaka/meta/Set.hpp"
-#include "alpaka/meta/SyclPrintf.hpp"
 #include "alpaka/meta/Transform.hpp"
 #include "alpaka/meta/TypeListOps.hpp"
 // offset
diff --git a/include/alpaka/core/Sycl.hpp b/include/alpaka/core/Sycl.hpp
index 3f067f5f8b39..983140770217 100644
--- a/include/alpaka/core/Sycl.hpp
+++ b/include/alpaka/core/Sycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan, Luca Ferragina
+/* Copyright 2022 Jan Stephan, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -13,6 +13,7 @@
 
 #include <array>
 #include <cstddef>
+#include <cstdio> // the #define printf(...) breaks <cstdio> if it is included afterwards
 #include <iostream>
 #include <stdexcept>
 #include <string>
@@ -23,6 +24,25 @@
 
 #    include <CL/sycl.hpp>
 
+// if SYCL is enabled with the AMD backend the printf will be killed because of missing compiler support
+#    ifdef __AMDGCN__
+#        define printf(...)
+#    else
+
+#        ifdef __SYCL_DEVICE_ONLY__
+using AlpakaFormat = char const* [[clang::opencl_constant]];
+#        else
+using AlpakaFormat = char const*;
+#        endif
+#        define printf(FORMAT, ...)                                                                                   \
+            do                                                                                                        \
+            {                                                                                                         \
+                static const auto format = AlpakaFormat{FORMAT};                                                      \
+                sycl::ext::oneapi::experimental::printf(format, ##__VA_ARGS__);                                       \
+            } while(false)
+
+#    endif
+
 // SYCL vector types trait specializations.
 namespace alpaka
 {
diff --git a/include/alpaka/example/ExampleDefaultAcc.hpp b/include/alpaka/example/ExampleDefaultAcc.hpp
index f90b3653c9f4..ef0eb80fb380 100644
--- a/include/alpaka/example/ExampleDefaultAcc.hpp
+++ b/include/alpaka/example/ExampleDefaultAcc.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -24,8 +24,6 @@ namespace alpaka
     using ExampleDefaultAcc = alpaka::AccCpuOmp2Threads<TDim, TIdx>;
 #elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
     using ExampleDefaultAcc = alpaka::AccCpuThreads<TDim, TIdx>;
-#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-    using ExampleDefaultAcc = alpaka::AccCpuSerial<TDim, TIdx>;
 #elif defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI)
 #    if defined(ALPAKA_SYCL_ONEAPI_CPU)
     using ExampleDefaultAcc = alpaka::AccCpuSyclIntel<TDim, TIdx>;
@@ -34,6 +32,8 @@ namespace alpaka
 #    elif defined(ALPAKA_SYCL_ONEAPI_GPU)
     using ExampleDefaultAcc = alpaka::AccGpuSyclIntel<TDim, TIdx>;
 #    endif
+#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
+    using ExampleDefaultAcc = alpaka::AccCpuSerial<TDim, TIdx>;
 #else
     class ExampleDefaultAcc;
 #    warning "No supported backend selected."
diff --git a/include/alpaka/mem/buf/sycl/Accessor.hpp b/include/alpaka/mem/buf/sycl/Accessor.hpp
index 5b7aeba11dac..6d2dec54cb7c 100644
--- a/include/alpaka/mem/buf/sycl/Accessor.hpp
+++ b/include/alpaka/mem/buf/sycl/Accessor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci
+/* Copyright 2023 Jan Stephan, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -112,30 +112,6 @@ namespace alpaka
         SyclAccessor m_accessor;
         VecType extents;
     };
-
-    namespace experimental::trait
-    {
-        namespace internal
-        {
-            template<typename TElem, typename TDim, typename TIdx, typename TDev>
-            struct IsView<BufGenericSycl<TElem, TDim, TIdx, TDev>> : std::false_type
-            {
-            };
-        } // namespace internal
-
-        template<typename TElem, typename TDim, typename TIdx, typename TDev>
-        struct BuildAccessor<BufGenericSycl<TElem, TDim, TIdx, TDev>>
-        {
-            template<typename... TAccessModes>
-            static auto buildAccessor(BufGenericSycl<TElem, TDim, TIdx, TDev>& buffer)
-            {
-                using SyclAccessor = detail::SyclAccessor<TElem, TDim::value, TAccessModes...>;
-                return Accessor<SyclAccessor, TElem, TIdx, TDim::value, TAccessModes...>{
-                    SyclAccessor{buffer.m_buffer},
-                    buffer.m_extentElements};
-            }
-        };
-    } // namespace experimental::trait
 } // namespace alpaka
 
 #endif
diff --git a/include/alpaka/mem/buf/sycl/Copy.hpp b/include/alpaka/mem/buf/sycl/Copy.hpp
index cfede497adb1..6ecb8d5f4836 100644
--- a/include/alpaka/mem/buf/sycl/Copy.hpp
+++ b/include/alpaka/mem/buf/sycl/Copy.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan, Luca Ferragina, Aurora Perego
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -146,10 +146,7 @@ namespace alpaka::detail
         using Elem = alpaka::Elem<TViewSrc>;
 
         template<typename TViewDstFwd>
-        ALPAKA_FN_HOST TaskCopySycl(
-            TViewDstFwd&& viewDst,
-            TViewSrc const& viewSrc,
-            [[maybe_unused]] TExtent const& extent)
+        ALPAKA_FN_HOST TaskCopySycl(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
             : m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
             , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
         {
diff --git a/include/alpaka/meta/SyclPrintf.hpp b/include/alpaka/meta/SyclPrintf.hpp
deleted file mode 100644
index ef432e15a87f..000000000000
--- a/include/alpaka/meta/SyclPrintf.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- *
- * This file is part of alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <CL/sycl.hpp>
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-// Kill printf in AMD GPU code because of missing compiler support
-#    ifdef __AMDGCN__
-#        include <cstdio> // the define breaks <cstdio> if it is included afterwards
-#        define printf(...)
-#    else
-
-#        ifdef __SYCL_DEVICE_ONLY__
-#            define CONSTANT __attribute__((opencl_constant))
-#        else
-#            define CONSTANT
-#        endif
-
-#        define printf(FORMAT, ...)                                                                                   \
-            do                                                                                                        \
-            {                                                                                                         \
-                static const CONSTANT char format[] = FORMAT;                                                         \
-                sycl::ext::oneapi::experimental::printf(format, ##__VA_ARGS__);                                       \
-            } while(false)
-#    endif
-#endif
diff --git a/include/alpaka/rand/RandGenericSycl.hpp b/include/alpaka/rand/RandGenericSycl.hpp
index bd274e9a526a..0901090853a8 100644
--- a/include/alpaka/rand/RandGenericSycl.hpp
+++ b/include/alpaka/rand/RandGenericSycl.hpp
@@ -1,10 +1,5 @@
-/* Copyright 2022 Luca Ferragina
- *
- * This file is part of alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+/* Copyright 2023 Luca Ferragina, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
@@ -17,24 +12,27 @@
 #    include <alpaka/rand/Traits.hpp>
 
 // Backend specific imports.
-#    if defined(ALPAKA_ACC_SYCL_ENABLED)
-#        include <CL/sycl.hpp>
-#        include <oneapi/mkl/rng.hpp>
-
-#        include <oneapi/dpl/random>
-#    endif
+#    include <CL/sycl.hpp>
+#    include <oneapi/mkl/rng.hpp>
 
+#    include <oneapi/dpl/random>
 #    include <type_traits>
 
 namespace alpaka::rand
 {
     //! The SYCL rand implementation.
-    class RandGenericSycl : public concepts::Implements<ConceptRand, RandGenericSycl>
+    template<typename TDim>
+    class RandGenericSycl : public concepts::Implements<ConceptRand, RandGenericSycl<TDim>>
     {
+    public:
+        RandGenericSycl(sycl::nd_item<TDim::value> my_item) : m_item{my_item}
+        {
+        }
+
+        sycl::nd_item<TDim::value> m_item;
     };
 
 #    if !defined(ALPAKA_HOST_ONLY)
-
     namespace distribution::sycl_rand
     {
         //! The SYCL random number floating point normal distribution.
@@ -53,6 +51,7 @@ namespace alpaka::rand
     namespace engine::sycl_rand
     {
         //! The SYCL linear congruential random number generator engine.
+        template<typename TDim>
         class Minstd
         {
         public:
@@ -60,9 +59,9 @@ namespace alpaka::rand
             // need to be overwritten with a valid object
             Minstd() = default;
 
-            Minstd(std::uint32_t const& seed, std::uint32_t const& subsequence = 0, std::uint32_t const& offset = 0)
+            Minstd(RandGenericSycl<TDim> rand, std::uint32_t const& seed)
             {
-                oneapi::dpl::minstd_rand engine(seed, offset);
+                oneapi::dpl::minstd_rand engine(seed, rand.m_item.get_global_linear_id());
                 rng_engine = engine;
             }
 
@@ -77,19 +76,20 @@ namespace alpaka::rand
             oneapi::dpl::minstd_rand rng_engine;
 
         public:
-            using result_type = decltype(rng_engine);
-
-            // ALPAKA_FN_HOST_ACC constexpr static result_type min()
-            // {
-            //     return std::numeric_limits<result_type>::min();
-            // }
-            // ALPAKA_FN_HOST_ACC constexpr static result_type max()
-            // {
-            //     return std::numeric_limits<result_type>::max();
-            // }
+            using result_type = float;
+
+            ALPAKA_FN_HOST_ACC static result_type min()
+            {
+                return std::numeric_limits<result_type>::min();
+            }
+            ALPAKA_FN_HOST_ACC static result_type max()
+            {
+                return std::numeric_limits<result_type>::max();
+            }
             result_type operator()()
             {
-                return rng_engine;
+                oneapi::dpl::uniform_real_distribution<float> distr;
+                return distr(rng_engine);
             }
         };
     } // namespace engine::sycl_rand
@@ -108,7 +108,7 @@ namespace alpaka::rand
                 oneapi::dpl::normal_distribution<float> distr;
 
                 // Generate float random number
-                return distr(engine);
+                return distr(engine.rng_engine);
             }
         };
 
@@ -124,7 +124,7 @@ namespace alpaka::rand
                 oneapi::dpl::normal_distribution<double> distr;
 
                 // Generate float random number
-                return distr(engine);
+                return distr(engine.rng_engine);
             }
         };
 
@@ -140,10 +140,7 @@ namespace alpaka::rand
                 oneapi::dpl::uniform_real_distribution<float> distr;
 
                 // Generate float random number
-                return distr(engine);
-                // NOTE: (1.0f - curand_uniform) does not work, because curand_uniform seems to return
-                // denormalized floats around 0.f. [0.f, 1.0f)
-                // return fUniformRand * static_cast<float>(fUniformRand != 1.0f);
+                return distr(engine.rng_engine);
             }
         };
 
@@ -159,10 +156,7 @@ namespace alpaka::rand
                 oneapi::dpl::uniform_real_distribution<double> distr;
 
                 // Generate float random number
-                return distr(engine);
-                // NOTE: (1.0f - curand_uniform_double) does not work, because curand_uniform_double seems to
-                // return denormalized floats around 0.f. [0.f, 1.0f)
-                // return fUniformRand * static_cast<double>(fUniformRand != 1.0);
+                return distr(engine.rng_engine);
             }
         };
 
@@ -178,7 +172,7 @@ namespace alpaka::rand
                 oneapi::dpl::uniform_int_distribution<unsigned int> distr;
 
                 // Generate float random number
-                return distr(engine);
+                return distr(engine.rng_engine);
             }
         };
     } // namespace distribution::sycl_rand
@@ -186,30 +180,30 @@ namespace alpaka::rand
     namespace distribution::trait
     {
         //! The SYCL random number float normal distribution get trait specialization.
-        template<typename T>
-        struct CreateNormalReal<RandGenericSycl, T, std::enable_if_t<std::is_floating_point_v<T>>>
+        template<typename TDim, typename T>
+        struct CreateNormalReal<RandGenericSycl<TDim>, T, std::enable_if_t<std::is_floating_point_v<T>>>
         {
-            static auto createNormalReal(RandGenericSycl const& /*rand*/) -> sycl_rand::NormalReal<T>
+            static auto createNormalReal(RandGenericSycl<TDim> const& /*rand*/) -> sycl_rand::NormalReal<T>
             {
                 return {};
             }
         };
 
         //! The SYCL random number float uniform distribution get trait specialization.
-        template<typename T>
-        struct CreateUniformReal<RandGenericSycl, T, std::enable_if_t<std::is_floating_point_v<T>>>
+        template<typename TDim, typename T>
+        struct CreateUniformReal<RandGenericSycl<TDim>, T, std::enable_if_t<std::is_floating_point_v<T>>>
         {
-            static auto createUniformReal(RandGenericSycl const& /*rand*/) -> sycl_rand::UniformReal<T>
+            static auto createUniformReal(RandGenericSycl<TDim> const& /*rand*/) -> sycl_rand::UniformReal<T>
             {
                 return {};
             }
         };
 
         //! The SYCL random number integer uniform distribution get trait specialization.
-        template<typename T>
-        struct CreateUniformUint<RandGenericSycl, T, std::enable_if_t<std::is_integral_v<T>>>
+        template<typename TDim, typename T>
+        struct CreateUniformUint<RandGenericSycl<TDim>, T, std::enable_if_t<std::is_integral_v<T>>>
         {
-            static auto createUniformUint(RandGenericSycl const& /*rand*/) -> sycl_rand::UniformUint<T>
+            static auto createUniformUint(RandGenericSycl<TDim> const& /*rand*/) -> sycl_rand::UniformUint<T>
             {
                 return {};
             }
@@ -219,16 +213,16 @@ namespace alpaka::rand
     namespace engine::trait
     {
         //! The SYCL random number default generator get trait specialization.
-        template<>
-        struct CreateDefault<RandGenericSycl>
+        template<typename TDim>
+        struct CreateDefault<RandGenericSycl<TDim>>
         {
             static auto createDefault(
-                RandGenericSycl const& /*rand*/,
+                RandGenericSycl<TDim> const& rand,
                 std::uint32_t const& seed = 0,
                 std::uint32_t const& subsequence = 0,
-                std::uint32_t const& offset = 0) -> sycl_rand::Minstd
+                std::uint32_t const& offset = 0) -> sycl_rand::Minstd<TDim>
             {
-                return {seed, subsequence, offset};
+                return {rand, seed};
             }
         };
     } // namespace engine::trait
diff --git a/include/alpaka/test/Check.hpp b/include/alpaka/test/Check.hpp
index 28f064701ccb..39545e764923 100644
--- a/include/alpaka/test/Check.hpp
+++ b/include/alpaka/test/Check.hpp
@@ -1,40 +1,19 @@
-/* Copyright 2022 Benjamin Worpitz, Jan Stephan, Luca Ferragina
+/* Copyright 2023 Benjamin Worpitz, Jan Stephan, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
+#include "alpaka/core/Sycl.hpp"
+
 #include <cstdio>
 
-// TODO: SYCL doesn't have a way to detect if we're looking at device or host code. This needs a workaround so that
-// SYCL and other back-ends are compatible.
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-#    define ALPAKA_CHECK(success, expression)                                                                         \
-        do                                                                                                            \
-        {                                                                                                             \
-            if(!(expression))                                                                                         \
-            {                                                                                                         \
-                /*if constexpr(alpaka::accMatchesTags<   TODO                                                         \
-                                 decltype(acc),                                                                       \
-                                 alpaka::TagCpuSyclIntel,                                                             \
-                                 alpaka::TagFpgaSyclIntel,                                                            \
-                                 alpaka::TagFpgaSyclXilinx,                                                           \
-                                 alpaka::TagGpuSyclIntel,                                                             \
-                                 alpaka::TagGenericSycl>)                                                             \
-                    acc.cout << "ALPAKA_CHECK failed because '!(" << #expression << ")'\n";                           \
-                else                                                                                                  \
-                    printf("ALPAKA_CHECK failed because '!(%s)'\n", #expression);*/                                   \
-                success = false;                                                                                      \
-            }                                                                                                         \
-        } while(0)
-#else
-#    define ALPAKA_CHECK(success, expression)                                                                         \
-        do                                                                                                            \
+#define ALPAKA_CHECK(success, expression)                                                                             \
+    do                                                                                                                \
+    {                                                                                                                 \
+        if(!(expression))                                                                                             \
         {                                                                                                             \
-            if(!(expression))                                                                                         \
-            {                                                                                                         \
-                printf("ALPAKA_CHECK failed because '!(%s)'\n", #expression);                                         \
-                success = false;                                                                                      \
-            }                                                                                                         \
-        } while(0)
-#endif
+            printf("ALPAKA_CHECK failed because '!(%s)'\n", #expression);                                             \
+            success = false;                                                                                          \
+        }                                                                                                             \
+    } while(0)

From 97ebacdc87493a9c308597f82c0510f5168d87e9 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 13 Jun 2023 20:47:05 +0200
Subject: [PATCH 03/10] Rewrite the SYCL backend for the SYCL 2020 standard and
 USM allocations (part 3)

Update the documentation.

Implement various fixes to the SYCL math functions:
  - add missing "if constexpr" to rsqrt();
  - do not call math function with mixed arguments; this fixes errors due to
    the implicit conversion between floating point types of different sizes
    in sycl::atan2() and sycl::pow();
  - add explicit type casts to silence warnings;
  - cast the result of isfinite/isinf/isnan to bool.

Implement various fixes to the SYCL atomic functions:
  - fix the cas/compare_exchange loops;
  - clarify which atomic types are supported.

Implement various fixes to the SYCL warp-level functions:
  - fix compilation warnings;
  - extract bits from sub_group_mask.

Mark the use of global device variables and constants as undupported: the SYCL
backend does not support global device variables and constants, yet.

Add explicit checks on the dimensionality of the SYCL accelerator and work division.

Silence warnings about the use of GNU extensions, and those coming from the
Intel oneMKL and oneDPL headers.

Update more tests for the SYCL backend:
  - add a special case for 0-dimensional tests;
  - disable the use of STL rand;
  - disable the test of global device variables and constants.
---
 README_SYCL.md                                | 10 +++-
 include/alpaka/acc/AccGenericSycl.hpp         |  2 +
 include/alpaka/atomic/AtomicGenericSycl.hpp   | 57 ++++++++++---------
 include/alpaka/core/Common.hpp                |  6 +-
 include/alpaka/core/Sycl.hpp                  | 14 ++++-
 include/alpaka/math/MathGenericSycl.hpp       | 40 ++++++++-----
 include/alpaka/mem/view/ViewPlainPtr.hpp      | 15 ++++-
 include/alpaka/rand/RandGenericSycl.hpp       | 22 ++++++-
 include/alpaka/warp/WarpGenericSycl.hpp       | 33 +++++++----
 include/alpaka/workdiv/WorkDivGenericSycl.hpp | 14 +++--
 test/unit/mem/view/src/ViewStaticAccMem.cpp   | 24 +++++++-
 test/unit/rand/src/RandTest.cpp               |  4 +-
 12 files changed, 169 insertions(+), 72 deletions(-)

diff --git a/README_SYCL.md b/README_SYCL.md
index eead8a8a1acc..82f620dca193 100644
--- a/README_SYCL.md
+++ b/README_SYCL.md
@@ -93,8 +93,12 @@ These can be used interchangeably (some restrictions apply - see below) with the
 ### Restrictions
 
 * The FPGA back-ends (both vendors) cannot be used together with the Intel CPU / GPU back-ends. This is because of the different compilation trajectory required for FPGAs and is unlikely to be fixed anytime soon. See [here](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/programming-interface/fpga-flow/why-is-fpga-compilation-different.html) for an explanation.
-* The SYCL back-end currently does not support passing pointers as kernel parameters. Use alpaka's experimental accessors instead.
-* The SYCL back-end does not have device-side random number generation.
 * Similar to the CUDA and HIP back-ends the SYCL back-end only supports up to three kernel dimensions.
 * Some Intel GPUs do not support the `double` type for device code. alpaka will not check this.
-* The FPGA back-end does not support atomics. alpaka will not check this.
\ No newline at end of file
+  You can enable software emulation for `double` precision types with
+  ```bash
+  export IGC_EnableDPEmulation=1
+  export OverrideDefaultFP64Settings=1
+  ```
+  See [Intel's FAQ](https://github.com/intel/compute-runtime/blob/master/opencl/doc/FAQ.md#feature-double-precision-emulation-fp64) for more information.
+* The FPGA back-end does not support atomics. alpaka will not check this.
diff --git a/include/alpaka/acc/AccGenericSycl.hpp b/include/alpaka/acc/AccGenericSycl.hpp
index fe3a28bc1828..76a735459940 100644
--- a/include/alpaka/acc/AccGenericSycl.hpp
+++ b/include/alpaka/acc/AccGenericSycl.hpp
@@ -60,6 +60,8 @@ namespace alpaka
         , public rand::RandGenericSycl<TDim>
         , public warp::WarpGenericSycl<TDim>
     {
+        static_assert(TDim::value > 0, "The SYCL accelerator must have a dimension greater than zero.");
+
     public:
         AccGenericSycl(AccGenericSycl const&) = delete;
         AccGenericSycl(AccGenericSycl&&) = delete;
diff --git a/include/alpaka/atomic/AtomicGenericSycl.hpp b/include/alpaka/atomic/AtomicGenericSycl.hpp
index 8864508d578b..a869248e53b4 100644
--- a/include/alpaka/atomic/AtomicGenericSycl.hpp
+++ b/include/alpaka/atomic/AtomicGenericSycl.hpp
@@ -98,17 +98,12 @@ namespace alpaka
         inline auto casWithCondition(T* const addr, TEval&& eval)
         {
             auto ref = TRef{*addr};
-
             auto old_val = ref.load();
-            auto assumed = T{};
 
-            do
+            // prefer compare_exchange_weak when in a loop, assuming that eval is not expensive
+            while(!ref.compare_exchange_weak(old_val, eval(old_val)))
             {
-                assumed = old_val;
-                auto const new_val = eval(old_val);
-                old_val = ref.compare_exchange_strong(assumed, new_val);
-            } while(assumed != old_val);
-
+            }
 
             return old_val;
         }
@@ -182,7 +177,9 @@ namespace alpaka::trait
     template<typename T, typename THierarchy>
     struct AtomicOp<AtomicExch, AtomicGenericSycl, T, THierarchy>
     {
-        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");
+        static_assert(
+            (std::is_integral_v<T> || std::is_floating_point_v<T>) &&(sizeof(T) == 4 || sizeof(T) == 8),
+            "SYCL atomics do not support this type");
 
         static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
         {
@@ -195,11 +192,14 @@ namespace alpaka::trait
     template<typename T, typename THierarchy>
     struct AtomicOp<AtomicInc, AtomicGenericSycl, T, THierarchy>
     {
-        static_assert(std::is_unsigned_v<T>, "atomicInc only supported for unsigned types");
+        static_assert(
+            std::is_unsigned_v<T> && (sizeof(T) == 4 || sizeof(T) == 8),
+            "SYCL atomics support only 32- and 64-bits unsigned integral types");
 
         static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
         {
-            auto inc = [&value](auto old_val) { return (old_val >= value) ? static_cast<T>(0) : (old_val + 1u); };
+            auto inc = [&value](auto old_val)
+            { return (old_val >= value) ? static_cast<T>(0) : (old_val + static_cast<T>(1)); };
             if(auto ptr = alpaka::detail::get_global_ptr(addr); ptr != nullptr)
                 return alpaka::detail::casWithCondition<alpaka::detail::global_ref<T, THierarchy>>(addr, inc);
             else
@@ -212,12 +212,14 @@ namespace alpaka::trait
     template<typename T, typename THierarchy>
     struct AtomicOp<AtomicDec, AtomicGenericSycl, T, THierarchy>
     {
-        static_assert(std::is_unsigned_v<T>, "atomicDec only supported for unsigned types");
+        static_assert(
+            std::is_unsigned_v<T> && (sizeof(T) == 4 || sizeof(T) == 8),
+            "SYCL atomics support only 32- and 64-bits unsigned integral types");
 
         static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
         {
-            auto dec
-                = [&value](auto& old_val) { return ((old_val == 0) || (old_val > value)) ? value : (old_val - 1u); };
+            auto dec = [&value](auto& old_val)
+            { return ((old_val == 0) || (old_val > value)) ? value : (old_val - static_cast<T>(1)); };
             if(auto ptr = alpaka::detail::get_global_ptr(addr); ptr != nullptr)
                 return alpaka::detail::casWithCondition<alpaka::detail::global_ref<T, THierarchy>>(addr, dec);
             else
@@ -275,22 +277,21 @@ namespace alpaka::trait
     {
         static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");
 
-        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& compare, T const& value) -> T
+        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& expected, T const& desired) -> T
         {
-            auto cas = [&compare, &value](auto& ref)
+            auto cas = [&expected, &desired](auto& ref)
             {
-                // SYCL stores the value in *addr to the "compare" parameter if the values are not equal. Since
-                // alpaka's interface does not expect this we need to copy "compare" to this function and forget it
-                // afterwards.
-                auto tmp = compare;
-
-                // We always want to return the old value at the end.
-                const auto old = ref.load();
-
-                // This returns a bool telling us if the exchange happened or not. Useless in this case.
-                ref.compare_exchange_strong(tmp, value);
-
-                return old;
+                auto expected_ = expected;
+                // Atomically compares the value of `ref` with the value of `expected`.
+                // If the values are equal, replaces the value of `ref` with `desired`.
+                // Otherwise updates `expected` with the value of `ref`.
+                // Returns a bool telling us if the exchange happened or not, but the Alpaka API does not make use of
+                // it.
+                ref.compare_exchange_strong(expected_, desired);
+
+                // If the update succeded, return the previous value of `ref`.
+                // Otherwise, return the current value of `ref`.
+                return expected_;
             };
 
             if(auto ptr = alpaka::detail::get_global_ptr(addr); ptr != nullptr)
diff --git a/include/alpaka/core/Common.hpp b/include/alpaka/core/Common.hpp
index 50b74df690be..4b6b64de2e1c 100644
--- a/include/alpaka/core/Common.hpp
+++ b/include/alpaka/core/Common.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
+/* Copyright 2023 Axel Huebl, Benjamin Worpitz, Matthias Werner, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -117,6 +117,8 @@
 #if((BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA) || (BOOST_LANG_CUDA && BOOST_COMP_NVCC && BOOST_ARCH_PTX)              \
     || BOOST_LANG_HIP)
 #    define ALPAKA_STATIC_ACC_MEM_GLOBAL __device__
+#elif defined(ALPAKA_ACC_SYCL_ENABLED)
+#    define ALPAKA_STATIC_ACC_MEM_GLOBAL _Pragma("GCC error \"The SYCL backend does not support global device variables.\""))
 #else
 #    define ALPAKA_STATIC_ACC_MEM_GLOBAL
 #endif
@@ -153,6 +155,8 @@
 #if((BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA) || (BOOST_LANG_CUDA && BOOST_COMP_NVCC && BOOST_ARCH_PTX)              \
     || BOOST_LANG_HIP)
 #    define ALPAKA_STATIC_ACC_MEM_CONSTANT __constant__
+#elif defined(ALPAKA_ACC_SYCL_ENABLED)
+#    define ALPAKA_STATIC_ACC_MEM_CONSTANT _Pragma("GCC error \"The SYCL backend does not support global device constants.\""))
 #else
 #    define ALPAKA_STATIC_ACC_MEM_CONSTANT
 #endif
diff --git a/include/alpaka/core/Sycl.hpp b/include/alpaka/core/Sycl.hpp
index 983140770217..409f9951a48b 100644
--- a/include/alpaka/core/Sycl.hpp
+++ b/include/alpaka/core/Sycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan, Luca Ferragina, Aurora Perego
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -34,13 +34,23 @@ using AlpakaFormat = char const* [[clang::opencl_constant]];
 #        else
 using AlpakaFormat = char const*;
 #        endif
+
+#        if BOOST_COMP_CLANG
+#            pragma clang diagnostic push
+#            pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
+#        endif
+
 #        define printf(FORMAT, ...)                                                                                   \
             do                                                                                                        \
             {                                                                                                         \
-                static const auto format = AlpakaFormat{FORMAT};                                                      \
+                static auto const format = AlpakaFormat{FORMAT};                                                      \
                 sycl::ext::oneapi::experimental::printf(format, ##__VA_ARGS__);                                       \
             } while(false)
 
+#        if BOOST_COMP_CLANG
+#            pragma clang diagnostic pop
+#        endif
+
 #    endif
 
 // SYCL vector types trait specializations.
diff --git a/include/alpaka/math/MathGenericSycl.hpp b/include/alpaka/math/MathGenericSycl.hpp
index d48b32b1704f..3449a0dbcc15 100644
--- a/include/alpaka/math/MathGenericSycl.hpp
+++ b/include/alpaka/math/MathGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan, Luca Ferragina
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -279,7 +279,7 @@ namespace alpaka::math::trait
             if constexpr(std::is_integral_v<TArgument>)
                 return sycl::atan2(0.0, static_cast<double>(argument));
             else if constexpr(std::is_floating_point_v<TArgument>)
-                return sycl::atan2(TArgument{0.0}, argument);
+                return sycl::atan2(static_cast<TArgument>(0.0), argument);
             else
                 static_assert(!sizeof(TArgument), "Unsupported data type");
         }
@@ -333,9 +333,11 @@ namespace alpaka::math::trait
         Tx,
         std::enable_if_t<std::is_floating_point_v<Ty> && std::is_floating_point_v<Tx>>>
     {
+        using TCommon = std::common_type_t<Ty, Tx>;
+
         auto operator()(math::Atan2GenericSycl const&, Ty const& y, Tx const& x)
         {
-            return sycl::atan2(y, x);
+            return sycl::atan2(static_cast<TCommon>(y), static_cast<TCommon>(x));
         }
     };
 
@@ -432,9 +434,11 @@ namespace alpaka::math::trait
         Ty,
         std::enable_if_t<std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty>>>
     {
+        using TCommon = std::common_type_t<Tx, Ty>;
+
         auto operator()(math::FmodGenericSycl const&, Tx const& x, Ty const& y)
         {
-            return sycl::fmod(x, y);
+            return sycl::fmod(static_cast<TCommon>(x), static_cast<TCommon>(y));
         }
     };
 
@@ -444,7 +448,7 @@ namespace alpaka::math::trait
     {
         auto operator()(math::IsfiniteGenericSycl const&, TArg const& arg)
         {
-            return sycl::isfinite(arg);
+            return static_cast<bool>(sycl::isfinite(arg));
         }
     };
 
@@ -454,7 +458,7 @@ namespace alpaka::math::trait
     {
         auto operator()(math::IsinfGenericSycl const&, TArg const& arg)
         {
-            return sycl::isinf(arg);
+            return static_cast<bool>(sycl::isinf(arg));
         }
     };
 
@@ -464,7 +468,7 @@ namespace alpaka::math::trait
     {
         auto operator()(math::IsnanGenericSycl const&, TArg const& arg)
         {
-            return sycl::isnan(arg);
+            return static_cast<bool>(sycl::isnan(arg));
         }
     };
 
@@ -482,18 +486,20 @@ namespace alpaka::math::trait
     template<typename Tx, typename Ty>
     struct Max<math::MaxGenericSycl, Tx, Ty, std::enable_if_t<std::is_arithmetic_v<Tx> && std::is_arithmetic_v<Ty>>>
     {
+        using TCommon = std::common_type_t<Tx, Ty>;
+
         auto operator()(math::MaxGenericSycl const&, Tx const& x, Ty const& y)
         {
             if constexpr(std::is_integral_v<Tx> && std::is_integral_v<Ty>)
-                return sycl::max(x, y);
+                return sycl::max(static_cast<TCommon>(x), static_cast<TCommon>(y));
             else if constexpr(std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty>)
-                return sycl::fmax(x, y);
+                return sycl::fmax(static_cast<TCommon>(x), static_cast<TCommon>(y));
             else if constexpr(
                 (std::is_floating_point_v<Tx> && std::is_integral_v<Ty>)
                 || (std::is_integral_v<Tx> && std::is_floating_point_v<Ty>) )
                 return sycl::fmax(static_cast<double>(x), static_cast<double>(y)); // mirror CUDA back-end
             else
-                static_assert(!sizeof(Tx), "Unsupported data type");
+                static_assert(!sizeof(Tx), "Unsupported data types");
         }
     };
 
@@ -512,7 +518,7 @@ namespace alpaka::math::trait
                 || (std::is_integral_v<Tx> && std::is_floating_point_v<Ty>) )
                 return sycl::fmin(static_cast<double>(x), static_cast<double>(y)); // mirror CUDA back-end
             else
-                static_assert(!sizeof(Tx), "Unsupported data type");
+                static_assert(!sizeof(Tx), "Unsupported data types");
         }
     };
 
@@ -524,9 +530,11 @@ namespace alpaka::math::trait
         TExp,
         std::enable_if_t<std::is_floating_point_v<TBase> && std::is_floating_point_v<TExp>>>
     {
+        using TCommon = std::common_type_t<TBase, TExp>;
+
         auto operator()(math::PowGenericSycl const&, TBase const& base, TExp const& exp)
         {
-            return sycl::pow(base, exp);
+            return sycl::pow(static_cast<TCommon>(base), static_cast<TCommon>(exp));
         }
     };
 
@@ -538,9 +546,11 @@ namespace alpaka::math::trait
         Ty,
         std::enable_if_t<std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty>>>
     {
+        using TCommon = std::common_type_t<Tx, Ty>;
+
         auto operator()(math::RemainderGenericSycl const&, Tx const& x, Ty const& y)
         {
-            return sycl::remainder(x, y);
+            return sycl::remainder(static_cast<TCommon>(x), static_cast<TCommon>(y));
         }
     };
 
@@ -580,9 +590,9 @@ namespace alpaka::math::trait
     {
         auto operator()(math::RsqrtGenericSycl const&, TArg const& arg)
         {
-            if(std::is_floating_point_v<TArg>)
+            if constexpr(std::is_floating_point_v<TArg>)
                 return sycl::rsqrt(arg);
-            else if(std::is_integral_v<TArg>)
+            else if constexpr(std::is_integral_v<TArg>)
                 return sycl::rsqrt(static_cast<double>(arg)); // mirror CUDA back-end and use double for ints
             else
                 static_assert(!sizeof(TArg), "Unsupported data type");
diff --git a/include/alpaka/mem/view/ViewPlainPtr.hpp b/include/alpaka/mem/view/ViewPlainPtr.hpp
index a092a6b26f60..6db8cdc7b7d2 100644
--- a/include/alpaka/mem/view/ViewPlainPtr.hpp
+++ b/include/alpaka/mem/view/ViewPlainPtr.hpp
@@ -1,13 +1,15 @@
-/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
 #include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/dev/DevGenericSycl.hpp"
 #include "alpaka/dev/DevUniformCudaHipRt.hpp"
 #include "alpaka/mem/view/Traits.hpp"
 #include "alpaka/mem/view/ViewAccessOps.hpp"
+#include "alpaka/meta/DependentFalseType.hpp"
 #include "alpaka/vec/Vec.hpp"
 
 #include <type_traits>
@@ -178,6 +180,17 @@ namespace alpaka
         };
 #endif
 
+#if defined(ALPAKA_ACC_SYCL_ENABLED)
+        //! The SYCL device CreateStaticDevMemView trait specialization.
+        template<typename TPltf>
+        struct CreateStaticDevMemView<DevGenericSycl<TPltf>>
+        {
+            static_assert(
+                meta::DependentFalseType<TPltf>::value,
+                "The SYCL backend does not support global device variables.");
+        };
+#endif
+
         //! The CPU device CreateViewPlainPtr trait specialization.
         template<>
         struct CreateViewPlainPtr<DevCpu>
diff --git a/include/alpaka/rand/RandGenericSycl.hpp b/include/alpaka/rand/RandGenericSycl.hpp
index 0901090853a8..35c788b449a7 100644
--- a/include/alpaka/rand/RandGenericSycl.hpp
+++ b/include/alpaka/rand/RandGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Luca Ferragina, Aurora Perego
+/* Copyright 2023 Luca Ferragina, Aurora Perego, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -13,9 +13,25 @@
 
 // Backend specific imports.
 #    include <CL/sycl.hpp>
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic push
+#        pragma clang diagnostic ignored "-Wcast-align"
+#        pragma clang diagnostic ignored "-Wcast-qual"
+#        pragma clang diagnostic ignored "-Wextra-semi"
+#        pragma clang diagnostic ignored "-Wfloat-equal"
+#        pragma clang diagnostic ignored "-Wold-style-cast"
+#        pragma clang diagnostic ignored "-Wreserved-identifier"
+#        pragma clang diagnostic ignored "-Wreserved-macro-identifier"
+#        pragma clang diagnostic ignored "-Wsign-compare"
+#        pragma clang diagnostic ignored "-Wundef"
+#    endif
 #    include <oneapi/mkl/rng.hpp>
 
 #    include <oneapi/dpl/random>
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic pop
+#    endif
+
 #    include <type_traits>
 
 namespace alpaka::rand
@@ -219,8 +235,8 @@ namespace alpaka::rand
             static auto createDefault(
                 RandGenericSycl<TDim> const& rand,
                 std::uint32_t const& seed = 0,
-                std::uint32_t const& subsequence = 0,
-                std::uint32_t const& offset = 0) -> sycl_rand::Minstd<TDim>
+                std::uint32_t const& /* subsequence */ = 0,
+                std::uint32_t const& /* offset */ = 0) -> sycl_rand::Minstd<TDim>
             {
                 return {rand, seed};
             }
diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp
index 4df62480e001..017c7a252fb4 100644
--- a/include/alpaka/warp/WarpGenericSycl.hpp
+++ b/include/alpaka/warp/WarpGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan, Luca Ferragina
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -43,12 +43,17 @@ namespace alpaka::warp::trait
     template<typename TDim>
     struct Activemask<warp::WarpGenericSycl<TDim>>
     {
+        // FIXME This should be std::uint64_t on AMD GCN architectures.
         static auto activemask(warp::WarpGenericSycl<TDim> const& warp) -> std::uint32_t
         {
             // SYCL has no way of querying this. Since sub-group functions have to be executed in convergent code
             // regions anyway we return the full mask.
             auto const sub_group = warp.m_item.get_sub_group();
-            return sycl::ext::oneapi::group_ballot(sub_group, true);
+            auto const mask = sycl::ext::oneapi::group_ballot(sub_group, true);
+            // FIXME This should be std::uint64_t on AMD GCN architectures.
+            std::uint32_t bits = 0;
+            mask.extract_bits(bits);
+            return bits;
         }
     };
 
@@ -75,10 +80,15 @@ namespace alpaka::warp::trait
     template<typename TDim>
     struct Ballot<warp::WarpGenericSycl<TDim>>
     {
-        static auto ballot(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate)
+        // FIXME This should be std::uint64_t on AMD GCN architectures.
+        static auto ballot(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::uint32_t
         {
             auto const sub_group = warp.m_item.get_sub_group();
-            return sycl::ext::oneapi::group_ballot(sub_group, static_cast<bool>(predicate));
+            auto const mask = sycl::ext::oneapi::group_ballot(sub_group, static_cast<bool>(predicate));
+            // FIXME This should be std::uint64_t on AMD GCN architectures.
+            std::uint32_t bits = 0;
+            mask.extract_bits(bits);
+            return bits;
         }
     };
 
@@ -88,6 +98,10 @@ namespace alpaka::warp::trait
         template<typename T>
         static auto shfl(warp::WarpGenericSycl<TDim> const& warp, T value, std::int32_t srcLane, std::int32_t width)
         {
+            ALPAKA_ASSERT_OFFLOAD(width > 0);
+            ALPAKA_ASSERT_OFFLOAD(srcLane < width);
+            ALPAKA_ASSERT_OFFLOAD(srcLane >= 0);
+
             /* If width < srcLane the sub-group needs to be split into assumed subdivisions. The first item of each
                subdivision has the assumed index 0. The srcLane index is relative to the subdivisions.
 
@@ -95,14 +109,9 @@ namespace alpaka::warp::trait
                The first starts at sub-group index 0 and the second at sub-group index 16. For srcLane = 4 the
                first subdivision will access the value at sub-group index 4 and the second at sub-group index 20. */
             auto const actual_group = warp.m_item.get_sub_group();
-            auto const actual_item_id = actual_group.get_local_linear_id();
-
-            auto const assumed_group_id = actual_item_id / width;
-            auto const assumed_item_id = actual_item_id % width;
-
-            auto const assumed_src_id = static_cast<std::size_t>(srcLane % width);
-            auto const actual_src_id = assumed_src_id + assumed_group_id * width;
-
+            auto const actual_item_id = static_cast<std::int32_t>(actual_group.get_local_linear_id());
+            auto const actual_group_id = actual_item_id / width;
+            auto const actual_src_id = static_cast<std::size_t>(srcLane + actual_group_id * width);
             auto const src = sycl::id<1>{actual_src_id};
 
             return sycl::select_from_group(actual_group, value, src);
diff --git a/include/alpaka/workdiv/WorkDivGenericSycl.hpp b/include/alpaka/workdiv/WorkDivGenericSycl.hpp
index cb4f003fa211..c7a2979815c4 100644
--- a/include/alpaka/workdiv/WorkDivGenericSycl.hpp
+++ b/include/alpaka/workdiv/WorkDivGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan, Luca Ferragina
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -18,6 +18,8 @@ namespace alpaka
     template<typename TDim, typename TIdx>
     class WorkDivGenericSycl : public concepts::Implements<ConceptWorkDiv, WorkDivGenericSycl<TDim, TIdx>>
     {
+        static_assert(TDim::value > 0, "The SYCL work division must have a dimension greater than zero.");
+
     public:
         using WorkDivBase = WorkDivGenericSycl;
 
@@ -55,7 +57,9 @@ namespace alpaka::trait
         //! \return The number of blocks in each dimension of the grid.
         static auto getWorkDiv(WorkDivGenericSycl<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
         {
-            if constexpr(TDim::value == 1)
+            if constexpr(TDim::value == 0)
+                return Vec<TDim, TIdx>{};
+            else if constexpr(TDim::value == 1)
                 return Vec<TDim, TIdx>{static_cast<TIdx>(workDiv.my_item.get_group_range(0))};
             else if constexpr(TDim::value == 2)
             {
@@ -80,7 +84,9 @@ namespace alpaka::trait
         //! \return The number of threads in each dimension of a block.
         static auto getWorkDiv(WorkDivGenericSycl<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
         {
-            if constexpr(TDim::value == 1)
+            if constexpr(TDim::value == 0)
+                return Vec<TDim, TIdx>{};
+            else if constexpr(TDim::value == 1)
                 return Vec<TDim, TIdx>{static_cast<TIdx>(workDiv.my_item.get_local_range(0))};
             else if constexpr(TDim::value == 2)
             {
@@ -102,7 +108,7 @@ namespace alpaka::trait
     template<typename TDim, typename TIdx>
     struct GetWorkDiv<WorkDivGenericSycl<TDim, TIdx>, origin::Thread, unit::Elems>
     {
-        //! \return The number of blocks in each dimension of the grid.
+        //! \return The number of elements in each dimension of the thread.
         static auto getWorkDiv(WorkDivGenericSycl<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
         {
             return workDiv.m_threadElemExtent;
diff --git a/test/unit/mem/view/src/ViewStaticAccMem.cpp b/test/unit/mem/view/src/ViewStaticAccMem.cpp
index f26a88fdd8a0..ff8f0e48b245 100644
--- a/test/unit/mem/view/src/ViewStaticAccMem.cpp
+++ b/test/unit/mem/view/src/ViewStaticAccMem.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2023 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -16,6 +16,8 @@ using Elem = std::uint32_t;
 using Dim = alpaka::DimInt<2u>;
 using Idx = std::uint32_t;
 
+#if !defined(ALPAKA_ACC_SYCL_ENABLED)
+
 // These forward declarations are only necessary when you want to access those variables
 // from a different compilation unit and should be moved to a common header.
 // Here they are used to silence clang`s -Wmissing-variable-declarations warning
@@ -40,10 +42,13 @@ struct StaticDeviceMemoryTestKernel
     }
 };
 
+#endif // !defined(ALPAKA_ACC_SYCL_ENABLED)
+
 using TestAccs = alpaka::test::EnabledAccs<Dim, Idx>;
 
 TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryGlobal", "[viewStaticAccMem]", TestAccs)
 {
+#if !defined(ALPAKA_ACC_SYCL_ENABLED)
     using Acc = TestType;
     using DevAcc = alpaka::Dev<Acc>;
 
@@ -75,8 +80,16 @@ TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryGlobal", "[viewStaticAccMem]", TestAc
 
         REQUIRE(fixture(kernel, alpaka::getPtrNative(viewConstantMemUninitialized)));
     }
+
+#else // !defined(ALPAKA_ACC_SYCL_ENABLED)
+
+    WARN("The SYCL backend does not support global device variables.");
+
+#endif // !defined(ALPAKA_ACC_SYCL_ENABLED)
 }
 
+#if !defined(ALPAKA_ACC_SYCL_ENABLED)
+
 // These forward declarations are only necessary when you want to access those variables
 // from a different compilation unit and should be moved to a common header.
 // Here they are used to silence clang`s -Wmissing-variable-declarations warning
@@ -84,8 +97,11 @@ TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryGlobal", "[viewStaticAccMem]", TestAc
 extern ALPAKA_STATIC_ACC_MEM_GLOBAL Elem g_globalMemory2DUninitialized[3][2];
 ALPAKA_STATIC_ACC_MEM_GLOBAL Elem g_globalMemory2DUninitialized[3][2];
 
+#endif // !defined(ALPAKA_ACC_SYCL_ENABLED)
+
 TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryConstant", "[viewStaticAccMem]", TestAccs)
 {
+#if !defined(ALPAKA_ACC_SYCL_ENABLED)
     using Acc = TestType;
     using DevAcc = alpaka::Dev<Acc>;
 
@@ -117,4 +133,10 @@ TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryConstant", "[viewStaticAccMem]", Test
 
         REQUIRE(fixture(kernel, alpaka::getPtrNative(viewGlobalMemUninitialized)));
     }
+
+#else // !defined(ALPAKA_ACC_SYCL_ENABLED)
+
+    WARN("The SYCL backend does not support global device constants.");
+
+#endif // !defined(ALPAKA_ACC_SYCL_ENABLED)
 }
diff --git a/test/unit/rand/src/RandTest.cpp b/test/unit/rand/src/RandTest.cpp
index 7bea6e930ea6..05311d438965 100644
--- a/test/unit/rand/src/RandTest.cpp
+++ b/test/unit/rand/src/RandTest.cpp
@@ -1,5 +1,5 @@
 /* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, René Widera, Jan Stephan, Bernhard Manfred Gruber,
- *                Sergei Bastrakov
+ *                Sergei Bastrakov, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -60,7 +60,7 @@ class RandTestKernel
         auto genDefault = alpaka::rand::engine::createDefault(acc, 12345u, 6789u);
         genNumbers(acc, success, genDefault);
 
-#if !defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+#if !defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !defined(ALPAKA_ACC_SYCL_ENABLED)
         // TODO: These ifdefs are wrong: They will reduce the test to the
         // smallest common denominator from all enabled backends
         // std::random_device

From 085365ab8fe8cb4e07d1e1f46c708a642419966f Mon Sep 17 00:00:00 2001
From: Jan Stephan <j.stephan@hzdr.de>
Date: Tue, 20 Jun 2023 13:40:00 +0200
Subject: [PATCH 04/10] Rewrite the SYCL backend for the SYCL 2020 standard and
 USM allocations (part 4)

Update the documentation related to FPGAs.

Various fixes and updates to the SYCL backend and tests, the copyright
information and code formatting.
---
 README_SYCL.md                                |   2 +-
 .../dyn/BlockSharedMemDynGenericSycl.hpp      |   4 +-
 .../shared/st/BlockSharedMemStGenericSycl.hpp |   1 +
 include/alpaka/core/BoostPredef.hpp           |  10 +-
 include/alpaka/core/Common.hpp                |   2 +-
 include/alpaka/math/MathGenericSycl.hpp       |   2 +-
 include/alpaka/mem/buf/sycl/Copy.hpp          |   7 +-
 include/alpaka/mem/view/ViewPlainPtr.hpp      |   3 +-
 include/alpaka/pltf/PltfCpuSyclIntel.hpp      |  11 +-
 include/alpaka/pltf/PltfGenericSycl.hpp       | 137 +++++++++---------
 include/alpaka/pltf/PltfGpuSyclIntel.hpp      |  13 +-
 include/alpaka/rand/RandGenericSycl.hpp       |   2 +-
 include/alpaka/test/dim/TestDims.hpp          |   2 +-
 .../test/event/EventHostManualTrigger.hpp     |   3 +-
 include/alpaka/test/queue/Queue.hpp           |   2 +-
 test/unit/atomic/src/AtomicTest.cpp           |   3 +-
 thirdParty/CMakeLists.txt                     |   2 +-
 17 files changed, 104 insertions(+), 102 deletions(-)

diff --git a/README_SYCL.md b/README_SYCL.md
index 82f620dca193..9846f1b8768b 100644
--- a/README_SYCL.md
+++ b/README_SYCL.md
@@ -92,7 +92,7 @@ These can be used interchangeably (some restrictions apply - see below) with the
 
 ### Restrictions
 
-* The FPGA back-ends (both vendors) cannot be used together with the Intel CPU / GPU back-ends. This is because of the different compilation trajectory required for FPGAs and is unlikely to be fixed anytime soon. See [here](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/programming-interface/fpga-flow/why-is-fpga-compilation-different.html) for an explanation.
+* The Intel FPGA back-end cannot be used together with the Intel CPU / GPU back-ends. This is because of the different compilation trajectory required for FPGAs and is unlikely to be fixed anytime soon. See [here](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/programming-interface/fpga-flow/why-is-fpga-compilation-different.html) for an explanation.
 * Similar to the CUDA and HIP back-ends the SYCL back-end only supports up to three kernel dimensions.
 * Some Intel GPUs do not support the `double` type for device code. alpaka will not check this.
   You can enable software emulation for `double` precision types with
diff --git a/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp b/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp
index d13b78249867..1e4671ebbf72 100644
--- a/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp
+++ b/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp
@@ -35,9 +35,7 @@ namespace alpaka::trait
     {
         static auto getMem(BlockSharedMemDynGenericSycl const& shared) -> T*
         {
-            auto void_ptr = sycl::multi_ptr<void, sycl::access::address_space::local_space>{shared.m_accessor};
-            auto t_ptr = static_cast<sycl::multi_ptr<T, sycl::access::address_space::local_space>>(void_ptr);
-            return t_ptr.get();
+            return reinterpret_cast<T*>(shared.m_accessor.get_pointer().get());
         }
     };
 } // namespace alpaka::trait
diff --git a/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp b/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp
index baae7b0c8825..abda79f7b049 100644
--- a/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp
+++ b/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp
@@ -7,6 +7,7 @@
 #include "alpaka/block/shared/st/Traits.hpp"
 #include "alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp"
 
+#include <cstddef>
 #include <cstdint>
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
diff --git a/include/alpaka/core/BoostPredef.hpp b/include/alpaka/core/BoostPredef.hpp
index 39621e964b0f..9a54f33200f8 100644
--- a/include/alpaka/core/BoostPredef.hpp
+++ b/include/alpaka/core/BoostPredef.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Jan Stephan
+/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Jan Stephan
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -68,3 +68,11 @@
 #    undef BOOST_COMP_PGI
 #    define BOOST_COMP_PGI BOOST_COMP_PGI_EMULATED
 #endif
+
+// Intel LLVM compiler detection
+#if !defined(BOOST_COMP_ICPX)
+#    if defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER)
+// The version string for icpx 2023.1.0 is 20230100. In Boost.Predef this becomes (53,1,0).
+#        define BOOST_COMP_ICPX BOOST_PREDEF_MAKE_YYYYMMDD(__INTEL_LLVM_COMPILER)
+#    endif
+#endif
diff --git a/include/alpaka/core/Common.hpp b/include/alpaka/core/Common.hpp
index 4b6b64de2e1c..bc7428af08db 100644
--- a/include/alpaka/core/Common.hpp
+++ b/include/alpaka/core/Common.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Axel Huebl, Benjamin Worpitz, Matthias Werner, Andrea Bocci
+/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, Jan Stephan, René Widera, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/math/MathGenericSycl.hpp b/include/alpaka/math/MathGenericSycl.hpp
index 3449a0dbcc15..ba99befbf66a 100644
--- a/include/alpaka/math/MathGenericSycl.hpp
+++ b/include/alpaka/math/MathGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
+/* Copyright 2023 Jan Stephan, Sergei Bastrakov, René Widera, Luca Ferragina, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/mem/buf/sycl/Copy.hpp b/include/alpaka/mem/buf/sycl/Copy.hpp
index 6ecb8d5f4836..7373125c63c6 100644
--- a/include/alpaka/mem/buf/sycl/Copy.hpp
+++ b/include/alpaka/mem/buf/sycl/Copy.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego
+/* Copyright 2023 Jan Stephan, Bernhard Manfred Gruber, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -146,7 +146,10 @@ namespace alpaka::detail
         using Elem = alpaka::Elem<TViewSrc>;
 
         template<typename TViewDstFwd>
-        ALPAKA_FN_HOST TaskCopySycl(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
+        ALPAKA_FN_HOST TaskCopySycl(
+            TViewDstFwd&& viewDst,
+            TViewSrc const& viewSrc,
+            [[maybe_unused]] TExtent const& extent)
             : m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
             , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
         {
diff --git a/include/alpaka/mem/view/ViewPlainPtr.hpp b/include/alpaka/mem/view/ViewPlainPtr.hpp
index 6db8cdc7b7d2..7416385ea068 100644
--- a/include/alpaka/mem/view/ViewPlainPtr.hpp
+++ b/include/alpaka/mem/view/ViewPlainPtr.hpp
@@ -1,4 +1,5 @@
-/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
+/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Sergei Bastrakov, Bernhard Manfred Gruber,
+ *                Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/pltf/PltfCpuSyclIntel.hpp b/include/alpaka/pltf/PltfCpuSyclIntel.hpp
index 1c28a1288610..0c6489879c82 100644
--- a/include/alpaka/pltf/PltfCpuSyclIntel.hpp
+++ b/include/alpaka/pltf/PltfCpuSyclIntel.hpp
@@ -18,13 +18,7 @@ namespace alpaka
 {
     namespace detail
     {
-        // Prevent clang from annoying us with warnings about emitting too many vtables. These are discarded by the
-        // linker anyway.
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic push
-#        pragma clang diagnostic ignored "-Wweak-vtables"
-#    endif
-        struct IntelCpuSelector final
+        struct IntelCpuSelector
         {
             auto operator()(sycl::device const& dev) const -> int
             {
@@ -34,9 +28,6 @@ namespace alpaka
                 return is_intel_cpu ? 1 : -1;
             }
         };
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic pop
-#    endif
     } // namespace detail
 
     //! The SYCL device manager.
diff --git a/include/alpaka/pltf/PltfGenericSycl.hpp b/include/alpaka/pltf/PltfGenericSycl.hpp
index bd7f351c17a8..914b3c7d80cb 100644
--- a/include/alpaka/pltf/PltfGenericSycl.hpp
+++ b/include/alpaka/pltf/PltfGenericSycl.hpp
@@ -127,7 +127,7 @@ namespace alpaka::trait
 #    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
             printDeviceProperties(sycl_dev);
 #    elif ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-            std::cout << __func__ << sycl_dev.get_info<info::device::name>() << '\n';
+            std::cout << __func__ << sycl_dev.template get_info<sycl::info::device::name>() << '\n';
 #    endif
             using SyclPltf = alpaka::PltfGenericSycl<TSelector>;
             return typename DevType<SyclPltf>::type{sycl_dev, platform.syclContext()};
@@ -189,82 +189,63 @@ namespace alpaka::trait
 
             std::cout << "SYCL version: " << device.get_info<sycl::info::device::version>() << '\n';
 
+#        if !defined(BOOST_COMP_ICPX)
+            // Not defined by Level Zero back-end
             std::cout << "Backend version: " << device.get_info<sycl::info::device::backend_version>() << '\n';
+#        endif
 
             std::cout << "Aspects: " << '\n';
-            auto const aspects = device.get_info<sycl::info::device::aspects>();
-            for(auto const& asp : aspects)
-            {
-                switch(asp)
-                {
-                // Ignore the hardware types - we already have queried this info above
-                case sycl::aspect::cpu:
-                case sycl::aspect::gpu:
-                case sycl::aspect::accelerator:
-                case sycl::aspect::custom:
-                    break;
+            std::cout.flush();
 
-                case sycl::aspect::emulated:
-                    std::cout << "\t* emulated\n";
-                    break;
+#        if defined(BOOST_COMP_ICPX)
+#            if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0)
+            // These aspects are missing from oneAPI versions < 2023.2.0
+            if(device.has(sycl::aspect::emulated))
+                std::cout << "\t* emulated\n";
 
-                case sycl::aspect::host_debugabble:
-                    std::cout << "\t* debugabble using standard debuggers\n";
-                    break;
+            if(device.has(sycl::aspect::host_debuggable))
+                std::cout << "\t* debuggable using standard debuggers\n";
+#            endif
+#        endif
 
-                case sycl::aspect::fp16:
-                    std::cout << "\t* supports sycl::half precision\n";
-                    break;
+            if(device.has(sycl::aspect::fp16))
+                std::cout << "\t* supports sycl::half precision\n";
 
-                case sycl::aspect::fp64:
-                    std::cout << "\t* supports double precision\n";
-                    break;
+            if(device.has(sycl::aspect::fp64))
+                std::cout << "\t* supports double precision\n";
 
-                case sycl::aspect::atomic64:
-                    std::cout << "\t* supports 64-bit atomics\n";
-                    break;
+            if(device.has(sycl::aspect::atomic64))
+                std::cout << "\t* supports 64-bit atomics\n";
 
-                case sycl::aspect::image:
-                    std::cout << "\t* supports images\n";
-                    break;
+            if(device.has(sycl::aspect::image))
+                std::cout << "\t* supports images\n";
 
-                case sycl::aspect::online_compiler:
-                    std::cout << "\t* supports online compilation of device code\n";
-                    break;
+            if(device.has(sycl::aspect::online_compiler))
+                std::cout << "\t* supports online compilation of device code\n";
 
-                case sycl::aspect::online_linker:
-                    std::cout << "\t* supports online linking of device code\n";
-                    break;
+            if(device.has(sycl::aspect::online_linker))
+                std::cout << "\t* supports online linking of device code\n";
 
-                case sycl::aspect::queue_profiling:
-                    std::cout << "\t* supports queue profiling\n";
-                    break;
+            if(device.has(sycl::aspect::queue_profiling))
+                std::cout << "\t* supports queue profiling\n";
 
-                case sycl::aspect::usm_device_allocations:
-                    std::cout << "\t* supports explicit USM allocations\n";
-                    break;
+            if(device.has(sycl::aspect::usm_device_allocations))
+                std::cout << "\t* supports explicit USM allocations\n";
 
-                case sycl::aspect::usm_host_allocations:
-                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host\n";
-                    break;
+            if(device.has(sycl::aspect::usm_host_allocations))
+                std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host\n";
 
-                case sycl::aspect::usm_atomic_host_allocations:
-                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host atomically\n";
-                    break;
+            if(device.has(sycl::aspect::usm_atomic_host_allocations))
+                std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host atomically\n";
 
-                case sycl::aspect::usm_shared_allocations:
-                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared\n";
-                    break;
+            if(device.has(sycl::aspect::usm_shared_allocations))
+                std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared\n";
 
-                case sycl::aspect::usm_atomic_shared_allocations:
-                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared atomically\n";
-                    break;
+            if(device.has(sycl::aspect::usm_atomic_shared_allocations))
+                std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared atomically\n";
 
-                case sycl::aspect::usm_system_allocations:
-                    std::cout << "\t* can access memory allocated by the system allocator\n";
-                    break;
-                }
-            }
+            if(device.has(sycl::aspect::usm_system_allocations))
+                std::cout << "\t* can access memory allocated by the system allocator\n";
 
             std::cout << "Available compute units: " << device.get_info<sycl::info::device::max_compute_units>()
                       << '\n';
@@ -323,7 +304,7 @@ namespace alpaka::trait
             std::cout << "Native ISA vector width (float): "
                       << device.get_info<sycl::info::device::native_vector_width_float>() << '\n';
 
-            if(device.has_aspect(sycl::aspect::fp64))
+            if(device.has(sycl::aspect::fp64))
             {
                 std::cout << "Preferred native vector width (double): "
                           << device.get_info<sycl::info::device::preferred_vector_width_double>() << '\n';
@@ -332,7 +313,7 @@ namespace alpaka::trait
                           << device.get_info<sycl::info::device::native_vector_width_double>() << '\n';
             }
 
-            if(device.has_aspect(sycl::aspect::fp16))
+            if(device.has(sycl::aspect::fp16))
             {
                 std::cout << "Preferred native vector width (half): "
                           << device.get_info<sycl::info::device::preferred_vector_width_half>() << '\n';
@@ -349,7 +330,7 @@ namespace alpaka::trait
             std::cout << "Maximum size of memory object allocation: "
                       << device.get_info<sycl::info::device::max_mem_alloc_size>() << " bytes\n";
 
-            if(device.has_aspect(sycl::aspect::image))
+            if(device.has(sycl::aspect::image))
             {
                 std::cout << "Maximum number of simultaneous image object reads per kernel: "
                           << device.get_info<sycl::info::device::max_read_image_args>() << '\n';
@@ -417,7 +398,7 @@ namespace alpaka::trait
                 find_and_print(sycl::info::fp_config::soft_float);
             };
 
-            if(device.has_aspect(sycl::aspect::fp16))
+            if(device.has(sycl::aspect::fp16))
             {
                 auto const fp16_conf = device.get_info<sycl::info::device::half_fp_config>();
                 print_fp_config("Half", fp16_conf);
@@ -426,7 +407,7 @@ namespace alpaka::trait
             auto const fp32_conf = device.get_info<sycl::info::device::single_fp_config>();
             print_fp_config("Single", fp32_conf);
 
-            if(device.has_aspect(sycl::aspect::fp64))
+            if(device.has(sycl::aspect::fp64))
             {
                 auto const fp64_conf = device.get_info<sycl::info::device::double_fp_config>();
                 print_fp_config("Double", fp64_conf);
@@ -458,7 +439,7 @@ namespace alpaka::trait
                           << device.get_info<sycl::info::device::global_mem_cache_line_size>() << " bytes\n";
 
                 std::cout << "Global memory cache size: "
-                          << device.get_info<sycl::info::device::global_mem_cache_size>() / KiB << " KiB\n"
+                          << device.get_info<sycl::info::device::global_mem_cache_size>() / KiB << " KiB\n";
             }
 
             std::cout << "Global memory size: " << device.get_info<sycl::info::device::global_mem_size>() / MiB
@@ -516,6 +497,11 @@ namespace alpaka::trait
                     case sycl::memory_order::seq_cst:
                         std::cout << "seq_cst";
                         break;
+#        if defined(BOOST_COMP_ICPX)
+                    // Stop icpx from complaining about its own internals.
+                    case sycl::memory_order::__consume_unsupported:
+                        break;
+#        endif
                     }
                     std::cout << ", ";
                 }
@@ -526,9 +512,14 @@ namespace alpaka::trait
             auto const mem_orders = device.get_info<sycl::info::device::atomic_memory_order_capabilities>();
             print_memory_orders(mem_orders);
 
+#        if defined(BOOST_COMP_ICPX)
+#            if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0)
+            // Not implemented in oneAPI < 2023.2.0
             std::cout << "Supported memory orderings for sycl::atomic_fence: ";
             auto const fence_orders = device.get_info<sycl::info::device::atomic_fence_order_capabilities>();
             print_memory_orders(fence_orders);
+#            endif
+#        endif
 
             auto print_memory_scopes = [](std::vector<sycl::memory_scope> const& mem_scopes)
             {
@@ -565,9 +556,14 @@ namespace alpaka::trait
             auto const mem_scopes = device.get_info<sycl::info::device::atomic_memory_scope_capabilities>();
             print_memory_scopes(mem_scopes);
 
+#        if defined(BOOST_COMP_ICPX)
+#            if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0)
+            // Not implemented in oneAPI < 2023.2.0
             std::cout << "Supported memory scopes for sycl::atomic_fence: ";
             auto const fence_scopes = device.get_info<sycl::info::device::atomic_fence_scope_capabilities>();
             print_memory_scopes(fence_scopes);
+#            endif
+#        endif
 
             std::cout << "Device timer resolution: "
                       << device.get_info<sycl::info::device::profiling_timer_resolution>() << " ns\n";
@@ -607,6 +603,11 @@ namespace alpaka::trait
                         std::cout << "by affinity domain";
                         has_affinity_domains = true;
                         break;
+#        if defined(BOOST_COMP_ICPX)
+                    case sycl::info::partition_property::ext_intel_partition_by_cslice:
+                        std::cout << "by compute slice (Intel extension; deprecated)";
+                        break;
+#        endif
                     }
                     std::cout << ", ";
                 }
@@ -671,6 +672,12 @@ namespace alpaka::trait
                 case sycl::info::partition_property::partition_by_affinity_domain:
                     std::cout << "partitioned by affinity domain";
                     break;
+
+#        if defined(BOOST_COMP_ICPX)
+                case sycl::info::partition_property::ext_intel_partition_by_cslice:
+                    std::cout << "partitioned by compute slice (Intel extension; deprecated)";
+                    break;
+#        endif
                 }
                 std::cout << '\n';
 
diff --git a/include/alpaka/pltf/PltfGpuSyclIntel.hpp b/include/alpaka/pltf/PltfGpuSyclIntel.hpp
index be1ae4643810..2e5d7583aaab 100644
--- a/include/alpaka/pltf/PltfGpuSyclIntel.hpp
+++ b/include/alpaka/pltf/PltfGpuSyclIntel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -18,13 +18,7 @@ namespace alpaka
 {
     namespace detail
     {
-        // Prevent clang from annoying us with warnings about emitting too many vtables. These are discarded by the
-        // linker anyway.
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic push
-#        pragma clang diagnostic ignored "-Wweak-vtables"
-#    endif
-        struct IntelGpuSelector final
+        struct IntelGpuSelector
         {
             auto operator()(sycl::device const& dev) const -> int
             {
@@ -34,9 +28,6 @@ namespace alpaka
                 return is_intel_gpu ? 1 : -1;
             }
         };
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic pop
-#    endif
     } // namespace detail
 
     //! The SYCL device manager.
diff --git a/include/alpaka/rand/RandGenericSycl.hpp b/include/alpaka/rand/RandGenericSycl.hpp
index 35c788b449a7..7f4d3a3e1c65 100644
--- a/include/alpaka/rand/RandGenericSycl.hpp
+++ b/include/alpaka/rand/RandGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Luca Ferragina, Aurora Perego, Andrea Bocci
+/* Copyright 2023 Luca Ferragina, Aurora Perego, Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/test/dim/TestDims.hpp b/include/alpaka/test/dim/TestDims.hpp
index 6350697fe7b3..395c97e5dcf3 100644
--- a/include/alpaka/test/dim/TestDims.hpp
+++ b/include/alpaka/test/dim/TestDims.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Benjamin Worpitz, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber
+/* Copyright 2023 Benjamin Worpitz, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/test/event/EventHostManualTrigger.hpp b/include/alpaka/test/event/EventHostManualTrigger.hpp
index d465555c0dc9..7016f45e6c45 100644
--- a/include/alpaka/test/event/EventHostManualTrigger.hpp
+++ b/include/alpaka/test/event/EventHostManualTrigger.hpp
@@ -1,4 +1,5 @@
-/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber
+/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Jan Stephan, Jeffrey Kelling, Andrea Bocci,
+ *                Bernhard Manfred Gruber
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/test/queue/Queue.hpp b/include/alpaka/test/queue/Queue.hpp
index 07d8495da8df..2fc75d6dacd2 100644
--- a/include/alpaka/test/queue/Queue.hpp
+++ b/include/alpaka/test/queue/Queue.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/test/unit/atomic/src/AtomicTest.cpp b/test/unit/atomic/src/AtomicTest.cpp
index b04fd281af6c..467dab651741 100644
--- a/test/unit/atomic/src/AtomicTest.cpp
+++ b/test/unit/atomic/src/AtomicTest.cpp
@@ -1,4 +1,5 @@
-/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber, Antonio Di Pilato
+/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, Sergei Bastrakov, René Widera, Jan Stephan,
+ *                Bernhard Manfred Gruber, Antonio Di Pilato, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/thirdParty/CMakeLists.txt b/thirdParty/CMakeLists.txt
index b061c9b0678a..ae1347167e8e 100644
--- a/thirdParty/CMakeLists.txt
+++ b/thirdParty/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright 2023 Jan Stephan
+# Copyright 2023 Jan Stephan, Bernhard Manfred Gruber, Andrea Bocci
 # SPDX-License-Identifier: MPL-2.0
 #
 

From 0fadd2abca03ce6809f8745a721c498fcc354d98 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 20 Jun 2023 08:20:31 +0200
Subject: [PATCH 05/10] Generalise the SYCL CpuSelector to non-Intel CPUs

---
 README_SYCL.md                                |  6 ++--
 .../{AccCpuSyclIntel.hpp => AccCpuSycl.hpp}   | 36 +++++++++----------
 include/alpaka/acc/Tag.hpp                    |  4 +--
 include/alpaka/alpaka.hpp                     | 18 +++++-----
 .../{DevCpuSyclIntel.hpp => DevCpuSycl.hpp}   |  6 ++--
 ...EventCpuSyclIntel.hpp => EventCpuSycl.hpp} |  6 ++--
 include/alpaka/example/ExampleDefaultAcc.hpp  |  4 +--
 ...CpuSyclIntel.hpp => TaskKernelCpuSycl.hpp} |  7 ++--
 .../{BufCpuSyclIntel.hpp => BufCpuSycl.hpp}   |  6 ++--
 .../{PltfCpuSyclIntel.hpp => PltfCpuSycl.hpp} | 13 +++----
 ...lBlocking.hpp => QueueCpuSyclBlocking.hpp} |  6 ++--
 ...ocking.hpp => QueueCpuSyclNonBlocking.hpp} |  6 ++--
 .../{CpuSyclIntel.hpp => CpuSycl.hpp}         |  2 +-
 include/alpaka/test/acc/TestAccs.hpp          |  6 ++--
 include/alpaka/test/queue/Queue.hpp           | 16 ++++-----
 test/unit/acc/src/AccTagTest.cpp              |  6 ++--
 16 files changed, 72 insertions(+), 76 deletions(-)
 rename include/alpaka/acc/{AccCpuSyclIntel.hpp => AccCpuSycl.hpp} (66%)
 rename include/alpaka/dev/{DevCpuSyclIntel.hpp => DevCpuSycl.hpp} (65%)
 rename include/alpaka/event/{EventCpuSyclIntel.hpp => EventCpuSycl.hpp} (65%)
 rename include/alpaka/kernel/{TaskKernelCpuSyclIntel.hpp => TaskKernelCpuSycl.hpp} (66%)
 rename include/alpaka/mem/buf/{BufCpuSyclIntel.hpp => BufCpuSycl.hpp} (64%)
 rename include/alpaka/pltf/{PltfCpuSyclIntel.hpp => PltfCpuSycl.hpp} (63%)
 rename include/alpaka/queue/{QueueCpuSyclIntelBlocking.hpp => QueueCpuSyclBlocking.hpp} (59%)
 rename include/alpaka/queue/{QueueCpuSyclIntelNonBlocking.hpp => QueueCpuSyclNonBlocking.hpp} (59%)
 rename include/alpaka/standalone/{CpuSyclIntel.hpp => CpuSycl.hpp} (87%)

diff --git a/README_SYCL.md b/README_SYCL.md
index 9846f1b8768b..ccac62293474 100644
--- a/README_SYCL.md
+++ b/README_SYCL.md
@@ -53,9 +53,9 @@ To enable device-side printing add the following compiler flags:
 * `-DALPAKA_SYCL_IOSTREAM_ENABLED`: to enable device-side printing.
 * `-DALPAKA_SYCL_IOSTREAM_KIB=<value>`: `<value>` (without the brackets) defines the kibibytes per block to be reserved for device-side printing. `<value>` cannot exceed the amount of shared memory per block.
 
-### Building for Intel CPUs
+### Building for x86 64-bit CPUs
 
-1. `#include <alpaka/standalone/CpuSyclIntel.hpp>` in your C++ code.
+1. `#include <alpaka/standalone/CpuSycl.hpp>` in your C++ code.
 2. Add the following flags:
   * `-fsycl-targets=spir64_x86_64` (compiler and linker): to enable CPU compilation. Note: If you are using multiple SYCL hardware targets (like CPU and GPU) separate them by comma here.
   * `-Xsycl-target-backend=spir64_x86_64 "-march=<ISA>"` (linker): to choose the Intel ISA to compile for. Check the output of `opencl-aot --help` and look for the possible values of the `--march` flag.
@@ -84,7 +84,7 @@ To enable device-side printing add the following compiler flags:
 
 In contrast to the other back-ends the SYCL back-end comes with multiple different accelerators which should be chosen according to your requirements:
 
-* `alpaka::experimental::AccCpuSyclIntel` for targeting Intel CPUs. In contrast to the other CPU back-ends this will be using Intel's OpenCL implementation for CPUs under the hood.
+* `alpaka::experimental::AccCpuSycl` for targeting Intel and AMD CPUs. In contrast to the other CPU back-ends this will use Intel's OpenCL implementation for CPUs under the hood.
 * `alpaka::experimental::AccFpgaSyclIntel` for targeting Intel FPGAs.
 * `alpaka::experimental::AccGpuSyclIntel` for targeting Intel GPUs. 
 
diff --git a/include/alpaka/acc/AccCpuSyclIntel.hpp b/include/alpaka/acc/AccCpuSycl.hpp
similarity index 66%
rename from include/alpaka/acc/AccCpuSyclIntel.hpp
rename to include/alpaka/acc/AccCpuSycl.hpp
index 751d16c6b8b0..5cedbea64d8f 100644
--- a/include/alpaka/acc/AccCpuSyclIntel.hpp
+++ b/include/alpaka/acc/AccCpuSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan, Luca Ferragina
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -9,11 +9,11 @@
 #include "alpaka/core/Concepts.hpp"
 #include "alpaka/core/DemangleTypeNames.hpp"
 #include "alpaka/core/Sycl.hpp"
-#include "alpaka/dev/DevCpuSyclIntel.hpp"
+#include "alpaka/dev/DevCpuSycl.hpp"
 #include "alpaka/dev/Traits.hpp"
-#include "alpaka/kernel/TaskKernelCpuSyclIntel.hpp"
+#include "alpaka/kernel/TaskKernelCpuSycl.hpp"
 #include "alpaka/kernel/Traits.hpp"
-#include "alpaka/pltf/PltfCpuSyclIntel.hpp"
+#include "alpaka/pltf/PltfCpuSycl.hpp"
 #include "alpaka/pltf/Traits.hpp"
 #include "alpaka/vec/Vec.hpp"
 
@@ -31,9 +31,9 @@ namespace alpaka
     //!
     //! This accelerator allows parallel kernel execution on a oneAPI-capable Intel CPU target device.
     template<typename TDim, typename TIdx>
-    class AccCpuSyclIntel final
+    class AccCpuSycl final
         : public AccGenericSycl<TDim, TIdx>
-        , public concepts::Implements<ConceptAcc, AccCpuSyclIntel<TDim, TIdx>>
+        , public concepts::Implements<ConceptAcc, AccCpuSycl<TDim, TIdx>>
     {
     public:
         using AccGenericSycl<TDim, TIdx>::AccGenericSycl;
@@ -44,28 +44,28 @@ namespace alpaka::trait
 {
     //! The Intel CPU SYCL accelerator name trait specialization.
     template<typename TDim, typename TIdx>
-    struct GetAccName<AccCpuSyclIntel<TDim, TIdx>>
+    struct GetAccName<AccCpuSycl<TDim, TIdx>>
     {
         static auto getAccName() -> std::string
         {
-            return "AccCpuSyclIntel<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
+            return "AccCpuSycl<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
         }
     };
 
     //! The Intel CPU SYCL accelerator device type trait specialization.
     template<typename TDim, typename TIdx>
-    struct DevType<AccCpuSyclIntel<TDim, TIdx>>
+    struct DevType<AccCpuSycl<TDim, TIdx>>
     {
-        using type = DevCpuSyclIntel;
+        using type = DevCpuSycl;
     };
 
     //! The Intel CPU SYCL accelerator execution task type trait specialization.
     template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
-    struct CreateTaskKernel<AccCpuSyclIntel<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
+    struct CreateTaskKernel<AccCpuSycl<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
     {
         static auto createTaskKernel(TWorkDiv const& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
         {
-            return TaskKernelCpuSyclIntel<TDim, TIdx, TKernelFnObj, TArgs...>{
+            return TaskKernelCpuSycl<TDim, TIdx, TKernelFnObj, TArgs...>{
                 workDiv,
                 kernelFnObj,
                 std::forward<TArgs>(args)...};
@@ -74,21 +74,21 @@ namespace alpaka::trait
 
     //! The Intel CPU SYCL execution task platform type trait specialization.
     template<typename TDim, typename TIdx>
-    struct PltfType<AccCpuSyclIntel<TDim, TIdx>>
+    struct PltfType<AccCpuSycl<TDim, TIdx>>
     {
-        using type = PltfCpuSyclIntel;
+        using type = PltfCpuSycl;
     };
 
     template<typename TDim, typename TIdx>
-    struct AccToTag<alpaka::AccCpuSyclIntel<TDim, TIdx>>
+    struct AccToTag<alpaka::AccCpuSycl<TDim, TIdx>>
     {
-        using type = alpaka::TagCpuSyclIntel;
+        using type = alpaka::TagCpuSycl;
     };
 
     template<typename TDim, typename TIdx>
-    struct TagToAcc<alpaka::TagCpuSyclIntel, TDim, TIdx>
+    struct TagToAcc<alpaka::TagCpuSycl, TDim, TIdx>
     {
-        using type = alpaka::AccCpuSyclIntel<TDim, TIdx>;
+        using type = alpaka::AccCpuSycl<TDim, TIdx>;
     };
 } // namespace alpaka::trait
 
diff --git a/include/alpaka/acc/Tag.hpp b/include/alpaka/acc/Tag.hpp
index f950bb03b56e..03a6449a75a9 100644
--- a/include/alpaka/acc/Tag.hpp
+++ b/include/alpaka/acc/Tag.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Simeon Ehrig, Jan Stephan
+/* Copyright 2023 Simeon Ehrig, Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -23,7 +23,7 @@ namespace alpaka
     CREATE_ACC_TAG(TagCpuOmp2Blocks);
     CREATE_ACC_TAG(TagCpuOmp2Threads);
     CREATE_ACC_TAG(TagCpuSerial);
-    CREATE_ACC_TAG(TagCpuSyclIntel);
+    CREATE_ACC_TAG(TagCpuSycl);
     CREATE_ACC_TAG(TagCpuTbbBlocks);
     CREATE_ACC_TAG(TagCpuThreads);
     CREATE_ACC_TAG(TagFpgaSyclIntel);
diff --git a/include/alpaka/alpaka.hpp b/include/alpaka/alpaka.hpp
index 3262ea360edc..327e53fab9f0 100644
--- a/include/alpaka/alpaka.hpp
+++ b/include/alpaka/alpaka.hpp
@@ -1,5 +1,5 @@
 /* Copyright 2023 Axel Hübl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Bernhard Manfred Gruber,
- *                Jan Stephan, Antonio Di Pilato, Luca Ferragina, Aurora Perego
+ *                Jan Stephan, Antonio Di Pilato, Luca Ferragina, Aurora Perego, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -13,7 +13,7 @@
 #include "alpaka/acc/AccCpuOmp2Blocks.hpp"
 #include "alpaka/acc/AccCpuOmp2Threads.hpp"
 #include "alpaka/acc/AccCpuSerial.hpp"
-#include "alpaka/acc/AccCpuSyclIntel.hpp"
+#include "alpaka/acc/AccCpuSycl.hpp"
 #include "alpaka/acc/AccCpuTbbBlocks.hpp"
 #include "alpaka/acc/AccCpuThreads.hpp"
 #include "alpaka/acc/AccDevProps.hpp"
@@ -75,7 +75,7 @@
 #include "alpaka/core/Vectorize.hpp"
 // dev
 #include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/dev/DevCpuSyclIntel.hpp"
+#include "alpaka/dev/DevCpuSycl.hpp"
 #include "alpaka/dev/DevCudaRt.hpp"
 #include "alpaka/dev/DevFpgaSyclIntel.hpp"
 #include "alpaka/dev/DevGenericSycl.hpp"
@@ -89,7 +89,7 @@
 #include "alpaka/dim/Traits.hpp"
 // event
 #include "alpaka/event/EventCpu.hpp"
-#include "alpaka/event/EventCpuSyclIntel.hpp"
+#include "alpaka/event/EventCpuSycl.hpp"
 #include "alpaka/event/EventCudaRt.hpp"
 #include "alpaka/event/EventFpgaSyclIntel.hpp"
 #include "alpaka/event/EventGenericSycl.hpp"
@@ -114,7 +114,7 @@
 #include "alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp"
 #include "alpaka/kernel/TaskKernelCpuOmp2Threads.hpp"
 #include "alpaka/kernel/TaskKernelCpuSerial.hpp"
-#include "alpaka/kernel/TaskKernelCpuSyclIntel.hpp"
+#include "alpaka/kernel/TaskKernelCpuSycl.hpp"
 #include "alpaka/kernel/TaskKernelCpuTbbBlocks.hpp"
 #include "alpaka/kernel/TaskKernelCpuThreads.hpp"
 #include "alpaka/kernel/TaskKernelFpgaSyclIntel.hpp"
@@ -133,7 +133,7 @@
 #include "alpaka/mem/alloc/AllocCpuNew.hpp"
 #include "alpaka/mem/alloc/Traits.hpp"
 #include "alpaka/mem/buf/BufCpu.hpp"
-#include "alpaka/mem/buf/BufCpuSyclIntel.hpp"
+#include "alpaka/mem/buf/BufCpuSycl.hpp"
 #include "alpaka/mem/buf/BufCudaRt.hpp"
 #include "alpaka/mem/buf/BufFpgaSyclIntel.hpp"
 #include "alpaka/mem/buf/BufGenericSycl.hpp"
@@ -178,7 +178,7 @@
 #include "alpaka/offset/Traits.hpp"
 // platform
 #include "alpaka/pltf/PltfCpu.hpp"
-#include "alpaka/pltf/PltfCpuSyclIntel.hpp"
+#include "alpaka/pltf/PltfCpuSycl.hpp"
 #include "alpaka/pltf/PltfCudaRt.hpp"
 #include "alpaka/pltf/PltfFpgaSyclIntel.hpp"
 #include "alpaka/pltf/PltfGpuSyclIntel.hpp"
@@ -197,8 +197,8 @@
 #include "alpaka/queue/Properties.hpp"
 #include "alpaka/queue/QueueCpuBlocking.hpp"
 #include "alpaka/queue/QueueCpuNonBlocking.hpp"
-#include "alpaka/queue/QueueCpuSyclIntelBlocking.hpp"
-#include "alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp"
+#include "alpaka/queue/QueueCpuSyclBlocking.hpp"
+#include "alpaka/queue/QueueCpuSyclNonBlocking.hpp"
 #include "alpaka/queue/QueueCudaRtBlocking.hpp"
 #include "alpaka/queue/QueueCudaRtNonBlocking.hpp"
 #include "alpaka/queue/QueueFpgaSyclIntelBlocking.hpp"
diff --git a/include/alpaka/dev/DevCpuSyclIntel.hpp b/include/alpaka/dev/DevCpuSycl.hpp
similarity index 65%
rename from include/alpaka/dev/DevCpuSyclIntel.hpp
rename to include/alpaka/dev/DevCpuSycl.hpp
index 8add6869e3da..1a9909d31cd0 100644
--- a/include/alpaka/dev/DevCpuSyclIntel.hpp
+++ b/include/alpaka/dev/DevCpuSycl.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
 #include "alpaka/dev/DevGenericSycl.hpp"
-#include "alpaka/pltf/PltfCpuSyclIntel.hpp"
+#include "alpaka/pltf/PltfCpuSycl.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU)
 
 namespace alpaka
 {
-    using DevCpuSyclIntel = DevGenericSycl<PltfCpuSyclIntel>;
+    using DevCpuSycl = DevGenericSycl<PltfCpuSycl>;
 } // namespace alpaka
 
 #endif
diff --git a/include/alpaka/event/EventCpuSyclIntel.hpp b/include/alpaka/event/EventCpuSycl.hpp
similarity index 65%
rename from include/alpaka/event/EventCpuSyclIntel.hpp
rename to include/alpaka/event/EventCpuSycl.hpp
index 21a752e28991..37e521484e93 100644
--- a/include/alpaka/event/EventCpuSyclIntel.hpp
+++ b/include/alpaka/event/EventCpuSycl.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-#include "alpaka/dev/DevCpuSyclIntel.hpp"
+#include "alpaka/dev/DevCpuSycl.hpp"
 #include "alpaka/event/EventGenericSycl.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU)
 
 namespace alpaka
 {
-    using EventCpuSyclIntel = EventGenericSycl<DevCpuSyclIntel>;
+    using EventCpuSycl = EventGenericSycl<DevCpuSycl>;
 } // namespace alpaka
 
 #endif
diff --git a/include/alpaka/example/ExampleDefaultAcc.hpp b/include/alpaka/example/ExampleDefaultAcc.hpp
index ef0eb80fb380..6ceb874dd708 100644
--- a/include/alpaka/example/ExampleDefaultAcc.hpp
+++ b/include/alpaka/example/ExampleDefaultAcc.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego
+/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -26,7 +26,7 @@ namespace alpaka
     using ExampleDefaultAcc = alpaka::AccCpuThreads<TDim, TIdx>;
 #elif defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI)
 #    if defined(ALPAKA_SYCL_ONEAPI_CPU)
-    using ExampleDefaultAcc = alpaka::AccCpuSyclIntel<TDim, TIdx>;
+    using ExampleDefaultAcc = alpaka::AccCpuSycl<TDim, TIdx>;
 #    elif defined(ALPAKA_SYCL_ONEAPI_FPGA)
     using ExampleDefaultAcc = alpaka::AccFpgaSyclIntel<TDim, TIdx>;
 #    elif defined(ALPAKA_SYCL_ONEAPI_GPU)
diff --git a/include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp b/include/alpaka/kernel/TaskKernelCpuSycl.hpp
similarity index 66%
rename from include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp
rename to include/alpaka/kernel/TaskKernelCpuSycl.hpp
index fbfc88b9c4b3..abb8c9a81d03 100644
--- a/include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp
+++ b/include/alpaka/kernel/TaskKernelCpuSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan, Luca Ferragina
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -11,11 +11,10 @@
 namespace alpaka
 {
     template<typename TDim, typename TIdx>
-    class AccCpuSyclIntel;
+    class AccCpuSycl;
 
     template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    using TaskKernelCpuSyclIntel
-        = TaskKernelGenericSycl<AccCpuSyclIntel<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>;
+    using TaskKernelCpuSycl = TaskKernelGenericSycl<AccCpuSycl<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>;
 } // namespace alpaka
 
 #endif
diff --git a/include/alpaka/mem/buf/BufCpuSyclIntel.hpp b/include/alpaka/mem/buf/BufCpuSycl.hpp
similarity index 64%
rename from include/alpaka/mem/buf/BufCpuSyclIntel.hpp
rename to include/alpaka/mem/buf/BufCpuSycl.hpp
index 0d73ffe281c6..1dbdc9f3a2ad 100644
--- a/include/alpaka/mem/buf/BufCpuSyclIntel.hpp
+++ b/include/alpaka/mem/buf/BufCpuSycl.hpp
@@ -1,10 +1,10 @@
-/* Copyright 2022 Jan Stephan, Luca Ferragina
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-#include "alpaka/dev/DevCpuSyclIntel.hpp"
+#include "alpaka/dev/DevCpuSycl.hpp"
 #include "alpaka/mem/buf/BufGenericSycl.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU)
@@ -12,7 +12,7 @@
 namespace alpaka
 {
     template<typename TElem, typename TDim, typename TIdx>
-    using BufCpuSyclIntel = BufGenericSycl<TElem, TDim, TIdx, PltfCpuSyclIntel>;
+    using BufCpuSycl = BufGenericSycl<TElem, TDim, TIdx, PltfCpuSycl>;
 }
 
 #endif
diff --git a/include/alpaka/pltf/PltfCpuSyclIntel.hpp b/include/alpaka/pltf/PltfCpuSycl.hpp
similarity index 63%
rename from include/alpaka/pltf/PltfCpuSyclIntel.hpp
rename to include/alpaka/pltf/PltfCpuSycl.hpp
index 0c6489879c82..40f6855eab1a 100644
--- a/include/alpaka/pltf/PltfCpuSyclIntel.hpp
+++ b/include/alpaka/pltf/PltfCpuSycl.hpp
@@ -18,29 +18,26 @@ namespace alpaka
 {
     namespace detail
     {
-        struct IntelCpuSelector
+        struct SyclCpuSelector
         {
             auto operator()(sycl::device const& dev) const -> int
             {
-                auto const& vendor = dev.get_info<sycl::info::device::vendor>();
-                auto const is_intel_cpu = (vendor.find("Intel(R) Corporation") != std::string::npos) && dev.is_cpu();
-
-                return is_intel_cpu ? 1 : -1;
+                return dev.is_cpu() ? 1 : -1;
             }
         };
     } // namespace detail
 
     //! The SYCL device manager.
-    using PltfCpuSyclIntel = PltfGenericSycl<detail::IntelCpuSelector>;
+    using PltfCpuSycl = PltfGenericSycl<detail::SyclCpuSelector>;
 } // namespace alpaka
 
 namespace alpaka::trait
 {
     //! The SYCL device manager device type trait specialization.
     template<>
-    struct DevType<PltfCpuSyclIntel>
+    struct DevType<PltfCpuSycl>
     {
-        using type = DevGenericSycl<PltfCpuSyclIntel>; // = DevCpuSyclIntel
+        using type = DevGenericSycl<PltfCpuSycl>; // = DevCpuSycl
     };
 } // namespace alpaka::trait
 
diff --git a/include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp b/include/alpaka/queue/QueueCpuSyclBlocking.hpp
similarity index 59%
rename from include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp
rename to include/alpaka/queue/QueueCpuSyclBlocking.hpp
index 039a05b3229f..3d561733ecc5 100644
--- a/include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp
+++ b/include/alpaka/queue/QueueCpuSyclBlocking.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2022 Jan Stephan, Luca Ferragina
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-#include "alpaka/dev/DevCpuSyclIntel.hpp"
+#include "alpaka/dev/DevCpuSycl.hpp"
 #include "alpaka/queue/QueueGenericSyclBlocking.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU)
 
 namespace alpaka
 {
-    using QueueCpuSyclIntelBlocking = QueueGenericSyclBlocking<DevCpuSyclIntel>;
+    using QueueCpuSyclBlocking = QueueGenericSyclBlocking<DevCpuSycl>;
 }
 
 #endif
diff --git a/include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp b/include/alpaka/queue/QueueCpuSyclNonBlocking.hpp
similarity index 59%
rename from include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp
rename to include/alpaka/queue/QueueCpuSyclNonBlocking.hpp
index 02391231f5f5..c75f5be45229 100644
--- a/include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp
+++ b/include/alpaka/queue/QueueCpuSyclNonBlocking.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2022 Jan Stephan, Luca Ferragina
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-#include "alpaka/dev/DevCpuSyclIntel.hpp"
+#include "alpaka/dev/DevCpuSycl.hpp"
 #include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU)
 
 namespace alpaka
 {
-    using QueueCpuSyclIntelNonBlocking = QueueGenericSyclNonBlocking<DevCpuSyclIntel>;
+    using QueueCpuSyclNonBlocking = QueueGenericSyclNonBlocking<DevCpuSycl>;
 }
 
 #endif
diff --git a/include/alpaka/standalone/CpuSyclIntel.hpp b/include/alpaka/standalone/CpuSycl.hpp
similarity index 87%
rename from include/alpaka/standalone/CpuSyclIntel.hpp
rename to include/alpaka/standalone/CpuSycl.hpp
index e42b64a2097e..fbdb5c2c481e 100644
--- a/include/alpaka/standalone/CpuSyclIntel.hpp
+++ b/include/alpaka/standalone/CpuSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/test/acc/TestAccs.hpp b/include/alpaka/test/acc/TestAccs.hpp
index 742b40fd8313..62c1e6f7d2c2 100644
--- a/include/alpaka/test/acc/TestAccs.hpp
+++ b/include/alpaka/test/acc/TestAccs.hpp
@@ -79,10 +79,10 @@ namespace alpaka::test
 #endif
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_TARGET_CPU)
         template<typename TDim, typename TIdx>
-        using AccCpuSyclIntelIfAvailableElseInt = alpaka::AccCpuSyclIntel<TDim, TIdx>;
+        using AccCpuSyclIfAvailableElseInt = alpaka::AccCpuSycl<TDim, TIdx>;
 #else
         template<typename TDim, typename TIdx>
-        using AccCpuSyclIntelIfAvailableElseInt = int;
+        using AccCpuSyclIfAvailableElseInt = int;
 #endif
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_TARGET_FPGA)
         template<typename TDim, typename TIdx>
@@ -109,7 +109,7 @@ namespace alpaka::test
             AccCpuOmp2ThreadsIfAvailableElseInt<TDim, TIdx>,
             AccGpuCudaRtIfAvailableElseInt<TDim, TIdx>,
             AccGpuHipRtIfAvailableElseInt<TDim, TIdx>,
-            AccCpuSyclIntelIfAvailableElseInt<TDim, TIdx>,
+            AccCpuSyclIfAvailableElseInt<TDim, TIdx>,
             AccFpgaSyclIntelIfAvailableElseInt<TDim, TIdx>,
             AccGpuSyclIntelIfAvailableElseInt<TDim, TIdx>>;
     } // namespace detail
diff --git a/include/alpaka/test/queue/Queue.hpp b/include/alpaka/test/queue/Queue.hpp
index 2fc75d6dacd2..7cb3492cfc45 100644
--- a/include/alpaka/test/queue/Queue.hpp
+++ b/include/alpaka/test/queue/Queue.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -86,23 +86,23 @@ namespace alpaka::test
 #        ifdef ALPAKA_SYCL_ONEAPI_CPU
         //! The default queue type trait specialization for the Intel CPU device.
         template<>
-        struct DefaultQueueType<alpaka::DevCpuSyclIntel>
+        struct DefaultQueueType<alpaka::DevCpuSycl>
         {
 #            if(ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-            using type = alpaka::QueueCpuSyclIntelBlocking;
+            using type = alpaka::QueueCpuSyclBlocking;
 #            else
-            using type = alpaka::QueueCpuSyclIntelNonBlocking;
+            using type = alpaka::QueueCpuSyclNonBlocking;
 #            endif
         };
 
         template<>
-        struct IsBlockingQueue<alpaka::QueueCpuSyclIntelBlocking>
+        struct IsBlockingQueue<alpaka::QueueCpuSyclBlocking>
         {
             static constexpr auto value = true;
         };
 
         template<>
-        struct IsBlockingQueue<alpaka::QueueCpuSyclIntelNonBlocking>
+        struct IsBlockingQueue<alpaka::QueueCpuSyclNonBlocking>
         {
             static constexpr auto value = false;
         };
@@ -180,8 +180,8 @@ namespace alpaka::test
 #    ifdef ALPAKA_SYCL_BACKEND_ONEAPI
 #        ifdef ALPAKA_SYCL_ONEAPI_CPU
         ,
-        std::tuple<alpaka::DevCpuSyclIntel, alpaka::QueueCpuSyclIntelBlocking>,
-        std::tuple<alpaka::DevCpuSyclIntel, alpaka::QueueCpuSyclIntelNonBlocking>
+        std::tuple<alpaka::DevCpuSycl, alpaka::QueueCpuSyclBlocking>,
+        std::tuple<alpaka::DevCpuSycl, alpaka::QueueCpuSyclNonBlocking>
 #        endif
 #        ifdef ALPAKA_SYCL_ONEAPI_FPGA
         ,
diff --git a/test/unit/acc/src/AccTagTest.cpp b/test/unit/acc/src/AccTagTest.cpp
index f2d5f58ee45e..741f2e17385b 100644
--- a/test/unit/acc/src/AccTagTest.cpp
+++ b/test/unit/acc/src/AccTagTest.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Simeon Ehrig, Jan Stephan
+/* Copyright 2023 Simeon Ehrig, Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -23,7 +23,7 @@ using TagList = std::tuple<
     alpaka::TagCpuOmp2Threads,
     alpaka::TagGpuCudaRt,
     alpaka::TagGpuHipRt,
-    alpaka::TagCpuSyclIntel,
+    alpaka::TagCpuSycl,
     alpaka::TagFpgaSyclIntel,
     alpaka::TagGpuSyclIntel>;
 
@@ -35,7 +35,7 @@ using AccToTagMap = std::tuple<
     std::pair<alpaka::test::detail::AccCpuOmp2ThreadsIfAvailableElseInt<Dim, Idx>, alpaka::TagCpuOmp2Threads>,
     std::pair<alpaka::test::detail::AccGpuCudaRtIfAvailableElseInt<Dim, Idx>, alpaka::TagGpuCudaRt>,
     std::pair<alpaka::test::detail::AccGpuHipRtIfAvailableElseInt<Dim, Idx>, alpaka::TagGpuHipRt>,
-    std::pair<alpaka::test::detail::AccCpuSyclIntelIfAvailableElseInt<Dim, Idx>, alpaka::TagCpuSyclIntel>,
+    std::pair<alpaka::test::detail::AccCpuSyclIfAvailableElseInt<Dim, Idx>, alpaka::TagCpuSycl>,
     std::pair<alpaka::test::detail::AccFpgaSyclIntelIfAvailableElseInt<Dim, Idx>, alpaka::TagFpgaSyclIntel>,
     std::pair<alpaka::test::detail::AccGpuSyclIntelIfAvailableElseInt<Dim, Idx>, alpaka::TagGpuSyclIntel>>;
 

From 93c607decc051ee68585b892e35344440e51057d Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 20 Jun 2023 19:38:16 +0200
Subject: [PATCH 06/10] Rewrite the SYCL memcpy and memset operations

Rewrite the N-dimensional Copy and Set memory operations to support pitched
memory buffers, based on the Cpu implementation. This may require more than one
memset or memcpy call per operation, which is not supported by command group
handlers. Rewrite the Copy and Set memory operations to use queues instead.
---
 include/alpaka/mem/buf/BufGenericSycl.hpp     |  4 +-
 include/alpaka/mem/buf/sycl/Copy.hpp          | 65 ++++++++++++++-----
 include/alpaka/mem/buf/sycl/Set.hpp           | 49 ++++++++++----
 .../queue/sycl/QueueGenericSyclBase.hpp       | 34 +++++-----
 4 files changed, 108 insertions(+), 44 deletions(-)

diff --git a/include/alpaka/mem/buf/BufGenericSycl.hpp b/include/alpaka/mem/buf/BufGenericSycl.hpp
index b60d70c8b033..e725c299a598 100644
--- a/include/alpaka/mem/buf/BufGenericSycl.hpp
+++ b/include/alpaka/mem/buf/BufGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan, Luca Ferragina, Aurora Perego
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -192,7 +192,7 @@ namespace alpaka::trait
             }
 #    endif
 
-            auto* memPtr = sycl::malloc_device<TElem>(
+            TElem* memPtr = sycl::malloc_device<TElem>(
                 static_cast<std::size_t>(getExtentProduct(extent)),
                 dev.getNativeHandle().first,
                 dev.getNativeHandle().second);
diff --git a/include/alpaka/mem/buf/sycl/Copy.hpp b/include/alpaka/mem/buf/sycl/Copy.hpp
index 7373125c63c6..34527e8478ed 100644
--- a/include/alpaka/mem/buf/sycl/Copy.hpp
+++ b/include/alpaka/mem/buf/sycl/Copy.hpp
@@ -1,8 +1,7 @@
-/* Copyright 2023 Jan Stephan, Bernhard Manfred Gruber, Luca Ferragina, Aurora Perego
+/* Copyright 2023 Jan Stephan, Bernhard Manfred Gruber, Luca Ferragina, Aurora Perego, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
-
 #pragma once
 
 #include "alpaka/core/Debug.hpp"
@@ -27,7 +26,7 @@
 
 namespace alpaka::detail
 {
-    //!  The Sycl device memory copy task base.
+    //!  The SYCL device memory copy task base.
     template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
     struct TaskCopySyclBase
     {
@@ -42,8 +41,8 @@ namespace alpaka::detail
         template<typename TViewFwd>
         TaskCopySyclBase(TViewFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
             : m_extent(getExtentVec(extent))
-            , m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast<ExtentSize>(sizeof(Elem)))
 #    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+            , m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast<ExtentSize>(sizeof(Elem)))
             , m_dstExtent(getExtentVec(viewDst))
             , m_srcExtent(getExtentVec(viewSrc))
 #    endif
@@ -70,8 +69,8 @@ namespace alpaka::detail
 #    endif
 
         Vec<TDim, ExtentSize> const m_extent;
-        ExtentSize const m_extentWidthBytes;
 #    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+        ExtentSize const m_extentWidthBytes;
         Vec<TDim, DstSize> const m_dstExtent;
         Vec<TDim, SrcSize> const m_srcExtent;
 #    endif
@@ -83,7 +82,7 @@ namespace alpaka::detail
         static constexpr auto is_sycl_task = true;
     };
 
-    //! The Sycl device ND memory copy task.
+    //! The SYCL device ND memory copy task.
     template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
     struct TaskCopySycl : public TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>
     {
@@ -94,20 +93,47 @@ namespace alpaka::detail
 
         using TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>::TaskCopySyclBase;
 
-        ALPAKA_FN_HOST auto operator()(sycl::handler& cgh) const -> void
+        ALPAKA_FN_HOST auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const
+            -> sycl::event
         {
             ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
 #    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
             this->printDebug();
 #    endif
+            // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one
+            // iteration.
+            Vec<DimMin1, ExtentSize> const extentWithoutInnermost(subVecBegin<DimMin1>(this->m_extent));
+            // [z, y, x] -> [y, x] because the z pitch (the full size of the buffer) is not required.
+            Vec<DimMin1, DstSize> const dstPitchBytesWithoutOutmost(subVecEnd<DimMin1>(this->m_dstPitchBytes));
+            Vec<DimMin1, SrcSize> const srcPitchBytesWithoutOutmost(subVecEnd<DimMin1>(this->m_srcPitchBytes));
+
+            // Record an event for each memcpy call
+            std::vector<sycl::event> events;
+            events.reserve(static_cast<std::size_t>(extentWithoutInnermost.prod()));
+
             if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
             {
-                cgh.memcpy(
-                    reinterpret_cast<void*>(this->m_dstMemNative),
-                    reinterpret_cast<void const*>(this->m_srcMemNative),
-                    static_cast<std::size_t>(this->m_extentWidthBytes * this->m_extent.prod()));
+                meta::ndLoopIncIdx(
+                    extentWithoutInnermost,
+                    [&](Vec<DimMin1, ExtentSize> const& idx)
+                    {
+                        events.push_back(queue.memcpy(
+                            reinterpret_cast<void*>(
+                                this->m_dstMemNative
+                                + (castVec<DstSize>(idx) * dstPitchBytesWithoutOutmost)
+                                      .foldrAll(std::plus<DstSize>())),
+                            reinterpret_cast<void const*>(
+                                this->m_srcMemNative
+                                + (castVec<SrcSize>(idx) * srcPitchBytesWithoutOutmost)
+                                      .foldrAll(std::plus<SrcSize>())),
+                            static_cast<std::size_t>(this->m_extentWidthBytes),
+                            requirements));
+                    });
             }
+
+            // Return an event that depends on all the events assciated to the memcpy calls
+            return queue.ext_oneapi_submit_barrier(events);
         }
     };
 
@@ -117,8 +143,10 @@ namespace alpaka::detail
         : TaskCopySyclBase<DimInt<1u>, TViewDst, TViewSrc, TExtent>
     {
         using TaskCopySyclBase<DimInt<1u>, TViewDst, TViewSrc, TExtent>::TaskCopySyclBase;
+        using Elem = alpaka::Elem<TViewSrc>;
 
-        ALPAKA_FN_HOST auto operator()(sycl::handler& cgh) const -> void
+        ALPAKA_FN_HOST auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const
+            -> sycl::event
         {
             ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
@@ -127,10 +155,15 @@ namespace alpaka::detail
 #    endif
             if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
             {
-                cgh.memcpy(
+                return queue.memcpy(
                     reinterpret_cast<void*>(this->m_dstMemNative),
                     reinterpret_cast<void const*>(this->m_srcMemNative),
-                    static_cast<std::size_t>(this->m_extentWidthBytes));
+                    sizeof(Elem) * static_cast<std::size_t>(this->m_extent.prod()),
+                    requirements);
+            }
+            else
+            {
+                return queue.ext_oneapi_submit_barrier();
             }
         }
     };
@@ -159,9 +192,9 @@ namespace alpaka::detail
             ALPAKA_ASSERT(getExtentVec(viewSrc).prod() == 1u);
         }
 
-        auto operator()(sycl::handler& cgh) const -> void
+        auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
         {
-            cgh.memcpy(m_dstMemNative, m_srcMemNative, sizeof(Elem));
+            return queue.memcpy(m_dstMemNative, m_srcMemNative, sizeof(Elem), requirements);
         }
 
         void* m_dstMemNative;
diff --git a/include/alpaka/mem/buf/sycl/Set.hpp b/include/alpaka/mem/buf/sycl/Set.hpp
index 25d602228424..da52a9206c46 100644
--- a/include/alpaka/mem/buf/sycl/Set.hpp
+++ b/include/alpaka/mem/buf/sycl/Set.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan, Luca Ferragina, Aurora Perego
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -83,20 +83,42 @@ namespace alpaka
 
             using TaskSetSyclBase<TDim, TView, TExtent>::TaskSetSyclBase;
 
-            auto operator()(sycl::handler& cgh) const -> void
+            auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
             {
                 ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
 #    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
                 this->printDebug();
 #    endif
+                // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one
+                // iteration.
+                Vec<DimMin1, ExtentSize> const extentWithoutInnermost(subVecBegin<DimMin1>(this->m_extent));
+                // [z, y, x] -> [y, x] because the z pitch (the full idx of the buffer) is not required.
+                Vec<DimMin1, DstSize> const dstPitchBytesWithoutOutmost(subVecEnd<DimMin1>(this->m_dstPitchBytes));
+
+                // Record an event for each memcpy call
+                std::vector<sycl::event> events;
+                events.reserve(static_cast<std::size_t>(extentWithoutInnermost.prod()));
+
                 if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
                 {
-                    cgh.memset(
-                        reinterpret_cast<void*>(this->m_dstMemNative),
-                        this->m_byte,
-                        static_cast<std::size_t>(this->m_extentWidthBytes * this->m_extent.prod()));
+                    meta::ndLoopIncIdx(
+                        extentWithoutInnermost,
+                        [&](Vec<DimMin1, ExtentSize> const& idx)
+                        {
+                            events.push_back(queue.memset(
+                                reinterpret_cast<void*>(
+                                    this->m_dstMemNative
+                                    + (castVec<DstSize>(idx) * dstPitchBytesWithoutOutmost)
+                                          .foldrAll(std::plus<DstSize>())),
+                                this->m_byte,
+                                static_cast<std::size_t>(this->m_extentWidthBytes),
+                                requirements));
+                        });
                 }
+
+                // Return an event that depends on all the events assciated to the memcpy calls
+                return queue.ext_oneapi_submit_barrier(events);
             }
         };
 
@@ -106,7 +128,7 @@ namespace alpaka
         {
             using TaskSetSyclBase<DimInt<1u>, TView, TExtent>::TaskSetSyclBase;
 
-            auto operator()(sycl::handler& cgh) const -> void
+            auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
             {
                 ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
@@ -115,10 +137,15 @@ namespace alpaka
 #    endif
                 if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
                 {
-                    cgh.memset(
+                    return queue.memset(
                         reinterpret_cast<void*>(this->m_dstMemNative),
                         this->m_byte,
-                        static_cast<std::size_t>(this->m_extentWidthBytes));
+                        static_cast<std::size_t>(this->m_extentWidthBytes),
+                        requirements);
+                }
+                else
+                {
+                    return queue.ext_oneapi_submit_barrier();
                 }
             }
         };
@@ -151,14 +178,14 @@ namespace alpaka
             }
 #    endif
 
-            auto operator()(sycl::handler& cgh) const -> void
+            auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
             {
                 ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
 #    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
                 printDebug();
 #    endif
-                cgh.memset(reinterpret_cast<void*>(m_dstMemNative), m_byte, sizeof(Elem));
+                return queue.memset(reinterpret_cast<void*>(m_dstMemNative), m_byte, sizeof(Elem), requirements);
             }
 
             std::uint8_t const m_byte;
diff --git a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
index 2db22b6d3abb..c529ecfaa2e4 100644
--- a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
+++ b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
@@ -1,10 +1,9 @@
-/* Copyright 2022 Jan Stephan, Antonio Di Pilato, Luca Ferragina
+/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Luca Ferragina, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-
 #include "alpaka/dev/Traits.hpp"
 #include "alpaka/event/Traits.hpp"
 #include "alpaka/queue/Traits.hpp"
@@ -127,19 +126,24 @@ namespace alpaka::detail
                 clean_dependencies();
 
                 // Execute task
-                m_last_event = m_queue.submit(
-                    [this, &task](sycl::handler& cgh)
-                    {
-                        if(!m_dependencies.empty())
-                            cgh.depends_on(m_dependencies);
-
-                        if constexpr(is_sycl_kernel<TTask>) // Kernel
-                            task(cgh, m_fence_dummy); // Will call cgh.parallel_for internally
-                        else if constexpr(is_sycl_task<TTask>) // Copy / Fill
-                            task(cgh); // Will call cgh.{copy, fill} internally
-                        else // Host
-                            cgh.host_task(task);
-                    });
+                if constexpr(is_sycl_task<TTask> && !is_sycl_kernel<TTask>) // Copy / Fill
+                {
+                    m_last_event = task(m_queue, m_dependencies); // Will call queue.{copy, fill} internally
+                }
+                else
+                {
+                    m_last_event = m_queue.submit(
+                        [this, &task](sycl::handler& cgh)
+                        {
+                            if(!m_dependencies.empty())
+                                cgh.depends_on(m_dependencies);
+
+                            if constexpr(is_sycl_kernel<TTask>) // Kernel
+                                task(cgh, m_fence_dummy); // Will call cgh.parallel_for internally
+                            else // Host
+                                cgh.host_task(task);
+                        });
+                }
 
                 m_dependencies.clear();
             }

From 01afbee880ebd485e0b49ca33dc5b3a625de6bfa Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 26 Jun 2023 08:15:33 +0200
Subject: [PATCH 07/10] First draft adding the warp size as a kernel trait

---
 .../alpaka/kernel/TaskKernelGenericSycl.hpp   | 82 +++++++++++++------
 include/alpaka/kernel/Traits.hpp              | 16 +++-
 include/alpaka/warp/WarpGenericSycl.hpp       |  2 +-
 test/unit/warp/src/Activemask.cpp             | 38 +++++++--
 test/unit/warp/src/All.cpp                    | 26 +++++-
 test/unit/warp/src/Any.cpp                    | 26 +++++-
 test/unit/warp/src/Ballot.cpp                 | 26 +++++-
 test/unit/warp/src/Shfl.cpp                   | 26 +++++-
 8 files changed, 199 insertions(+), 43 deletions(-)

diff --git a/include/alpaka/kernel/TaskKernelGenericSycl.hpp b/include/alpaka/kernel/TaskKernelGenericSycl.hpp
index d4f1937415cb..a173b42a0437 100644
--- a/include/alpaka/kernel/TaskKernelGenericSycl.hpp
+++ b/include/alpaka/kernel/TaskKernelGenericSycl.hpp
@@ -136,33 +136,67 @@ namespace alpaka
 
             auto output_stream = sycl::stream{buf_size, buf_per_work_item, cgh};
 #    endif
-            // cgh.parallel_for<detail::kernel<TAcc, TKernelFnObj, TArgs...>>( //FIXME_
-            cgh.parallel_for(
-                sycl::nd_range<TDim::value>{global_size, local_size},
-                [=](sycl::nd_item<TDim::value> work_item)
-                {
+            if constexpr(trait::WarpSize<TKernelFnObj, TAcc>::warp_size != 0)
+            {
+                // cgh.parallel_for<detail::kernel<TAcc, TKernelFnObj, TArgs...>>( //FIXME_
+                cgh.parallel_for(
+                    sycl::nd_range<TDim::value>{global_size, local_size},
+                    [=](sycl::nd_item<TDim::value> work_item)
+                        [[intel::reqd_sub_group_size(trait::WarpSize<TKernelFnObj, TAcc>::warp_size)]]
+                    {
 #    ifdef ALPAKA_SYCL_IOSTREAM_ENABLED
-                    auto acc = TAcc{
-                        item_elements,
-                        work_item,
-                        dyn_shared_accessor,
-                        st_shared_accessor,
-                        global_fence_dummy,
-                        local_fence_dummy,
-                        output_stream};
+                        auto acc = TAcc{
+                            item_elements,
+                            work_item,
+                            dyn_shared_accessor,
+                            st_shared_accessor,
+                            global_fence_dummy,
+                            local_fence_dummy,
+                            output_stream};
 #    else
-                    auto acc = TAcc{
-                        item_elements,
-                        work_item,
-                        dyn_shared_accessor,
-                        st_shared_accessor,
-                        global_fence_dummy,
-                        local_fence_dummy};
+                        auto acc = TAcc{
+                            item_elements,
+                            work_item,
+                            dyn_shared_accessor,
+                            st_shared_accessor,
+                            global_fence_dummy,
+                            local_fence_dummy};
 #    endif
-                    core::apply(
-                        [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); },
-                        k_args);
-                });
+                        core::apply(
+                            [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); },
+                            k_args);
+                    });
+            }
+            else
+            { // autodetect
+                // cgh.parallel_for<detail::kernel<TAcc, TKernelFnObj, TArgs...>>( //FIXME_
+                cgh.parallel_for(
+                    sycl::nd_range<TDim::value>{global_size, local_size},
+                    [=](sycl::nd_item<TDim::value> work_item)
+                    {
+#    ifdef ALPAKA_SYCL_IOSTREAM_ENABLED
+                        auto acc = TAcc{
+                            item_elements,
+                            work_item,
+                            dyn_shared_accessor,
+                            st_shared_accessor,
+                            global_fence_dummy,
+                            local_fence_dummy,
+                            output_stream};
+#    else
+                        auto acc = TAcc{
+                            item_elements,
+                            work_item,
+                            dyn_shared_accessor,
+                            st_shared_accessor,
+                            global_fence_dummy,
+                            local_fence_dummy};
+#    endif
+                        core::apply(
+                            [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); },
+                            k_args);
+                    });
+            }
         }
 
         static constexpr auto is_sycl_task = true;
diff --git a/include/alpaka/kernel/Traits.hpp b/include/alpaka/kernel/Traits.hpp
index 33032cdb5c57..a0c8d17a89e2 100644
--- a/include/alpaka/kernel/Traits.hpp
+++ b/include/alpaka/kernel/Traits.hpp
@@ -1,4 +1,5 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, René Widera, Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber
+/* Copyright 2023 Axel Huebl, Benjamin Worpitz, René Widera, Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber,
+ *                Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -68,6 +69,19 @@ namespace alpaka
             }
         };
 
+        //! The trait for getting the warp size required by a kernel.
+        //!
+        //! \tparam TKernelFnObj The kernel function object.
+        //! \tparam TAcc The accelerator.
+        //!
+        //! The default implementation returns 0, which lets the accelerator compiler and runtime choose the warp size.
+        template<typename TKernelFnObj, typename TAcc, typename TSfinae = void>
+        struct WarpSize
+        {
+            static constexpr std::uint32_t warp_size = 0u;
+        };
+
+
         //! The trait for getting the schedule to use when a kernel is run using the CpuOmp2Blocks accelerator.
         //!
         //! Has no effect on other accelerators.
diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp
index 017c7a252fb4..356ee16e2c63 100644
--- a/include/alpaka/warp/WarpGenericSycl.hpp
+++ b/include/alpaka/warp/WarpGenericSycl.hpp
@@ -36,7 +36,7 @@ namespace alpaka::warp::trait
         {
             auto const sub_group = warp.m_item.get_sub_group();
             // SYCL sub-groups are always 1D
-            return static_cast<std::int32_t>(sub_group.get_local_linear_range());
+            return static_cast<std::int32_t>(sub_group.get_max_local_range()[0]);
         }
     };
 
diff --git a/test/unit/warp/src/Activemask.cpp b/test/unit/warp/src/Activemask.cpp
index afc492ffc9a7..76431d44c557 100644
--- a/test/unit/warp/src/Activemask.cpp
+++ b/test/unit/warp/src/Activemask.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -25,6 +25,7 @@ struct ActivemaskSingleThreadWarpTestKernel
     }
 };
 
+template<std::uint32_t TWarpSize>
 struct ActivemaskMultipleThreadWarpTestKernel
 {
     ALPAKA_NO_HOST_ACC_WARNING
@@ -53,6 +54,15 @@ struct ActivemaskMultipleThreadWarpTestKernel
     }
 };
 
+namespace alpaka::trait
+{
+    template<std::uint32_t TWarpSize, typename TAcc>
+    struct WarpSize<ActivemaskMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
+    {
+        static constexpr std::uint32_t warp_size = TWarpSize;
+    };
+} // namespace alpaka::trait
+
 TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
@@ -68,7 +78,7 @@ TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs)
         if(scalar)
         {
             alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(4));
-            REQUIRE(fixture(ActivemaskSingleThreadWarpTestKernel{}));
+            CHECK(fixture(ActivemaskSingleThreadWarpTestKernel{}));
         }
         else
         {
@@ -80,9 +90,27 @@ TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs)
             auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
             auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
             auto fixture = ExecutionFixture{workDiv};
-            ActivemaskMultipleThreadWarpTestKernel kernel;
-            for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++)
-                REQUIRE(fixture(kernel, inactiveThreadIdx));
+            if(warpExtent == 8)
+            {
+                for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++)
+                {
+                    CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<8>{}, inactiveThreadIdx));
+                }
+            }
+            else if(warpExtent == 16)
+            {
+                for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++)
+                {
+                    CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<16>{}, inactiveThreadIdx));
+                }
+            }
+            else if(warpExtent == 32)
+            {
+                for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++)
+                {
+                    CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<32>{}, inactiveThreadIdx));
+                }
+            }
         }
     }
 }
diff --git a/test/unit/warp/src/All.cpp b/test/unit/warp/src/All.cpp
index 4ea233e3179b..27468cd8803c 100644
--- a/test/unit/warp/src/All.cpp
+++ b/test/unit/warp/src/All.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -25,6 +25,7 @@ struct AllSingleThreadWarpTestKernel
     }
 };
 
+template<std::uint32_t TWarpSize>
 struct AllMultipleThreadWarpTestKernel
 {
     ALPAKA_NO_HOST_ACC_WARNING
@@ -57,6 +58,15 @@ struct AllMultipleThreadWarpTestKernel
     }
 };
 
+namespace alpaka::trait
+{
+    template<std::uint32_t TWarpSize, typename TAcc>
+    struct WarpSize<AllMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
+    {
+        static constexpr std::uint32_t warp_size = TWarpSize;
+    };
+} // namespace alpaka::trait
+
 TEMPLATE_LIST_TEST_CASE("all", "[warp]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
@@ -84,8 +94,18 @@ TEMPLATE_LIST_TEST_CASE("all", "[warp]", alpaka::test::TestAccs)
             auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
             auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
             auto fixture = ExecutionFixture{workDiv};
-            AllMultipleThreadWarpTestKernel kernel;
-            REQUIRE(fixture(kernel));
+            if(warpExtent == 8)
+            {
+                REQUIRE(fixture(AllMultipleThreadWarpTestKernel<8>{}));
+            }
+            else if(warpExtent == 16)
+            {
+                REQUIRE(fixture(AllMultipleThreadWarpTestKernel<16>{}));
+            }
+            else if(warpExtent == 32)
+            {
+                REQUIRE(fixture(AllMultipleThreadWarpTestKernel<32>{}));
+            }
         }
     }
 }
diff --git a/test/unit/warp/src/Any.cpp b/test/unit/warp/src/Any.cpp
index 72b247ecfe63..a00f303fc0fa 100644
--- a/test/unit/warp/src/Any.cpp
+++ b/test/unit/warp/src/Any.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego
+/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -25,6 +25,7 @@ struct AnySingleThreadWarpTestKernel
     }
 };
 
+template<std::uint32_t TWarpSize>
 struct AnyMultipleThreadWarpTestKernel
 {
     ALPAKA_NO_HOST_ACC_WARNING
@@ -57,6 +58,15 @@ struct AnyMultipleThreadWarpTestKernel
     }
 };
 
+namespace alpaka::trait
+{
+    template<std::uint32_t TWarpSize, typename TAcc>
+    struct WarpSize<AnyMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
+    {
+        static constexpr std::uint32_t warp_size = TWarpSize;
+    };
+} // namespace alpaka::trait
+
 TEMPLATE_LIST_TEST_CASE("any", "[warp]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
@@ -84,8 +94,18 @@ TEMPLATE_LIST_TEST_CASE("any", "[warp]", alpaka::test::TestAccs)
             auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
             auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
             auto fixture = ExecutionFixture{workDiv};
-            AnyMultipleThreadWarpTestKernel kernel;
-            REQUIRE(fixture(kernel));
+            if(warpExtent == 8)
+            {
+                REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<8>{}));
+            }
+            else if(warpExtent == 16)
+            {
+                REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<16>{}));
+            }
+            else if(warpExtent == 32)
+            {
+                REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<32>{}));
+            }
         }
     }
 }
diff --git a/test/unit/warp/src/Ballot.cpp b/test/unit/warp/src/Ballot.cpp
index cf5b59d79445..edf8c59cd575 100644
--- a/test/unit/warp/src/Ballot.cpp
+++ b/test/unit/warp/src/Ballot.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -26,6 +26,7 @@ struct BallotSingleThreadWarpTestKernel
     }
 };
 
+template<std::uint32_t TWarpSize>
 struct BallotMultipleThreadWarpTestKernel
 {
     ALPAKA_NO_HOST_ACC_WARNING
@@ -65,6 +66,15 @@ struct BallotMultipleThreadWarpTestKernel
     }
 };
 
+namespace alpaka::trait
+{
+    template<std::uint32_t TWarpSize, typename TAcc>
+    struct WarpSize<BallotMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
+    {
+        static constexpr std::uint32_t warp_size = TWarpSize;
+    };
+} // namespace alpaka::trait
+
 TEMPLATE_LIST_TEST_CASE("ballot", "[warp]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
@@ -92,8 +102,18 @@ TEMPLATE_LIST_TEST_CASE("ballot", "[warp]", alpaka::test::TestAccs)
             auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
             auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
             auto fixture = ExecutionFixture{workDiv};
-            BallotMultipleThreadWarpTestKernel kernel;
-            REQUIRE(fixture(kernel));
+            if(warpExtent == 8)
+            {
+                REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<8>{}));
+            }
+            else if(warpExtent == 16)
+            {
+                REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<16>{}));
+            }
+            else if(warpExtent == 32)
+            {
+                REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<32>{}));
+            }
         }
     }
 }
diff --git a/test/unit/warp/src/Shfl.cpp b/test/unit/warp/src/Shfl.cpp
index 63f5b81a5f77..d9ec5d59ee5d 100644
--- a/test/unit/warp/src/Shfl.cpp
+++ b/test/unit/warp/src/Shfl.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 David M. Rogers, Jan Stephan
+/* Copyright 2023 David M. Rogers, Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -35,6 +35,7 @@ struct ShflSingleThreadWarpTestKernel
     }
 };
 
+template<std::uint32_t TWarpSize>
 struct ShflMultipleThreadWarpTestKernel
 {
     ALPAKA_NO_HOST_ACC_WARNING
@@ -87,6 +88,15 @@ struct ShflMultipleThreadWarpTestKernel
     }
 };
 
+namespace alpaka::trait
+{
+    template<std::uint32_t TWarpSize, typename TAcc>
+    struct WarpSize<ShflMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
+    {
+        static constexpr std::uint32_t warp_size = TWarpSize;
+    };
+} // namespace alpaka::trait
+
 TEMPLATE_LIST_TEST_CASE("shfl", "[warp]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
@@ -114,8 +124,18 @@ TEMPLATE_LIST_TEST_CASE("shfl", "[warp]", alpaka::test::TestAccs)
             auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
             auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
             auto fixture = ExecutionFixture{workDiv};
-            ShflMultipleThreadWarpTestKernel kernel;
-            REQUIRE(fixture(kernel));
+            if(warpExtent == 8)
+            {
+                REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<8>{}));
+            }
+            else if(warpExtent == 16)
+            {
+                REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<16>{}));
+            }
+            else if(warpExtent == 32)
+            {
+                REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<32>{}));
+            }
         }
     }
 }

From b26b05a34b222ba1e08bbc178cf10e34200f9aff Mon Sep 17 00:00:00 2001
From: AuroraPerego <aurora.perego@cern.ch>
Date: Wed, 5 Jul 2023 09:13:58 +0200
Subject: [PATCH 08/10] Support compile-time warp size in SYCL kernels

Introduce a new optional trait to describe at compile time the warp size that a
kernel should use.  The default behaviour is to let the back-end compiler pick
the preferred size.

Before launching a kernel with a compile-time sub-group size the user should
query the sizes supported by the device, and choose accordingly.  If the device
does not support the requested size, the SYCL runtime will throw a synchronous
exception.

During just-in-time (JIT) compilation this guarantees that a kernel is compiled
only for the sizes supported by the device.  During ahead-of-time (AOT)
compilation this is not enough, because the device is not known at compile
time.  The SYCL specification mandates that the back-end compilers should not
fail if a kernel uses unsupported features, like unsupported sub-group sizes.
Unfortunately the Intel OpenCL CPU and GPU compilers currently fail with a hard
error.  To work around this limitation, use the preprocessor macros defined
when compiling AOT for the new SYCL targets to enable the compilation only for
the sub-group sizes supported by each device.

Note: while the CPU OpenCL back-end does support a sub-group size of 64, the
SYCL code currently does not.  To avoid issues with the sub-group primitives
always consider the sub-group size of 64 as not supported by the device.

Other changes:
  - remove the use of SYCL streams in favour of the printf() extension;
  - remove the ALPAKA_FN_HOST attribute;
  - fix the GetSize test for the different sub-group sizes;
  - fix the use of sycl::exceptions;
  - use different member names for nd_item in different classes, to avoid
    ambiguous name lookup error when accessing the nd_item in the accelerator
    object.
---
 cmake/alpakaCommon.cmake                      |  12 +-
 include/alpaka/acc/AccGenericSycl.hpp         |  28 ---
 include/alpaka/core/Sycl.hpp                  |  15 --
 include/alpaka/dev/DevGenericSycl.hpp         |   9 +-
 include/alpaka/idx/bt/IdxBtGenericSycl.hpp    |  18 +-
 include/alpaka/idx/gb/IdxGbGenericSycl.hpp    |  18 +-
 include/alpaka/kernel/SyclSubgroupSize.hpp    |  97 ++++++++
 .../alpaka/kernel/TaskKernelGenericSycl.hpp   | 218 ++++++++++++------
 include/alpaka/kernel/Traits.hpp              |   8 +-
 include/alpaka/mem/buf/BufGenericSycl.hpp     |  37 ++-
 include/alpaka/pltf/PltfGenericSycl.hpp       |   3 +-
 include/alpaka/rand/RandGenericSycl.hpp       | 120 +++-------
 include/alpaka/warp/WarpGenericSycl.hpp       |  34 +--
 include/alpaka/workdiv/WorkDivGenericSycl.hpp |  30 +--
 test/unit/warp/src/Activemask.cpp             |  29 ++-
 test/unit/warp/src/All.cpp                    |  23 +-
 test/unit/warp/src/Any.cpp                    |  21 +-
 test/unit/warp/src/Ballot.cpp                 |  23 +-
 test/unit/warp/src/GetSize.cpp                |  30 ++-
 test/unit/warp/src/Shfl.cpp                   |  23 +-
 20 files changed, 471 insertions(+), 325 deletions(-)
 create mode 100644 include/alpaka/kernel/SyclSubgroupSize.hpp

diff --git a/cmake/alpakaCommon.cmake b/cmake/alpakaCommon.cmake
index e4ec50a8cd2b..27776497138d 100644
--- a/cmake/alpakaCommon.cmake
+++ b/cmake/alpakaCommon.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright 2023 Benjamin Worpitz, Erik Zenker, Axel Hübl, Jan Stephan, René Widera, Jeffrey Kelling, Andrea Bocci, Bernhard Manfred Gruber
+# Copyright 2023 Benjamin Worpitz, Erik Zenker, Axel Hübl, Jan Stephan, René Widera, Jeffrey Kelling, Andrea Bocci, Bernhard Manfred Gruber, Aurora Perego
 # SPDX-License-Identifier: MPL-2.0
 #
 
@@ -547,9 +547,6 @@ if(alpaka_ACC_SYCL_ENABLE)
 
         # Enable device-side printing to stdout
         cmake_dependent_option(alpaka_SYCL_ENABLE_IOSTREAM "Enable device-side printing to stdout" OFF "alpaka_ACC_SYCL_ENABLE" OFF)
-        if(BUILD_TESTING)
-            set(alpaka_SYCL_ENABLE_IOSTREAM ON CACHE BOOL "Enable device-side printing to stdout" FORCE)
-        endif()
 
         alpaka_set_compiler_options(HOST_DEVICE target alpaka "-fsycl")
         target_link_options(alpaka INTERFACE "-fsycl")
@@ -559,7 +556,7 @@ if(alpaka_ACC_SYCL_ENABLE)
         # Determine SYCL targets
         set(alpaka_SYCL_ONEAPI_CPU_TARGET "spir64_x86_64")
         set(alpaka_SYCL_ONEAPI_FPGA_TARGET "spir64_fpga")
-        set(alpaka_SYCL_ONEAPI_GPU_TARGET "spir64_gen")
+        set(alpaka_SYCL_ONEAPI_GPU_TARGET ${alpaka_SYCL_ONEAPI_GPU_DEVICES})
 
         if(alpaka_SYCL_ONEAPI_CPU)
             list(APPEND alpaka_SYCL_TARGETS ${alpaka_SYCL_ONEAPI_CPU_TARGET})
@@ -620,14 +617,13 @@ if(alpaka_ACC_SYCL_ENABLE)
         if(alpaka_SYCL_ONEAPI_GPU)
             # Create a drop-down list (in cmake-gui) of valid Intel GPU targets. On the command line the user can specifiy
             # additional targets, such as ranges: "Gen8-Gen12LP" or lists: "icllp;skl".
-            set(alpaka_SYCL_ONEAPI_GPU_DEVICES "bdw" CACHE STRING "Intel GPU devices / generations to compile for")
+            set(alpaka_SYCL_ONEAPI_GPU_DEVICES "intel_gpu_pvc" CACHE STRING "Intel GPU devices / generations to compile for")
             set_property(CACHE alpaka_SYCL_ONEAPI_GPU_DEVICES
-                        PROPERTY STRINGS "bdw;skl;kbl;cfl;bxt;glk;whl;aml;cml;icllp;lkf;ehl;tgllp;rkl;adl-s;adl-p;dg1;acm-g10;ats-m150;dg2-g10;acm-g11;ats-m75;dg2-g11;acm-g12;dg2-g12;pvc-sdv;pvc;gen11;gen12lp;gen8;gen9;xe;xe-hpc;xe-hpg")
+                        PROPERTY STRINGS "intel_gpu_pvc;intel_gpu_acm_g12;intel_gpu_acm_g11;intel_gpu_acm_g10;intel_gpu_dg1;intel_gpu_adl_n;intel_gpu_adl_p;intel_gpu_rpl_s;intel_gpu_adl_s;intel_gpu_rkl;intel_gpu_tgllp;intel_gpu_icllp;intel_gpu_cml;intel_gpu_aml;intel_gpu_whl;intel_gpu_glk;intel_gpu_apl;intel_gpu_cfl;intel_gpu_kbl;intel_gpu_skl;intel_gpu_bdw")
             # If the user has given us a list turn all ';' into ',' to pacify the Intel OpenCL compiler.
             string(REPLACE ";" "," alpaka_SYCL_ONEAPI_GPU_DEVICES "${alpaka_SYCL_ONEAPI_GPU_DEVICES}")
             
             target_compile_definitions(alpaka INTERFACE "ALPAKA_SYCL_ONEAPI_GPU")
-            target_link_options(alpaka INTERFACE "SHELL:-Xsycl-target-backend=${alpaka_SYCL_ONEAPI_GPU_TARGET} \"-device ${alpaka_SYCL_ONEAPI_GPU_DEVICES}\"")
         endif()
 
         #-----------------------------------------------------------------------------------------------------------------
diff --git a/include/alpaka/acc/AccGenericSycl.hpp b/include/alpaka/acc/AccGenericSycl.hpp
index 76a735459940..ea9560e23b38 100644
--- a/include/alpaka/acc/AccGenericSycl.hpp
+++ b/include/alpaka/acc/AccGenericSycl.hpp
@@ -68,33 +68,6 @@ namespace alpaka
         auto operator=(AccGenericSycl const&) -> AccGenericSycl& = delete;
         auto operator=(AccGenericSycl&&) -> AccGenericSycl& = delete;
 
-#    ifdef ALPAKA_SYCL_IOSTREAM_ENABLED
-        AccGenericSycl(
-            Vec<TDim, TIdx> const& threadElemExtent,
-            sycl::nd_item<TDim::value> work_item,
-            sycl::local_accessor<std::byte> dyn_shared_acc,
-            sycl::local_accessor<std::byte> st_shared_acc,
-            sycl::accessor<int, 1, sycl::access_mode::read_write, sycl::target::device> global_fence_dummy,
-            sycl::local_accessor<int> local_fence_dummy,
-            sycl::stream output_stream)
-            : WorkDivGenericSycl<TDim, TIdx>{threadElemExtent, work_item}
-            , gb::IdxGbGenericSycl<TDim, TIdx>{work_item}
-            , bt::IdxBtGenericSycl<TDim, TIdx>{work_item}
-            , AtomicHierarchy<AtomicGenericSycl, AtomicGenericSycl, AtomicGenericSycl>{}
-            , math::MathGenericSycl{}
-            , BlockSharedMemDynGenericSycl{dyn_shared_acc}
-            , BlockSharedMemStGenericSycl{st_shared_acc}
-            , BlockSyncGenericSycl<TDim>{work_item}
-            , IntrinsicGenericSycl{}
-            , MemFenceGenericSycl{global_fence_dummy, local_fence_dummy}
-            , rand::RandGenericSycl<TDim>{work_item}
-            , warp::WarpGenericSycl<TDim>{work_item}
-            , cout{output_stream}
-        {
-        }
-
-        sycl::stream cout;
-#    else
         AccGenericSycl(
             Vec<TDim, TIdx> const& threadElemExtent,
             sycl::nd_item<TDim::value> work_item,
@@ -116,7 +89,6 @@ namespace alpaka
             , warp::WarpGenericSycl<TDim>{work_item}
         {
         }
-#    endif
     };
 } // namespace alpaka
 
diff --git a/include/alpaka/core/Sycl.hpp b/include/alpaka/core/Sycl.hpp
index 409f9951a48b..8ba75ef6ecda 100644
--- a/include/alpaka/core/Sycl.hpp
+++ b/include/alpaka/core/Sycl.hpp
@@ -80,7 +80,6 @@ namespace alpaka
 
               // 2 component vector types
               sycl::char2,
-              sycl::schar2,
               sycl::uchar2,
               sycl::short2,
               sycl::ushort2,
@@ -88,15 +87,12 @@ namespace alpaka
               sycl::uint2,
               sycl::long2,
               sycl::ulong2,
-              sycl::longlong2,
-              sycl::ulonglong2,
               sycl::float2,
               sycl::double2,
               sycl::half2,
 
               // 3 component vector types
               sycl::char3,
-              sycl::schar3,
               sycl::uchar3,
               sycl::short3,
               sycl::ushort3,
@@ -104,15 +100,12 @@ namespace alpaka
               sycl::uint3,
               sycl::long3,
               sycl::ulong3,
-              sycl::longlong3,
-              sycl::ulonglong3,
               sycl::float3,
               sycl::double3,
               sycl::half3,
 
               // 4 component vector types
               sycl::char4,
-              sycl::schar4,
               sycl::uchar4,
               sycl::short4,
               sycl::ushort4,
@@ -120,15 +113,12 @@ namespace alpaka
               sycl::uint4,
               sycl::long4,
               sycl::ulong4,
-              sycl::longlong4,
-              sycl::ulonglong4,
               sycl::float4,
               sycl::double4,
               sycl::half4,
 
               // 8 component vector types
               sycl::char8,
-              sycl::schar8,
               sycl::uchar8,
               sycl::short8,
               sycl::ushort8,
@@ -136,15 +126,12 @@ namespace alpaka
               sycl::uint8,
               sycl::long8,
               sycl::ulong8,
-              sycl::longlong8,
-              sycl::ulonglong8,
               sycl::float8,
               sycl::double8,
               sycl::half8,
 
               // 16 component vector types
               sycl::char16,
-              sycl::schar16,
               sycl::uchar16,
               sycl::short16,
               sycl::ushort16,
@@ -152,8 +139,6 @@ namespace alpaka
               sycl::uint16,
               sycl::long16,
               sycl::ulong16,
-              sycl::longlong16,
-              sycl::ulonglong16,
               sycl::float16,
               sycl::double16,
               sycl::half16>
diff --git a/include/alpaka/dev/DevGenericSycl.hpp b/include/alpaka/dev/DevGenericSycl.hpp
index aa5027d23513..0a6875a77ba6 100644
--- a/include/alpaka/dev/DevGenericSycl.hpp
+++ b/include/alpaka/dev/DevGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan, Antonio Di Pilato, Luca Ferragina
+/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -176,7 +176,12 @@ namespace alpaka::trait
         static auto getWarpSizes(DevGenericSycl<TPltf> const& dev) -> std::vector<std::size_t>
         {
             auto const device = dev.getNativeHandle().first;
-            return device.template get_info<sycl::info::device::sub_group_sizes>();
+            std::vector<std::size_t> warp_sizes = device.template get_info<sycl::info::device::sub_group_sizes>();
+            // The CPU runtime supports a sub-group size of 64, but the SYCL implementation currently does not
+            auto find64 = std::find(warp_sizes.begin(), warp_sizes.end(), 64);
+            if(find64 != warp_sizes.end())
+                warp_sizes.erase(find64);
+            return warp_sizes;
         }
     };
 
diff --git a/include/alpaka/idx/bt/IdxBtGenericSycl.hpp b/include/alpaka/idx/bt/IdxBtGenericSycl.hpp
index 88628d343c18..9d68deb826a6 100644
--- a/include/alpaka/idx/bt/IdxBtGenericSycl.hpp
+++ b/include/alpaka/idx/bt/IdxBtGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -23,11 +23,11 @@ namespace alpaka::bt
     public:
         using IdxBtBase = IdxBtGenericSycl;
 
-        explicit IdxBtGenericSycl(sycl::nd_item<TDim::value> work_item) : my_item{work_item}
+        explicit IdxBtGenericSycl(sycl::nd_item<TDim::value> work_item) : m_item_bt{work_item}
         {
         }
 
-        sycl::nd_item<TDim::value> my_item;
+        sycl::nd_item<TDim::value> m_item_bt;
     };
 } // namespace alpaka::bt
 
@@ -49,19 +49,19 @@ namespace alpaka::trait
         static auto getIdx(bt::IdxBtGenericSycl<TDim, TIdx> const& idx, TWorkDiv const&) -> Vec<TDim, TIdx>
         {
             if constexpr(TDim::value == 1)
-                return Vec<TDim, TIdx>{static_cast<TIdx>(idx.my_item.get_local_id(0))};
+                return Vec<TDim, TIdx>{static_cast<TIdx>(idx.m_item_bt.get_local_id(0))};
             else if constexpr(TDim::value == 2)
             {
                 return Vec<TDim, TIdx>{
-                    static_cast<TIdx>(idx.my_item.get_local_id(1)),
-                    static_cast<TIdx>(idx.my_item.get_local_id(0))};
+                    static_cast<TIdx>(idx.m_item_bt.get_local_id(1)),
+                    static_cast<TIdx>(idx.m_item_bt.get_local_id(0))};
             }
             else
             {
                 return Vec<TDim, TIdx>{
-                    static_cast<TIdx>(idx.my_item.get_local_id(2)),
-                    static_cast<TIdx>(idx.my_item.get_local_id(1)),
-                    static_cast<TIdx>(idx.my_item.get_local_id(0))};
+                    static_cast<TIdx>(idx.m_item_bt.get_local_id(2)),
+                    static_cast<TIdx>(idx.m_item_bt.get_local_id(1)),
+                    static_cast<TIdx>(idx.m_item_bt.get_local_id(0))};
             }
         }
     };
diff --git a/include/alpaka/idx/gb/IdxGbGenericSycl.hpp b/include/alpaka/idx/gb/IdxGbGenericSycl.hpp
index 8fa8a5bb0f35..e7500c0f6144 100644
--- a/include/alpaka/idx/gb/IdxGbGenericSycl.hpp
+++ b/include/alpaka/idx/gb/IdxGbGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -23,11 +23,11 @@ namespace alpaka::gb
     public:
         using IdxGbBase = IdxGbGenericSycl;
 
-        explicit IdxGbGenericSycl(sycl::nd_item<TDim::value> work_item) : my_item{work_item}
+        explicit IdxGbGenericSycl(sycl::nd_item<TDim::value> work_item) : m_item_gb{work_item}
         {
         }
 
-        sycl::nd_item<TDim::value> my_item;
+        sycl::nd_item<TDim::value> m_item_gb;
     };
 } // namespace alpaka::gb
 
@@ -49,19 +49,19 @@ namespace alpaka::trait
         static auto getIdx(gb::IdxGbGenericSycl<TDim, TIdx> const& idx, TWorkDiv const&)
         {
             if constexpr(TDim::value == 1)
-                return Vec<TDim, TIdx>(static_cast<TIdx>(idx.my_item.get_group(0)));
+                return Vec<TDim, TIdx>(static_cast<TIdx>(idx.m_item_gb.get_group(0)));
             else if constexpr(TDim::value == 2)
             {
                 return Vec<TDim, TIdx>(
-                    static_cast<TIdx>(idx.my_item.get_group(1)),
-                    static_cast<TIdx>(idx.my_item.get_group(0)));
+                    static_cast<TIdx>(idx.m_item_gb.get_group(1)),
+                    static_cast<TIdx>(idx.m_item_gb.get_group(0)));
             }
             else
             {
                 return Vec<TDim, TIdx>(
-                    static_cast<TIdx>(idx.my_item.get_group(2)),
-                    static_cast<TIdx>(idx.my_item.get_group(1)),
-                    static_cast<TIdx>(idx.my_item.get_group(0)));
+                    static_cast<TIdx>(idx.m_item_gb.get_group(2)),
+                    static_cast<TIdx>(idx.m_item_gb.get_group(1)),
+                    static_cast<TIdx>(idx.m_item_gb.get_group(0)));
             }
         }
     };
diff --git a/include/alpaka/kernel/SyclSubgroupSize.hpp b/include/alpaka/kernel/SyclSubgroupSize.hpp
new file mode 100644
index 000000000000..b56b652c7b9e
--- /dev/null
+++ b/include/alpaka/kernel/SyclSubgroupSize.hpp
@@ -0,0 +1,97 @@
+/* Copyright 2023 Andrea Bocci, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    ifdef __SYCL_DEVICE_ONLY__
+
+#        if defined(__SYCL_TARGET_INTEL_GPU_BDW__) || /* Broadwell Intel graphics architecture */                     \
+            defined(__SYCL_TARGET_INTEL_GPU_SKL__) || /* Skylake Intel graphics architecture */                       \
+            defined(__SYCL_TARGET_INTEL_GPU_KBL__) || /* Kaby Lake Intel graphics architecture */                     \
+            defined(__SYCL_TARGET_INTEL_GPU_CFL__) || /* Coffee Lake Intel graphics architecture */                   \
+            defined(__SYCL_TARGET_INTEL_GPU_APL__) || /* Apollo Lake Intel graphics architecture */                   \
+            defined(__SYCL_TARGET_INTEL_GPU_GLK__) || /* Gemini Lake Intel graphics architecture */                   \
+            defined(__SYCL_TARGET_INTEL_GPU_WHL__) || /* Whiskey Lake Intel graphics architecture */                  \
+            defined(__SYCL_TARGET_INTEL_GPU_AML__) || /* Amber Lake Intel graphics architecture */                    \
+            defined(__SYCL_TARGET_INTEL_GPU_CML__) || /* Comet Lake Intel graphics architecture */                    \
+            defined(__SYCL_TARGET_INTEL_GPU_ICLLP__) || /* Ice Lake Intel graphics architecture */                    \
+            defined(__SYCL_TARGET_INTEL_GPU_TGLLP__) || /* Tiger Lake Intel graphics architecture */                  \
+            defined(__SYCL_TARGET_INTEL_GPU_RKL__) || /* Rocket Lake Intel graphics architecture */                   \
+            defined(__SYCL_TARGET_INTEL_GPU_ADL_S__) || /* Alder Lake S Intel graphics architecture */                \
+            defined(__SYCL_TARGET_INTEL_GPU_RPL_S__) || /* Raptor Lake Intel graphics architecture */                 \
+            defined(__SYCL_TARGET_INTEL_GPU_ADL_P__) || /* Alder Lake P Intel graphics architecture */                \
+            defined(__SYCL_TARGET_INTEL_GPU_ADL_N__) || /* Alder Lake N Intel graphics architecture */                \
+            defined(__SYCL_TARGET_INTEL_GPU_DG1__) || /* DG1 Intel graphics architecture */                           \
+            defined(__SYCL_TARGET_INTEL_GPU_ACM_G10__) || /* Alchemist G10 Intel graphics architecture */             \
+            defined(__SYCL_TARGET_INTEL_GPU_ACM_G11__) || /* Alchemist G11 Intel graphics architecture */             \
+            defined(__SYCL_TARGET_INTEL_GPU_ACM_G12__) /* Alchemist G12 Intel graphics architecture */
+
+#            define SYCL_SUBGROUP_SIZE (8 | 16 | 32)
+
+#        elif defined(__SYCL_TARGET_INTEL_GPU_PVC__) /* Ponte Vecchio Intel graphics architecture */
+
+#            define SYCL_SUBGROUP_SIZE (16 | 32)
+
+#        elif defined(__SYCL_TARGET_INTEL_X86_64__) /* generate code ahead of time for x86_64 CPUs */
+
+#            define SYCL_SUBGROUP_SIZE (4 | 8 | 16 | 32 | 64)
+
+#        elif defined(__SYCL_TARGET_NVIDIA_GPU_SM_50__) || /* NVIDIA Maxwell architecture (compute capability 5.0) */ \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_52__) || /* NVIDIA Maxwell architecture (compute capability 5.2) */   \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_53__) || /* NVIDIA Jetson TX1 / Nano (compute capability 5.3) */      \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_60__) || /* NVIDIA Pascal architecture (compute capability 6.0) */    \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_61__) || /* NVIDIA Pascal architecture (compute capability 6.1) */    \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_62__) || /* NVIDIA Jetson TX2 (compute capability 6.2) */             \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_70__) || /* NVIDIA Volta architecture (compute capability 7.0) */     \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_72__) || /* NVIDIA Jetson AGX (compute capability 7.2) */             \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_75__) || /* NVIDIA Turing architecture (compute capability 7.5) */    \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_80__) || /* NVIDIA Ampere architecture (compute capability 8.0) */    \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_86__) || /* NVIDIA Ampere architecture (compute capability 8.6) */    \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_87__) || /* NVIDIA Jetson/Drive AGX Orin (compute capability 8.7) */  \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_89__) || /* NVIDIA Ada Lovelace arch. (compute capability 8.9) */     \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_90__) /* NVIDIA Hopper architecture (compute capability 9.0) */
+
+#            define SYCL_SUBGROUP_SIZE (32)
+
+#        elif defined(__SYCL_TARGET_AMD_GPU_GFX700__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */         \
+            defined(__SYCL_TARGET_AMD_GPU_GFX701__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */           \
+            defined(__SYCL_TARGET_AMD_GPU_GFX702__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */           \
+            defined(__SYCL_TARGET_AMD_GPU_GFX801__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */      \
+            defined(__SYCL_TARGET_AMD_GPU_GFX802__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */      \
+            defined(__SYCL_TARGET_AMD_GPU_GFX803__) || /* AMD GCN 4.0 Arctic Islands architecture (gfx 8.0) */        \
+            defined(__SYCL_TARGET_AMD_GPU_GFX805__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */      \
+            defined(__SYCL_TARGET_AMD_GPU_GFX810__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.1) */      \
+            defined(__SYCL_TARGET_AMD_GPU_GFX900__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                  \
+            defined(__SYCL_TARGET_AMD_GPU_GFX902__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                  \
+            defined(__SYCL_TARGET_AMD_GPU_GFX904__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                  \
+            defined(__SYCL_TARGET_AMD_GPU_GFX906__) || /* AMD GCN 5.1 Vega II architecture (gfx 9.0) */               \
+            defined(__SYCL_TARGET_AMD_GPU_GFX908__) || /* AMD CDNA 1.0 Arcturus architecture (gfx 9.0) */             \
+            defined(__SYCL_TARGET_AMD_GPU_GFX90A__) /* AMD CDNA 2.0 Aldebaran architecture (gfx 9.0) */
+
+#            define SYCL_SUBGROUP_SIZE (64)
+
+#        elif defined(__SYCL_TARGET_AMD_GPU_GFX1010__) || /* AMD RDNA 1.0 Navi 10 architecture (gfx 10.1) */          \
+            defined(__SYCL_TARGET_AMD_GPU_GFX1011__) || /* AMD RDNA 1.0 Navi 12 architecture (gfx 10.1) */            \
+            defined(__SYCL_TARGET_AMD_GPU_GFX1012__) || /* AMD RDNA 1.0 Navi 14 architecture (gfx 10.1) */            \
+            defined(__SYCL_TARGET_AMD_GPU_GFX1013__) || /* AMD RDNA 2.0 Oberon architecture (gfx 10.1) */             \
+            defined(__SYCL_TARGET_AMD_GPU_GFX1030__) || /* AMD RDNA 2.0 Navi 21 architecture (gfx 10.3) */            \
+            defined(__SYCL_TARGET_AMD_GPU_GFX1031__) || /* AMD RDNA 2.0 Navi 22 architecture (gfx 10.3) */            \
+            defined(__SYCL_TARGET_AMD_GPU_GFX1032__) || /* AMD RDNA 2.0 Navi 23 architecture (gfx 10.3) */            \
+            defined(__SYCL_TARGET_AMD_GPU_GFX1034__) /* AMD RDNA 2.0 Navi 24 architecture (gfx 10.3) */
+
+#            define SYCL_SUBGROUP_SIZE (32 | 64)
+
+#        else // __SYCL_TARGET_*
+
+#            define SYCL_SUBGROUP_SIZE (0) /* unknown target */
+
+#        endif // __SYCL_TARGET_*
+
+#    else
+
+#        define SYCL_SUBGROUP_SIZE (0) /* host compilation */
+
+#    endif // __SYCL_DEVICE_ONLY__
+
+#endif // ALPAKA_ACC_SYCL_ENABLED
diff --git a/include/alpaka/kernel/TaskKernelGenericSycl.hpp b/include/alpaka/kernel/TaskKernelGenericSycl.hpp
index a173b42a0437..289a6f72c819 100644
--- a/include/alpaka/kernel/TaskKernelGenericSycl.hpp
+++ b/include/alpaka/kernel/TaskKernelGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina
+/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -13,6 +13,7 @@
 #include "alpaka/dev/Traits.hpp"
 #include "alpaka/dim/Traits.hpp"
 #include "alpaka/idx/Traits.hpp"
+#include "alpaka/kernel/SyclSubgroupSize.hpp"
 #include "alpaka/kernel/Traits.hpp"
 #include "alpaka/mem/buf/sycl/Accessor.hpp"
 #include "alpaka/pltf/Traits.hpp"
@@ -29,8 +30,72 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic push
+#        pragma clang diagnostic ignored "-Wunused-lambda-capture"
+#        pragma clang diagnostic ignored "-Wunused-parameter"
+#    endif
+
 #    include <CL/sycl.hpp>
 
+#    define LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size)                                                    \
+        cgh.parallel_for(                                                                                             \
+            sycl::nd_range<TDim::value>{global_size, local_size},                                                     \
+            [item_elements,                                                                                           \
+             dyn_shared_accessor,                                                                                     \
+             st_shared_accessor,                                                                                      \
+             global_fence_dummy,                                                                                      \
+             local_fence_dummy,                                                                                       \
+             k_func,                                                                                                  \
+             k_args](sycl::nd_item<TDim::value> work_item) [[intel::reqd_sub_group_size(sub_group_size)]]             \
+            {                                                                                                         \
+                auto acc = TAcc{                                                                                      \
+                    item_elements,                                                                                    \
+                    work_item,                                                                                        \
+                    dyn_shared_accessor,                                                                              \
+                    st_shared_accessor,                                                                               \
+                    global_fence_dummy,                                                                               \
+                    local_fence_dummy};                                                                               \
+                core::apply(                                                                                          \
+                    [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); },            \
+                    k_args);                                                                                          \
+            });
+
+#    define LAUNCH_SYCL_KERNEL_WITH_DEFAULT_SUBGROUP_SIZE                                                             \
+        cgh.parallel_for(                                                                                             \
+            sycl::nd_range<TDim::value>{global_size, local_size},                                                     \
+            [item_elements,                                                                                           \
+             dyn_shared_accessor,                                                                                     \
+             st_shared_accessor,                                                                                      \
+             global_fence_dummy,                                                                                      \
+             local_fence_dummy,                                                                                       \
+             k_func,                                                                                                  \
+             k_args](sycl::nd_item<TDim::value> work_item)                                                            \
+            {                                                                                                         \
+                auto acc = TAcc{                                                                                      \
+                    item_elements,                                                                                    \
+                    work_item,                                                                                        \
+                    dyn_shared_accessor,                                                                              \
+                    st_shared_accessor,                                                                               \
+                    global_fence_dummy,                                                                               \
+                    local_fence_dummy};                                                                               \
+                core::apply(                                                                                          \
+                    [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); },            \
+                    k_args);                                                                                          \
+            });
+
+#    define THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL                                                                        \
+        throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported));                               \
+        cgh.parallel_for(                                                                                             \
+            sycl::nd_range<TDim::value>{global_size, local_size},                                                     \
+            [item_elements,                                                                                           \
+             dyn_shared_accessor,                                                                                     \
+             st_shared_accessor,                                                                                      \
+             global_fence_dummy,                                                                                      \
+             local_fence_dummy,                                                                                       \
+             k_func,                                                                                                  \
+             k_args](sycl::nd_item<TDim::value> work_item) {});
+
 namespace alpaka::detail
 {
     template<typename TAcc, typename TKernelFnObj, typename... TArgs>
@@ -120,82 +185,86 @@ namespace alpaka
             auto k_func = m_kernelFnObj;
             auto k_args = m_args;
 
-#    ifdef ALPAKA_SYCL_IOSTREAM_ENABLED
-            // Set up device-side printing with (user-chosen value) KiB per block for the output buffer.
-            constexpr auto buf_size = std::size_t{ALPAKA_SYCL_IOSTREAM_KIB * 1024};
-            auto buf_per_work_item = std::size_t{};
-            if constexpr(TDim::value == 1)
-                buf_per_work_item = buf_size / static_cast<std::size_t>(group_items[0]);
-            else if constexpr(TDim::value == 2)
-                buf_per_work_item = buf_size / static_cast<std::size_t>(group_items[0] * group_items[1]);
-            else
-                buf_per_work_item
-                    = buf_size / static_cast<std::size_t>(group_items[0] * group_items[1] * group_items[2]);
-
-            assert(buf_per_work_item > 0);
+            constexpr std::size_t sub_group_size = trait::warpSize<TKernelFnObj, TAcc>;
+            bool supported = false;
 
-            auto output_stream = sycl::stream{buf_size, buf_per_work_item, cgh};
-#    endif
-            if constexpr(trait::WarpSize<TKernelFnObj, TAcc>::warp_size != 0)
+            if constexpr(sub_group_size == 0)
             {
-                // cgh.parallel_for<detail::kernel<TAcc, TKernelFnObj, TArgs...>>( //FIXME_
-                cgh.parallel_for(
-                    sycl::nd_range<TDim::value>{global_size, local_size},
-                    [=](sycl::nd_item<TDim::value> work_item)
-                        [[intel::reqd_sub_group_size(trait::WarpSize<TKernelFnObj, TAcc>::warp_size)]]
-                    {
-#    ifdef ALPAKA_SYCL_IOSTREAM_ENABLED
-                        auto acc = TAcc{
-                            item_elements,
-                            work_item,
-                            dyn_shared_accessor,
-                            st_shared_accessor,
-                            global_fence_dummy,
-                            local_fence_dummy,
-                            output_stream};
-#    else
-                        auto acc = TAcc{
-                            item_elements,
-                            work_item,
-                            dyn_shared_accessor,
-                            st_shared_accessor,
-                            global_fence_dummy,
-                            local_fence_dummy};
-#    endif
-                        core::apply(
-                            [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); },
-                            k_args);
-                    });
+                // no explicit subgroup size requirement
+                LAUNCH_SYCL_KERNEL_WITH_DEFAULT_SUBGROUP_SIZE
+                supported = true;
             }
             else
-            { // autodetect
-                // cgh.parallel_for<detail::kernel<TAcc, TKernelFnObj, TArgs...>>( //FIXME_
-                cgh.parallel_for(
-                    sycl::nd_range<TDim::value>{global_size, local_size},
-                    [=](sycl::nd_item<TDim::value> work_item)
-                    {
-#    ifdef ALPAKA_SYCL_IOSTREAM_ENABLED
-                        auto acc = TAcc{
-                            item_elements,
-                            work_item,
-                            dyn_shared_accessor,
-                            st_shared_accessor,
-                            global_fence_dummy,
-                            local_fence_dummy,
-                            output_stream};
+            {
+#    if(SYCL_SUBGROUP_SIZE == 0)
+                // no explicit SYCL target, assume JIT compilation
+                LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size)
+                supported = true;
 #    else
-                        auto acc = TAcc{
-                            item_elements,
-                            work_item,
-                            dyn_shared_accessor,
-                            st_shared_accessor,
-                            global_fence_dummy,
-                            local_fence_dummy};
+                // check if the kernel should be launched with a subgroup size of 4
+                if constexpr(sub_group_size == 4)
+                {
+#        if(SYCL_SUBGROUP_SIZE & 4)
+                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(4)
+                    supported = true;
+#        else
+                    // empty kernel, required to keep SYCL happy
+                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
+#        endif
+                }
+
+                // check if the kernel should be launched with a subgroup size of 8
+                if constexpr(sub_group_size == 8)
+                {
+#        if(SYCL_SUBGROUP_SIZE & 8)
+                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(8)
+                    supported = true;
+#        else
+                    // empty kernel, required to keep SYCL happy
+                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
+#        endif
+                }
+
+                // check if the kernel should be launched with a subgroup size of 16
+                if constexpr(sub_group_size == 16)
+                {
+#        if(SYCL_SUBGROUP_SIZE & 16)
+                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(16)
+                    supported = true;
+#        else
+                    // empty kernel, required to keep SYCL happy
+                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
+#        endif
+                }
+
+                // check if the kernel should be launched with a subgroup size of 32
+                if constexpr(sub_group_size == 32)
+                {
+#        if(SYCL_SUBGROUP_SIZE & 32)
+                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(32)
+                    supported = true;
+#        else
+                    // empty kernel, required to keep SYCL happy
+                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
+#        endif
+                }
+
+                // check if the kernel should be launched with a subgroup size of 64
+                if constexpr(sub_group_size == 64)
+                {
+#        if(SYCL_SUBGROUP_SIZE & 64)
+                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(64)
+                    supported = true;
+#        else
+                    // empty kernel, required to keep SYCL happy
+                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
+#        endif
+                }
 #    endif
-                        core::apply(
-                            [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); },
-                            k_args);
-                    });
+
+                // this subgroup size is not supported, raise an exception
+                if(not supported)
+                    throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported));
             }
         }
 
@@ -238,8 +307,13 @@ namespace alpaka
         TKernelFnObj m_kernelFnObj;
         core::Tuple<std::decay_t<TArgs>...> m_args;
     };
+
 } // namespace alpaka
 
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic pop
+#    endif
+
 namespace alpaka::trait
 {
     //! The SYCL execution task accelerator type trait specialization.
@@ -278,4 +352,6 @@ namespace alpaka::trait
     };
 } // namespace alpaka::trait
 
+#    undef LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS
+
 #endif
diff --git a/include/alpaka/kernel/Traits.hpp b/include/alpaka/kernel/Traits.hpp
index a0c8d17a89e2..384b82873759 100644
--- a/include/alpaka/kernel/Traits.hpp
+++ b/include/alpaka/kernel/Traits.hpp
@@ -1,5 +1,5 @@
 /* Copyright 2023 Axel Huebl, Benjamin Worpitz, René Widera, Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber,
- *                Andrea Bocci
+ *                Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -76,11 +76,13 @@ namespace alpaka
         //!
         //! The default implementation returns 0, which lets the accelerator compiler and runtime choose the warp size.
         template<typename TKernelFnObj, typename TAcc, typename TSfinae = void>
-        struct WarpSize
+        struct WarpSize : std::integral_constant<std::uint32_t, 0>
         {
-            static constexpr std::uint32_t warp_size = 0u;
         };
 
+        //! This is a shortcut for the trait defined above
+        template<typename TKernelFnObj, typename TAcc>
+        inline constexpr std::uint32_t warpSize = WarpSize<TKernelFnObj, TAcc>::value;
 
         //! The trait for getting the schedule to use when a kernel is run using the CpuOmp2Blocks accelerator.
         //!
diff --git a/include/alpaka/mem/buf/BufGenericSycl.hpp b/include/alpaka/mem/buf/BufGenericSycl.hpp
index e725c299a598..38241c83773b 100644
--- a/include/alpaka/mem/buf/BufGenericSycl.hpp
+++ b/include/alpaka/mem/buf/BufGenericSycl.hpp
@@ -73,7 +73,7 @@ namespace alpaka::trait
     template<typename TElem, typename TDim, typename TIdx, typename TPltf>
     struct GetDev<BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
-        ALPAKA_FN_HOST static auto getDev(BufGenericSycl<TElem, TDim, TIdx, TPltf> const& buf)
+        static auto getDev(BufGenericSycl<TElem, TDim, TIdx, TPltf> const& buf)
         {
             return buf.m_dev;
         }
@@ -99,7 +99,7 @@ namespace alpaka::trait
     {
         static_assert(TDim::value > TIdxIntegralConst::value, "Requested dimension out of bounds");
 
-        ALPAKA_FN_HOST static auto getExtent(BufGenericSycl<TElem, TDim, TIdx, TPltf> const& buf) -> TIdx
+        static auto getExtent(BufGenericSycl<TElem, TDim, TIdx, TPltf> const& buf) -> TIdx
         {
             return buf.m_extentElements[TIdxIntegralConst::value];
         }
@@ -109,12 +109,12 @@ namespace alpaka::trait
     template<typename TElem, typename TDim, typename TIdx, typename TPltf>
     struct GetPtrNative<BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
-        ALPAKA_FN_HOST static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TPltf> const& buf) -> TElem const*
+        static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TPltf> const& buf) -> TElem const*
         {
             return buf.m_spMem.get();
         }
 
-        ALPAKA_FN_HOST static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TPltf>& buf) -> TElem*
+        static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TPltf>& buf) -> TElem*
         {
             return buf.m_spMem.get();
         }
@@ -124,9 +124,8 @@ namespace alpaka::trait
     template<typename TElem, typename TDim, typename TIdx, typename TPltf>
     struct GetPtrDev<BufGenericSycl<TElem, TDim, TIdx, TPltf>, DevGenericSycl<TPltf>>
     {
-        ALPAKA_FN_HOST static auto getPtrDev(
-            BufGenericSycl<TElem, TDim, TIdx, TPltf> const& buf,
-            DevGenericSycl<TPltf> const& dev) -> TElem const*
+        static auto getPtrDev(BufGenericSycl<TElem, TDim, TIdx, TPltf> const& buf, DevGenericSycl<TPltf> const& dev)
+            -> TElem const*
         {
             if(dev == getDev(buf))
             {
@@ -138,9 +137,8 @@ namespace alpaka::trait
             }
         }
 
-        ALPAKA_FN_HOST static auto getPtrDev(
-            BufGenericSycl<TElem, TDim, TIdx, TPltf>& buf,
-            DevGenericSycl<TPltf> const& dev) -> TElem*
+        static auto getPtrDev(BufGenericSycl<TElem, TDim, TIdx, TPltf>& buf, DevGenericSycl<TPltf> const& dev)
+            -> TElem*
         {
             if(dev == getDev(buf))
             {
@@ -158,7 +156,7 @@ namespace alpaka::trait
     struct BufAlloc<TElem, TDim, TIdx, DevGenericSycl<TPltf>>
     {
         template<typename TExtent>
-        ALPAKA_FN_HOST static auto allocBuf(DevGenericSycl<TPltf> const& dev, TExtent const& extent)
+        static auto allocBuf(DevGenericSycl<TPltf> const& dev, TExtent const& extent)
             -> BufGenericSycl<TElem, TDim, TIdx, TPltf>
         {
             ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
@@ -192,10 +190,13 @@ namespace alpaka::trait
             }
 #    endif
 
+            auto const& [nativeDev, nativeContext] = dev.getNativeHandle();
             TElem* memPtr = sycl::malloc_device<TElem>(
                 static_cast<std::size_t>(getExtentProduct(extent)),
-                dev.getNativeHandle().first,
-                dev.getNativeHandle().second);
+                nativeDev,
+                nativeContext);
+            // captured structured bindings are a C++20 extension
+            // auto deleter = [nativeContext](TElem* ptr) { sycl::free(ptr, nativeContext); };
             auto deleter = [&dev](TElem* ptr) { sycl::free(ptr, dev.getNativeHandle().second); };
 
             return BufGenericSycl<TElem, TDim, TIdx, TPltf>(dev, memPtr, std::move(deleter), extent);
@@ -204,7 +205,7 @@ namespace alpaka::trait
 
     //! The BufGenericSycl stream-ordered memory allocation capability trait specialization.
     template<typename TDim, typename TPltf>
-    struct HasAsyncBufSupport<TDim, DevGenericSycl<TPltf>> : public std::false_type
+    struct HasAsyncBufSupport<TDim, DevGenericSycl<TPltf>> : std::false_type
     {
     };
 
@@ -223,8 +224,7 @@ namespace alpaka::trait
     struct BufAllocMapped
     {
         template<typename TExtent>
-        ALPAKA_FN_HOST static auto allocMappedBuf(DevCpu const& host, TExtent const& extent)
-            -> BufCpu<TElem, TDim, TIdx>
+        static auto allocMappedBuf(DevCpu const& host, TExtent const& extent) -> BufCpu<TElem, TDim, TIdx>
         {
             ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
@@ -249,12 +249,11 @@ namespace alpaka::trait
     template<typename TElem, typename TDim, typename TIdx, typename TPltf>
     struct GetPtrDev<BufCpu<TElem, TDim, TIdx>, DevGenericSycl<TPltf>>
     {
-        ALPAKA_FN_HOST static auto getPtrDev(BufCpu<TElem, TDim, TIdx> const& buf, DevGenericSycl<TPltf> const&)
-            -> TElem const*
+        static auto getPtrDev(BufCpu<TElem, TDim, TIdx> const& buf, DevGenericSycl<TPltf> const&) -> TElem const*
         {
             return getPtrNative(buf);
         }
-        ALPAKA_FN_HOST static auto getPtrDev(BufCpu<TElem, TDim, TIdx>& buf, DevGenericSycl<TPltf> const&) -> TElem*
+        static auto getPtrDev(BufCpu<TElem, TDim, TIdx>& buf, DevGenericSycl<TPltf> const&) -> TElem*
         {
             return getPtrNative(buf);
         }
diff --git a/include/alpaka/pltf/PltfGenericSycl.hpp b/include/alpaka/pltf/PltfGenericSycl.hpp
index 914b3c7d80cb..d2a33b959b70 100644
--- a/include/alpaka/pltf/PltfGenericSycl.hpp
+++ b/include/alpaka/pltf/PltfGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -195,7 +195,6 @@ namespace alpaka::trait
 #        endif
 
             std::cout << "Aspects: " << '\n';
-            std::cout.flush();
 
 #        if defined(BOOST_COMP_ICPX)
 #            if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0)
diff --git a/include/alpaka/rand/RandGenericSycl.hpp b/include/alpaka/rand/RandGenericSycl.hpp
index 7f4d3a3e1c65..befaa5d7005a 100644
--- a/include/alpaka/rand/RandGenericSycl.hpp
+++ b/include/alpaka/rand/RandGenericSycl.hpp
@@ -4,12 +4,12 @@
 
 #pragma once
 
-#if defined(ALPAKA_ACC_SYCL_ENABLED)
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/dev/DevGenericSycl.hpp"
+#include "alpaka/rand/Traits.hpp"
 
-#    include <alpaka/core/BoostPredef.hpp>
-#    include <alpaka/core/Concepts.hpp>
-#    include <alpaka/dev/DevGenericSycl.hpp>
-#    include <alpaka/rand/Traits.hpp>
+#ifdef ALPAKA_ACC_SYCL_ENABLED
 
 // Backend specific imports.
 #    include <CL/sycl.hpp>
@@ -38,14 +38,13 @@ namespace alpaka::rand
 {
     //! The SYCL rand implementation.
     template<typename TDim>
-    class RandGenericSycl : public concepts::Implements<ConceptRand, RandGenericSycl<TDim>>
+    struct RandGenericSycl : concepts::Implements<ConceptRand, RandGenericSycl<TDim>>
     {
-    public:
-        RandGenericSycl(sycl::nd_item<TDim::value> my_item) : m_item{my_item}
+        explicit RandGenericSycl(sycl::nd_item<TDim::value> my_item) : m_item_rand{my_item}
         {
         }
 
-        sycl::nd_item<TDim::value> m_item;
+        sycl::nd_item<TDim::value> m_item_rand;
     };
 
 #    if !defined(ALPAKA_HOST_ONLY)
@@ -53,15 +52,11 @@ namespace alpaka::rand
     {
         //! The SYCL random number floating point normal distribution.
         template<typename T>
-        class NormalReal;
+        struct NormalReal;
 
-        //! The SYCL random number floating point uniform distribution.
+        //! The SYCL random number uniform distribution.
         template<typename T>
-        class UniformReal;
-
-        //! The SYCL random number integer uniform distribution.
-        template<typename T>
-        class UniformUint;
+        struct Uniform;
     } // namespace distribution::sycl_rand
 
     namespace engine::sycl_rand
@@ -77,17 +72,15 @@ namespace alpaka::rand
 
             Minstd(RandGenericSycl<TDim> rand, std::uint32_t const& seed)
             {
-                oneapi::dpl::minstd_rand engine(seed, rand.m_item.get_global_linear_id());
+                oneapi::dpl::minstd_rand engine(seed, rand.m_item_rand.get_global_linear_id());
                 rng_engine = engine;
             }
 
         private:
             template<typename T>
-            friend class distribution::sycl_rand::NormalReal;
-            template<typename T>
-            friend class distribution::sycl_rand::UniformReal;
+            friend struct distribution::sycl_rand::NormalReal;
             template<typename T>
-            friend class distribution::sycl_rand::UniformUint;
+            friend struct distribution::sycl_rand::Uniform;
 
             oneapi::dpl::minstd_rand rng_engine;
 
@@ -112,83 +105,40 @@ namespace alpaka::rand
 
     namespace distribution::sycl_rand
     {
-        //! The SYCL random number float normal distribution.
-        template<>
-        class NormalReal<float>
-        {
-        public:
-            template<typename TEngine>
-            auto operator()(TEngine& engine) -> float
-            {
-                // Create float uniform_real_distribution distribution
-                oneapi::dpl::normal_distribution<float> distr;
-
-                // Generate float random number
-                return distr(engine.rng_engine);
-            }
-        };
 
         //! The SYCL random number double normal distribution.
-        template<>
-        class NormalReal<double>
+        template<typename F>
+        struct NormalReal
         {
-        public:
-            template<typename TEngine>
-            auto operator()(TEngine& engine) -> double
-            {
-                // Create float uniform_real_distribution distribution
-                oneapi::dpl::normal_distribution<double> distr;
-
-                // Generate float random number
-                return distr(engine.rng_engine);
-            }
-        };
+            static_assert(std::is_floating_point_v<F>);
 
-        //! The SYCL random number float uniform distribution.
-        template<>
-        class UniformReal<float>
-        {
-        public:
             template<typename TEngine>
-            auto operator()(TEngine& engine) -> float
+            auto operator()(TEngine& engine) -> F
             {
-                // Create float uniform_real_distribution distribution
-                oneapi::dpl::uniform_real_distribution<float> distr;
-
-                // Generate float random number
+                oneapi::dpl::normal_distribution<F> distr;
                 return distr(engine.rng_engine);
             }
         };
 
         //! The SYCL random number float uniform distribution.
-        template<>
-        class UniformReal<double>
+        template<typename T>
+        struct Uniform
         {
-        public:
-            template<typename TEngine>
-            auto operator()(TEngine& engine) -> double
-            {
-                // Create float uniform_real_distribution distribution
-                oneapi::dpl::uniform_real_distribution<double> distr;
+            static_assert(std::is_floating_point_v<T> || std::is_unsigned_v<T>);
 
-                // Generate float random number
-                return distr(engine.rng_engine);
-            }
-        };
-
-        //! The SYCL random number unsigned integer uniform distribution.
-        template<>
-        class UniformUint<unsigned int>
-        {
-        public:
             template<typename TEngine>
-            auto operator()(TEngine& engine) -> unsigned int
+            auto operator()(TEngine& engine) -> T
             {
-                // Create float uniform_real_distribution distribution
-                oneapi::dpl::uniform_int_distribution<unsigned int> distr;
-
-                // Generate float random number
-                return distr(engine.rng_engine);
+                if constexpr(std::is_floating_point_v<T>)
+                {
+                    oneapi::dpl::uniform_real_distribution<T> distr;
+                    return distr(engine.rng_engine);
+                }
+                else
+                {
+                    oneapi::dpl::uniform_int_distribution<T> distr;
+                    return distr(engine.rng_engine);
+                }
             }
         };
     } // namespace distribution::sycl_rand
@@ -209,7 +159,7 @@ namespace alpaka::rand
         template<typename TDim, typename T>
         struct CreateUniformReal<RandGenericSycl<TDim>, T, std::enable_if_t<std::is_floating_point_v<T>>>
         {
-            static auto createUniformReal(RandGenericSycl<TDim> const& /*rand*/) -> sycl_rand::UniformReal<T>
+            static auto createUniformReal(RandGenericSycl<TDim> const& /*rand*/) -> sycl_rand::Uniform<T>
             {
                 return {};
             }
@@ -219,7 +169,7 @@ namespace alpaka::rand
         template<typename TDim, typename T>
         struct CreateUniformUint<RandGenericSycl<TDim>, T, std::enable_if_t<std::is_integral_v<T>>>
         {
-            static auto createUniformUint(RandGenericSycl<TDim> const& /*rand*/) -> sycl_rand::UniformUint<T>
+            static auto createUniformUint(RandGenericSycl<TDim> const& /*rand*/) -> sycl_rand::Uniform<T>
             {
                 return {};
             }
diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp
index 356ee16e2c63..eb4a21a5a801 100644
--- a/include/alpaka/warp/WarpGenericSycl.hpp
+++ b/include/alpaka/warp/WarpGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -19,11 +19,11 @@ namespace alpaka::warp
     class WarpGenericSycl : public concepts::Implements<alpaka::warp::ConceptWarp, WarpGenericSycl<TDim>>
     {
     public:
-        WarpGenericSycl(sycl::nd_item<TDim::value> my_item) : m_item{my_item}
+        WarpGenericSycl(sycl::nd_item<TDim::value> my_item) : m_item_warp{my_item}
         {
         }
 
-        sycl::nd_item<TDim::value> m_item;
+        sycl::nd_item<TDim::value> m_item_warp;
     };
 } // namespace alpaka::warp
 
@@ -34,7 +34,7 @@ namespace alpaka::warp::trait
     {
         static auto getSize(warp::WarpGenericSycl<TDim> const& warp) -> std::int32_t
         {
-            auto const sub_group = warp.m_item.get_sub_group();
+            auto const sub_group = warp.m_item_warp.get_sub_group();
             // SYCL sub-groups are always 1D
             return static_cast<std::int32_t>(sub_group.get_max_local_range()[0]);
         }
@@ -43,14 +43,18 @@ namespace alpaka::warp::trait
     template<typename TDim>
     struct Activemask<warp::WarpGenericSycl<TDim>>
     {
-        // FIXME This should be std::uint64_t on AMD GCN architectures.
+        // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
+        // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
+        // Restrict to warpSize <= 32 for now.
         static auto activemask(warp::WarpGenericSycl<TDim> const& warp) -> std::uint32_t
         {
             // SYCL has no way of querying this. Since sub-group functions have to be executed in convergent code
             // regions anyway we return the full mask.
-            auto const sub_group = warp.m_item.get_sub_group();
+            auto const sub_group = warp.m_item_warp.get_sub_group();
             auto const mask = sycl::ext::oneapi::group_ballot(sub_group, true);
-            // FIXME This should be std::uint64_t on AMD GCN architectures.
+            // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
+            // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
+            // Restrict to warpSize <= 32 for now.
             std::uint32_t bits = 0;
             mask.extract_bits(bits);
             return bits;
@@ -62,7 +66,7 @@ namespace alpaka::warp::trait
     {
         static auto all(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::int32_t
         {
-            auto const sub_group = warp.m_item.get_sub_group();
+            auto const sub_group = warp.m_item_warp.get_sub_group();
             return static_cast<std::int32_t>(sycl::all_of_group(sub_group, static_cast<bool>(predicate)));
         }
     };
@@ -72,7 +76,7 @@ namespace alpaka::warp::trait
     {
         static auto any(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::int32_t
         {
-            auto const sub_group = warp.m_item.get_sub_group();
+            auto const sub_group = warp.m_item_warp.get_sub_group();
             return static_cast<std::int32_t>(sycl::any_of_group(sub_group, static_cast<bool>(predicate)));
         }
     };
@@ -80,12 +84,16 @@ namespace alpaka::warp::trait
     template<typename TDim>
     struct Ballot<warp::WarpGenericSycl<TDim>>
     {
-        // FIXME This should be std::uint64_t on AMD GCN architectures.
+        // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
+        // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
+        // Restrict to warpSize <= 32 for now.
         static auto ballot(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::uint32_t
         {
-            auto const sub_group = warp.m_item.get_sub_group();
+            auto const sub_group = warp.m_item_warp.get_sub_group();
             auto const mask = sycl::ext::oneapi::group_ballot(sub_group, static_cast<bool>(predicate));
-            // FIXME This should be std::uint64_t on AMD GCN architectures.
+            // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
+            // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
+            // Restrict to warpSize <= 32 for now.
             std::uint32_t bits = 0;
             mask.extract_bits(bits);
             return bits;
@@ -108,7 +116,7 @@ namespace alpaka::warp::trait
                Example: If we assume a sub-group size of 32 and a width of 16 we will receive two subdivisions:
                The first starts at sub-group index 0 and the second at sub-group index 16. For srcLane = 4 the
                first subdivision will access the value at sub-group index 4 and the second at sub-group index 20. */
-            auto const actual_group = warp.m_item.get_sub_group();
+            auto const actual_group = warp.m_item_warp.get_sub_group();
             auto const actual_item_id = static_cast<std::int32_t>(actual_group.get_local_linear_id());
             auto const actual_group_id = actual_item_id / width;
             auto const actual_src_id = static_cast<std::size_t>(srcLane + actual_group_id * width);
diff --git a/include/alpaka/workdiv/WorkDivGenericSycl.hpp b/include/alpaka/workdiv/WorkDivGenericSycl.hpp
index c7a2979815c4..725e1ff78824 100644
--- a/include/alpaka/workdiv/WorkDivGenericSycl.hpp
+++ b/include/alpaka/workdiv/WorkDivGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -25,12 +25,12 @@ namespace alpaka
 
         WorkDivGenericSycl(Vec<TDim, TIdx> const& threadElemExtent, sycl::nd_item<TDim::value> work_item)
             : m_threadElemExtent{threadElemExtent}
-            , my_item{work_item}
+            , m_item_workdiv{work_item}
         {
         }
 
         Vec<TDim, TIdx> const& m_threadElemExtent;
-        sycl::nd_item<TDim::value> my_item;
+        sycl::nd_item<TDim::value> m_item_workdiv;
     };
 } // namespace alpaka
 
@@ -60,19 +60,19 @@ namespace alpaka::trait
             if constexpr(TDim::value == 0)
                 return Vec<TDim, TIdx>{};
             else if constexpr(TDim::value == 1)
-                return Vec<TDim, TIdx>{static_cast<TIdx>(workDiv.my_item.get_group_range(0))};
+                return Vec<TDim, TIdx>{static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(0))};
             else if constexpr(TDim::value == 2)
             {
                 return Vec<TDim, TIdx>{
-                    static_cast<TIdx>(workDiv.my_item.get_group_range(1)),
-                    static_cast<TIdx>(workDiv.my_item.get_group_range(0))};
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(1)),
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(0))};
             }
             else
             {
                 return Vec<TDim, TIdx>{
-                    static_cast<TIdx>(workDiv.my_item.get_group_range(2)),
-                    static_cast<TIdx>(workDiv.my_item.get_group_range(1)),
-                    static_cast<TIdx>(workDiv.my_item.get_group_range(0))};
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(2)),
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(1)),
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(0))};
             }
         }
     };
@@ -87,19 +87,19 @@ namespace alpaka::trait
             if constexpr(TDim::value == 0)
                 return Vec<TDim, TIdx>{};
             else if constexpr(TDim::value == 1)
-                return Vec<TDim, TIdx>{static_cast<TIdx>(workDiv.my_item.get_local_range(0))};
+                return Vec<TDim, TIdx>{static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(0))};
             else if constexpr(TDim::value == 2)
             {
                 return Vec<TDim, TIdx>{
-                    static_cast<TIdx>(workDiv.my_item.get_local_range(1)),
-                    static_cast<TIdx>(workDiv.my_item.get_local_range(0))};
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(1)),
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(0))};
             }
             else
             {
                 return Vec<TDim, TIdx>{
-                    static_cast<TIdx>(workDiv.my_item.get_local_range(2)),
-                    static_cast<TIdx>(workDiv.my_item.get_local_range(1)),
-                    static_cast<TIdx>(workDiv.my_item.get_local_range(0))};
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(2)),
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(1)),
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(0))};
             }
         }
     };
diff --git a/test/unit/warp/src/Activemask.cpp b/test/unit/warp/src/Activemask.cpp
index 76431d44c557..8038606a4116 100644
--- a/test/unit/warp/src/Activemask.cpp
+++ b/test/unit/warp/src/Activemask.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
+/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -54,14 +54,11 @@ struct ActivemaskMultipleThreadWarpTestKernel
     }
 };
 
-namespace alpaka::trait
+template<std::uint32_t TWarpSize, typename TAcc>
+struct alpaka::trait::WarpSize<ActivemaskMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
+    : std::integral_constant<std::uint32_t, TWarpSize>
 {
-    template<std::uint32_t TWarpSize, typename TAcc>
-    struct WarpSize<ActivemaskMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
-    {
-        static constexpr std::uint32_t warp_size = TWarpSize;
-    };
-} // namespace alpaka::trait
+};
 
 TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs)
 {
@@ -90,7 +87,14 @@ TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs)
             auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
             auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
             auto fixture = ExecutionFixture{workDiv};
-            if(warpExtent == 8)
+            if(warpExtent == 4)
+            {
+                for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++)
+                {
+                    CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<4>{}, inactiveThreadIdx));
+                }
+            }
+            else if(warpExtent == 8)
             {
                 for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++)
                 {
@@ -111,6 +115,13 @@ TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs)
                     CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<32>{}, inactiveThreadIdx));
                 }
             }
+            else if(warpExtent == 64)
+            {
+                for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++)
+                {
+                    CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<64>{}, inactiveThreadIdx));
+                }
+            }
         }
     }
 }
diff --git a/test/unit/warp/src/All.cpp b/test/unit/warp/src/All.cpp
index 27468cd8803c..3a61553496be 100644
--- a/test/unit/warp/src/All.cpp
+++ b/test/unit/warp/src/All.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
+/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -58,14 +58,11 @@ struct AllMultipleThreadWarpTestKernel
     }
 };
 
-namespace alpaka::trait
+template<std::uint32_t TWarpSize, typename TAcc>
+struct alpaka::trait::WarpSize<AllMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
+    : std::integral_constant<std::uint32_t, TWarpSize>
 {
-    template<std::uint32_t TWarpSize, typename TAcc>
-    struct WarpSize<AllMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
-    {
-        static constexpr std::uint32_t warp_size = TWarpSize;
-    };
-} // namespace alpaka::trait
+};
 
 TEMPLATE_LIST_TEST_CASE("all", "[warp]", alpaka::test::TestAccs)
 {
@@ -94,7 +91,11 @@ TEMPLATE_LIST_TEST_CASE("all", "[warp]", alpaka::test::TestAccs)
             auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
             auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
             auto fixture = ExecutionFixture{workDiv};
-            if(warpExtent == 8)
+            if(warpExtent == 4)
+            {
+                REQUIRE(fixture(AllMultipleThreadWarpTestKernel<4>{}));
+            }
+            else if(warpExtent == 8)
             {
                 REQUIRE(fixture(AllMultipleThreadWarpTestKernel<8>{}));
             }
@@ -106,6 +107,10 @@ TEMPLATE_LIST_TEST_CASE("all", "[warp]", alpaka::test::TestAccs)
             {
                 REQUIRE(fixture(AllMultipleThreadWarpTestKernel<32>{}));
             }
+            else if(warpExtent == 64)
+            {
+                REQUIRE(fixture(AllMultipleThreadWarpTestKernel<64>{}));
+            }
         }
     }
 }
diff --git a/test/unit/warp/src/Any.cpp b/test/unit/warp/src/Any.cpp
index a00f303fc0fa..0f5a059d1c48 100644
--- a/test/unit/warp/src/Any.cpp
+++ b/test/unit/warp/src/Any.cpp
@@ -58,14 +58,11 @@ struct AnyMultipleThreadWarpTestKernel
     }
 };
 
-namespace alpaka::trait
+template<std::uint32_t TWarpSize, typename TAcc>
+struct alpaka::trait::WarpSize<AnyMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
+    : std::integral_constant<std::uint32_t, TWarpSize>
 {
-    template<std::uint32_t TWarpSize, typename TAcc>
-    struct WarpSize<AnyMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
-    {
-        static constexpr std::uint32_t warp_size = TWarpSize;
-    };
-} // namespace alpaka::trait
+};
 
 TEMPLATE_LIST_TEST_CASE("any", "[warp]", alpaka::test::TestAccs)
 {
@@ -94,7 +91,11 @@ TEMPLATE_LIST_TEST_CASE("any", "[warp]", alpaka::test::TestAccs)
             auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
             auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
             auto fixture = ExecutionFixture{workDiv};
-            if(warpExtent == 8)
+            if(warpExtent == 4)
+            {
+                REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<4>{}));
+            }
+            else if(warpExtent == 8)
             {
                 REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<8>{}));
             }
@@ -106,6 +107,10 @@ TEMPLATE_LIST_TEST_CASE("any", "[warp]", alpaka::test::TestAccs)
             {
                 REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<32>{}));
             }
+            else if(warpExtent == 64)
+            {
+                REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<64>{}));
+            }
         }
     }
 }
diff --git a/test/unit/warp/src/Ballot.cpp b/test/unit/warp/src/Ballot.cpp
index edf8c59cd575..0525928c275c 100644
--- a/test/unit/warp/src/Ballot.cpp
+++ b/test/unit/warp/src/Ballot.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
+/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -66,14 +66,11 @@ struct BallotMultipleThreadWarpTestKernel
     }
 };
 
-namespace alpaka::trait
+template<std::uint32_t TWarpSize, typename TAcc>
+struct alpaka::trait::WarpSize<BallotMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
+    : std::integral_constant<std::uint32_t, TWarpSize>
 {
-    template<std::uint32_t TWarpSize, typename TAcc>
-    struct WarpSize<BallotMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
-    {
-        static constexpr std::uint32_t warp_size = TWarpSize;
-    };
-} // namespace alpaka::trait
+};
 
 TEMPLATE_LIST_TEST_CASE("ballot", "[warp]", alpaka::test::TestAccs)
 {
@@ -102,7 +99,11 @@ TEMPLATE_LIST_TEST_CASE("ballot", "[warp]", alpaka::test::TestAccs)
             auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
             auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
             auto fixture = ExecutionFixture{workDiv};
-            if(warpExtent == 8)
+            if(warpExtent == 4)
+            {
+                REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<4>{}));
+            }
+            else if(warpExtent == 8)
             {
                 REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<8>{}));
             }
@@ -114,6 +115,10 @@ TEMPLATE_LIST_TEST_CASE("ballot", "[warp]", alpaka::test::TestAccs)
             {
                 REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<32>{}));
             }
+            else if(warpExtent == 64)
+            {
+                REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<64>{}));
+            }
         }
     }
 }
diff --git a/test/unit/warp/src/GetSize.cpp b/test/unit/warp/src/GetSize.cpp
index 5dbbaa018bba..d8c2b5ce3a84 100644
--- a/test/unit/warp/src/GetSize.cpp
+++ b/test/unit/warp/src/GetSize.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2022 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -12,6 +12,7 @@
 
 #include <cstdint>
 
+template<std::uint32_t TWarpSize>
 struct GetSizeTestKernel
 {
     ALPAKA_NO_HOST_ACC_WARNING
@@ -22,6 +23,11 @@ struct GetSizeTestKernel
     }
 };
 
+template<std::uint32_t TWarpSize, typename TAcc>
+struct alpaka::trait::WarpSize<GetSizeTestKernel<TWarpSize>, TAcc> : std::integral_constant<std::uint32_t, TWarpSize>
+{
+};
+
 TEMPLATE_LIST_TEST_CASE("getSize", "[warp]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
@@ -37,6 +43,26 @@ TEMPLATE_LIST_TEST_CASE("getSize", "[warp]", alpaka::test::TestAccs)
         [](std::size_t ws)
         {
             alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(8));
-            return fixture(GetSizeTestKernel{}, static_cast<std::int32_t>(ws));
+            if(ws == 4)
+            {
+                return fixture(GetSizeTestKernel<4>{}, static_cast<std::int32_t>(ws));
+            }
+            else if(ws == 8)
+            {
+                return fixture(GetSizeTestKernel<8>{}, static_cast<std::int32_t>(ws));
+            }
+            else if(ws == 16)
+            {
+                return fixture(GetSizeTestKernel<16>{}, static_cast<std::int32_t>(ws));
+            }
+            else if(ws == 32)
+            {
+                return fixture(GetSizeTestKernel<32>{}, static_cast<std::int32_t>(ws));
+            }
+            else if(ws == 64)
+            {
+                return fixture(GetSizeTestKernel<64>{}, static_cast<std::int32_t>(ws));
+            }
+            return fixture(GetSizeTestKernel<0>{}, static_cast<std::int32_t>(ws));
         }));
 }
diff --git a/test/unit/warp/src/Shfl.cpp b/test/unit/warp/src/Shfl.cpp
index d9ec5d59ee5d..4a35ba4480a3 100644
--- a/test/unit/warp/src/Shfl.cpp
+++ b/test/unit/warp/src/Shfl.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 David M. Rogers, Jan Stephan, Andrea Bocci
+/* Copyright 2023 David M. Rogers, Jan Stephan, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -88,14 +88,11 @@ struct ShflMultipleThreadWarpTestKernel
     }
 };
 
-namespace alpaka::trait
+template<std::uint32_t TWarpSize, typename TAcc>
+struct alpaka::trait::WarpSize<ShflMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
+    : std::integral_constant<std::uint32_t, TWarpSize>
 {
-    template<std::uint32_t TWarpSize, typename TAcc>
-    struct WarpSize<ShflMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
-    {
-        static constexpr std::uint32_t warp_size = TWarpSize;
-    };
-} // namespace alpaka::trait
+};
 
 TEMPLATE_LIST_TEST_CASE("shfl", "[warp]", alpaka::test::TestAccs)
 {
@@ -124,7 +121,11 @@ TEMPLATE_LIST_TEST_CASE("shfl", "[warp]", alpaka::test::TestAccs)
             auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
             auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
             auto fixture = ExecutionFixture{workDiv};
-            if(warpExtent == 8)
+            if(warpExtent == 4)
+            {
+                REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<4>{}));
+            }
+            else if(warpExtent == 8)
             {
                 REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<8>{}));
             }
@@ -136,6 +137,10 @@ TEMPLATE_LIST_TEST_CASE("shfl", "[warp]", alpaka::test::TestAccs)
             {
                 REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<32>{}));
             }
+            else if(warpExtent == 64)
+            {
+                REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<64>{}));
+            }
         }
     }
 }

From 218ab6e561b5f79305457729687e6a8e849748c1 Mon Sep 17 00:00:00 2001
From: AuroraPerego <aurora.perego@cern.ch>
Date: Mon, 10 Jul 2023 19:01:41 +0200
Subject: [PATCH 09/10] Various fixes related to the SYCL back-end

  - add the missing specialization of CreateViewPlainPtr for SYCL devices
  - improve the comments on the ALPAKA_FN_INLINE macro
  - remove unnecessary ALPAKA_FN_HOST attributes
  - rename QueueGenericSyclBase::m_impl to m_spQueueImpl, to align with the other back-ends
---
 include/alpaka/acc/AccFpgaSyclIntel.hpp       |  4 +--
 include/alpaka/core/Common.hpp                | 11 ++++----
 include/alpaka/event/EventGenericSycl.hpp     | 10 +++----
 include/alpaka/mem/buf/sycl/Copy.hpp          | 28 +++++--------------
 include/alpaka/mem/buf/sycl/Set.hpp           |  6 ++--
 include/alpaka/mem/view/ViewPlainPtr.hpp      | 22 ++++++++++++++-
 .../queue/sycl/QueueGenericSyclBase.hpp       | 18 ++++++------
 7 files changed, 52 insertions(+), 47 deletions(-)

diff --git a/include/alpaka/acc/AccFpgaSyclIntel.hpp b/include/alpaka/acc/AccFpgaSyclIntel.hpp
index 7ddc7b2b65de..2c71a86c2b90 100644
--- a/include/alpaka/acc/AccFpgaSyclIntel.hpp
+++ b/include/alpaka/acc/AccFpgaSyclIntel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -45,7 +45,7 @@ namespace alpaka::trait
     template<typename TDim, typename TIdx>
     struct GetAccName<AccFpgaSyclIntel<TDim, TIdx>>
     {
-        ALPAKA_FN_HOST static auto getAccName() -> std::string
+        static auto getAccName() -> std::string
         {
             return "AccFpgaSyclIntel<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
         }
diff --git a/include/alpaka/core/Common.hpp b/include/alpaka/core/Common.hpp
index bc7428af08db..6da0b9fbd620 100644
--- a/include/alpaka/core/Common.hpp
+++ b/include/alpaka/core/Common.hpp
@@ -76,13 +76,12 @@
 //! Macro defining the inline function attribute.
 #if BOOST_LANG_CUDA || BOOST_LANG_HIP
 #    define ALPAKA_FN_INLINE __forceinline__
-#else
-#    if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#elif BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
 // TODO: With C++20 [[msvc::forceinline]] can be used.
-#        define ALPAKA_FN_INLINE __forceinline
-#    else
-#        define ALPAKA_FN_INLINE [[gnu::always_inline]] inline
-#    endif
+#    define ALPAKA_FN_INLINE __forceinline
+#else
+// For gcc, clang, and clang-based compilers like Intel icpx
+#    define ALPAKA_FN_INLINE [[gnu::always_inline]] inline
 #endif
 
 //! This macro defines a variable lying in global accelerator device memory.
diff --git a/include/alpaka/event/EventGenericSycl.hpp b/include/alpaka/event/EventGenericSycl.hpp
index 3e84baec3810..f761902ddfea 100644
--- a/include/alpaka/event/EventGenericSycl.hpp
+++ b/include/alpaka/event/EventGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan, Antonio Di Pilato
+/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -87,7 +87,7 @@ namespace alpaka::trait
     {
         static auto enqueue(QueueGenericSyclNonBlocking<TDev>& queue, EventGenericSycl<TDev>& event)
         {
-            event.setEvent(queue.m_impl->get_last_event());
+            event.setEvent(queue.m_spQueueImpl->get_last_event());
         }
     };
 
@@ -97,7 +97,7 @@ namespace alpaka::trait
     {
         static auto enqueue(QueueGenericSyclBlocking<TDev>& queue, EventGenericSycl<TDev>& event)
         {
-            event.setEvent(queue.m_impl->get_last_event());
+            event.setEvent(queue.m_spQueueImpl->get_last_event());
         }
     };
 
@@ -120,7 +120,7 @@ namespace alpaka::trait
     {
         static auto waiterWaitFor(QueueGenericSyclNonBlocking<TDev>& queue, EventGenericSycl<TDev> const& event)
         {
-            queue.m_impl->register_dependency(event.getNativeHandle());
+            queue.m_spQueueImpl->register_dependency(event.getNativeHandle());
         }
     };
 
@@ -130,7 +130,7 @@ namespace alpaka::trait
     {
         static auto waiterWaitFor(QueueGenericSyclBlocking<TDev>& queue, EventGenericSycl<TDev> const& event)
         {
-            queue.m_impl->register_dependency(event.getNativeHandle());
+            queue.m_spQueueImpl->register_dependency(event.getNativeHandle());
         }
     };
 
diff --git a/include/alpaka/mem/buf/sycl/Copy.hpp b/include/alpaka/mem/buf/sycl/Copy.hpp
index 34527e8478ed..e1d8ce640c32 100644
--- a/include/alpaka/mem/buf/sycl/Copy.hpp
+++ b/include/alpaka/mem/buf/sycl/Copy.hpp
@@ -59,7 +59,7 @@ namespace alpaka::detail
         }
 
 #    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-        ALPAKA_FN_HOST auto printDebug() const -> void
+        auto printDebug() const -> void
         {
             std::cout << __func__ << " e: " << m_extent << " ewb: " << this->m_extentWidthBytes
                       << " de: " << m_dstExtent << " dptr: " << reinterpret_cast<void*>(m_dstMemNative)
@@ -93,8 +93,7 @@ namespace alpaka::detail
 
         using TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>::TaskCopySyclBase;
 
-        ALPAKA_FN_HOST auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const
-            -> sycl::event
+        auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
         {
             ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
@@ -145,8 +144,7 @@ namespace alpaka::detail
         using TaskCopySyclBase<DimInt<1u>, TViewDst, TViewSrc, TExtent>::TaskCopySyclBase;
         using Elem = alpaka::Elem<TViewSrc>;
 
-        ALPAKA_FN_HOST auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const
-            -> sycl::event
+        auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
         {
             ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
@@ -179,10 +177,7 @@ namespace alpaka::detail
         using Elem = alpaka::Elem<TViewSrc>;
 
         template<typename TViewDstFwd>
-        ALPAKA_FN_HOST TaskCopySycl(
-            TViewDstFwd&& viewDst,
-            TViewSrc const& viewSrc,
-            [[maybe_unused]] TExtent const& extent)
+        TaskCopySycl(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, [[maybe_unused]] TExtent const& extent)
             : m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
             , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
         {
@@ -211,10 +206,7 @@ namespace alpaka::trait
     struct CreateTaskMemcpy<TDim, DevGenericSycl<TPltf>, DevCpu>
     {
         template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
-        ALPAKA_FN_HOST static auto createTaskMemcpy(
-            TViewDstFwd&& viewDst,
-            TViewSrc const& viewSrc,
-            TExtent const& extent)
+        static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
             -> alpaka::detail::TaskCopySycl<TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
         {
             ALPAKA_DEBUG_FULL_LOG_SCOPE;
@@ -228,10 +220,7 @@ namespace alpaka::trait
     struct CreateTaskMemcpy<TDim, DevCpu, DevGenericSycl<TPltf>>
     {
         template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
-        ALPAKA_FN_HOST static auto createTaskMemcpy(
-            TViewDstFwd&& viewDst,
-            TViewSrc const& viewSrc,
-            TExtent const& extent)
+        static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
             -> alpaka::detail::TaskCopySycl<TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
         {
             ALPAKA_DEBUG_FULL_LOG_SCOPE;
@@ -245,10 +234,7 @@ namespace alpaka::trait
     struct CreateTaskMemcpy<TDim, DevGenericSycl<TPltfDst>, DevGenericSycl<TPltfSrc>>
     {
         template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
-        ALPAKA_FN_HOST static auto createTaskMemcpy(
-            TViewDstFwd&& viewDst,
-            TViewSrc const& viewSrc,
-            TExtent const& extent)
+        static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
             -> alpaka::detail::TaskCopySycl<TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
         {
             ALPAKA_DEBUG_FULL_LOG_SCOPE;
diff --git a/include/alpaka/mem/buf/sycl/Set.hpp b/include/alpaka/mem/buf/sycl/Set.hpp
index da52a9206c46..01899cc18e1b 100644
--- a/include/alpaka/mem/buf/sycl/Set.hpp
+++ b/include/alpaka/mem/buf/sycl/Set.hpp
@@ -54,7 +54,7 @@ namespace alpaka
             }
 
 #    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            ALPAKA_FN_HOST auto printDebug() const -> void
+            auto printDebug() const -> void
             {
                 std::cout << __func__ << " e: " << this->m_extent << " ewb: " << this->m_extentWidthBytes
                           << " de: " << this->m_dstExtent << " dptr: " << reinterpret_cast<void*>(this->m_dstMemNative)
@@ -170,7 +170,7 @@ namespace alpaka
             }
 
 #    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            ALPAKA_FN_HOST auto printDebug() const -> void
+            auto printDebug() const -> void
             {
                 std::cout << __func__ << " e: " << Scalar() << " ewb: " << sizeof(Elem) << " de: " << Scalar()
                           << " dptr: " << reinterpret_cast<void*>(m_dstMemNative) << " dpitchb: " << Scalar()
@@ -203,7 +203,7 @@ namespace alpaka
         struct CreateTaskMemset<TDim, DevGenericSycl<TPltf>>
         {
             template<typename TExtent, typename TView>
-            ALPAKA_FN_HOST static auto createTaskMemset(TView& view, std::uint8_t const& byte, TExtent const& extent)
+            static auto createTaskMemset(TView& view, std::uint8_t const& byte, TExtent const& extent)
                 -> detail::TaskSetSycl<TDim, TView, TExtent>
             {
                 return detail::TaskSetSycl<TDim, TView, TExtent>(view, byte, extent);
diff --git a/include/alpaka/mem/view/ViewPlainPtr.hpp b/include/alpaka/mem/view/ViewPlainPtr.hpp
index 7416385ea068..56fadf03feb9 100644
--- a/include/alpaka/mem/view/ViewPlainPtr.hpp
+++ b/include/alpaka/mem/view/ViewPlainPtr.hpp
@@ -1,5 +1,5 @@
 /* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Sergei Bastrakov, Bernhard Manfred Gruber,
- *                Jan Stephan, Andrea Bocci
+ *                Jan Stephan, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -229,6 +229,26 @@ namespace alpaka
         };
 #endif
 
+#if defined(ALPAKA_ACC_SYCL_ENABLED)
+        //! The SYCL device CreateViewPlainPtr trait specialization.
+        template<typename TPltf>
+        struct CreateViewPlainPtr<DevGenericSycl<TPltf>>
+        {
+            template<typename TElem, typename TExtent, typename TPitch>
+            static auto createViewPlainPtr(
+                DevGenericSycl<TPltf> const& dev,
+                TElem* pMem,
+                TExtent const& extent,
+                TPitch const& pitch)
+            {
+                return alpaka::ViewPlainPtr<DevGenericSycl<TPltf>, TElem, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
+                    pMem,
+                    dev,
+                    extent,
+                    pitch);
+            }
+        };
+#endif
         //! The ViewPlainPtr offset get trait specialization.
         template<typename TIdxIntegralConst, typename TDev, typename TElem, typename TDim, typename TIdx>
         struct GetOffset<TIdxIntegralConst, ViewPlainPtr<TDev, TElem, TDim, TIdx>>
diff --git a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
index c529ecfaa2e4..0f847079de95 100644
--- a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
+++ b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Luca Ferragina, Andrea Bocci
+/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Luca Ferragina, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -172,16 +172,16 @@ namespace alpaka::detail
     public:
         QueueGenericSyclBase(TDev const& dev)
             : m_dev{dev}
-            , m_impl{std::make_shared<detail::QueueGenericSyclImpl>(
+            , m_spQueueImpl{std::make_shared<detail::QueueGenericSyclImpl>(
                   dev.getNativeHandle().second,
                   dev.getNativeHandle().first)}
         {
-            m_dev.m_impl->register_queue(m_impl);
+            m_dev.m_impl->register_queue(m_spQueueImpl);
         }
 
         friend auto operator==(QueueGenericSyclBase const& lhs, QueueGenericSyclBase const& rhs) -> bool
         {
-            return (lhs.m_dev == rhs.m_dev) && (lhs.m_impl == rhs.m_impl);
+            return (lhs.m_dev == rhs.m_dev) && (lhs.m_spQueueImpl == rhs.m_spQueueImpl);
         }
 
         friend auto operator!=(QueueGenericSyclBase const& lhs, QueueGenericSyclBase const& rhs) -> bool
@@ -191,11 +191,11 @@ namespace alpaka::detail
 
         [[nodiscard]] auto getNativeHandle() const noexcept
         {
-            return m_impl->getNativeHandle();
+            return m_spQueueImpl->getNativeHandle();
         }
 
         TDev m_dev;
-        std::shared_ptr<detail::QueueGenericSyclImpl> m_impl;
+        std::shared_ptr<detail::QueueGenericSyclImpl> m_spQueueImpl;
     };
 } // namespace alpaka::detail
 
@@ -239,7 +239,7 @@ namespace alpaka::trait
         static auto enqueue(detail::QueueGenericSyclBase<TDev, TBlocking>& queue, TTask const& task) -> void
         {
             ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-            queue.m_impl->template enqueue<TBlocking>(task);
+            queue.m_spQueueImpl->template enqueue<TBlocking>(task);
         }
     };
 
@@ -250,7 +250,7 @@ namespace alpaka::trait
         static auto empty(detail::QueueGenericSyclBase<TDev, TBlocking> const& queue) -> bool
         {
             ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-            return queue.m_impl->empty();
+            return queue.m_spQueueImpl->empty();
         }
     };
 
@@ -264,7 +264,7 @@ namespace alpaka::trait
         static auto currentThreadWaitFor(detail::QueueGenericSyclBase<TDev, TBlocking> const& queue) -> void
         {
             ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-            queue.m_impl->wait();
+            queue.m_spQueueImpl->wait();
         }
     };
 

From c66b7a54c162e848c3c15069a5b008f9480e01d6 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 12 Jul 2023 18:53:57 +0200
Subject: [PATCH 10/10] Update the main SYCL include file name

---
 include/alpaka/acc/AccCpuSycl.hpp                               | 2 +-
 include/alpaka/acc/AccFpgaSyclIntel.hpp                         | 2 +-
 include/alpaka/acc/AccGenericSycl.hpp                           | 2 +-
 include/alpaka/atomic/AtomicGenericSycl.hpp                     | 2 +-
 .../alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp    | 2 +-
 include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp  | 2 +-
 include/alpaka/block/sync/BlockSyncGenericSycl.hpp              | 2 +-
 include/alpaka/core/Sycl.hpp                                    | 2 +-
 include/alpaka/dev/DevGenericSycl.hpp                           | 2 +-
 include/alpaka/event/EventGenericSycl.hpp                       | 2 +-
 include/alpaka/idx/bt/IdxBtGenericSycl.hpp                      | 2 +-
 include/alpaka/idx/gb/IdxGbGenericSycl.hpp                      | 2 +-
 include/alpaka/intrinsic/IntrinsicGenericSycl.hpp               | 2 +-
 include/alpaka/kernel/TaskKernelGenericSycl.hpp                 | 2 +-
 include/alpaka/math/MathGenericSycl.hpp                         | 2 +-
 include/alpaka/mem/buf/BufGenericSycl.hpp                       | 2 +-
 include/alpaka/mem/buf/sycl/Accessor.hpp                        | 2 +-
 include/alpaka/mem/buf/sycl/Common.hpp                          | 2 +-
 include/alpaka/mem/buf/sycl/Copy.hpp                            | 2 +-
 include/alpaka/mem/fence/MemFenceGenericSycl.hpp                | 2 +-
 include/alpaka/pltf/PltfCpuSycl.hpp                             | 2 +-
 include/alpaka/pltf/PltfFpgaSyclIntel.hpp                       | 2 +-
 include/alpaka/pltf/PltfGenericSycl.hpp                         | 2 +-
 include/alpaka/pltf/PltfGpuSyclIntel.hpp                        | 2 +-
 include/alpaka/queue/sycl/QueueGenericSyclBase.hpp              | 2 +-
 include/alpaka/rand/RandGenericSycl.hpp                         | 2 +-
 include/alpaka/warp/WarpGenericSycl.hpp                         | 2 +-
 include/alpaka/workdiv/WorkDivGenericSycl.hpp                   | 2 +-
 28 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/include/alpaka/acc/AccCpuSycl.hpp b/include/alpaka/acc/AccCpuSycl.hpp
index 5cedbea64d8f..e9e5504bae67 100644
--- a/include/alpaka/acc/AccCpuSycl.hpp
+++ b/include/alpaka/acc/AccCpuSycl.hpp
@@ -23,7 +23,7 @@
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU)
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
diff --git a/include/alpaka/acc/AccFpgaSyclIntel.hpp b/include/alpaka/acc/AccFpgaSyclIntel.hpp
index 2c71a86c2b90..7e84ce6d5e39 100644
--- a/include/alpaka/acc/AccFpgaSyclIntel.hpp
+++ b/include/alpaka/acc/AccFpgaSyclIntel.hpp
@@ -22,7 +22,7 @@
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
diff --git a/include/alpaka/acc/AccGenericSycl.hpp b/include/alpaka/acc/AccGenericSycl.hpp
index ea9560e23b38..34400442c31c 100644
--- a/include/alpaka/acc/AccGenericSycl.hpp
+++ b/include/alpaka/acc/AccGenericSycl.hpp
@@ -38,7 +38,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
diff --git a/include/alpaka/atomic/AtomicGenericSycl.hpp b/include/alpaka/atomic/AtomicGenericSycl.hpp
index a869248e53b4..8ebf608dc570 100644
--- a/include/alpaka/atomic/AtomicGenericSycl.hpp
+++ b/include/alpaka/atomic/AtomicGenericSycl.hpp
@@ -14,7 +14,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
diff --git a/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp b/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp
index 1e4671ebbf72..4e2af194ddcd 100644
--- a/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp
+++ b/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp
@@ -9,7 +9,7 @@
 #include <cstddef>
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
diff --git a/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp b/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp
index abda79f7b049..f92df9c051f0 100644
--- a/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp
+++ b/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp
@@ -12,7 +12,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
diff --git a/include/alpaka/block/sync/BlockSyncGenericSycl.hpp b/include/alpaka/block/sync/BlockSyncGenericSycl.hpp
index 87d478eb9c19..67e97493fee4 100644
--- a/include/alpaka/block/sync/BlockSyncGenericSycl.hpp
+++ b/include/alpaka/block/sync/BlockSyncGenericSycl.hpp
@@ -8,7 +8,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
diff --git a/include/alpaka/core/Sycl.hpp b/include/alpaka/core/Sycl.hpp
index 8ba75ef6ecda..dbc8c1e0f748 100644
--- a/include/alpaka/core/Sycl.hpp
+++ b/include/alpaka/core/Sycl.hpp
@@ -22,7 +22,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 // if SYCL is enabled with the AMD backend the printf will be killed because of missing compiler support
 #    ifdef __AMDGCN__
diff --git a/include/alpaka/dev/DevGenericSycl.hpp b/include/alpaka/dev/DevGenericSycl.hpp
index 0a6875a77ba6..85c218365f3d 100644
--- a/include/alpaka/dev/DevGenericSycl.hpp
+++ b/include/alpaka/dev/DevGenericSycl.hpp
@@ -27,7 +27,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
diff --git a/include/alpaka/event/EventGenericSycl.hpp b/include/alpaka/event/EventGenericSycl.hpp
index f761902ddfea..68011a0247cd 100644
--- a/include/alpaka/event/EventGenericSycl.hpp
+++ b/include/alpaka/event/EventGenericSycl.hpp
@@ -17,7 +17,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
diff --git a/include/alpaka/idx/bt/IdxBtGenericSycl.hpp b/include/alpaka/idx/bt/IdxBtGenericSycl.hpp
index 9d68deb826a6..54ef78014f1f 100644
--- a/include/alpaka/idx/bt/IdxBtGenericSycl.hpp
+++ b/include/alpaka/idx/bt/IdxBtGenericSycl.hpp
@@ -12,7 +12,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka::bt
 {
diff --git a/include/alpaka/idx/gb/IdxGbGenericSycl.hpp b/include/alpaka/idx/gb/IdxGbGenericSycl.hpp
index e7500c0f6144..42547effd6c3 100644
--- a/include/alpaka/idx/gb/IdxGbGenericSycl.hpp
+++ b/include/alpaka/idx/gb/IdxGbGenericSycl.hpp
@@ -12,7 +12,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka::gb
 {
diff --git a/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp b/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp
index 700442fb4520..395043a9cd95 100644
--- a/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp
+++ b/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp
@@ -11,7 +11,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
diff --git a/include/alpaka/kernel/TaskKernelGenericSycl.hpp b/include/alpaka/kernel/TaskKernelGenericSycl.hpp
index 289a6f72c819..92eab6fa5221 100644
--- a/include/alpaka/kernel/TaskKernelGenericSycl.hpp
+++ b/include/alpaka/kernel/TaskKernelGenericSycl.hpp
@@ -36,7 +36,7 @@
 #        pragma clang diagnostic ignored "-Wunused-parameter"
 #    endif
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 #    define LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size)                                                    \
         cgh.parallel_for(                                                                                             \
diff --git a/include/alpaka/math/MathGenericSycl.hpp b/include/alpaka/math/MathGenericSycl.hpp
index ba99befbf66a..53898f210680 100644
--- a/include/alpaka/math/MathGenericSycl.hpp
+++ b/include/alpaka/math/MathGenericSycl.hpp
@@ -12,7 +12,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 //! The mathematical operation specifics.
 namespace alpaka::math
diff --git a/include/alpaka/mem/buf/BufGenericSycl.hpp b/include/alpaka/mem/buf/BufGenericSycl.hpp
index 38241c83773b..c8849cedf7dd 100644
--- a/include/alpaka/mem/buf/BufGenericSycl.hpp
+++ b/include/alpaka/mem/buf/BufGenericSycl.hpp
@@ -20,7 +20,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
diff --git a/include/alpaka/mem/buf/sycl/Accessor.hpp b/include/alpaka/mem/buf/sycl/Accessor.hpp
index 6d2dec54cb7c..a83cd3d7d8e9 100644
--- a/include/alpaka/mem/buf/sycl/Accessor.hpp
+++ b/include/alpaka/mem/buf/sycl/Accessor.hpp
@@ -13,7 +13,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
diff --git a/include/alpaka/mem/buf/sycl/Common.hpp b/include/alpaka/mem/buf/sycl/Common.hpp
index 80e9763ebf75..87058bc6ca92 100644
--- a/include/alpaka/mem/buf/sycl/Common.hpp
+++ b/include/alpaka/mem/buf/sycl/Common.hpp
@@ -12,7 +12,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka::detail
 {
diff --git a/include/alpaka/mem/buf/sycl/Copy.hpp b/include/alpaka/mem/buf/sycl/Copy.hpp
index e1d8ce640c32..8e8cf533f747 100644
--- a/include/alpaka/mem/buf/sycl/Copy.hpp
+++ b/include/alpaka/mem/buf/sycl/Copy.hpp
@@ -22,7 +22,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka::detail
 {
diff --git a/include/alpaka/mem/fence/MemFenceGenericSycl.hpp b/include/alpaka/mem/fence/MemFenceGenericSycl.hpp
index c0a308448f8e..0b7559f3d61b 100644
--- a/include/alpaka/mem/fence/MemFenceGenericSycl.hpp
+++ b/include/alpaka/mem/fence/MemFenceGenericSycl.hpp
@@ -8,7 +8,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
diff --git a/include/alpaka/pltf/PltfCpuSycl.hpp b/include/alpaka/pltf/PltfCpuSycl.hpp
index 40f6855eab1a..c30793dd98a9 100644
--- a/include/alpaka/pltf/PltfCpuSycl.hpp
+++ b/include/alpaka/pltf/PltfCpuSycl.hpp
@@ -12,7 +12,7 @@
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU)
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
diff --git a/include/alpaka/pltf/PltfFpgaSyclIntel.hpp b/include/alpaka/pltf/PltfFpgaSyclIntel.hpp
index 1d908c832f1b..9b5e1b100373 100644
--- a/include/alpaka/pltf/PltfFpgaSyclIntel.hpp
+++ b/include/alpaka/pltf/PltfFpgaSyclIntel.hpp
@@ -10,7 +10,7 @@
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 #    include <string>
 
diff --git a/include/alpaka/pltf/PltfGenericSycl.hpp b/include/alpaka/pltf/PltfGenericSycl.hpp
index d2a33b959b70..8c67f5f50c31 100644
--- a/include/alpaka/pltf/PltfGenericSycl.hpp
+++ b/include/alpaka/pltf/PltfGenericSycl.hpp
@@ -20,7 +20,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
diff --git a/include/alpaka/pltf/PltfGpuSyclIntel.hpp b/include/alpaka/pltf/PltfGpuSyclIntel.hpp
index 2e5d7583aaab..9c0e6db1eb97 100644
--- a/include/alpaka/pltf/PltfGpuSyclIntel.hpp
+++ b/include/alpaka/pltf/PltfGpuSyclIntel.hpp
@@ -12,7 +12,7 @@
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_GPU)
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
diff --git a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
index 0f847079de95..9b08f3de55ca 100644
--- a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
+++ b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
@@ -21,7 +21,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka::detail
 {
diff --git a/include/alpaka/rand/RandGenericSycl.hpp b/include/alpaka/rand/RandGenericSycl.hpp
index befaa5d7005a..6a85a3a79a5a 100644
--- a/include/alpaka/rand/RandGenericSycl.hpp
+++ b/include/alpaka/rand/RandGenericSycl.hpp
@@ -12,7 +12,7 @@
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
 // Backend specific imports.
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 #    if BOOST_COMP_CLANG
 #        pragma clang diagnostic push
 #        pragma clang diagnostic ignored "-Wcast-align"
diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp
index eb4a21a5a801..b2b6aa8328b5 100644
--- a/include/alpaka/warp/WarpGenericSycl.hpp
+++ b/include/alpaka/warp/WarpGenericSycl.hpp
@@ -10,7 +10,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka::warp
 {
diff --git a/include/alpaka/workdiv/WorkDivGenericSycl.hpp b/include/alpaka/workdiv/WorkDivGenericSycl.hpp
index 725e1ff78824..26e00750e42d 100644
--- a/include/alpaka/workdiv/WorkDivGenericSycl.hpp
+++ b/include/alpaka/workdiv/WorkDivGenericSycl.hpp
@@ -10,7 +10,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {