From be6421ea6752d8d8c017bf3070750e343ba0653d Mon Sep 17 00:00:00 2001 From: Luca Ferragina Date: Mon, 28 Nov 2022 12:22:22 +0100 Subject: [PATCH 01/10] Rewrite the SYCL backend for the SYCL 2020 standard and USM allocations (part 1) Initial work to support the SYCL 2020 standard, using USM allocations instead of SYCL buffers and accessors: - bring the SYCL interface in line with the other backends, and remove the last uses of the alpaka::experimental namespace; - reimplement the alpaka memory buffers, memset and memcpy tasks for the USM SYCL backend; - make the SYCL native handles more consistent with the other backends; - use the oneAPI printf extension, and implement a workaround for the OpenCL limitation on variadic functions and the conflict with AMD HIP/ROCm device code; - add more debug print messages; - various fixes for kernel names, memory_scope Grid and atomics; - update copyright information. Initial work on the SYCL random number generators (not fully working yet). --- example/vectorAdd/src/vectorAdd.cpp | 10 +- include/alpaka/acc/AccCpuSyclIntel.hpp | 2 +- include/alpaka/acc/AccGenericSycl.hpp | 6 +- include/alpaka/alpaka.hpp | 4 +- include/alpaka/atomic/AtomicGenericSycl.hpp | 2 +- include/alpaka/core/Sycl.hpp | 5 +- include/alpaka/dev/DevGenericSycl.hpp | 4 +- .../alpaka/kernel/TaskKernelCpuSyclIntel.hpp | 2 +- .../alpaka/kernel/TaskKernelGenericSycl.hpp | 11 +- include/alpaka/math/MathGenericSycl.hpp | 2 +- include/alpaka/mem/buf/BufCpuSyclIntel.hpp | 4 +- include/alpaka/mem/buf/BufGenericSycl.hpp | 182 ++++++++------ include/alpaka/mem/buf/BufGpuSyclIntel.hpp | 4 +- include/alpaka/mem/buf/sycl/Copy.hpp | 226 +++++++++++------ include/alpaka/mem/buf/sycl/Set.hpp | 174 ++++++++++--- .../alpaka/mem/fence/MemFenceGenericSycl.hpp | 7 + include/alpaka/meta/SyclPrintf.hpp | 35 +++ include/alpaka/pltf/PltfCpuSyclIntel.hpp | 2 +- include/alpaka/pltf/PltfFpgaSyclIntel.hpp | 2 +- include/alpaka/pltf/PltfGenericSycl.hpp | 2 +- include/alpaka/pltf/PltfGpuSyclIntel.hpp | 2 +- .../queue/QueueCpuSyclIntelBlocking.hpp | 2 +- .../queue/QueueCpuSyclIntelNonBlocking.hpp | 2 +- .../queue/sycl/QueueGenericSyclBase.hpp | 3 +- include/alpaka/rand/RandGenericSycl.hpp | 238 ++++++++++++++++++ include/alpaka/test/Check.hpp | 13 +- include/alpaka/warp/WarpGenericSycl.hpp | 2 +- include/alpaka/workdiv/WorkDivGenericSycl.hpp | 2 +- 28 files changed, 735 insertions(+), 215 deletions(-) create mode 100644 include/alpaka/meta/SyclPrintf.hpp create mode 100644 include/alpaka/rand/RandGenericSycl.hpp diff --git a/example/vectorAdd/src/vectorAdd.cpp b/example/vectorAdd/src/vectorAdd.cpp index e057e359e43d..676d0041018b 100644 --- a/example/vectorAdd/src/vectorAdd.cpp +++ b/example/vectorAdd/src/vectorAdd.cpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan +/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan, Luca Ferragina * SPDX-License-Identifier: ISC */ @@ -75,7 +75,11 @@ auto main() -> int // - AccCpuTbbBlocks // - AccCpuSerial // using Acc = alpaka::AccCpuSerial; - using Acc = alpaka::ExampleDefaultAcc; + // using Acc = alpaka::ExampleDefaultAcc; + + using Acc = alpaka::AccGpuSyclIntel; + using Pltf = alpaka::Pltf; + using DevAcc = alpaka::Dev; std::cout << "Using alpaka accelerator: " << alpaka::getAccName() << std::endl; // Defines the synchronization behavior of a queue @@ -136,7 +140,7 @@ auto main() -> int } // Allocate 3 buffers on the accelerator - using BufAcc = alpaka::Buf; + using BufAcc = alpaka::Buf; BufAcc bufAccA(alpaka::allocBuf(devAcc, extent)); BufAcc bufAccB(alpaka::allocBuf(devAcc, extent)); BufAcc bufAccC(alpaka::allocBuf(devAcc, extent)); diff --git a/include/alpaka/acc/AccCpuSyclIntel.hpp b/include/alpaka/acc/AccCpuSyclIntel.hpp index f1f53fad45b6..751d16c6b8b0 100644 --- a/include/alpaka/acc/AccCpuSyclIntel.hpp +++ b/include/alpaka/acc/AccCpuSyclIntel.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2022 Jan Stephan, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/acc/AccGenericSycl.hpp b/include/alpaka/acc/AccGenericSycl.hpp index 211d89a7740f..8a7d0972ddd7 100644 --- a/include/alpaka/acc/AccGenericSycl.hpp +++ b/include/alpaka/acc/AccGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Andrea Bocci +/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Andrea Bocci, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ @@ -15,6 +15,8 @@ #include "alpaka/intrinsic/IntrinsicGenericSycl.hpp" #include "alpaka/math/MathGenericSycl.hpp" #include "alpaka/mem/fence/MemFenceGenericSycl.hpp" +#include "alpaka/meta/SyclPrintf.hpp" +#include "alpaka/rand/RandGenericSycl.hpp" #include "alpaka/warp/WarpGenericSycl.hpp" #include "alpaka/workdiv/WorkDivGenericSycl.hpp" @@ -56,6 +58,7 @@ namespace alpaka , public BlockSyncGenericSycl , public IntrinsicGenericSycl , public MemFenceGenericSycl + , public rand::RandGenericSycl , public warp::WarpGenericSycl { public: @@ -83,6 +86,7 @@ namespace alpaka , BlockSyncGenericSycl{work_item} , IntrinsicGenericSycl{} , MemFenceGenericSycl{global_fence_dummy, local_fence_dummy} + , rand::RandGenericSycl{} , warp::WarpGenericSycl{work_item} , cout{output_stream} { diff --git a/include/alpaka/alpaka.hpp b/include/alpaka/alpaka.hpp index 43829f951fde..c96d396ac4c4 100644 --- a/include/alpaka/alpaka.hpp +++ b/include/alpaka/alpaka.hpp @@ -1,5 +1,5 @@ /* Copyright 2023 Axel Hübl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Bernhard Manfred Gruber, - * Jan Stephan, Antonio Di Pilato + * Jan Stephan, Antonio Di Pilato, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ @@ -172,6 +172,7 @@ #include "alpaka/meta/NdLoop.hpp" #include "alpaka/meta/NonZero.hpp" #include "alpaka/meta/Set.hpp" +#include "alpaka/meta/SyclPrintf.hpp" #include "alpaka/meta/Transform.hpp" #include "alpaka/meta/TypeListOps.hpp" // offset @@ -186,6 +187,7 @@ #include "alpaka/pltf/Traits.hpp" // rand #include "alpaka/rand/RandDefault.hpp" +#include "alpaka/rand/RandGenericSycl.hpp" #include "alpaka/rand/RandPhilox.hpp" #include "alpaka/rand/RandStdLib.hpp" #include "alpaka/rand/RandUniformCudaHipRand.hpp" diff --git a/include/alpaka/atomic/AtomicGenericSycl.hpp b/include/alpaka/atomic/AtomicGenericSycl.hpp index 742adcdfa3e7..8864508d578b 100644 --- a/include/alpaka/atomic/AtomicGenericSycl.hpp +++ b/include/alpaka/atomic/AtomicGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Andrea Bocci +/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/core/Sycl.hpp b/include/alpaka/core/Sycl.hpp index cab73fe17f00..3f067f5f8b39 100644 --- a/include/alpaka/core/Sycl.hpp +++ b/include/alpaka/core/Sycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2022 Jan Stephan, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ @@ -146,10 +146,7 @@ namespace alpaka::trait { using type = std::conditional_t, T, typename T::element_type>; }; -} // namespace alpaka::trait -namespace alpaka::trait -{ //! The SYCL vectors' extent get trait specialization. template struct GetExtent::value>, TExtent, std::enable_if_t::value>> diff --git a/include/alpaka/dev/DevGenericSycl.hpp b/include/alpaka/dev/DevGenericSycl.hpp index 5892df450df3..aa5027d23513 100644 --- a/include/alpaka/dev/DevGenericSycl.hpp +++ b/include/alpaka/dev/DevGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan, Antonio Di Pilato +/* Copyright 2022 Jan Stephan, Antonio Di Pilato, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ @@ -204,7 +204,7 @@ namespace alpaka::trait template struct BufType, TElem, TDim, TIdx> { - using type = BufGenericSycl>; + using type = BufGenericSycl; }; //! The SYCL device platform type trait specialization. diff --git a/include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp b/include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp index 18b67d51a8e1..fbfc88b9c4b3 100644 --- a/include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp +++ b/include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2022 Jan Stephan, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/kernel/TaskKernelGenericSycl.hpp b/include/alpaka/kernel/TaskKernelGenericSycl.hpp index 8dfb20fc6d92..d4f1937415cb 100644 --- a/include/alpaka/kernel/TaskKernelGenericSycl.hpp +++ b/include/alpaka/kernel/TaskKernelGenericSycl.hpp @@ -1,10 +1,11 @@ -/* Copyright 2023 Jan Stephan, Andrea Bocci +/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ #pragma once #include "alpaka/acc/Traits.hpp" +#include "alpaka/block/shared/dyn/BlockSharedDynMemberAllocKiB.hpp" #include "alpaka/core/BoostPredef.hpp" #include "alpaka/core/STLTuple/STLTuple.hpp" #include "alpaka/core/Sycl.hpp" @@ -48,9 +49,8 @@ namespace alpaka::detail template inline auto require( sycl::handler& cgh, - experimental:: - Accessor::value, TAccessModes>, TElem, TIdx, TDim, TAccessModes> - acc, + experimental::Accessor::value, TAccessModes>, TElem, TIdx, TDim, TAccessModes> + acc, special) { cgh.require(acc.m_accessor); @@ -136,7 +136,8 @@ namespace alpaka auto output_stream = sycl::stream{buf_size, buf_per_work_item, cgh}; # endif - cgh.parallel_for>( + // cgh.parallel_for>( //FIXME_ + cgh.parallel_for( sycl::nd_range{global_size, local_size}, [=](sycl::nd_item work_item) { diff --git a/include/alpaka/math/MathGenericSycl.hpp b/include/alpaka/math/MathGenericSycl.hpp index cf7233826741..d48b32b1704f 100644 --- a/include/alpaka/math/MathGenericSycl.hpp +++ b/include/alpaka/math/MathGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2022 Jan Stephan, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/mem/buf/BufCpuSyclIntel.hpp b/include/alpaka/mem/buf/BufCpuSyclIntel.hpp index 967dbc8141a4..0d73ffe281c6 100644 --- a/include/alpaka/mem/buf/BufCpuSyclIntel.hpp +++ b/include/alpaka/mem/buf/BufCpuSyclIntel.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2022 Jan Stephan, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ @@ -12,7 +12,7 @@ namespace alpaka { template - using BufCpuSyclIntel = BufGenericSycl; + using BufCpuSyclIntel = BufGenericSycl; } #endif diff --git a/include/alpaka/mem/buf/BufGenericSycl.hpp b/include/alpaka/mem/buf/BufGenericSycl.hpp index 825fad4e37f1..b60d70c8b033 100644 --- a/include/alpaka/mem/buf/BufGenericSycl.hpp +++ b/include/alpaka/mem/buf/BufGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2022 Jan Stephan, Luca Ferragina, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -12,6 +12,7 @@ #include "alpaka/mem/buf/BufCpu.hpp" #include "alpaka/mem/buf/Traits.hpp" #include "alpaka/mem/view/Accessor.hpp" +#include "alpaka/mem/view/ViewAccessOps.hpp" #include "alpaka/vec/Vec.hpp" #include @@ -24,22 +25,22 @@ namespace alpaka { //! The SYCL memory buffer. - template - class BufGenericSycl + template + class BufGenericSycl : public internal::ViewAccessOps> { + public: static_assert( !std::is_const_v, "The elem type of the buffer can not be const because the C++ Standard forbids containers of const " "elements!"); static_assert(!std::is_const_v, "The idx type of the buffer can not be const!"); - public: //! Constructor - template - BufGenericSycl(TDev const& dev, sycl::buffer buffer, TExtent const& extent) + template + BufGenericSycl(DevGenericSycl const& dev, TElem* const pMem, Deleter deleter, TExtent const& extent) : m_dev{dev} , m_extentElements{getExtentVecEnd(extent)} - , m_buffer{buffer} + , m_spMem(pMem, std::move(deleter)) { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; @@ -53,92 +54,102 @@ namespace alpaka "The idx type of TExtent and the TIdx template parameter have to be identical!"); } - TDev m_dev; + DevGenericSycl m_dev; Vec m_extentElements; - sycl::buffer m_buffer; + std::shared_ptr m_spMem; }; } // namespace alpaka namespace alpaka::trait { //! The BufGenericSycl device type trait specialization. - template - struct DevType> + template + struct DevType> { - using type = TDev; + using type = DevGenericSycl; }; //! The BufGenericSycl device get trait specialization. - template - struct GetDev> + template + struct GetDev> { - static auto getDev(BufGenericSycl const& buf) + ALPAKA_FN_HOST static auto getDev(BufGenericSycl const& buf) { return buf.m_dev; } }; //! The BufGenericSycl dimension getter trait specialization. - template - struct DimType> + template + struct DimType> { using type = TDim; }; //! The BufGenericSycl memory element type get trait specialization. - template - struct ElemType> + template + struct ElemType> { using type = TElem; }; //! The BufGenericSycl extent get trait specialization. - template - struct GetExtent> + template + struct GetExtent> { static_assert(TDim::value > TIdxIntegralConst::value, "Requested dimension out of bounds"); - static auto getExtent(BufGenericSycl const& buf) -> TIdx + ALPAKA_FN_HOST static auto getExtent(BufGenericSycl const& buf) -> TIdx { return buf.m_extentElements[TIdxIntegralConst::value]; } }; //! The BufGenericSycl native pointer get trait specialization. - template - struct GetPtrNative> + template + struct GetPtrNative> { - static_assert( - !sizeof(TElem), - "Accessing device-side pointers on the host is not supported by the SYCL back-end"); - - static auto getPtrNative(BufGenericSycl const&) -> TElem const* + ALPAKA_FN_HOST static auto getPtrNative(BufGenericSycl const& buf) -> TElem const* { - return nullptr; + return buf.m_spMem.get(); } - static auto getPtrNative(BufGenericSycl&) -> TElem* + ALPAKA_FN_HOST static auto getPtrNative(BufGenericSycl& buf) -> TElem* { - return nullptr; + return buf.m_spMem.get(); } }; //! The BufGenericSycl pointer on device get trait specialization. - template - struct GetPtrDev, TDev> + template + struct GetPtrDev, DevGenericSycl> { - static_assert( - !sizeof(TElem), - "Accessing device-side pointers on the host is not supported by the SYCL back-end"); - - static auto getPtrDev(BufGenericSycl const&, TDev const&) -> TElem const* + ALPAKA_FN_HOST static auto getPtrDev( + BufGenericSycl const& buf, + DevGenericSycl const& dev) -> TElem const* { - return nullptr; + if(dev == getDev(buf)) + { + return buf.m_spMem.get(); + } + else + { + throw std::runtime_error("The buffer is not accessible from the given device!"); + } } - static auto getPtrDev(BufGenericSycl&, TDev const&) -> TElem* + ALPAKA_FN_HOST static auto getPtrDev( + BufGenericSycl& buf, + DevGenericSycl const& dev) -> TElem* { - return nullptr; + if(dev == getDev(buf)) + { + return buf.m_spMem.get(); + } + else + { + throw std::runtime_error("The buffer is not accessible from the given device!"); + } } }; @@ -147,68 +158,89 @@ namespace alpaka::trait struct BufAlloc> { template - static auto allocBuf(DevGenericSycl const& dev, TExtent const& ext) - -> BufGenericSycl> + ALPAKA_FN_HOST static auto allocBuf(DevGenericSycl const& dev, TExtent const& extent) + -> BufGenericSycl { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL if constexpr(TDim::value == 0 || TDim::value == 1) { - auto const width = getWidth(ext); + auto const width = getWidth(extent); -# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL auto const widthBytes = width * static_cast(sizeof(TElem)); std::cout << __func__ << " ew: " << width << " ewb: " << widthBytes << '\n'; -# endif - - auto const range = sycl::range<1>{width}; - return {dev, sycl::buffer{range}, ext}; } else if constexpr(TDim::value == 2) { - auto const width = getWidth(ext); - auto const height = getHeight(ext); + auto const width = getWidth(extent); + auto const height = getHeight(extent); -# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL auto const widthBytes = width * static_cast(sizeof(TElem)); std::cout << __func__ << " ew: " << width << " eh: " << height << " ewb: " << widthBytes << " pitch: " << widthBytes << '\n'; -# endif - - auto const range = sycl::range<2>{width, height}; - return {dev, sycl::buffer{range}, ext}; } else if constexpr(TDim::value == 3) { - auto const width = getWidth(ext); - auto const height = getHeight(ext); - auto const depth = getDepth(ext); + auto const width = getWidth(extent); + auto const height = getHeight(extent); + auto const depth = getDepth(extent); -# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL auto const widthBytes = width * static_cast(sizeof(TElem)); std::cout << __func__ << " ew: " << width << " eh: " << height << " ed: " << depth << " ewb: " << widthBytes << " pitch: " << widthBytes << '\n'; + } # endif - auto const range = sycl::range<3>{width, height, depth}; - return {dev, sycl::buffer{range}, ext}; - } + auto* memPtr = sycl::malloc_device( + static_cast(getExtentProduct(extent)), + dev.getNativeHandle().first, + dev.getNativeHandle().second); + auto deleter = [&dev](TElem* ptr) { sycl::free(ptr, dev.getNativeHandle().second); }; + + return BufGenericSycl(dev, memPtr, std::move(deleter), extent); } }; + //! The BufGenericSycl stream-ordered memory allocation capability trait specialization. + template + struct HasAsyncBufSupport> : public std::false_type + { + }; + //! The BufGenericSycl offset get trait specialization. - template - struct GetOffset> + template + struct GetOffset> { - static auto getOffset(BufGenericSycl const&) -> TIdx + static auto getOffset(BufGenericSycl const&) -> TIdx { return 0u; } }; + //! The pinned/mapped memory allocation trait specialization for the SYCL devices. + template + struct BufAllocMapped + { + template + ALPAKA_FN_HOST static auto allocMappedBuf(DevCpu const& host, TExtent const& extent) + -> BufCpu + { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + // Allocate SYCL page-locked memory on the host, mapped into the TPltf address space and + // accessible to all devices in the TPltf. + auto ctx = TPltf::syclContext(); + TElem* memPtr = sycl::malloc_host(static_cast(getExtentProduct(extent)), ctx); + auto deleter = [ctx](TElem* ptr) { sycl::free(ptr, ctx); }; + + return BufCpu(host, memPtr, std::move(deleter), extent); + } + }; + //! The BufGenericSycl idx type trait specialization. - template - struct IdxType> + template + struct IdxType> { using type = TIdx; }; @@ -217,16 +249,14 @@ namespace alpaka::trait template struct GetPtrDev, DevGenericSycl> { - static_assert(!sizeof(TElem), "Accessing host pointers on the device is not supported by the SYCL back-end"); - - static auto getPtrDev(BufCpu const&, DevGenericSycl const&) -> TElem const* + ALPAKA_FN_HOST static auto getPtrDev(BufCpu const& buf, DevGenericSycl const&) + -> TElem const* { - return nullptr; + return getPtrNative(buf); } - - static auto getPtrDev(BufCpu&, DevGenericSycl const&) -> TElem* + ALPAKA_FN_HOST static auto getPtrDev(BufCpu& buf, DevGenericSycl const&) -> TElem* { - return nullptr; + return getPtrNative(buf); } }; } // namespace alpaka::trait diff --git a/include/alpaka/mem/buf/BufGpuSyclIntel.hpp b/include/alpaka/mem/buf/BufGpuSyclIntel.hpp index 0b27ab2ae1f9..cd12f975c8d4 100644 --- a/include/alpaka/mem/buf/BufGpuSyclIntel.hpp +++ b/include/alpaka/mem/buf/BufGpuSyclIntel.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2022 Jan Stephan, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ @@ -12,7 +12,7 @@ namespace alpaka { template - using BufGpuSyclIntel = BufGenericSycl; + using BufGpuSyclIntel = BufGenericSycl; } #endif diff --git a/include/alpaka/mem/buf/sycl/Copy.hpp b/include/alpaka/mem/buf/sycl/Copy.hpp index e6b1ae03836e..cfede497adb1 100644 --- a/include/alpaka/mem/buf/sycl/Copy.hpp +++ b/include/alpaka/mem/buf/sycl/Copy.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2022 Jan Stephan, Luca Ferragina, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -14,6 +14,9 @@ #include "alpaka/extent/Traits.hpp" #include "alpaka/mem/buf/sycl/Common.hpp" #include "alpaka/mem/view/Traits.hpp" +#include "alpaka/meta/NdLoop.hpp" +#include "alpaka/queue/QueueGenericSyclBlocking.hpp" +#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp" #include #include @@ -24,41 +27,145 @@ namespace alpaka::detail { - template - using SrcAccessor = sycl:: - accessor; - - template - using DstAccessor = sycl::accessor< - TElem, - TDim, - sycl::access_mode::write, - sycl::target::global_buffer, - sycl::access::placeholder::true_t>; - - enum class Direction + //! The Sycl device memory copy task base. + template + struct TaskCopySyclBase { - h2d, - d2h, - d2d + static_assert( + std::is_same_v>, std::remove_const_t>>, + "The source and the destination view are required to have the same element type!"); + using ExtentSize = Idx; + using DstSize = Idx; + using SrcSize = Idx; + using Elem = alpaka::Elem; + + template + TaskCopySyclBase(TViewFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent) + : m_extent(getExtentVec(extent)) + , m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast(sizeof(Elem))) +# if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL) + , m_dstExtent(getExtentVec(viewDst)) + , m_srcExtent(getExtentVec(viewSrc)) +# endif + , m_dstPitchBytes(getPitchBytesVec(viewDst)) + , m_srcPitchBytes(getPitchBytesVec(viewSrc)) + , m_dstMemNative(reinterpret_cast(getPtrNative(viewDst))) + , m_srcMemNative(reinterpret_cast(getPtrNative(viewSrc))) + { + if constexpr(TDim::value > 0) + { + ALPAKA_ASSERT((castVec(m_extent) <= m_dstExtent).foldrAll(std::logical_or())); + ALPAKA_ASSERT((castVec(m_extent) <= m_srcExtent).foldrAll(std::logical_or())); + } + } + +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + ALPAKA_FN_HOST auto printDebug() const -> void + { + std::cout << __func__ << " e: " << m_extent << " ewb: " << this->m_extentWidthBytes + << " de: " << m_dstExtent << " dptr: " << reinterpret_cast(m_dstMemNative) + << " se: " << m_srcExtent << " sptr: " << reinterpret_cast(m_srcMemNative) + << std::endl; + } +# endif + + Vec const m_extent; + ExtentSize const m_extentWidthBytes; +# if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL) + Vec const m_dstExtent; + Vec const m_srcExtent; +# endif + + Vec const m_dstPitchBytes; + Vec const m_srcPitchBytes; + std::uint8_t* const m_dstMemNative; + std::uint8_t const* const m_srcMemNative; + static constexpr auto is_sycl_task = true; }; - template - struct TaskCopySycl + //! The Sycl device ND memory copy task. + template + struct TaskCopySycl : public TaskCopySyclBase { - auto operator()(sycl::handler& cgh) const -> void + using DimMin1 = DimInt; + using typename TaskCopySyclBase::ExtentSize; + using typename TaskCopySyclBase::DstSize; + using typename TaskCopySyclBase::SrcSize; + + using TaskCopySyclBase::TaskCopySyclBase; + + ALPAKA_FN_HOST auto operator()(sycl::handler& cgh) const -> void + { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + this->printDebug(); +# endif + if(static_cast(this->m_extent.prod()) != 0u) + { + cgh.memcpy( + reinterpret_cast(this->m_dstMemNative), + reinterpret_cast(this->m_srcMemNative), + static_cast(this->m_extentWidthBytes * this->m_extent.prod())); + } + } + }; + + //! The SYCL device 1D memory copy task. + template + struct TaskCopySycl, TViewDst, TViewSrc, TExtent> + : TaskCopySyclBase, TViewDst, TViewSrc, TExtent> + { + using TaskCopySyclBase, TViewDst, TViewSrc, TExtent>::TaskCopySyclBase; + + ALPAKA_FN_HOST auto operator()(sycl::handler& cgh) const -> void { - if constexpr(TDirection == Direction::d2h || TDirection == Direction::d2d) - cgh.require(m_src); + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + this->printDebug(); +# endif + if(static_cast(this->m_extent.prod()) != 0u) + { + cgh.memcpy( + reinterpret_cast(this->m_dstMemNative), + reinterpret_cast(this->m_srcMemNative), + static_cast(this->m_extentWidthBytes)); + } + } + }; - if constexpr(TDirection == Direction::h2d || TDirection == Direction::d2d) - cgh.require(m_dst); + //! The scalar SYCL memory copy trait. + template + struct TaskCopySycl, TViewDst, TViewSrc, TExtent> + { + static_assert( + std::is_same_v>, std::remove_const_t>>, + "The source and the destination view are required to have the same element type!"); + + using Elem = alpaka::Elem; + + template + ALPAKA_FN_HOST TaskCopySycl( + TViewDstFwd&& viewDst, + TViewSrc const& viewSrc, + [[maybe_unused]] TExtent const& extent) + : m_dstMemNative(reinterpret_cast(getPtrNative(viewDst))) + , m_srcMemNative(reinterpret_cast(getPtrNative(viewSrc))) + { + // all zero-sized extents are equivalent + ALPAKA_ASSERT(getExtentVec(extent).prod() == 1u); + ALPAKA_ASSERT(getExtentVec(viewDst).prod() == 1u); + ALPAKA_ASSERT(getExtentVec(viewSrc).prod() == 1u); + } - cgh.copy(m_src, m_dst); + auto operator()(sycl::handler& cgh) const -> void + { + cgh.memcpy(m_dstMemNative, m_srcMemNative, sizeof(Elem)); } - TSrc m_src; - TDst m_dst; + void* m_dstMemNative; + void const* m_srcMemNative; static constexpr auto is_sycl_task = true; }; } // namespace alpaka::detail @@ -67,76 +174,53 @@ namespace alpaka::detail namespace alpaka::trait { //! The SYCL host-to-device memory copy trait specialization. - template + template struct CreateTaskMemcpy, DevCpu> { template - static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& ext) + ALPAKA_FN_HOST static auto createTaskMemcpy( + TViewDstFwd&& viewDst, + TViewSrc const& viewSrc, + TExtent const& extent) + -> alpaka::detail::TaskCopySycl, TViewSrc, TExtent> { ALPAKA_DEBUG_FULL_LOG_SCOPE; - constexpr auto copy_dim = static_cast(Dim::value); - using ElemType = Elem>; - using SrcType = ElemType const*; - using DstType = alpaka::detail::DstAccessor; - - auto const range = detail::make_sycl_range(ext); - auto const offset = detail::make_sycl_offset(viewDst); - - return detail::TaskCopySycl{ - getPtrNative(viewSrc), - DstType{viewDst.m_buffer, range, offset}}; + return {std::forward(viewDst), viewSrc, extent}; } }; //! The SYCL device-to-host memory copy trait specialization. - template + template struct CreateTaskMemcpy> { template - static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& ext) + ALPAKA_FN_HOST static auto createTaskMemcpy( + TViewDstFwd&& viewDst, + TViewSrc const& viewSrc, + TExtent const& extent) + -> alpaka::detail::TaskCopySycl, TViewSrc, TExtent> { ALPAKA_DEBUG_FULL_LOG_SCOPE; - constexpr auto copy_dim = static_cast(Dim::value); - using ElemType = Elem>; - using SrcType = alpaka::detail::SrcAccessor; - using DstType = ElemType*; - - auto const range = detail::make_sycl_range(ext); - auto const offset = detail::make_sycl_offset(viewSrc); - - auto view_src = const_cast(viewSrc); - - return detail::TaskCopySycl{ - SrcType{view_src.m_buffer, range, offset}, - getPtrNative(viewDst)}; + return {std::forward(viewDst), viewSrc, extent}; } }; //! The SYCL device-to-device memory copy trait specialization. - template + template struct CreateTaskMemcpy, DevGenericSycl> { template - static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& ext) + ALPAKA_FN_HOST static auto createTaskMemcpy( + TViewDstFwd&& viewDst, + TViewSrc const& viewSrc, + TExtent const& extent) + -> alpaka::detail::TaskCopySycl, TViewSrc, TExtent> { ALPAKA_DEBUG_FULL_LOG_SCOPE; - constexpr auto copy_dim = static_cast(Dim::value); - using ElemType = Elem>; - using SrcType = alpaka::detail::SrcAccessor; - using DstType = alpaka::detail::DstAccessor; - - auto const range = detail::make_sycl_range(ext); - auto const offset_src = detail::make_sycl_offset(viewSrc); - auto const offset_dst = detail::make_sycl_offset(viewDst); - - auto view_src = const_cast(viewSrc); - - return detail::TaskCopySycl{ - SrcType{view_src.m_buffer, range, offset_src}, - DstType{viewDst.m_buffer, range, offset_dst}}; + return {std::forward(viewDst), viewSrc, extent}; } }; } // namespace alpaka::trait diff --git a/include/alpaka/mem/buf/sycl/Set.hpp b/include/alpaka/mem/buf/sycl/Set.hpp index f584e0461e87..25d602228424 100644 --- a/include/alpaka/mem/buf/sycl/Set.hpp +++ b/include/alpaka/mem/buf/sycl/Set.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2022 Jan Stephan, Luca Ferragina, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -12,6 +12,9 @@ #include "alpaka/extent/Traits.hpp" #include "alpaka/mem/buf/sycl/Common.hpp" #include "alpaka/mem/view/Traits.hpp" +#include "alpaka/meta/NdLoop.hpp" +#include "alpaka/queue/QueueGenericSyclBlocking.hpp" +#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp" #include "alpaka/queue/Traits.hpp" #include @@ -25,29 +28,144 @@ namespace alpaka namespace detail { - template - using Accessor = sycl::accessor< - std::byte, - TDim, - sycl::access_mode::write, - sycl::target::global_buffer, - sycl::access::placeholder::true_t>; - - //! The SYCL memory set trait. - template - struct TaskSetSycl + //! The SYCL ND memory set task base. + template + struct TaskSetSyclBase { + using ExtentSize = Idx; + using DstSize = Idx; + using Elem = alpaka::Elem; + + template + TaskSetSyclBase(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent) + : m_byte(byte) + , m_extent(getExtentVec(extent)) + , m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast(sizeof(Elem))) +# if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL) + , m_dstExtent(getExtentVec(view)) +# endif + + , m_dstPitchBytes(getPitchBytesVec(view)) + , m_dstMemNative(reinterpret_cast(getPtrNative(view))) + + { + ALPAKA_ASSERT((castVec(m_extent) <= m_dstExtent).foldrAll(std::logical_or())); + ALPAKA_ASSERT(m_extentWidthBytes <= m_dstPitchBytes[TDim::value - 1u]); + } + +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + ALPAKA_FN_HOST auto printDebug() const -> void + { + std::cout << __func__ << " e: " << this->m_extent << " ewb: " << this->m_extentWidthBytes + << " de: " << this->m_dstExtent << " dptr: " << reinterpret_cast(this->m_dstMemNative) + << " dpitchb: " << this->m_dstPitchBytes << std::endl; + } +# endif + + std::uint8_t const m_byte; + Vec const m_extent; + ExtentSize const m_extentWidthBytes; +# if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL) + Vec const m_dstExtent; +# endif + Vec const m_dstPitchBytes; + std::uint8_t* const m_dstMemNative; + static constexpr auto is_sycl_task = true; + }; + + //! The SYCL device ND memory set task. + template + struct TaskSetSycl : public TaskSetSyclBase + { + using DimMin1 = DimInt; + using typename TaskSetSyclBase::ExtentSize; + using typename TaskSetSyclBase::DstSize; + + using TaskSetSyclBase::TaskSetSyclBase; + auto operator()(sycl::handler& cgh) const -> void { - cgh.require(m_accessor); - cgh.fill(m_accessor, m_value); + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + this->printDebug(); +# endif + if(static_cast(this->m_extent.prod()) != 0u) + { + cgh.memset( + reinterpret_cast(this->m_dstMemNative), + this->m_byte, + static_cast(this->m_extentWidthBytes * this->m_extent.prod())); + } } + }; - TAccessor m_accessor; - std::byte m_value; - // Distinguish from non-alpaka types (= host tasks) + //! The 1D SYCL memory set task. + template + struct TaskSetSycl, TView, TExtent> : public TaskSetSyclBase, TView, TExtent> + { + using TaskSetSyclBase, TView, TExtent>::TaskSetSyclBase; + + auto operator()(sycl::handler& cgh) const -> void + { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + this->printDebug(); +# endif + if(static_cast(this->m_extent.prod()) != 0u) + { + cgh.memset( + reinterpret_cast(this->m_dstMemNative), + this->m_byte, + static_cast(this->m_extentWidthBytes)); + } + } + }; + + //! The SYCL device scalar memory set task. + template + struct TaskSetSycl, TView, TExtent> + { + using ExtentSize = Idx; + using Scalar = Vec, ExtentSize>; + using DstSize = Idx; + using Elem = alpaka::Elem; + + template + TaskSetSycl(TViewFwd&& view, std::uint8_t const& byte, [[maybe_unused]] TExtent const& extent) + : m_byte(byte) + , m_dstMemNative(reinterpret_cast(getPtrNative(view))) + { + // all zero-sized extents are equivalent + ALPAKA_ASSERT(getExtentVec(extent).prod() == 1u); + ALPAKA_ASSERT(getExtentVec(view).prod() == 1u); + } + +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + ALPAKA_FN_HOST auto printDebug() const -> void + { + std::cout << __func__ << " e: " << Scalar() << " ewb: " << sizeof(Elem) << " de: " << Scalar() + << " dptr: " << reinterpret_cast(m_dstMemNative) << " dpitchb: " << Scalar() + << std::endl; + } +# endif + + auto operator()(sycl::handler& cgh) const -> void + { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + printDebug(); +# endif + cgh.memset(reinterpret_cast(m_dstMemNative), m_byte, sizeof(Elem)); + } + + std::uint8_t const m_byte; + std::uint8_t* const m_dstMemNative; static constexpr auto is_sycl_task = true; }; + } // namespace detail @@ -57,25 +175,15 @@ namespace alpaka template struct CreateTaskMemset> { - template - static auto createTaskMemset(TViewFwd&& view, std::uint8_t const& byte, TExtent const& ext) + template + ALPAKA_FN_HOST static auto createTaskMemset(TView& view, std::uint8_t const& byte, TExtent const& extent) + -> detail::TaskSetSycl { - ALPAKA_DEBUG_FULL_LOG_SCOPE; - - constexpr auto set_dim = static_cast(Dim::value); - using TView = std::remove_reference_t; - using ElemType = Elem; - using DstType = alpaka::detail::Accessor; - - // Reinterpret as byte buffer - auto buf = view.m_buffer.template reinterpret(); - auto const byte_val = static_cast(byte); - - auto const range = detail::make_sycl_range(ext, sizeof(ElemType)); - return detail::TaskSetSycl{DstType{buf, range}, byte_val}; + return detail::TaskSetSycl(view, byte, extent); } }; + } // namespace trait -} // namespace alpaka +} // namespace alpaka #endif diff --git a/include/alpaka/mem/fence/MemFenceGenericSycl.hpp b/include/alpaka/mem/fence/MemFenceGenericSycl.hpp index c3d96113e13f..c0a308448f8e 100644 --- a/include/alpaka/mem/fence/MemFenceGenericSycl.hpp +++ b/include/alpaka/mem/fence/MemFenceGenericSycl.hpp @@ -32,6 +32,13 @@ namespace alpaka static constexpr auto scope = sycl::memory_scope::device; static constexpr auto space = sycl::access::address_space::global_space; }; + + template<> + struct SyclFenceProps + { + static constexpr auto scope = sycl::memory_scope::device; + static constexpr auto space = sycl::access::address_space::global_space; + }; } // namespace detail //! The SYCL memory fence. diff --git a/include/alpaka/meta/SyclPrintf.hpp b/include/alpaka/meta/SyclPrintf.hpp new file mode 100644 index 000000000000..ef432e15a87f --- /dev/null +++ b/include/alpaka/meta/SyclPrintf.hpp @@ -0,0 +1,35 @@ +/* Copyright 2022 Andrea Bocci + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include + +#ifdef ALPAKA_ACC_SYCL_ENABLED + +// Kill printf in AMD GPU code because of missing compiler support +# ifdef __AMDGCN__ +# include // the define breaks if it is included afterwards +# define printf(...) +# else + +# ifdef __SYCL_DEVICE_ONLY__ +# define CONSTANT __attribute__((opencl_constant)) +# else +# define CONSTANT +# endif + +# define printf(FORMAT, ...) \ + do \ + { \ + static const CONSTANT char format[] = FORMAT; \ + sycl::ext::oneapi::experimental::printf(format, ##__VA_ARGS__); \ + } while(false) +# endif +#endif diff --git a/include/alpaka/pltf/PltfCpuSyclIntel.hpp b/include/alpaka/pltf/PltfCpuSyclIntel.hpp index b8f3815ddaed..1c28a1288610 100644 --- a/include/alpaka/pltf/PltfCpuSyclIntel.hpp +++ b/include/alpaka/pltf/PltfCpuSyclIntel.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Andrea Bocci +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/pltf/PltfFpgaSyclIntel.hpp b/include/alpaka/pltf/PltfFpgaSyclIntel.hpp index ef0b0cfa35c6..1d908c832f1b 100644 --- a/include/alpaka/pltf/PltfFpgaSyclIntel.hpp +++ b/include/alpaka/pltf/PltfFpgaSyclIntel.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Andrea Bocci +/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/pltf/PltfGenericSycl.hpp b/include/alpaka/pltf/PltfGenericSycl.hpp index e1e0d38ea174..bd7f351c17a8 100644 --- a/include/alpaka/pltf/PltfGenericSycl.hpp +++ b/include/alpaka/pltf/PltfGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan +/* Copyright 2023 Jan Stephan, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/pltf/PltfGpuSyclIntel.hpp b/include/alpaka/pltf/PltfGpuSyclIntel.hpp index 4c54e6c6ec1c..be1ae4643810 100644 --- a/include/alpaka/pltf/PltfGpuSyclIntel.hpp +++ b/include/alpaka/pltf/PltfGpuSyclIntel.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Andrea Bocci +/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp b/include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp index 844375913d95..039a05b3229f 100644 --- a/include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp +++ b/include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2022 Jan Stephan, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp b/include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp index 77f20c1acbba..02391231f5f5 100644 --- a/include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp +++ b/include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2022 Jan Stephan, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp index 297b29732f24..2db22b6d3abb 100644 --- a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp +++ b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan, Antonio Di Pilato +/* Copyright 2022 Jan Stephan, Antonio Di Pilato, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ @@ -8,6 +8,7 @@ #include "alpaka/dev/Traits.hpp" #include "alpaka/event/Traits.hpp" #include "alpaka/queue/Traits.hpp" +#include "alpaka/traits/Traits.hpp" #include "alpaka/wait/Traits.hpp" #include diff --git a/include/alpaka/rand/RandGenericSycl.hpp b/include/alpaka/rand/RandGenericSycl.hpp new file mode 100644 index 000000000000..bd274e9a526a --- /dev/null +++ b/include/alpaka/rand/RandGenericSycl.hpp @@ -0,0 +1,238 @@ +/* Copyright 2022 Luca Ferragina + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#if defined(ALPAKA_ACC_SYCL_ENABLED) + +# include +# include +# include +# include + +// Backend specific imports. +# if defined(ALPAKA_ACC_SYCL_ENABLED) +# include +# include + +# include +# endif + +# include + +namespace alpaka::rand +{ + //! The SYCL rand implementation. + class RandGenericSycl : public concepts::Implements + { + }; + +# if !defined(ALPAKA_HOST_ONLY) + + namespace distribution::sycl_rand + { + //! The SYCL random number floating point normal distribution. + template + class NormalReal; + + //! The SYCL random number floating point uniform distribution. + template + class UniformReal; + + //! The SYCL random number integer uniform distribution. + template + class UniformUint; + } // namespace distribution::sycl_rand + + namespace engine::sycl_rand + { + //! The SYCL linear congruential random number generator engine. + class Minstd + { + public: + // After calling this constructor the instance is not valid initialized and + // need to be overwritten with a valid object + Minstd() = default; + + Minstd(std::uint32_t const& seed, std::uint32_t const& subsequence = 0, std::uint32_t const& offset = 0) + { + oneapi::dpl::minstd_rand engine(seed, offset); + rng_engine = engine; + } + + private: + template + friend class distribution::sycl_rand::NormalReal; + template + friend class distribution::sycl_rand::UniformReal; + template + friend class distribution::sycl_rand::UniformUint; + + oneapi::dpl::minstd_rand rng_engine; + + public: + using result_type = decltype(rng_engine); + + // ALPAKA_FN_HOST_ACC constexpr static result_type min() + // { + // return std::numeric_limits::min(); + // } + // ALPAKA_FN_HOST_ACC constexpr static result_type max() + // { + // return std::numeric_limits::max(); + // } + result_type operator()() + { + return rng_engine; + } + }; + } // namespace engine::sycl_rand + + namespace distribution::sycl_rand + { + //! The SYCL random number float normal distribution. + template<> + class NormalReal + { + public: + template + auto operator()(TEngine& engine) -> float + { + // Create float uniform_real_distribution distribution + oneapi::dpl::normal_distribution distr; + + // Generate float random number + return distr(engine); + } + }; + + //! The SYCL random number double normal distribution. + template<> + class NormalReal + { + public: + template + auto operator()(TEngine& engine) -> double + { + // Create float uniform_real_distribution distribution + oneapi::dpl::normal_distribution distr; + + // Generate float random number + return distr(engine); + } + }; + + //! The SYCL random number float uniform distribution. + template<> + class UniformReal + { + public: + template + auto operator()(TEngine& engine) -> float + { + // Create float uniform_real_distribution distribution + oneapi::dpl::uniform_real_distribution distr; + + // Generate float random number + return distr(engine); + // NOTE: (1.0f - curand_uniform) does not work, because curand_uniform seems to return + // denormalized floats around 0.f. [0.f, 1.0f) + // return fUniformRand * static_cast(fUniformRand != 1.0f); + } + }; + + //! The SYCL random number float uniform distribution. + template<> + class UniformReal + { + public: + template + auto operator()(TEngine& engine) -> double + { + // Create float uniform_real_distribution distribution + oneapi::dpl::uniform_real_distribution distr; + + // Generate float random number + return distr(engine); + // NOTE: (1.0f - curand_uniform_double) does not work, because curand_uniform_double seems to + // return denormalized floats around 0.f. [0.f, 1.0f) + // return fUniformRand * static_cast(fUniformRand != 1.0); + } + }; + + //! The SYCL random number unsigned integer uniform distribution. + template<> + class UniformUint + { + public: + template + auto operator()(TEngine& engine) -> unsigned int + { + // Create float uniform_real_distribution distribution + oneapi::dpl::uniform_int_distribution distr; + + // Generate float random number + return distr(engine); + } + }; + } // namespace distribution::sycl_rand + + namespace distribution::trait + { + //! The SYCL random number float normal distribution get trait specialization. + template + struct CreateNormalReal>> + { + static auto createNormalReal(RandGenericSycl const& /*rand*/) -> sycl_rand::NormalReal + { + return {}; + } + }; + + //! The SYCL random number float uniform distribution get trait specialization. + template + struct CreateUniformReal>> + { + static auto createUniformReal(RandGenericSycl const& /*rand*/) -> sycl_rand::UniformReal + { + return {}; + } + }; + + //! The SYCL random number integer uniform distribution get trait specialization. + template + struct CreateUniformUint>> + { + static auto createUniformUint(RandGenericSycl const& /*rand*/) -> sycl_rand::UniformUint + { + return {}; + } + }; + } // namespace distribution::trait + + namespace engine::trait + { + //! The SYCL random number default generator get trait specialization. + template<> + struct CreateDefault + { + static auto createDefault( + RandGenericSycl const& /*rand*/, + std::uint32_t const& seed = 0, + std::uint32_t const& subsequence = 0, + std::uint32_t const& offset = 0) -> sycl_rand::Minstd + { + return {seed, subsequence, offset}; + } + }; + } // namespace engine::trait +# endif +} // namespace alpaka::rand + +#endif diff --git a/include/alpaka/test/Check.hpp b/include/alpaka/test/Check.hpp index 253cb477fe9b..28f064701ccb 100644 --- a/include/alpaka/test/Check.hpp +++ b/include/alpaka/test/Check.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Benjamin Worpitz, Jan Stephan +/* Copyright 2022 Benjamin Worpitz, Jan Stephan, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ @@ -14,7 +14,16 @@ { \ if(!(expression)) \ { \ - acc.cout << "ALPAKA_CHECK failed because '!(" << #expression << ")'\n"; \ + /*if constexpr(alpaka::accMatchesTags< TODO \ + decltype(acc), \ + alpaka::TagCpuSyclIntel, \ + alpaka::TagFpgaSyclIntel, \ + alpaka::TagFpgaSyclXilinx, \ + alpaka::TagGpuSyclIntel, \ + alpaka::TagGenericSycl>) \ + acc.cout << "ALPAKA_CHECK failed because '!(" << #expression << ")'\n"; \ + else \ + printf("ALPAKA_CHECK failed because '!(%s)'\n", #expression);*/ \ success = false; \ } \ } while(0) diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp index 33ac1884deb2..4df62480e001 100644 --- a/include/alpaka/warp/WarpGenericSycl.hpp +++ b/include/alpaka/warp/WarpGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2022 Jan Stephan, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/workdiv/WorkDivGenericSycl.hpp b/include/alpaka/workdiv/WorkDivGenericSycl.hpp index c6d1c114a3c9..cb4f003fa211 100644 --- a/include/alpaka/workdiv/WorkDivGenericSycl.hpp +++ b/include/alpaka/workdiv/WorkDivGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2022 Jan Stephan, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ From 5b1b2b7b7750c4ff4d3b41f3df22f74fa98ffa94 Mon Sep 17 00:00:00 2001 From: AuroraPerego Date: Sun, 2 Apr 2023 19:06:13 +0200 Subject: [PATCH 02/10] Rewrite the SYCL backend for the SYCL 2020 standard and USM allocations (part 2) More changes to the SYCL backend: - move printf to alpaka/core and use it in ALPAKA_CHECK; - remove IsView -> false in mem/buf/sycl/Accessor; - remove wrong attribute in mem/buf/sycl/Copy; - remove the SYCL experimental BuildAccessor, use the default implementation from alpaka/mem/view. Fix the examples to work with the SYCL backend: - fix the accelerator in the vectorAdd example; - move AccCpuSerial at the end in the ExampleDefaultAcc, as it was preventing the SYCL accelerators from being selected. Complete the work on the SYCL random number generators. --- example/vectorAdd/src/vectorAdd.cpp | 7 +- include/alpaka/acc/AccGenericSycl.hpp | 8 +- include/alpaka/alpaka.hpp | 3 +- include/alpaka/core/Sycl.hpp | 22 +++- include/alpaka/example/ExampleDefaultAcc.hpp | 6 +- include/alpaka/mem/buf/sycl/Accessor.hpp | 26 +---- include/alpaka/mem/buf/sycl/Copy.hpp | 7 +- include/alpaka/meta/SyclPrintf.hpp | 35 ------- include/alpaka/rand/RandGenericSycl.hpp | 100 +++++++++---------- include/alpaka/test/Check.hpp | 43 ++------ 10 files changed, 93 insertions(+), 164 deletions(-) delete mode 100644 include/alpaka/meta/SyclPrintf.hpp diff --git a/example/vectorAdd/src/vectorAdd.cpp b/example/vectorAdd/src/vectorAdd.cpp index 676d0041018b..a67362200189 100644 --- a/example/vectorAdd/src/vectorAdd.cpp +++ b/example/vectorAdd/src/vectorAdd.cpp @@ -1,4 +1,5 @@ -/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan, Luca Ferragina +/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan, Luca Ferragina, + * Aurora Perego * SPDX-License-Identifier: ISC */ @@ -75,9 +76,7 @@ auto main() -> int // - AccCpuTbbBlocks // - AccCpuSerial // using Acc = alpaka::AccCpuSerial; - // using Acc = alpaka::ExampleDefaultAcc; - - using Acc = alpaka::AccGpuSyclIntel; + using Acc = alpaka::ExampleDefaultAcc; using Pltf = alpaka::Pltf; using DevAcc = alpaka::Dev; std::cout << "Using alpaka accelerator: " << alpaka::getAccName() << std::endl; diff --git a/include/alpaka/acc/AccGenericSycl.hpp b/include/alpaka/acc/AccGenericSycl.hpp index 8a7d0972ddd7..fe3a28bc1828 100644 --- a/include/alpaka/acc/AccGenericSycl.hpp +++ b/include/alpaka/acc/AccGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Andrea Bocci, Luca Ferragina +/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Andrea Bocci, Luca Ferragina, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -15,7 +15,6 @@ #include "alpaka/intrinsic/IntrinsicGenericSycl.hpp" #include "alpaka/math/MathGenericSycl.hpp" #include "alpaka/mem/fence/MemFenceGenericSycl.hpp" -#include "alpaka/meta/SyclPrintf.hpp" #include "alpaka/rand/RandGenericSycl.hpp" #include "alpaka/warp/WarpGenericSycl.hpp" #include "alpaka/workdiv/WorkDivGenericSycl.hpp" @@ -58,7 +57,7 @@ namespace alpaka , public BlockSyncGenericSycl , public IntrinsicGenericSycl , public MemFenceGenericSycl - , public rand::RandGenericSycl + , public rand::RandGenericSycl , public warp::WarpGenericSycl { public: @@ -86,7 +85,7 @@ namespace alpaka , BlockSyncGenericSycl{work_item} , IntrinsicGenericSycl{} , MemFenceGenericSycl{global_fence_dummy, local_fence_dummy} - , rand::RandGenericSycl{} + , rand::RandGenericSycl{work_item} , warp::WarpGenericSycl{work_item} , cout{output_stream} { @@ -111,6 +110,7 @@ namespace alpaka , BlockSyncGenericSycl{work_item} , IntrinsicGenericSycl{} , MemFenceGenericSycl{global_fence_dummy, local_fence_dummy} + , rand::RandGenericSycl{work_item} , warp::WarpGenericSycl{work_item} { } diff --git a/include/alpaka/alpaka.hpp b/include/alpaka/alpaka.hpp index c96d396ac4c4..3262ea360edc 100644 --- a/include/alpaka/alpaka.hpp +++ b/include/alpaka/alpaka.hpp @@ -1,5 +1,5 @@ /* Copyright 2023 Axel Hübl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Bernhard Manfred Gruber, - * Jan Stephan, Antonio Di Pilato, Luca Ferragina + * Jan Stephan, Antonio Di Pilato, Luca Ferragina, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -172,7 +172,6 @@ #include "alpaka/meta/NdLoop.hpp" #include "alpaka/meta/NonZero.hpp" #include "alpaka/meta/Set.hpp" -#include "alpaka/meta/SyclPrintf.hpp" #include "alpaka/meta/Transform.hpp" #include "alpaka/meta/TypeListOps.hpp" // offset diff --git a/include/alpaka/core/Sycl.hpp b/include/alpaka/core/Sycl.hpp index 3f067f5f8b39..983140770217 100644 --- a/include/alpaka/core/Sycl.hpp +++ b/include/alpaka/core/Sycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan, Luca Ferragina +/* Copyright 2022 Jan Stephan, Luca Ferragina, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -13,6 +13,7 @@ #include #include +#include // the #define printf(...) breaks if it is included afterwards #include #include #include @@ -23,6 +24,25 @@ # include +// if SYCL is enabled with the AMD backend the printf will be killed because of missing compiler support +# ifdef __AMDGCN__ +# define printf(...) +# else + +# ifdef __SYCL_DEVICE_ONLY__ +using AlpakaFormat = char const* [[clang::opencl_constant]]; +# else +using AlpakaFormat = char const*; +# endif +# define printf(FORMAT, ...) \ + do \ + { \ + static const auto format = AlpakaFormat{FORMAT}; \ + sycl::ext::oneapi::experimental::printf(format, ##__VA_ARGS__); \ + } while(false) + +# endif + // SYCL vector types trait specializations. namespace alpaka { diff --git a/include/alpaka/example/ExampleDefaultAcc.hpp b/include/alpaka/example/ExampleDefaultAcc.hpp index f90b3653c9f4..ef0eb80fb380 100644 --- a/include/alpaka/example/ExampleDefaultAcc.hpp +++ b/include/alpaka/example/ExampleDefaultAcc.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan +/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -24,8 +24,6 @@ namespace alpaka using ExampleDefaultAcc = alpaka::AccCpuOmp2Threads; #elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED) using ExampleDefaultAcc = alpaka::AccCpuThreads; -#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) - using ExampleDefaultAcc = alpaka::AccCpuSerial; #elif defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) # if defined(ALPAKA_SYCL_ONEAPI_CPU) using ExampleDefaultAcc = alpaka::AccCpuSyclIntel; @@ -34,6 +32,8 @@ namespace alpaka # elif defined(ALPAKA_SYCL_ONEAPI_GPU) using ExampleDefaultAcc = alpaka::AccGpuSyclIntel; # endif +#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) + using ExampleDefaultAcc = alpaka::AccCpuSerial; #else class ExampleDefaultAcc; # warning "No supported backend selected." diff --git a/include/alpaka/mem/buf/sycl/Accessor.hpp b/include/alpaka/mem/buf/sycl/Accessor.hpp index 5b7aeba11dac..6d2dec54cb7c 100644 --- a/include/alpaka/mem/buf/sycl/Accessor.hpp +++ b/include/alpaka/mem/buf/sycl/Accessor.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Andrea Bocci +/* Copyright 2023 Jan Stephan, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -112,30 +112,6 @@ namespace alpaka SyclAccessor m_accessor; VecType extents; }; - - namespace experimental::trait - { - namespace internal - { - template - struct IsView> : std::false_type - { - }; - } // namespace internal - - template - struct BuildAccessor> - { - template - static auto buildAccessor(BufGenericSycl& buffer) - { - using SyclAccessor = detail::SyclAccessor; - return Accessor{ - SyclAccessor{buffer.m_buffer}, - buffer.m_extentElements}; - } - }; - } // namespace experimental::trait } // namespace alpaka #endif diff --git a/include/alpaka/mem/buf/sycl/Copy.hpp b/include/alpaka/mem/buf/sycl/Copy.hpp index cfede497adb1..6ecb8d5f4836 100644 --- a/include/alpaka/mem/buf/sycl/Copy.hpp +++ b/include/alpaka/mem/buf/sycl/Copy.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan, Luca Ferragina, Aurora Perego +/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -146,10 +146,7 @@ namespace alpaka::detail using Elem = alpaka::Elem; template - ALPAKA_FN_HOST TaskCopySycl( - TViewDstFwd&& viewDst, - TViewSrc const& viewSrc, - [[maybe_unused]] TExtent const& extent) + ALPAKA_FN_HOST TaskCopySycl(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent) : m_dstMemNative(reinterpret_cast(getPtrNative(viewDst))) , m_srcMemNative(reinterpret_cast(getPtrNative(viewSrc))) { diff --git a/include/alpaka/meta/SyclPrintf.hpp b/include/alpaka/meta/SyclPrintf.hpp deleted file mode 100644 index ef432e15a87f..000000000000 --- a/include/alpaka/meta/SyclPrintf.hpp +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright 2022 Andrea Bocci - * - * This file is part of alpaka. - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. - */ - -#pragma once - -#include - -#ifdef ALPAKA_ACC_SYCL_ENABLED - -// Kill printf in AMD GPU code because of missing compiler support -# ifdef __AMDGCN__ -# include // the define breaks if it is included afterwards -# define printf(...) -# else - -# ifdef __SYCL_DEVICE_ONLY__ -# define CONSTANT __attribute__((opencl_constant)) -# else -# define CONSTANT -# endif - -# define printf(FORMAT, ...) \ - do \ - { \ - static const CONSTANT char format[] = FORMAT; \ - sycl::ext::oneapi::experimental::printf(format, ##__VA_ARGS__); \ - } while(false) -# endif -#endif diff --git a/include/alpaka/rand/RandGenericSycl.hpp b/include/alpaka/rand/RandGenericSycl.hpp index bd274e9a526a..0901090853a8 100644 --- a/include/alpaka/rand/RandGenericSycl.hpp +++ b/include/alpaka/rand/RandGenericSycl.hpp @@ -1,10 +1,5 @@ -/* Copyright 2022 Luca Ferragina - * - * This file is part of alpaka. - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. +/* Copyright 2023 Luca Ferragina, Aurora Perego + * SPDX-License-Identifier: MPL-2.0 */ #pragma once @@ -17,24 +12,27 @@ # include // Backend specific imports. -# if defined(ALPAKA_ACC_SYCL_ENABLED) -# include -# include - -# include -# endif +# include +# include +# include # include namespace alpaka::rand { //! The SYCL rand implementation. - class RandGenericSycl : public concepts::Implements + template + class RandGenericSycl : public concepts::Implements> { + public: + RandGenericSycl(sycl::nd_item my_item) : m_item{my_item} + { + } + + sycl::nd_item m_item; }; # if !defined(ALPAKA_HOST_ONLY) - namespace distribution::sycl_rand { //! The SYCL random number floating point normal distribution. @@ -53,6 +51,7 @@ namespace alpaka::rand namespace engine::sycl_rand { //! The SYCL linear congruential random number generator engine. + template class Minstd { public: @@ -60,9 +59,9 @@ namespace alpaka::rand // need to be overwritten with a valid object Minstd() = default; - Minstd(std::uint32_t const& seed, std::uint32_t const& subsequence = 0, std::uint32_t const& offset = 0) + Minstd(RandGenericSycl rand, std::uint32_t const& seed) { - oneapi::dpl::minstd_rand engine(seed, offset); + oneapi::dpl::minstd_rand engine(seed, rand.m_item.get_global_linear_id()); rng_engine = engine; } @@ -77,19 +76,20 @@ namespace alpaka::rand oneapi::dpl::minstd_rand rng_engine; public: - using result_type = decltype(rng_engine); - - // ALPAKA_FN_HOST_ACC constexpr static result_type min() - // { - // return std::numeric_limits::min(); - // } - // ALPAKA_FN_HOST_ACC constexpr static result_type max() - // { - // return std::numeric_limits::max(); - // } + using result_type = float; + + ALPAKA_FN_HOST_ACC static result_type min() + { + return std::numeric_limits::min(); + } + ALPAKA_FN_HOST_ACC static result_type max() + { + return std::numeric_limits::max(); + } result_type operator()() { - return rng_engine; + oneapi::dpl::uniform_real_distribution distr; + return distr(rng_engine); } }; } // namespace engine::sycl_rand @@ -108,7 +108,7 @@ namespace alpaka::rand oneapi::dpl::normal_distribution distr; // Generate float random number - return distr(engine); + return distr(engine.rng_engine); } }; @@ -124,7 +124,7 @@ namespace alpaka::rand oneapi::dpl::normal_distribution distr; // Generate float random number - return distr(engine); + return distr(engine.rng_engine); } }; @@ -140,10 +140,7 @@ namespace alpaka::rand oneapi::dpl::uniform_real_distribution distr; // Generate float random number - return distr(engine); - // NOTE: (1.0f - curand_uniform) does not work, because curand_uniform seems to return - // denormalized floats around 0.f. [0.f, 1.0f) - // return fUniformRand * static_cast(fUniformRand != 1.0f); + return distr(engine.rng_engine); } }; @@ -159,10 +156,7 @@ namespace alpaka::rand oneapi::dpl::uniform_real_distribution distr; // Generate float random number - return distr(engine); - // NOTE: (1.0f - curand_uniform_double) does not work, because curand_uniform_double seems to - // return denormalized floats around 0.f. [0.f, 1.0f) - // return fUniformRand * static_cast(fUniformRand != 1.0); + return distr(engine.rng_engine); } }; @@ -178,7 +172,7 @@ namespace alpaka::rand oneapi::dpl::uniform_int_distribution distr; // Generate float random number - return distr(engine); + return distr(engine.rng_engine); } }; } // namespace distribution::sycl_rand @@ -186,30 +180,30 @@ namespace alpaka::rand namespace distribution::trait { //! The SYCL random number float normal distribution get trait specialization. - template - struct CreateNormalReal>> + template + struct CreateNormalReal, T, std::enable_if_t>> { - static auto createNormalReal(RandGenericSycl const& /*rand*/) -> sycl_rand::NormalReal + static auto createNormalReal(RandGenericSycl const& /*rand*/) -> sycl_rand::NormalReal { return {}; } }; //! The SYCL random number float uniform distribution get trait specialization. - template - struct CreateUniformReal>> + template + struct CreateUniformReal, T, std::enable_if_t>> { - static auto createUniformReal(RandGenericSycl const& /*rand*/) -> sycl_rand::UniformReal + static auto createUniformReal(RandGenericSycl const& /*rand*/) -> sycl_rand::UniformReal { return {}; } }; //! The SYCL random number integer uniform distribution get trait specialization. - template - struct CreateUniformUint>> + template + struct CreateUniformUint, T, std::enable_if_t>> { - static auto createUniformUint(RandGenericSycl const& /*rand*/) -> sycl_rand::UniformUint + static auto createUniformUint(RandGenericSycl const& /*rand*/) -> sycl_rand::UniformUint { return {}; } @@ -219,16 +213,16 @@ namespace alpaka::rand namespace engine::trait { //! The SYCL random number default generator get trait specialization. - template<> - struct CreateDefault + template + struct CreateDefault> { static auto createDefault( - RandGenericSycl const& /*rand*/, + RandGenericSycl const& rand, std::uint32_t const& seed = 0, std::uint32_t const& subsequence = 0, - std::uint32_t const& offset = 0) -> sycl_rand::Minstd + std::uint32_t const& offset = 0) -> sycl_rand::Minstd { - return {seed, subsequence, offset}; + return {rand, seed}; } }; } // namespace engine::trait diff --git a/include/alpaka/test/Check.hpp b/include/alpaka/test/Check.hpp index 28f064701ccb..39545e764923 100644 --- a/include/alpaka/test/Check.hpp +++ b/include/alpaka/test/Check.hpp @@ -1,40 +1,19 @@ -/* Copyright 2022 Benjamin Worpitz, Jan Stephan, Luca Ferragina +/* Copyright 2023 Benjamin Worpitz, Jan Stephan, Luca Ferragina, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ #pragma once +#include "alpaka/core/Sycl.hpp" + #include -// TODO: SYCL doesn't have a way to detect if we're looking at device or host code. This needs a workaround so that -// SYCL and other back-ends are compatible. -#ifdef ALPAKA_ACC_SYCL_ENABLED -# define ALPAKA_CHECK(success, expression) \ - do \ - { \ - if(!(expression)) \ - { \ - /*if constexpr(alpaka::accMatchesTags< TODO \ - decltype(acc), \ - alpaka::TagCpuSyclIntel, \ - alpaka::TagFpgaSyclIntel, \ - alpaka::TagFpgaSyclXilinx, \ - alpaka::TagGpuSyclIntel, \ - alpaka::TagGenericSycl>) \ - acc.cout << "ALPAKA_CHECK failed because '!(" << #expression << ")'\n"; \ - else \ - printf("ALPAKA_CHECK failed because '!(%s)'\n", #expression);*/ \ - success = false; \ - } \ - } while(0) -#else -# define ALPAKA_CHECK(success, expression) \ - do \ +#define ALPAKA_CHECK(success, expression) \ + do \ + { \ + if(!(expression)) \ { \ - if(!(expression)) \ - { \ - printf("ALPAKA_CHECK failed because '!(%s)'\n", #expression); \ - success = false; \ - } \ - } while(0) -#endif + printf("ALPAKA_CHECK failed because '!(%s)'\n", #expression); \ + success = false; \ + } \ + } while(0) From 97ebacdc87493a9c308597f82c0510f5168d87e9 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Tue, 13 Jun 2023 20:47:05 +0200 Subject: [PATCH 03/10] Rewrite the SYCL backend for the SYCL 2020 standard and USM allocations (part 3) Update the documentation. Implement various fixes to the SYCL math functions: - add missing "if constexpr" to rsqrt(); - do not call math function with mixed arguments; this fixes errors due to the implicit conversion between floating point types of different sizes in sycl::atan2() and sycl::pow(); - add explicit type casts to silence warnings; - cast the result of isfinite/isinf/isnan to bool. Implement various fixes to the SYCL atomic functions: - fix the cas/compare_exchange loops; - clarify which atomic types are supported. Implement various fixes to the SYCL warp-level functions: - fix compilation warnings; - extract bits from sub_group_mask. Mark the use of global device variables and constants as undupported: the SYCL backend does not support global device variables and constants, yet. Add explicit checks on the dimensionality of the SYCL accelerator and work division. Silence warnings about the use of GNU extensions, and those coming from the Intel oneMKL and oneDPL headers. Update more tests for the SYCL backend: - add a special case for 0-dimensional tests; - disable the use of STL rand; - disable the test of global device variables and constants. --- README_SYCL.md | 10 +++- include/alpaka/acc/AccGenericSycl.hpp | 2 + include/alpaka/atomic/AtomicGenericSycl.hpp | 57 ++++++++++--------- include/alpaka/core/Common.hpp | 6 +- include/alpaka/core/Sycl.hpp | 14 ++++- include/alpaka/math/MathGenericSycl.hpp | 40 ++++++++----- include/alpaka/mem/view/ViewPlainPtr.hpp | 15 ++++- include/alpaka/rand/RandGenericSycl.hpp | 22 ++++++- include/alpaka/warp/WarpGenericSycl.hpp | 33 +++++++---- include/alpaka/workdiv/WorkDivGenericSycl.hpp | 14 +++-- test/unit/mem/view/src/ViewStaticAccMem.cpp | 24 +++++++- test/unit/rand/src/RandTest.cpp | 4 +- 12 files changed, 169 insertions(+), 72 deletions(-) diff --git a/README_SYCL.md b/README_SYCL.md index eead8a8a1acc..82f620dca193 100644 --- a/README_SYCL.md +++ b/README_SYCL.md @@ -93,8 +93,12 @@ These can be used interchangeably (some restrictions apply - see below) with the ### Restrictions * The FPGA back-ends (both vendors) cannot be used together with the Intel CPU / GPU back-ends. This is because of the different compilation trajectory required for FPGAs and is unlikely to be fixed anytime soon. See [here](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/programming-interface/fpga-flow/why-is-fpga-compilation-different.html) for an explanation. -* The SYCL back-end currently does not support passing pointers as kernel parameters. Use alpaka's experimental accessors instead. -* The SYCL back-end does not have device-side random number generation. * Similar to the CUDA and HIP back-ends the SYCL back-end only supports up to three kernel dimensions. * Some Intel GPUs do not support the `double` type for device code. alpaka will not check this. -* The FPGA back-end does not support atomics. alpaka will not check this. \ No newline at end of file + You can enable software emulation for `double` precision types with + ```bash + export IGC_EnableDPEmulation=1 + export OverrideDefaultFP64Settings=1 + ``` + See [Intel's FAQ](https://github.com/intel/compute-runtime/blob/master/opencl/doc/FAQ.md#feature-double-precision-emulation-fp64) for more information. +* The FPGA back-end does not support atomics. alpaka will not check this. diff --git a/include/alpaka/acc/AccGenericSycl.hpp b/include/alpaka/acc/AccGenericSycl.hpp index fe3a28bc1828..76a735459940 100644 --- a/include/alpaka/acc/AccGenericSycl.hpp +++ b/include/alpaka/acc/AccGenericSycl.hpp @@ -60,6 +60,8 @@ namespace alpaka , public rand::RandGenericSycl , public warp::WarpGenericSycl { + static_assert(TDim::value > 0, "The SYCL accelerator must have a dimension greater than zero."); + public: AccGenericSycl(AccGenericSycl const&) = delete; AccGenericSycl(AccGenericSycl&&) = delete; diff --git a/include/alpaka/atomic/AtomicGenericSycl.hpp b/include/alpaka/atomic/AtomicGenericSycl.hpp index 8864508d578b..a869248e53b4 100644 --- a/include/alpaka/atomic/AtomicGenericSycl.hpp +++ b/include/alpaka/atomic/AtomicGenericSycl.hpp @@ -98,17 +98,12 @@ namespace alpaka inline auto casWithCondition(T* const addr, TEval&& eval) { auto ref = TRef{*addr}; - auto old_val = ref.load(); - auto assumed = T{}; - do + // prefer compare_exchange_weak when in a loop, assuming that eval is not expensive + while(!ref.compare_exchange_weak(old_val, eval(old_val))) { - assumed = old_val; - auto const new_val = eval(old_val); - old_val = ref.compare_exchange_strong(assumed, new_val); - } while(assumed != old_val); - + } return old_val; } @@ -182,7 +177,9 @@ namespace alpaka::trait template struct AtomicOp { - static_assert(std::is_integral_v || std::is_floating_point_v, "SYCL atomics do not support this type"); + static_assert( + (std::is_integral_v || std::is_floating_point_v) &&(sizeof(T) == 4 || sizeof(T) == 8), + "SYCL atomics do not support this type"); static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T { @@ -195,11 +192,14 @@ namespace alpaka::trait template struct AtomicOp { - static_assert(std::is_unsigned_v, "atomicInc only supported for unsigned types"); + static_assert( + std::is_unsigned_v && (sizeof(T) == 4 || sizeof(T) == 8), + "SYCL atomics support only 32- and 64-bits unsigned integral types"); static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T { - auto inc = [&value](auto old_val) { return (old_val >= value) ? static_cast(0) : (old_val + 1u); }; + auto inc = [&value](auto old_val) + { return (old_val >= value) ? static_cast(0) : (old_val + static_cast(1)); }; if(auto ptr = alpaka::detail::get_global_ptr(addr); ptr != nullptr) return alpaka::detail::casWithCondition>(addr, inc); else @@ -212,12 +212,14 @@ namespace alpaka::trait template struct AtomicOp { - static_assert(std::is_unsigned_v, "atomicDec only supported for unsigned types"); + static_assert( + std::is_unsigned_v && (sizeof(T) == 4 || sizeof(T) == 8), + "SYCL atomics support only 32- and 64-bits unsigned integral types"); static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T { - auto dec - = [&value](auto& old_val) { return ((old_val == 0) || (old_val > value)) ? value : (old_val - 1u); }; + auto dec = [&value](auto& old_val) + { return ((old_val == 0) || (old_val > value)) ? value : (old_val - static_cast(1)); }; if(auto ptr = alpaka::detail::get_global_ptr(addr); ptr != nullptr) return alpaka::detail::casWithCondition>(addr, dec); else @@ -275,22 +277,21 @@ namespace alpaka::trait { static_assert(std::is_integral_v || std::is_floating_point_v, "SYCL atomics do not support this type"); - static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& compare, T const& value) -> T + static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& expected, T const& desired) -> T { - auto cas = [&compare, &value](auto& ref) + auto cas = [&expected, &desired](auto& ref) { - // SYCL stores the value in *addr to the "compare" parameter if the values are not equal. Since - // alpaka's interface does not expect this we need to copy "compare" to this function and forget it - // afterwards. - auto tmp = compare; - - // We always want to return the old value at the end. - const auto old = ref.load(); - - // This returns a bool telling us if the exchange happened or not. Useless in this case. - ref.compare_exchange_strong(tmp, value); - - return old; + auto expected_ = expected; + // Atomically compares the value of `ref` with the value of `expected`. + // If the values are equal, replaces the value of `ref` with `desired`. + // Otherwise updates `expected` with the value of `ref`. + // Returns a bool telling us if the exchange happened or not, but the Alpaka API does not make use of + // it. + ref.compare_exchange_strong(expected_, desired); + + // If the update succeded, return the previous value of `ref`. + // Otherwise, return the current value of `ref`. + return expected_; }; if(auto ptr = alpaka::detail::get_global_ptr(addr); ptr != nullptr) diff --git a/include/alpaka/core/Common.hpp b/include/alpaka/core/Common.hpp index 50b74df690be..4b6b64de2e1c 100644 --- a/include/alpaka/core/Common.hpp +++ b/include/alpaka/core/Common.hpp @@ -1,4 +1,4 @@ -/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner +/* Copyright 2023 Axel Huebl, Benjamin Worpitz, Matthias Werner, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -117,6 +117,8 @@ #if((BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA) || (BOOST_LANG_CUDA && BOOST_COMP_NVCC && BOOST_ARCH_PTX) \ || BOOST_LANG_HIP) # define ALPAKA_STATIC_ACC_MEM_GLOBAL __device__ +#elif defined(ALPAKA_ACC_SYCL_ENABLED) +# define ALPAKA_STATIC_ACC_MEM_GLOBAL _Pragma("GCC error \"The SYCL backend does not support global device variables.\"")) #else # define ALPAKA_STATIC_ACC_MEM_GLOBAL #endif @@ -153,6 +155,8 @@ #if((BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA) || (BOOST_LANG_CUDA && BOOST_COMP_NVCC && BOOST_ARCH_PTX) \ || BOOST_LANG_HIP) # define ALPAKA_STATIC_ACC_MEM_CONSTANT __constant__ +#elif defined(ALPAKA_ACC_SYCL_ENABLED) +# define ALPAKA_STATIC_ACC_MEM_CONSTANT _Pragma("GCC error \"The SYCL backend does not support global device constants.\"")) #else # define ALPAKA_STATIC_ACC_MEM_CONSTANT #endif diff --git a/include/alpaka/core/Sycl.hpp b/include/alpaka/core/Sycl.hpp index 983140770217..409f9951a48b 100644 --- a/include/alpaka/core/Sycl.hpp +++ b/include/alpaka/core/Sycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan, Luca Ferragina, Aurora Perego +/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -34,13 +34,23 @@ using AlpakaFormat = char const* [[clang::opencl_constant]]; # else using AlpakaFormat = char const*; # endif + +# if BOOST_COMP_CLANG +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments" +# endif + # define printf(FORMAT, ...) \ do \ { \ - static const auto format = AlpakaFormat{FORMAT}; \ + static auto const format = AlpakaFormat{FORMAT}; \ sycl::ext::oneapi::experimental::printf(format, ##__VA_ARGS__); \ } while(false) +# if BOOST_COMP_CLANG +# pragma clang diagnostic pop +# endif + # endif // SYCL vector types trait specializations. diff --git a/include/alpaka/math/MathGenericSycl.hpp b/include/alpaka/math/MathGenericSycl.hpp index d48b32b1704f..3449a0dbcc15 100644 --- a/include/alpaka/math/MathGenericSycl.hpp +++ b/include/alpaka/math/MathGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan, Luca Ferragina +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -279,7 +279,7 @@ namespace alpaka::math::trait if constexpr(std::is_integral_v) return sycl::atan2(0.0, static_cast(argument)); else if constexpr(std::is_floating_point_v) - return sycl::atan2(TArgument{0.0}, argument); + return sycl::atan2(static_cast(0.0), argument); else static_assert(!sizeof(TArgument), "Unsupported data type"); } @@ -333,9 +333,11 @@ namespace alpaka::math::trait Tx, std::enable_if_t && std::is_floating_point_v>> { + using TCommon = std::common_type_t; + auto operator()(math::Atan2GenericSycl const&, Ty const& y, Tx const& x) { - return sycl::atan2(y, x); + return sycl::atan2(static_cast(y), static_cast(x)); } }; @@ -432,9 +434,11 @@ namespace alpaka::math::trait Ty, std::enable_if_t && std::is_floating_point_v>> { + using TCommon = std::common_type_t; + auto operator()(math::FmodGenericSycl const&, Tx const& x, Ty const& y) { - return sycl::fmod(x, y); + return sycl::fmod(static_cast(x), static_cast(y)); } }; @@ -444,7 +448,7 @@ namespace alpaka::math::trait { auto operator()(math::IsfiniteGenericSycl const&, TArg const& arg) { - return sycl::isfinite(arg); + return static_cast(sycl::isfinite(arg)); } }; @@ -454,7 +458,7 @@ namespace alpaka::math::trait { auto operator()(math::IsinfGenericSycl const&, TArg const& arg) { - return sycl::isinf(arg); + return static_cast(sycl::isinf(arg)); } }; @@ -464,7 +468,7 @@ namespace alpaka::math::trait { auto operator()(math::IsnanGenericSycl const&, TArg const& arg) { - return sycl::isnan(arg); + return static_cast(sycl::isnan(arg)); } }; @@ -482,18 +486,20 @@ namespace alpaka::math::trait template struct Max && std::is_arithmetic_v>> { + using TCommon = std::common_type_t; + auto operator()(math::MaxGenericSycl const&, Tx const& x, Ty const& y) { if constexpr(std::is_integral_v && std::is_integral_v) - return sycl::max(x, y); + return sycl::max(static_cast(x), static_cast(y)); else if constexpr(std::is_floating_point_v && std::is_floating_point_v) - return sycl::fmax(x, y); + return sycl::fmax(static_cast(x), static_cast(y)); else if constexpr( (std::is_floating_point_v && std::is_integral_v) || (std::is_integral_v && std::is_floating_point_v) ) return sycl::fmax(static_cast(x), static_cast(y)); // mirror CUDA back-end else - static_assert(!sizeof(Tx), "Unsupported data type"); + static_assert(!sizeof(Tx), "Unsupported data types"); } }; @@ -512,7 +518,7 @@ namespace alpaka::math::trait || (std::is_integral_v && std::is_floating_point_v) ) return sycl::fmin(static_cast(x), static_cast(y)); // mirror CUDA back-end else - static_assert(!sizeof(Tx), "Unsupported data type"); + static_assert(!sizeof(Tx), "Unsupported data types"); } }; @@ -524,9 +530,11 @@ namespace alpaka::math::trait TExp, std::enable_if_t && std::is_floating_point_v>> { + using TCommon = std::common_type_t; + auto operator()(math::PowGenericSycl const&, TBase const& base, TExp const& exp) { - return sycl::pow(base, exp); + return sycl::pow(static_cast(base), static_cast(exp)); } }; @@ -538,9 +546,11 @@ namespace alpaka::math::trait Ty, std::enable_if_t && std::is_floating_point_v>> { + using TCommon = std::common_type_t; + auto operator()(math::RemainderGenericSycl const&, Tx const& x, Ty const& y) { - return sycl::remainder(x, y); + return sycl::remainder(static_cast(x), static_cast(y)); } }; @@ -580,9 +590,9 @@ namespace alpaka::math::trait { auto operator()(math::RsqrtGenericSycl const&, TArg const& arg) { - if(std::is_floating_point_v) + if constexpr(std::is_floating_point_v) return sycl::rsqrt(arg); - else if(std::is_integral_v) + else if constexpr(std::is_integral_v) return sycl::rsqrt(static_cast(arg)); // mirror CUDA back-end and use double for ints else static_assert(!sizeof(TArg), "Unsupported data type"); diff --git a/include/alpaka/mem/view/ViewPlainPtr.hpp b/include/alpaka/mem/view/ViewPlainPtr.hpp index a092a6b26f60..6db8cdc7b7d2 100644 --- a/include/alpaka/mem/view/ViewPlainPtr.hpp +++ b/include/alpaka/mem/view/ViewPlainPtr.hpp @@ -1,13 +1,15 @@ -/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan +/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ #pragma once #include "alpaka/dev/DevCpu.hpp" +#include "alpaka/dev/DevGenericSycl.hpp" #include "alpaka/dev/DevUniformCudaHipRt.hpp" #include "alpaka/mem/view/Traits.hpp" #include "alpaka/mem/view/ViewAccessOps.hpp" +#include "alpaka/meta/DependentFalseType.hpp" #include "alpaka/vec/Vec.hpp" #include @@ -178,6 +180,17 @@ namespace alpaka }; #endif +#if defined(ALPAKA_ACC_SYCL_ENABLED) + //! The SYCL device CreateStaticDevMemView trait specialization. + template + struct CreateStaticDevMemView> + { + static_assert( + meta::DependentFalseType::value, + "The SYCL backend does not support global device variables."); + }; +#endif + //! The CPU device CreateViewPlainPtr trait specialization. template<> struct CreateViewPlainPtr diff --git a/include/alpaka/rand/RandGenericSycl.hpp b/include/alpaka/rand/RandGenericSycl.hpp index 0901090853a8..35c788b449a7 100644 --- a/include/alpaka/rand/RandGenericSycl.hpp +++ b/include/alpaka/rand/RandGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Luca Ferragina, Aurora Perego +/* Copyright 2023 Luca Ferragina, Aurora Perego, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -13,9 +13,25 @@ // Backend specific imports. # include +# if BOOST_COMP_CLANG +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wcast-align" +# pragma clang diagnostic ignored "-Wcast-qual" +# pragma clang diagnostic ignored "-Wextra-semi" +# pragma clang diagnostic ignored "-Wfloat-equal" +# pragma clang diagnostic ignored "-Wold-style-cast" +# pragma clang diagnostic ignored "-Wreserved-identifier" +# pragma clang diagnostic ignored "-Wreserved-macro-identifier" +# pragma clang diagnostic ignored "-Wsign-compare" +# pragma clang diagnostic ignored "-Wundef" +# endif # include # include +# if BOOST_COMP_CLANG +# pragma clang diagnostic pop +# endif + # include namespace alpaka::rand @@ -219,8 +235,8 @@ namespace alpaka::rand static auto createDefault( RandGenericSycl const& rand, std::uint32_t const& seed = 0, - std::uint32_t const& subsequence = 0, - std::uint32_t const& offset = 0) -> sycl_rand::Minstd + std::uint32_t const& /* subsequence */ = 0, + std::uint32_t const& /* offset */ = 0) -> sycl_rand::Minstd { return {rand, seed}; } diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp index 4df62480e001..017c7a252fb4 100644 --- a/include/alpaka/warp/WarpGenericSycl.hpp +++ b/include/alpaka/warp/WarpGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan, Luca Ferragina +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -43,12 +43,17 @@ namespace alpaka::warp::trait template struct Activemask> { + // FIXME This should be std::uint64_t on AMD GCN architectures. static auto activemask(warp::WarpGenericSycl const& warp) -> std::uint32_t { // SYCL has no way of querying this. Since sub-group functions have to be executed in convergent code // regions anyway we return the full mask. auto const sub_group = warp.m_item.get_sub_group(); - return sycl::ext::oneapi::group_ballot(sub_group, true); + auto const mask = sycl::ext::oneapi::group_ballot(sub_group, true); + // FIXME This should be std::uint64_t on AMD GCN architectures. + std::uint32_t bits = 0; + mask.extract_bits(bits); + return bits; } }; @@ -75,10 +80,15 @@ namespace alpaka::warp::trait template struct Ballot> { - static auto ballot(warp::WarpGenericSycl const& warp, std::int32_t predicate) + // FIXME This should be std::uint64_t on AMD GCN architectures. + static auto ballot(warp::WarpGenericSycl const& warp, std::int32_t predicate) -> std::uint32_t { auto const sub_group = warp.m_item.get_sub_group(); - return sycl::ext::oneapi::group_ballot(sub_group, static_cast(predicate)); + auto const mask = sycl::ext::oneapi::group_ballot(sub_group, static_cast(predicate)); + // FIXME This should be std::uint64_t on AMD GCN architectures. + std::uint32_t bits = 0; + mask.extract_bits(bits); + return bits; } }; @@ -88,6 +98,10 @@ namespace alpaka::warp::trait template static auto shfl(warp::WarpGenericSycl const& warp, T value, std::int32_t srcLane, std::int32_t width) { + ALPAKA_ASSERT_OFFLOAD(width > 0); + ALPAKA_ASSERT_OFFLOAD(srcLane < width); + ALPAKA_ASSERT_OFFLOAD(srcLane >= 0); + /* If width < srcLane the sub-group needs to be split into assumed subdivisions. The first item of each subdivision has the assumed index 0. The srcLane index is relative to the subdivisions. @@ -95,14 +109,9 @@ namespace alpaka::warp::trait The first starts at sub-group index 0 and the second at sub-group index 16. For srcLane = 4 the first subdivision will access the value at sub-group index 4 and the second at sub-group index 20. */ auto const actual_group = warp.m_item.get_sub_group(); - auto const actual_item_id = actual_group.get_local_linear_id(); - - auto const assumed_group_id = actual_item_id / width; - auto const assumed_item_id = actual_item_id % width; - - auto const assumed_src_id = static_cast(srcLane % width); - auto const actual_src_id = assumed_src_id + assumed_group_id * width; - + auto const actual_item_id = static_cast(actual_group.get_local_linear_id()); + auto const actual_group_id = actual_item_id / width; + auto const actual_src_id = static_cast(srcLane + actual_group_id * width); auto const src = sycl::id<1>{actual_src_id}; return sycl::select_from_group(actual_group, value, src); diff --git a/include/alpaka/workdiv/WorkDivGenericSycl.hpp b/include/alpaka/workdiv/WorkDivGenericSycl.hpp index cb4f003fa211..c7a2979815c4 100644 --- a/include/alpaka/workdiv/WorkDivGenericSycl.hpp +++ b/include/alpaka/workdiv/WorkDivGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan, Luca Ferragina +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -18,6 +18,8 @@ namespace alpaka template class WorkDivGenericSycl : public concepts::Implements> { + static_assert(TDim::value > 0, "The SYCL work division must have a dimension greater than zero."); + public: using WorkDivBase = WorkDivGenericSycl; @@ -55,7 +57,9 @@ namespace alpaka::trait //! \return The number of blocks in each dimension of the grid. static auto getWorkDiv(WorkDivGenericSycl const& workDiv) -> Vec { - if constexpr(TDim::value == 1) + if constexpr(TDim::value == 0) + return Vec{}; + else if constexpr(TDim::value == 1) return Vec{static_cast(workDiv.my_item.get_group_range(0))}; else if constexpr(TDim::value == 2) { @@ -80,7 +84,9 @@ namespace alpaka::trait //! \return The number of threads in each dimension of a block. static auto getWorkDiv(WorkDivGenericSycl const& workDiv) -> Vec { - if constexpr(TDim::value == 1) + if constexpr(TDim::value == 0) + return Vec{}; + else if constexpr(TDim::value == 1) return Vec{static_cast(workDiv.my_item.get_local_range(0))}; else if constexpr(TDim::value == 2) { @@ -102,7 +108,7 @@ namespace alpaka::trait template struct GetWorkDiv, origin::Thread, unit::Elems> { - //! \return The number of blocks in each dimension of the grid. + //! \return The number of elements in each dimension of the thread. static auto getWorkDiv(WorkDivGenericSycl const& workDiv) -> Vec { return workDiv.m_threadElemExtent; diff --git a/test/unit/mem/view/src/ViewStaticAccMem.cpp b/test/unit/mem/view/src/ViewStaticAccMem.cpp index f26a88fdd8a0..ff8f0e48b245 100644 --- a/test/unit/mem/view/src/ViewStaticAccMem.cpp +++ b/test/unit/mem/view/src/ViewStaticAccMem.cpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan +/* Copyright 2023 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -16,6 +16,8 @@ using Elem = std::uint32_t; using Dim = alpaka::DimInt<2u>; using Idx = std::uint32_t; +#if !defined(ALPAKA_ACC_SYCL_ENABLED) + // These forward declarations are only necessary when you want to access those variables // from a different compilation unit and should be moved to a common header. // Here they are used to silence clang`s -Wmissing-variable-declarations warning @@ -40,10 +42,13 @@ struct StaticDeviceMemoryTestKernel } }; +#endif // !defined(ALPAKA_ACC_SYCL_ENABLED) + using TestAccs = alpaka::test::EnabledAccs; TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryGlobal", "[viewStaticAccMem]", TestAccs) { +#if !defined(ALPAKA_ACC_SYCL_ENABLED) using Acc = TestType; using DevAcc = alpaka::Dev; @@ -75,8 +80,16 @@ TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryGlobal", "[viewStaticAccMem]", TestAc REQUIRE(fixture(kernel, alpaka::getPtrNative(viewConstantMemUninitialized))); } + +#else // !defined(ALPAKA_ACC_SYCL_ENABLED) + + WARN("The SYCL backend does not support global device variables."); + +#endif // !defined(ALPAKA_ACC_SYCL_ENABLED) } +#if !defined(ALPAKA_ACC_SYCL_ENABLED) + // These forward declarations are only necessary when you want to access those variables // from a different compilation unit and should be moved to a common header. // Here they are used to silence clang`s -Wmissing-variable-declarations warning @@ -84,8 +97,11 @@ TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryGlobal", "[viewStaticAccMem]", TestAc extern ALPAKA_STATIC_ACC_MEM_GLOBAL Elem g_globalMemory2DUninitialized[3][2]; ALPAKA_STATIC_ACC_MEM_GLOBAL Elem g_globalMemory2DUninitialized[3][2]; +#endif // !defined(ALPAKA_ACC_SYCL_ENABLED) + TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryConstant", "[viewStaticAccMem]", TestAccs) { +#if !defined(ALPAKA_ACC_SYCL_ENABLED) using Acc = TestType; using DevAcc = alpaka::Dev; @@ -117,4 +133,10 @@ TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryConstant", "[viewStaticAccMem]", Test REQUIRE(fixture(kernel, alpaka::getPtrNative(viewGlobalMemUninitialized))); } + +#else // !defined(ALPAKA_ACC_SYCL_ENABLED) + + WARN("The SYCL backend does not support global device constants."); + +#endif // !defined(ALPAKA_ACC_SYCL_ENABLED) } diff --git a/test/unit/rand/src/RandTest.cpp b/test/unit/rand/src/RandTest.cpp index 7bea6e930ea6..05311d438965 100644 --- a/test/unit/rand/src/RandTest.cpp +++ b/test/unit/rand/src/RandTest.cpp @@ -1,5 +1,5 @@ /* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, René Widera, Jan Stephan, Bernhard Manfred Gruber, - * Sergei Bastrakov + * Sergei Bastrakov, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -60,7 +60,7 @@ class RandTestKernel auto genDefault = alpaka::rand::engine::createDefault(acc, 12345u, 6789u); genNumbers(acc, success, genDefault); -#if !defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !defined(ALPAKA_ACC_GPU_HIP_ENABLED) +#if !defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !defined(ALPAKA_ACC_SYCL_ENABLED) // TODO: These ifdefs are wrong: They will reduce the test to the // smallest common denominator from all enabled backends // std::random_device From 085365ab8fe8cb4e07d1e1f46c708a642419966f Mon Sep 17 00:00:00 2001 From: Jan Stephan Date: Tue, 20 Jun 2023 13:40:00 +0200 Subject: [PATCH 04/10] Rewrite the SYCL backend for the SYCL 2020 standard and USM allocations (part 4) Update the documentation related to FPGAs. Various fixes and updates to the SYCL backend and tests, the copyright information and code formatting. --- README_SYCL.md | 2 +- .../dyn/BlockSharedMemDynGenericSycl.hpp | 4 +- .../shared/st/BlockSharedMemStGenericSycl.hpp | 1 + include/alpaka/core/BoostPredef.hpp | 10 +- include/alpaka/core/Common.hpp | 2 +- include/alpaka/math/MathGenericSycl.hpp | 2 +- include/alpaka/mem/buf/sycl/Copy.hpp | 7 +- include/alpaka/mem/view/ViewPlainPtr.hpp | 3 +- include/alpaka/pltf/PltfCpuSyclIntel.hpp | 11 +- include/alpaka/pltf/PltfGenericSycl.hpp | 137 +++++++++--------- include/alpaka/pltf/PltfGpuSyclIntel.hpp | 13 +- include/alpaka/rand/RandGenericSycl.hpp | 2 +- include/alpaka/test/dim/TestDims.hpp | 2 +- .../test/event/EventHostManualTrigger.hpp | 3 +- include/alpaka/test/queue/Queue.hpp | 2 +- test/unit/atomic/src/AtomicTest.cpp | 3 +- thirdParty/CMakeLists.txt | 2 +- 17 files changed, 104 insertions(+), 102 deletions(-) diff --git a/README_SYCL.md b/README_SYCL.md index 82f620dca193..9846f1b8768b 100644 --- a/README_SYCL.md +++ b/README_SYCL.md @@ -92,7 +92,7 @@ These can be used interchangeably (some restrictions apply - see below) with the ### Restrictions -* The FPGA back-ends (both vendors) cannot be used together with the Intel CPU / GPU back-ends. This is because of the different compilation trajectory required for FPGAs and is unlikely to be fixed anytime soon. See [here](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/programming-interface/fpga-flow/why-is-fpga-compilation-different.html) for an explanation. +* The Intel FPGA back-end cannot be used together with the Intel CPU / GPU back-ends. This is because of the different compilation trajectory required for FPGAs and is unlikely to be fixed anytime soon. See [here](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/programming-interface/fpga-flow/why-is-fpga-compilation-different.html) for an explanation. * Similar to the CUDA and HIP back-ends the SYCL back-end only supports up to three kernel dimensions. * Some Intel GPUs do not support the `double` type for device code. alpaka will not check this. You can enable software emulation for `double` precision types with diff --git a/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp b/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp index d13b78249867..1e4671ebbf72 100644 --- a/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp +++ b/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp @@ -35,9 +35,7 @@ namespace alpaka::trait { static auto getMem(BlockSharedMemDynGenericSycl const& shared) -> T* { - auto void_ptr = sycl::multi_ptr{shared.m_accessor}; - auto t_ptr = static_cast>(void_ptr); - return t_ptr.get(); + return reinterpret_cast(shared.m_accessor.get_pointer().get()); } }; } // namespace alpaka::trait diff --git a/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp b/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp index baae7b0c8825..abda79f7b049 100644 --- a/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp +++ b/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp @@ -7,6 +7,7 @@ #include "alpaka/block/shared/st/Traits.hpp" #include "alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp" +#include #include #ifdef ALPAKA_ACC_SYCL_ENABLED diff --git a/include/alpaka/core/BoostPredef.hpp b/include/alpaka/core/BoostPredef.hpp index 39621e964b0f..9a54f33200f8 100644 --- a/include/alpaka/core/BoostPredef.hpp +++ b/include/alpaka/core/BoostPredef.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Jan Stephan +/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Jan Stephan * SPDX-License-Identifier: MPL-2.0 */ @@ -68,3 +68,11 @@ # undef BOOST_COMP_PGI # define BOOST_COMP_PGI BOOST_COMP_PGI_EMULATED #endif + +// Intel LLVM compiler detection +#if !defined(BOOST_COMP_ICPX) +# if defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER) +// The version string for icpx 2023.1.0 is 20230100. In Boost.Predef this becomes (53,1,0). +# define BOOST_COMP_ICPX BOOST_PREDEF_MAKE_YYYYMMDD(__INTEL_LLVM_COMPILER) +# endif +#endif diff --git a/include/alpaka/core/Common.hpp b/include/alpaka/core/Common.hpp index 4b6b64de2e1c..bc7428af08db 100644 --- a/include/alpaka/core/Common.hpp +++ b/include/alpaka/core/Common.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Axel Huebl, Benjamin Worpitz, Matthias Werner, Andrea Bocci +/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, Jan Stephan, René Widera, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/math/MathGenericSycl.hpp b/include/alpaka/math/MathGenericSycl.hpp index 3449a0dbcc15..ba99befbf66a 100644 --- a/include/alpaka/math/MathGenericSycl.hpp +++ b/include/alpaka/math/MathGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci +/* Copyright 2023 Jan Stephan, Sergei Bastrakov, René Widera, Luca Ferragina, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/mem/buf/sycl/Copy.hpp b/include/alpaka/mem/buf/sycl/Copy.hpp index 6ecb8d5f4836..7373125c63c6 100644 --- a/include/alpaka/mem/buf/sycl/Copy.hpp +++ b/include/alpaka/mem/buf/sycl/Copy.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego +/* Copyright 2023 Jan Stephan, Bernhard Manfred Gruber, Luca Ferragina, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -146,7 +146,10 @@ namespace alpaka::detail using Elem = alpaka::Elem; template - ALPAKA_FN_HOST TaskCopySycl(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent) + ALPAKA_FN_HOST TaskCopySycl( + TViewDstFwd&& viewDst, + TViewSrc const& viewSrc, + [[maybe_unused]] TExtent const& extent) : m_dstMemNative(reinterpret_cast(getPtrNative(viewDst))) , m_srcMemNative(reinterpret_cast(getPtrNative(viewSrc))) { diff --git a/include/alpaka/mem/view/ViewPlainPtr.hpp b/include/alpaka/mem/view/ViewPlainPtr.hpp index 6db8cdc7b7d2..7416385ea068 100644 --- a/include/alpaka/mem/view/ViewPlainPtr.hpp +++ b/include/alpaka/mem/view/ViewPlainPtr.hpp @@ -1,4 +1,5 @@ -/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci +/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Sergei Bastrakov, Bernhard Manfred Gruber, + * Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/pltf/PltfCpuSyclIntel.hpp b/include/alpaka/pltf/PltfCpuSyclIntel.hpp index 1c28a1288610..0c6489879c82 100644 --- a/include/alpaka/pltf/PltfCpuSyclIntel.hpp +++ b/include/alpaka/pltf/PltfCpuSyclIntel.hpp @@ -18,13 +18,7 @@ namespace alpaka { namespace detail { - // Prevent clang from annoying us with warnings about emitting too many vtables. These are discarded by the - // linker anyway. -# if BOOST_COMP_CLANG -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Wweak-vtables" -# endif - struct IntelCpuSelector final + struct IntelCpuSelector { auto operator()(sycl::device const& dev) const -> int { @@ -34,9 +28,6 @@ namespace alpaka return is_intel_cpu ? 1 : -1; } }; -# if BOOST_COMP_CLANG -# pragma clang diagnostic pop -# endif } // namespace detail //! The SYCL device manager. diff --git a/include/alpaka/pltf/PltfGenericSycl.hpp b/include/alpaka/pltf/PltfGenericSycl.hpp index bd7f351c17a8..914b3c7d80cb 100644 --- a/include/alpaka/pltf/PltfGenericSycl.hpp +++ b/include/alpaka/pltf/PltfGenericSycl.hpp @@ -127,7 +127,7 @@ namespace alpaka::trait # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL printDeviceProperties(sycl_dev); # elif ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL - std::cout << __func__ << sycl_dev.get_info() << '\n'; + std::cout << __func__ << sycl_dev.template get_info() << '\n'; # endif using SyclPltf = alpaka::PltfGenericSycl; return typename DevType::type{sycl_dev, platform.syclContext()}; @@ -189,82 +189,63 @@ namespace alpaka::trait std::cout << "SYCL version: " << device.get_info() << '\n'; +# if !defined(BOOST_COMP_ICPX) + // Not defined by Level Zero back-end std::cout << "Backend version: " << device.get_info() << '\n'; +# endif std::cout << "Aspects: " << '\n'; - auto const aspects = device.get_info(); - for(auto const& asp : aspects) - { - switch(asp) - { - // Ignore the hardware types - we already have queried this info above - case sycl::aspect::cpu: - case sycl::aspect::gpu: - case sycl::aspect::accelerator: - case sycl::aspect::custom: - break; + std::cout.flush(); - case sycl::aspect::emulated: - std::cout << "\t* emulated\n"; - break; +# if defined(BOOST_COMP_ICPX) +# if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0) + // These aspects are missing from oneAPI versions < 2023.2.0 + if(device.has(sycl::aspect::emulated)) + std::cout << "\t* emulated\n"; - case sycl::aspect::host_debugabble: - std::cout << "\t* debugabble using standard debuggers\n"; - break; + if(device.has(sycl::aspect::host_debuggable)) + std::cout << "\t* debuggable using standard debuggers\n"; +# endif +# endif - case sycl::aspect::fp16: - std::cout << "\t* supports sycl::half precision\n"; - break; + if(device.has(sycl::aspect::fp16)) + std::cout << "\t* supports sycl::half precision\n"; - case sycl::aspect::fp64: - std::cout << "\t* supports double precision\n"; - break; + if(device.has(sycl::aspect::fp64)) + std::cout << "\t* supports double precision\n"; - case sycl::aspect::atomic64: - std::cout << "\t* supports 64-bit atomics\n"; - break; + if(device.has(sycl::aspect::atomic64)) + std::cout << "\t* supports 64-bit atomics\n"; - case sycl::aspect::image: - std::cout << "\t* supports images\n"; - break; + if(device.has(sycl::aspect::image)) + std::cout << "\t* supports images\n"; - case sycl::aspect::online_compiler: - std::cout << "\t* supports online compilation of device code\n"; - break; + if(device.has(sycl::aspect::online_compiler)) + std::cout << "\t* supports online compilation of device code\n"; - case sycl::aspect::online_linker: - std::cout << "\t* supports online linking of device code\n"; - break; + if(device.has(sycl::aspect::online_linker)) + std::cout << "\t* supports online linking of device code\n"; - case sycl::aspect::queue_profiling: - std::cout << "\t* supports queue profiling\n"; - break; + if(device.has(sycl::aspect::queue_profiling)) + std::cout << "\t* supports queue profiling\n"; - case sycl::aspect::usm_device_allocations: - std::cout << "\t* supports explicit USM allocations\n"; - break; + if(device.has(sycl::aspect::usm_device_allocations)) + std::cout << "\t* supports explicit USM allocations\n"; - case sycl::aspect::usm_host_allocations: - std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host\n"; - break; + if(device.has(sycl::aspect::usm_host_allocations)) + std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host\n"; - case sycl::aspect::usm_atomic_host_allocations: - std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host atomically\n"; - break; + if(device.has(sycl::aspect::usm_atomic_host_allocations)) + std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host atomically\n"; - case sycl::aspect::usm_shared_allocations: - std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared\n"; - break; + if(device.has(sycl::aspect::usm_shared_allocations)) + std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared\n"; - case sycl::aspect::usm_atomic_shared_allocations: - std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared atomically\n"; - break; + if(device.has(sycl::aspect::usm_atomic_shared_allocations)) + std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared atomically\n"; - case sycl::aspect::usm_system_allocations: - std::cout << "\t* can access memory allocated by the system allocator\n"; - break; - } - } + if(device.has(sycl::aspect::usm_system_allocations)) + std::cout << "\t* can access memory allocated by the system allocator\n"; std::cout << "Available compute units: " << device.get_info() << '\n'; @@ -323,7 +304,7 @@ namespace alpaka::trait std::cout << "Native ISA vector width (float): " << device.get_info() << '\n'; - if(device.has_aspect(sycl::aspect::fp64)) + if(device.has(sycl::aspect::fp64)) { std::cout << "Preferred native vector width (double): " << device.get_info() << '\n'; @@ -332,7 +313,7 @@ namespace alpaka::trait << device.get_info() << '\n'; } - if(device.has_aspect(sycl::aspect::fp16)) + if(device.has(sycl::aspect::fp16)) { std::cout << "Preferred native vector width (half): " << device.get_info() << '\n'; @@ -349,7 +330,7 @@ namespace alpaka::trait std::cout << "Maximum size of memory object allocation: " << device.get_info() << " bytes\n"; - if(device.has_aspect(sycl::aspect::image)) + if(device.has(sycl::aspect::image)) { std::cout << "Maximum number of simultaneous image object reads per kernel: " << device.get_info() << '\n'; @@ -417,7 +398,7 @@ namespace alpaka::trait find_and_print(sycl::info::fp_config::soft_float); }; - if(device.has_aspect(sycl::aspect::fp16)) + if(device.has(sycl::aspect::fp16)) { auto const fp16_conf = device.get_info(); print_fp_config("Half", fp16_conf); @@ -426,7 +407,7 @@ namespace alpaka::trait auto const fp32_conf = device.get_info(); print_fp_config("Single", fp32_conf); - if(device.has_aspect(sycl::aspect::fp64)) + if(device.has(sycl::aspect::fp64)) { auto const fp64_conf = device.get_info(); print_fp_config("Double", fp64_conf); @@ -458,7 +439,7 @@ namespace alpaka::trait << device.get_info() << " bytes\n"; std::cout << "Global memory cache size: " - << device.get_info() / KiB << " KiB\n" + << device.get_info() / KiB << " KiB\n"; } std::cout << "Global memory size: " << device.get_info() / MiB @@ -516,6 +497,11 @@ namespace alpaka::trait case sycl::memory_order::seq_cst: std::cout << "seq_cst"; break; +# if defined(BOOST_COMP_ICPX) + // Stop icpx from complaining about its own internals. + case sycl::memory_order::__consume_unsupported: + break; +# endif } std::cout << ", "; } @@ -526,9 +512,14 @@ namespace alpaka::trait auto const mem_orders = device.get_info(); print_memory_orders(mem_orders); +# if defined(BOOST_COMP_ICPX) +# if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0) + // Not implemented in oneAPI < 2023.2.0 std::cout << "Supported memory orderings for sycl::atomic_fence: "; auto const fence_orders = device.get_info(); print_memory_orders(fence_orders); +# endif +# endif auto print_memory_scopes = [](std::vector const& mem_scopes) { @@ -565,9 +556,14 @@ namespace alpaka::trait auto const mem_scopes = device.get_info(); print_memory_scopes(mem_scopes); +# if defined(BOOST_COMP_ICPX) +# if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0) + // Not implemented in oneAPI < 2023.2.0 std::cout << "Supported memory scopes for sycl::atomic_fence: "; auto const fence_scopes = device.get_info(); print_memory_scopes(fence_scopes); +# endif +# endif std::cout << "Device timer resolution: " << device.get_info() << " ns\n"; @@ -607,6 +603,11 @@ namespace alpaka::trait std::cout << "by affinity domain"; has_affinity_domains = true; break; +# if defined(BOOST_COMP_ICPX) + case sycl::info::partition_property::ext_intel_partition_by_cslice: + std::cout << "by compute slice (Intel extension; deprecated)"; + break; +# endif } std::cout << ", "; } @@ -671,6 +672,12 @@ namespace alpaka::trait case sycl::info::partition_property::partition_by_affinity_domain: std::cout << "partitioned by affinity domain"; break; + +# if defined(BOOST_COMP_ICPX) + case sycl::info::partition_property::ext_intel_partition_by_cslice: + std::cout << "partitioned by compute slice (Intel extension; deprecated)"; + break; +# endif } std::cout << '\n'; diff --git a/include/alpaka/pltf/PltfGpuSyclIntel.hpp b/include/alpaka/pltf/PltfGpuSyclIntel.hpp index be1ae4643810..2e5d7583aaab 100644 --- a/include/alpaka/pltf/PltfGpuSyclIntel.hpp +++ b/include/alpaka/pltf/PltfGpuSyclIntel.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -18,13 +18,7 @@ namespace alpaka { namespace detail { - // Prevent clang from annoying us with warnings about emitting too many vtables. These are discarded by the - // linker anyway. -# if BOOST_COMP_CLANG -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Wweak-vtables" -# endif - struct IntelGpuSelector final + struct IntelGpuSelector { auto operator()(sycl::device const& dev) const -> int { @@ -34,9 +28,6 @@ namespace alpaka return is_intel_gpu ? 1 : -1; } }; -# if BOOST_COMP_CLANG -# pragma clang diagnostic pop -# endif } // namespace detail //! The SYCL device manager. diff --git a/include/alpaka/rand/RandGenericSycl.hpp b/include/alpaka/rand/RandGenericSycl.hpp index 35c788b449a7..7f4d3a3e1c65 100644 --- a/include/alpaka/rand/RandGenericSycl.hpp +++ b/include/alpaka/rand/RandGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Luca Ferragina, Aurora Perego, Andrea Bocci +/* Copyright 2023 Luca Ferragina, Aurora Perego, Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/test/dim/TestDims.hpp b/include/alpaka/test/dim/TestDims.hpp index 6350697fe7b3..395c97e5dcf3 100644 --- a/include/alpaka/test/dim/TestDims.hpp +++ b/include/alpaka/test/dim/TestDims.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Benjamin Worpitz, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber +/* Copyright 2023 Benjamin Worpitz, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/test/event/EventHostManualTrigger.hpp b/include/alpaka/test/event/EventHostManualTrigger.hpp index d465555c0dc9..7016f45e6c45 100644 --- a/include/alpaka/test/event/EventHostManualTrigger.hpp +++ b/include/alpaka/test/event/EventHostManualTrigger.hpp @@ -1,4 +1,5 @@ -/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber +/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Jan Stephan, Jeffrey Kelling, Andrea Bocci, + * Bernhard Manfred Gruber * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/test/queue/Queue.hpp b/include/alpaka/test/queue/Queue.hpp index 07d8495da8df..2fc75d6dacd2 100644 --- a/include/alpaka/test/queue/Queue.hpp +++ b/include/alpaka/test/queue/Queue.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan +/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan * SPDX-License-Identifier: MPL-2.0 */ diff --git a/test/unit/atomic/src/AtomicTest.cpp b/test/unit/atomic/src/AtomicTest.cpp index b04fd281af6c..467dab651741 100644 --- a/test/unit/atomic/src/AtomicTest.cpp +++ b/test/unit/atomic/src/AtomicTest.cpp @@ -1,4 +1,5 @@ -/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber, Antonio Di Pilato +/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, Sergei Bastrakov, René Widera, Jan Stephan, + * Bernhard Manfred Gruber, Antonio Di Pilato, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ diff --git a/thirdParty/CMakeLists.txt b/thirdParty/CMakeLists.txt index b061c9b0678a..ae1347167e8e 100644 --- a/thirdParty/CMakeLists.txt +++ b/thirdParty/CMakeLists.txt @@ -1,5 +1,5 @@ # -# Copyright 2023 Jan Stephan +# Copyright 2023 Jan Stephan, Bernhard Manfred Gruber, Andrea Bocci # SPDX-License-Identifier: MPL-2.0 # From 0fadd2abca03ce6809f8745a721c498fcc354d98 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Tue, 20 Jun 2023 08:20:31 +0200 Subject: [PATCH 05/10] Generalise the SYCL CpuSelector to non-Intel CPUs --- README_SYCL.md | 6 ++-- .../{AccCpuSyclIntel.hpp => AccCpuSycl.hpp} | 36 +++++++++---------- include/alpaka/acc/Tag.hpp | 4 +-- include/alpaka/alpaka.hpp | 18 +++++----- .../{DevCpuSyclIntel.hpp => DevCpuSycl.hpp} | 6 ++-- ...EventCpuSyclIntel.hpp => EventCpuSycl.hpp} | 6 ++-- include/alpaka/example/ExampleDefaultAcc.hpp | 4 +-- ...CpuSyclIntel.hpp => TaskKernelCpuSycl.hpp} | 7 ++-- .../{BufCpuSyclIntel.hpp => BufCpuSycl.hpp} | 6 ++-- .../{PltfCpuSyclIntel.hpp => PltfCpuSycl.hpp} | 13 +++---- ...lBlocking.hpp => QueueCpuSyclBlocking.hpp} | 6 ++-- ...ocking.hpp => QueueCpuSyclNonBlocking.hpp} | 6 ++-- .../{CpuSyclIntel.hpp => CpuSycl.hpp} | 2 +- include/alpaka/test/acc/TestAccs.hpp | 6 ++-- include/alpaka/test/queue/Queue.hpp | 16 ++++----- test/unit/acc/src/AccTagTest.cpp | 6 ++-- 16 files changed, 72 insertions(+), 76 deletions(-) rename include/alpaka/acc/{AccCpuSyclIntel.hpp => AccCpuSycl.hpp} (66%) rename include/alpaka/dev/{DevCpuSyclIntel.hpp => DevCpuSycl.hpp} (65%) rename include/alpaka/event/{EventCpuSyclIntel.hpp => EventCpuSycl.hpp} (65%) rename include/alpaka/kernel/{TaskKernelCpuSyclIntel.hpp => TaskKernelCpuSycl.hpp} (66%) rename include/alpaka/mem/buf/{BufCpuSyclIntel.hpp => BufCpuSycl.hpp} (64%) rename include/alpaka/pltf/{PltfCpuSyclIntel.hpp => PltfCpuSycl.hpp} (63%) rename include/alpaka/queue/{QueueCpuSyclIntelBlocking.hpp => QueueCpuSyclBlocking.hpp} (59%) rename include/alpaka/queue/{QueueCpuSyclIntelNonBlocking.hpp => QueueCpuSyclNonBlocking.hpp} (59%) rename include/alpaka/standalone/{CpuSyclIntel.hpp => CpuSycl.hpp} (87%) diff --git a/README_SYCL.md b/README_SYCL.md index 9846f1b8768b..ccac62293474 100644 --- a/README_SYCL.md +++ b/README_SYCL.md @@ -53,9 +53,9 @@ To enable device-side printing add the following compiler flags: * `-DALPAKA_SYCL_IOSTREAM_ENABLED`: to enable device-side printing. * `-DALPAKA_SYCL_IOSTREAM_KIB=`: `` (without the brackets) defines the kibibytes per block to be reserved for device-side printing. `` cannot exceed the amount of shared memory per block. -### Building for Intel CPUs +### Building for x86 64-bit CPUs -1. `#include ` in your C++ code. +1. `#include ` in your C++ code. 2. Add the following flags: * `-fsycl-targets=spir64_x86_64` (compiler and linker): to enable CPU compilation. Note: If you are using multiple SYCL hardware targets (like CPU and GPU) separate them by comma here. * `-Xsycl-target-backend=spir64_x86_64 "-march="` (linker): to choose the Intel ISA to compile for. Check the output of `opencl-aot --help` and look for the possible values of the `--march` flag. @@ -84,7 +84,7 @@ To enable device-side printing add the following compiler flags: In contrast to the other back-ends the SYCL back-end comes with multiple different accelerators which should be chosen according to your requirements: -* `alpaka::experimental::AccCpuSyclIntel` for targeting Intel CPUs. In contrast to the other CPU back-ends this will be using Intel's OpenCL implementation for CPUs under the hood. +* `alpaka::experimental::AccCpuSycl` for targeting Intel and AMD CPUs. In contrast to the other CPU back-ends this will use Intel's OpenCL implementation for CPUs under the hood. * `alpaka::experimental::AccFpgaSyclIntel` for targeting Intel FPGAs. * `alpaka::experimental::AccGpuSyclIntel` for targeting Intel GPUs. diff --git a/include/alpaka/acc/AccCpuSyclIntel.hpp b/include/alpaka/acc/AccCpuSycl.hpp similarity index 66% rename from include/alpaka/acc/AccCpuSyclIntel.hpp rename to include/alpaka/acc/AccCpuSycl.hpp index 751d16c6b8b0..5cedbea64d8f 100644 --- a/include/alpaka/acc/AccCpuSyclIntel.hpp +++ b/include/alpaka/acc/AccCpuSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan, Luca Ferragina +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -9,11 +9,11 @@ #include "alpaka/core/Concepts.hpp" #include "alpaka/core/DemangleTypeNames.hpp" #include "alpaka/core/Sycl.hpp" -#include "alpaka/dev/DevCpuSyclIntel.hpp" +#include "alpaka/dev/DevCpuSycl.hpp" #include "alpaka/dev/Traits.hpp" -#include "alpaka/kernel/TaskKernelCpuSyclIntel.hpp" +#include "alpaka/kernel/TaskKernelCpuSycl.hpp" #include "alpaka/kernel/Traits.hpp" -#include "alpaka/pltf/PltfCpuSyclIntel.hpp" +#include "alpaka/pltf/PltfCpuSycl.hpp" #include "alpaka/pltf/Traits.hpp" #include "alpaka/vec/Vec.hpp" @@ -31,9 +31,9 @@ namespace alpaka //! //! This accelerator allows parallel kernel execution on a oneAPI-capable Intel CPU target device. template - class AccCpuSyclIntel final + class AccCpuSycl final : public AccGenericSycl - , public concepts::Implements> + , public concepts::Implements> { public: using AccGenericSycl::AccGenericSycl; @@ -44,28 +44,28 @@ namespace alpaka::trait { //! The Intel CPU SYCL accelerator name trait specialization. template - struct GetAccName> + struct GetAccName> { static auto getAccName() -> std::string { - return "AccCpuSyclIntel<" + std::to_string(TDim::value) + "," + core::demangled + ">"; + return "AccCpuSycl<" + std::to_string(TDim::value) + "," + core::demangled + ">"; } }; //! The Intel CPU SYCL accelerator device type trait specialization. template - struct DevType> + struct DevType> { - using type = DevCpuSyclIntel; + using type = DevCpuSycl; }; //! The Intel CPU SYCL accelerator execution task type trait specialization. template - struct CreateTaskKernel, TWorkDiv, TKernelFnObj, TArgs...> + struct CreateTaskKernel, TWorkDiv, TKernelFnObj, TArgs...> { static auto createTaskKernel(TWorkDiv const& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args) { - return TaskKernelCpuSyclIntel{ + return TaskKernelCpuSycl{ workDiv, kernelFnObj, std::forward(args)...}; @@ -74,21 +74,21 @@ namespace alpaka::trait //! The Intel CPU SYCL execution task platform type trait specialization. template - struct PltfType> + struct PltfType> { - using type = PltfCpuSyclIntel; + using type = PltfCpuSycl; }; template - struct AccToTag> + struct AccToTag> { - using type = alpaka::TagCpuSyclIntel; + using type = alpaka::TagCpuSycl; }; template - struct TagToAcc + struct TagToAcc { - using type = alpaka::AccCpuSyclIntel; + using type = alpaka::AccCpuSycl; }; } // namespace alpaka::trait diff --git a/include/alpaka/acc/Tag.hpp b/include/alpaka/acc/Tag.hpp index f950bb03b56e..03a6449a75a9 100644 --- a/include/alpaka/acc/Tag.hpp +++ b/include/alpaka/acc/Tag.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Simeon Ehrig, Jan Stephan +/* Copyright 2023 Simeon Ehrig, Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -23,7 +23,7 @@ namespace alpaka CREATE_ACC_TAG(TagCpuOmp2Blocks); CREATE_ACC_TAG(TagCpuOmp2Threads); CREATE_ACC_TAG(TagCpuSerial); - CREATE_ACC_TAG(TagCpuSyclIntel); + CREATE_ACC_TAG(TagCpuSycl); CREATE_ACC_TAG(TagCpuTbbBlocks); CREATE_ACC_TAG(TagCpuThreads); CREATE_ACC_TAG(TagFpgaSyclIntel); diff --git a/include/alpaka/alpaka.hpp b/include/alpaka/alpaka.hpp index 3262ea360edc..327e53fab9f0 100644 --- a/include/alpaka/alpaka.hpp +++ b/include/alpaka/alpaka.hpp @@ -1,5 +1,5 @@ /* Copyright 2023 Axel Hübl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Bernhard Manfred Gruber, - * Jan Stephan, Antonio Di Pilato, Luca Ferragina, Aurora Perego + * Jan Stephan, Antonio Di Pilato, Luca Ferragina, Aurora Perego, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -13,7 +13,7 @@ #include "alpaka/acc/AccCpuOmp2Blocks.hpp" #include "alpaka/acc/AccCpuOmp2Threads.hpp" #include "alpaka/acc/AccCpuSerial.hpp" -#include "alpaka/acc/AccCpuSyclIntel.hpp" +#include "alpaka/acc/AccCpuSycl.hpp" #include "alpaka/acc/AccCpuTbbBlocks.hpp" #include "alpaka/acc/AccCpuThreads.hpp" #include "alpaka/acc/AccDevProps.hpp" @@ -75,7 +75,7 @@ #include "alpaka/core/Vectorize.hpp" // dev #include "alpaka/dev/DevCpu.hpp" -#include "alpaka/dev/DevCpuSyclIntel.hpp" +#include "alpaka/dev/DevCpuSycl.hpp" #include "alpaka/dev/DevCudaRt.hpp" #include "alpaka/dev/DevFpgaSyclIntel.hpp" #include "alpaka/dev/DevGenericSycl.hpp" @@ -89,7 +89,7 @@ #include "alpaka/dim/Traits.hpp" // event #include "alpaka/event/EventCpu.hpp" -#include "alpaka/event/EventCpuSyclIntel.hpp" +#include "alpaka/event/EventCpuSycl.hpp" #include "alpaka/event/EventCudaRt.hpp" #include "alpaka/event/EventFpgaSyclIntel.hpp" #include "alpaka/event/EventGenericSycl.hpp" @@ -114,7 +114,7 @@ #include "alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp" #include "alpaka/kernel/TaskKernelCpuOmp2Threads.hpp" #include "alpaka/kernel/TaskKernelCpuSerial.hpp" -#include "alpaka/kernel/TaskKernelCpuSyclIntel.hpp" +#include "alpaka/kernel/TaskKernelCpuSycl.hpp" #include "alpaka/kernel/TaskKernelCpuTbbBlocks.hpp" #include "alpaka/kernel/TaskKernelCpuThreads.hpp" #include "alpaka/kernel/TaskKernelFpgaSyclIntel.hpp" @@ -133,7 +133,7 @@ #include "alpaka/mem/alloc/AllocCpuNew.hpp" #include "alpaka/mem/alloc/Traits.hpp" #include "alpaka/mem/buf/BufCpu.hpp" -#include "alpaka/mem/buf/BufCpuSyclIntel.hpp" +#include "alpaka/mem/buf/BufCpuSycl.hpp" #include "alpaka/mem/buf/BufCudaRt.hpp" #include "alpaka/mem/buf/BufFpgaSyclIntel.hpp" #include "alpaka/mem/buf/BufGenericSycl.hpp" @@ -178,7 +178,7 @@ #include "alpaka/offset/Traits.hpp" // platform #include "alpaka/pltf/PltfCpu.hpp" -#include "alpaka/pltf/PltfCpuSyclIntel.hpp" +#include "alpaka/pltf/PltfCpuSycl.hpp" #include "alpaka/pltf/PltfCudaRt.hpp" #include "alpaka/pltf/PltfFpgaSyclIntel.hpp" #include "alpaka/pltf/PltfGpuSyclIntel.hpp" @@ -197,8 +197,8 @@ #include "alpaka/queue/Properties.hpp" #include "alpaka/queue/QueueCpuBlocking.hpp" #include "alpaka/queue/QueueCpuNonBlocking.hpp" -#include "alpaka/queue/QueueCpuSyclIntelBlocking.hpp" -#include "alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp" +#include "alpaka/queue/QueueCpuSyclBlocking.hpp" +#include "alpaka/queue/QueueCpuSyclNonBlocking.hpp" #include "alpaka/queue/QueueCudaRtBlocking.hpp" #include "alpaka/queue/QueueCudaRtNonBlocking.hpp" #include "alpaka/queue/QueueFpgaSyclIntelBlocking.hpp" diff --git a/include/alpaka/dev/DevCpuSyclIntel.hpp b/include/alpaka/dev/DevCpuSycl.hpp similarity index 65% rename from include/alpaka/dev/DevCpuSyclIntel.hpp rename to include/alpaka/dev/DevCpuSycl.hpp index 8add6869e3da..1a9909d31cd0 100644 --- a/include/alpaka/dev/DevCpuSyclIntel.hpp +++ b/include/alpaka/dev/DevCpuSycl.hpp @@ -1,17 +1,17 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ #pragma once #include "alpaka/dev/DevGenericSycl.hpp" -#include "alpaka/pltf/PltfCpuSyclIntel.hpp" +#include "alpaka/pltf/PltfCpuSycl.hpp" #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU) namespace alpaka { - using DevCpuSyclIntel = DevGenericSycl; + using DevCpuSycl = DevGenericSycl; } // namespace alpaka #endif diff --git a/include/alpaka/event/EventCpuSyclIntel.hpp b/include/alpaka/event/EventCpuSycl.hpp similarity index 65% rename from include/alpaka/event/EventCpuSyclIntel.hpp rename to include/alpaka/event/EventCpuSycl.hpp index 21a752e28991..37e521484e93 100644 --- a/include/alpaka/event/EventCpuSyclIntel.hpp +++ b/include/alpaka/event/EventCpuSycl.hpp @@ -1,17 +1,17 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ #pragma once -#include "alpaka/dev/DevCpuSyclIntel.hpp" +#include "alpaka/dev/DevCpuSycl.hpp" #include "alpaka/event/EventGenericSycl.hpp" #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU) namespace alpaka { - using EventCpuSyclIntel = EventGenericSycl; + using EventCpuSycl = EventGenericSycl; } // namespace alpaka #endif diff --git a/include/alpaka/example/ExampleDefaultAcc.hpp b/include/alpaka/example/ExampleDefaultAcc.hpp index ef0eb80fb380..6ceb874dd708 100644 --- a/include/alpaka/example/ExampleDefaultAcc.hpp +++ b/include/alpaka/example/ExampleDefaultAcc.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego +/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -26,7 +26,7 @@ namespace alpaka using ExampleDefaultAcc = alpaka::AccCpuThreads; #elif defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) # if defined(ALPAKA_SYCL_ONEAPI_CPU) - using ExampleDefaultAcc = alpaka::AccCpuSyclIntel; + using ExampleDefaultAcc = alpaka::AccCpuSycl; # elif defined(ALPAKA_SYCL_ONEAPI_FPGA) using ExampleDefaultAcc = alpaka::AccFpgaSyclIntel; # elif defined(ALPAKA_SYCL_ONEAPI_GPU) diff --git a/include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp b/include/alpaka/kernel/TaskKernelCpuSycl.hpp similarity index 66% rename from include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp rename to include/alpaka/kernel/TaskKernelCpuSycl.hpp index fbfc88b9c4b3..abb8c9a81d03 100644 --- a/include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp +++ b/include/alpaka/kernel/TaskKernelCpuSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan, Luca Ferragina +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -11,11 +11,10 @@ namespace alpaka { template - class AccCpuSyclIntel; + class AccCpuSycl; template - using TaskKernelCpuSyclIntel - = TaskKernelGenericSycl, TDim, TIdx, TKernelFnObj, TArgs...>; + using TaskKernelCpuSycl = TaskKernelGenericSycl, TDim, TIdx, TKernelFnObj, TArgs...>; } // namespace alpaka #endif diff --git a/include/alpaka/mem/buf/BufCpuSyclIntel.hpp b/include/alpaka/mem/buf/BufCpuSycl.hpp similarity index 64% rename from include/alpaka/mem/buf/BufCpuSyclIntel.hpp rename to include/alpaka/mem/buf/BufCpuSycl.hpp index 0d73ffe281c6..1dbdc9f3a2ad 100644 --- a/include/alpaka/mem/buf/BufCpuSyclIntel.hpp +++ b/include/alpaka/mem/buf/BufCpuSycl.hpp @@ -1,10 +1,10 @@ -/* Copyright 2022 Jan Stephan, Luca Ferragina +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ #pragma once -#include "alpaka/dev/DevCpuSyclIntel.hpp" +#include "alpaka/dev/DevCpuSycl.hpp" #include "alpaka/mem/buf/BufGenericSycl.hpp" #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU) @@ -12,7 +12,7 @@ namespace alpaka { template - using BufCpuSyclIntel = BufGenericSycl; + using BufCpuSycl = BufGenericSycl; } #endif diff --git a/include/alpaka/pltf/PltfCpuSyclIntel.hpp b/include/alpaka/pltf/PltfCpuSycl.hpp similarity index 63% rename from include/alpaka/pltf/PltfCpuSyclIntel.hpp rename to include/alpaka/pltf/PltfCpuSycl.hpp index 0c6489879c82..40f6855eab1a 100644 --- a/include/alpaka/pltf/PltfCpuSyclIntel.hpp +++ b/include/alpaka/pltf/PltfCpuSycl.hpp @@ -18,29 +18,26 @@ namespace alpaka { namespace detail { - struct IntelCpuSelector + struct SyclCpuSelector { auto operator()(sycl::device const& dev) const -> int { - auto const& vendor = dev.get_info(); - auto const is_intel_cpu = (vendor.find("Intel(R) Corporation") != std::string::npos) && dev.is_cpu(); - - return is_intel_cpu ? 1 : -1; + return dev.is_cpu() ? 1 : -1; } }; } // namespace detail //! The SYCL device manager. - using PltfCpuSyclIntel = PltfGenericSycl; + using PltfCpuSycl = PltfGenericSycl; } // namespace alpaka namespace alpaka::trait { //! The SYCL device manager device type trait specialization. template<> - struct DevType + struct DevType { - using type = DevGenericSycl; // = DevCpuSyclIntel + using type = DevGenericSycl; // = DevCpuSycl }; } // namespace alpaka::trait diff --git a/include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp b/include/alpaka/queue/QueueCpuSyclBlocking.hpp similarity index 59% rename from include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp rename to include/alpaka/queue/QueueCpuSyclBlocking.hpp index 039a05b3229f..3d561733ecc5 100644 --- a/include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp +++ b/include/alpaka/queue/QueueCpuSyclBlocking.hpp @@ -1,17 +1,17 @@ -/* Copyright 2022 Jan Stephan, Luca Ferragina +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ #pragma once -#include "alpaka/dev/DevCpuSyclIntel.hpp" +#include "alpaka/dev/DevCpuSycl.hpp" #include "alpaka/queue/QueueGenericSyclBlocking.hpp" #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU) namespace alpaka { - using QueueCpuSyclIntelBlocking = QueueGenericSyclBlocking; + using QueueCpuSyclBlocking = QueueGenericSyclBlocking; } #endif diff --git a/include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp b/include/alpaka/queue/QueueCpuSyclNonBlocking.hpp similarity index 59% rename from include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp rename to include/alpaka/queue/QueueCpuSyclNonBlocking.hpp index 02391231f5f5..c75f5be45229 100644 --- a/include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp +++ b/include/alpaka/queue/QueueCpuSyclNonBlocking.hpp @@ -1,17 +1,17 @@ -/* Copyright 2022 Jan Stephan, Luca Ferragina +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ #pragma once -#include "alpaka/dev/DevCpuSyclIntel.hpp" +#include "alpaka/dev/DevCpuSycl.hpp" #include "alpaka/queue/QueueGenericSyclNonBlocking.hpp" #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU) namespace alpaka { - using QueueCpuSyclIntelNonBlocking = QueueGenericSyclNonBlocking; + using QueueCpuSyclNonBlocking = QueueGenericSyclNonBlocking; } #endif diff --git a/include/alpaka/standalone/CpuSyclIntel.hpp b/include/alpaka/standalone/CpuSycl.hpp similarity index 87% rename from include/alpaka/standalone/CpuSyclIntel.hpp rename to include/alpaka/standalone/CpuSycl.hpp index e42b64a2097e..fbdb5c2c481e 100644 --- a/include/alpaka/standalone/CpuSyclIntel.hpp +++ b/include/alpaka/standalone/CpuSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/test/acc/TestAccs.hpp b/include/alpaka/test/acc/TestAccs.hpp index 742b40fd8313..62c1e6f7d2c2 100644 --- a/include/alpaka/test/acc/TestAccs.hpp +++ b/include/alpaka/test/acc/TestAccs.hpp @@ -79,10 +79,10 @@ namespace alpaka::test #endif #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_TARGET_CPU) template - using AccCpuSyclIntelIfAvailableElseInt = alpaka::AccCpuSyclIntel; + using AccCpuSyclIfAvailableElseInt = alpaka::AccCpuSycl; #else template - using AccCpuSyclIntelIfAvailableElseInt = int; + using AccCpuSyclIfAvailableElseInt = int; #endif #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_TARGET_FPGA) template @@ -109,7 +109,7 @@ namespace alpaka::test AccCpuOmp2ThreadsIfAvailableElseInt, AccGpuCudaRtIfAvailableElseInt, AccGpuHipRtIfAvailableElseInt, - AccCpuSyclIntelIfAvailableElseInt, + AccCpuSyclIfAvailableElseInt, AccFpgaSyclIntelIfAvailableElseInt, AccGpuSyclIntelIfAvailableElseInt>; } // namespace detail diff --git a/include/alpaka/test/queue/Queue.hpp b/include/alpaka/test/queue/Queue.hpp index 2fc75d6dacd2..7cb3492cfc45 100644 --- a/include/alpaka/test/queue/Queue.hpp +++ b/include/alpaka/test/queue/Queue.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan +/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -86,23 +86,23 @@ namespace alpaka::test # ifdef ALPAKA_SYCL_ONEAPI_CPU //! The default queue type trait specialization for the Intel CPU device. template<> - struct DefaultQueueType + struct DefaultQueueType { # if(ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL) - using type = alpaka::QueueCpuSyclIntelBlocking; + using type = alpaka::QueueCpuSyclBlocking; # else - using type = alpaka::QueueCpuSyclIntelNonBlocking; + using type = alpaka::QueueCpuSyclNonBlocking; # endif }; template<> - struct IsBlockingQueue + struct IsBlockingQueue { static constexpr auto value = true; }; template<> - struct IsBlockingQueue + struct IsBlockingQueue { static constexpr auto value = false; }; @@ -180,8 +180,8 @@ namespace alpaka::test # ifdef ALPAKA_SYCL_BACKEND_ONEAPI # ifdef ALPAKA_SYCL_ONEAPI_CPU , - std::tuple, - std::tuple + std::tuple, + std::tuple # endif # ifdef ALPAKA_SYCL_ONEAPI_FPGA , diff --git a/test/unit/acc/src/AccTagTest.cpp b/test/unit/acc/src/AccTagTest.cpp index f2d5f58ee45e..741f2e17385b 100644 --- a/test/unit/acc/src/AccTagTest.cpp +++ b/test/unit/acc/src/AccTagTest.cpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Simeon Ehrig, Jan Stephan +/* Copyright 2023 Simeon Ehrig, Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -23,7 +23,7 @@ using TagList = std::tuple< alpaka::TagCpuOmp2Threads, alpaka::TagGpuCudaRt, alpaka::TagGpuHipRt, - alpaka::TagCpuSyclIntel, + alpaka::TagCpuSycl, alpaka::TagFpgaSyclIntel, alpaka::TagGpuSyclIntel>; @@ -35,7 +35,7 @@ using AccToTagMap = std::tuple< std::pair, alpaka::TagCpuOmp2Threads>, std::pair, alpaka::TagGpuCudaRt>, std::pair, alpaka::TagGpuHipRt>, - std::pair, alpaka::TagCpuSyclIntel>, + std::pair, alpaka::TagCpuSycl>, std::pair, alpaka::TagFpgaSyclIntel>, std::pair, alpaka::TagGpuSyclIntel>>; From 93c607decc051ee68585b892e35344440e51057d Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Tue, 20 Jun 2023 19:38:16 +0200 Subject: [PATCH 06/10] Rewrite the SYCL memcpy and memset operations Rewrite the N-dimensional Copy and Set memory operations to support pitched memory buffers, based on the Cpu implementation. This may require more than one memset or memcpy call per operation, which is not supported by command group handlers. Rewrite the Copy and Set memory operations to use queues instead. --- include/alpaka/mem/buf/BufGenericSycl.hpp | 4 +- include/alpaka/mem/buf/sycl/Copy.hpp | 65 ++++++++++++++----- include/alpaka/mem/buf/sycl/Set.hpp | 49 ++++++++++---- .../queue/sycl/QueueGenericSyclBase.hpp | 34 +++++----- 4 files changed, 108 insertions(+), 44 deletions(-) diff --git a/include/alpaka/mem/buf/BufGenericSycl.hpp b/include/alpaka/mem/buf/BufGenericSycl.hpp index b60d70c8b033..e725c299a598 100644 --- a/include/alpaka/mem/buf/BufGenericSycl.hpp +++ b/include/alpaka/mem/buf/BufGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan, Luca Ferragina, Aurora Perego +/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -192,7 +192,7 @@ namespace alpaka::trait } # endif - auto* memPtr = sycl::malloc_device( + TElem* memPtr = sycl::malloc_device( static_cast(getExtentProduct(extent)), dev.getNativeHandle().first, dev.getNativeHandle().second); diff --git a/include/alpaka/mem/buf/sycl/Copy.hpp b/include/alpaka/mem/buf/sycl/Copy.hpp index 7373125c63c6..34527e8478ed 100644 --- a/include/alpaka/mem/buf/sycl/Copy.hpp +++ b/include/alpaka/mem/buf/sycl/Copy.hpp @@ -1,8 +1,7 @@ -/* Copyright 2023 Jan Stephan, Bernhard Manfred Gruber, Luca Ferragina, Aurora Perego +/* Copyright 2023 Jan Stephan, Bernhard Manfred Gruber, Luca Ferragina, Aurora Perego, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ - #pragma once #include "alpaka/core/Debug.hpp" @@ -27,7 +26,7 @@ namespace alpaka::detail { - //! The Sycl device memory copy task base. + //! The SYCL device memory copy task base. template struct TaskCopySyclBase { @@ -42,8 +41,8 @@ namespace alpaka::detail template TaskCopySyclBase(TViewFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent) : m_extent(getExtentVec(extent)) - , m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast(sizeof(Elem))) # if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL) + , m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast(sizeof(Elem))) , m_dstExtent(getExtentVec(viewDst)) , m_srcExtent(getExtentVec(viewSrc)) # endif @@ -70,8 +69,8 @@ namespace alpaka::detail # endif Vec const m_extent; - ExtentSize const m_extentWidthBytes; # if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL) + ExtentSize const m_extentWidthBytes; Vec const m_dstExtent; Vec const m_srcExtent; # endif @@ -83,7 +82,7 @@ namespace alpaka::detail static constexpr auto is_sycl_task = true; }; - //! The Sycl device ND memory copy task. + //! The SYCL device ND memory copy task. template struct TaskCopySycl : public TaskCopySyclBase { @@ -94,20 +93,47 @@ namespace alpaka::detail using TaskCopySyclBase::TaskCopySyclBase; - ALPAKA_FN_HOST auto operator()(sycl::handler& cgh) const -> void + ALPAKA_FN_HOST auto operator()(sycl::queue& queue, std::vector const& requirements) const + -> sycl::event { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL this->printDebug(); # endif + // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one + // iteration. + Vec const extentWithoutInnermost(subVecBegin(this->m_extent)); + // [z, y, x] -> [y, x] because the z pitch (the full size of the buffer) is not required. + Vec const dstPitchBytesWithoutOutmost(subVecEnd(this->m_dstPitchBytes)); + Vec const srcPitchBytesWithoutOutmost(subVecEnd(this->m_srcPitchBytes)); + + // Record an event for each memcpy call + std::vector events; + events.reserve(static_cast(extentWithoutInnermost.prod())); + if(static_cast(this->m_extent.prod()) != 0u) { - cgh.memcpy( - reinterpret_cast(this->m_dstMemNative), - reinterpret_cast(this->m_srcMemNative), - static_cast(this->m_extentWidthBytes * this->m_extent.prod())); + meta::ndLoopIncIdx( + extentWithoutInnermost, + [&](Vec const& idx) + { + events.push_back(queue.memcpy( + reinterpret_cast( + this->m_dstMemNative + + (castVec(idx) * dstPitchBytesWithoutOutmost) + .foldrAll(std::plus())), + reinterpret_cast( + this->m_srcMemNative + + (castVec(idx) * srcPitchBytesWithoutOutmost) + .foldrAll(std::plus())), + static_cast(this->m_extentWidthBytes), + requirements)); + }); } + + // Return an event that depends on all the events assciated to the memcpy calls + return queue.ext_oneapi_submit_barrier(events); } }; @@ -117,8 +143,10 @@ namespace alpaka::detail : TaskCopySyclBase, TViewDst, TViewSrc, TExtent> { using TaskCopySyclBase, TViewDst, TViewSrc, TExtent>::TaskCopySyclBase; + using Elem = alpaka::Elem; - ALPAKA_FN_HOST auto operator()(sycl::handler& cgh) const -> void + ALPAKA_FN_HOST auto operator()(sycl::queue& queue, std::vector const& requirements) const + -> sycl::event { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; @@ -127,10 +155,15 @@ namespace alpaka::detail # endif if(static_cast(this->m_extent.prod()) != 0u) { - cgh.memcpy( + return queue.memcpy( reinterpret_cast(this->m_dstMemNative), reinterpret_cast(this->m_srcMemNative), - static_cast(this->m_extentWidthBytes)); + sizeof(Elem) * static_cast(this->m_extent.prod()), + requirements); + } + else + { + return queue.ext_oneapi_submit_barrier(); } } }; @@ -159,9 +192,9 @@ namespace alpaka::detail ALPAKA_ASSERT(getExtentVec(viewSrc).prod() == 1u); } - auto operator()(sycl::handler& cgh) const -> void + auto operator()(sycl::queue& queue, std::vector const& requirements) const -> sycl::event { - cgh.memcpy(m_dstMemNative, m_srcMemNative, sizeof(Elem)); + return queue.memcpy(m_dstMemNative, m_srcMemNative, sizeof(Elem), requirements); } void* m_dstMemNative; diff --git a/include/alpaka/mem/buf/sycl/Set.hpp b/include/alpaka/mem/buf/sycl/Set.hpp index 25d602228424..da52a9206c46 100644 --- a/include/alpaka/mem/buf/sycl/Set.hpp +++ b/include/alpaka/mem/buf/sycl/Set.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan, Luca Ferragina, Aurora Perego +/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -83,20 +83,42 @@ namespace alpaka using TaskSetSyclBase::TaskSetSyclBase; - auto operator()(sycl::handler& cgh) const -> void + auto operator()(sycl::queue& queue, std::vector const& requirements) const -> sycl::event { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL this->printDebug(); # endif + // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one + // iteration. + Vec const extentWithoutInnermost(subVecBegin(this->m_extent)); + // [z, y, x] -> [y, x] because the z pitch (the full idx of the buffer) is not required. + Vec const dstPitchBytesWithoutOutmost(subVecEnd(this->m_dstPitchBytes)); + + // Record an event for each memcpy call + std::vector events; + events.reserve(static_cast(extentWithoutInnermost.prod())); + if(static_cast(this->m_extent.prod()) != 0u) { - cgh.memset( - reinterpret_cast(this->m_dstMemNative), - this->m_byte, - static_cast(this->m_extentWidthBytes * this->m_extent.prod())); + meta::ndLoopIncIdx( + extentWithoutInnermost, + [&](Vec const& idx) + { + events.push_back(queue.memset( + reinterpret_cast( + this->m_dstMemNative + + (castVec(idx) * dstPitchBytesWithoutOutmost) + .foldrAll(std::plus())), + this->m_byte, + static_cast(this->m_extentWidthBytes), + requirements)); + }); } + + // Return an event that depends on all the events assciated to the memcpy calls + return queue.ext_oneapi_submit_barrier(events); } }; @@ -106,7 +128,7 @@ namespace alpaka { using TaskSetSyclBase, TView, TExtent>::TaskSetSyclBase; - auto operator()(sycl::handler& cgh) const -> void + auto operator()(sycl::queue& queue, std::vector const& requirements) const -> sycl::event { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; @@ -115,10 +137,15 @@ namespace alpaka # endif if(static_cast(this->m_extent.prod()) != 0u) { - cgh.memset( + return queue.memset( reinterpret_cast(this->m_dstMemNative), this->m_byte, - static_cast(this->m_extentWidthBytes)); + static_cast(this->m_extentWidthBytes), + requirements); + } + else + { + return queue.ext_oneapi_submit_barrier(); } } }; @@ -151,14 +178,14 @@ namespace alpaka } # endif - auto operator()(sycl::handler& cgh) const -> void + auto operator()(sycl::queue& queue, std::vector const& requirements) const -> sycl::event { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL printDebug(); # endif - cgh.memset(reinterpret_cast(m_dstMemNative), m_byte, sizeof(Elem)); + return queue.memset(reinterpret_cast(m_dstMemNative), m_byte, sizeof(Elem), requirements); } std::uint8_t const m_byte; diff --git a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp index 2db22b6d3abb..c529ecfaa2e4 100644 --- a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp +++ b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp @@ -1,10 +1,9 @@ -/* Copyright 2022 Jan Stephan, Antonio Di Pilato, Luca Ferragina +/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Luca Ferragina, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ #pragma once - #include "alpaka/dev/Traits.hpp" #include "alpaka/event/Traits.hpp" #include "alpaka/queue/Traits.hpp" @@ -127,19 +126,24 @@ namespace alpaka::detail clean_dependencies(); // Execute task - m_last_event = m_queue.submit( - [this, &task](sycl::handler& cgh) - { - if(!m_dependencies.empty()) - cgh.depends_on(m_dependencies); - - if constexpr(is_sycl_kernel) // Kernel - task(cgh, m_fence_dummy); // Will call cgh.parallel_for internally - else if constexpr(is_sycl_task) // Copy / Fill - task(cgh); // Will call cgh.{copy, fill} internally - else // Host - cgh.host_task(task); - }); + if constexpr(is_sycl_task && !is_sycl_kernel) // Copy / Fill + { + m_last_event = task(m_queue, m_dependencies); // Will call queue.{copy, fill} internally + } + else + { + m_last_event = m_queue.submit( + [this, &task](sycl::handler& cgh) + { + if(!m_dependencies.empty()) + cgh.depends_on(m_dependencies); + + if constexpr(is_sycl_kernel) // Kernel + task(cgh, m_fence_dummy); // Will call cgh.parallel_for internally + else // Host + cgh.host_task(task); + }); + } m_dependencies.clear(); } From 01afbee880ebd485e0b49ca33dc5b3a625de6bfa Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 26 Jun 2023 08:15:33 +0200 Subject: [PATCH 07/10] First draft adding the warp size as a kernel trait --- .../alpaka/kernel/TaskKernelGenericSycl.hpp | 82 +++++++++++++------ include/alpaka/kernel/Traits.hpp | 16 +++- include/alpaka/warp/WarpGenericSycl.hpp | 2 +- test/unit/warp/src/Activemask.cpp | 38 +++++++-- test/unit/warp/src/All.cpp | 26 +++++- test/unit/warp/src/Any.cpp | 26 +++++- test/unit/warp/src/Ballot.cpp | 26 +++++- test/unit/warp/src/Shfl.cpp | 26 +++++- 8 files changed, 199 insertions(+), 43 deletions(-) diff --git a/include/alpaka/kernel/TaskKernelGenericSycl.hpp b/include/alpaka/kernel/TaskKernelGenericSycl.hpp index d4f1937415cb..a173b42a0437 100644 --- a/include/alpaka/kernel/TaskKernelGenericSycl.hpp +++ b/include/alpaka/kernel/TaskKernelGenericSycl.hpp @@ -136,33 +136,67 @@ namespace alpaka auto output_stream = sycl::stream{buf_size, buf_per_work_item, cgh}; # endif - // cgh.parallel_for>( //FIXME_ - cgh.parallel_for( - sycl::nd_range{global_size, local_size}, - [=](sycl::nd_item work_item) - { + if constexpr(trait::WarpSize::warp_size != 0) + { + // cgh.parallel_for>( //FIXME_ + cgh.parallel_for( + sycl::nd_range{global_size, local_size}, + [=](sycl::nd_item work_item) + [[intel::reqd_sub_group_size(trait::WarpSize::warp_size)]] + { # ifdef ALPAKA_SYCL_IOSTREAM_ENABLED - auto acc = TAcc{ - item_elements, - work_item, - dyn_shared_accessor, - st_shared_accessor, - global_fence_dummy, - local_fence_dummy, - output_stream}; + auto acc = TAcc{ + item_elements, + work_item, + dyn_shared_accessor, + st_shared_accessor, + global_fence_dummy, + local_fence_dummy, + output_stream}; # else - auto acc = TAcc{ - item_elements, - work_item, - dyn_shared_accessor, - st_shared_accessor, - global_fence_dummy, - local_fence_dummy}; + auto acc = TAcc{ + item_elements, + work_item, + dyn_shared_accessor, + st_shared_accessor, + global_fence_dummy, + local_fence_dummy}; # endif - core::apply( - [k_func, &acc](typename std::decay_t const&... args) { k_func(acc, args...); }, - k_args); - }); + core::apply( + [k_func, &acc](typename std::decay_t const&... args) { k_func(acc, args...); }, + k_args); + }); + } + else + { // autodetect + // cgh.parallel_for>( //FIXME_ + cgh.parallel_for( + sycl::nd_range{global_size, local_size}, + [=](sycl::nd_item work_item) + { +# ifdef ALPAKA_SYCL_IOSTREAM_ENABLED + auto acc = TAcc{ + item_elements, + work_item, + dyn_shared_accessor, + st_shared_accessor, + global_fence_dummy, + local_fence_dummy, + output_stream}; +# else + auto acc = TAcc{ + item_elements, + work_item, + dyn_shared_accessor, + st_shared_accessor, + global_fence_dummy, + local_fence_dummy}; +# endif + core::apply( + [k_func, &acc](typename std::decay_t const&... args) { k_func(acc, args...); }, + k_args); + }); + } } static constexpr auto is_sycl_task = true; diff --git a/include/alpaka/kernel/Traits.hpp b/include/alpaka/kernel/Traits.hpp index 33032cdb5c57..a0c8d17a89e2 100644 --- a/include/alpaka/kernel/Traits.hpp +++ b/include/alpaka/kernel/Traits.hpp @@ -1,4 +1,5 @@ -/* Copyright 2022 Axel Huebl, Benjamin Worpitz, René Widera, Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber +/* Copyright 2023 Axel Huebl, Benjamin Worpitz, René Widera, Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber, + * Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -68,6 +69,19 @@ namespace alpaka } }; + //! The trait for getting the warp size required by a kernel. + //! + //! \tparam TKernelFnObj The kernel function object. + //! \tparam TAcc The accelerator. + //! + //! The default implementation returns 0, which lets the accelerator compiler and runtime choose the warp size. + template + struct WarpSize + { + static constexpr std::uint32_t warp_size = 0u; + }; + + //! The trait for getting the schedule to use when a kernel is run using the CpuOmp2Blocks accelerator. //! //! Has no effect on other accelerators. diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp index 017c7a252fb4..356ee16e2c63 100644 --- a/include/alpaka/warp/WarpGenericSycl.hpp +++ b/include/alpaka/warp/WarpGenericSycl.hpp @@ -36,7 +36,7 @@ namespace alpaka::warp::trait { auto const sub_group = warp.m_item.get_sub_group(); // SYCL sub-groups are always 1D - return static_cast(sub_group.get_local_linear_range()); + return static_cast(sub_group.get_max_local_range()[0]); } }; diff --git a/test/unit/warp/src/Activemask.cpp b/test/unit/warp/src/Activemask.cpp index afc492ffc9a7..76431d44c557 100644 --- a/test/unit/warp/src/Activemask.cpp +++ b/test/unit/warp/src/Activemask.cpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan +/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -25,6 +25,7 @@ struct ActivemaskSingleThreadWarpTestKernel } }; +template struct ActivemaskMultipleThreadWarpTestKernel { ALPAKA_NO_HOST_ACC_WARNING @@ -53,6 +54,15 @@ struct ActivemaskMultipleThreadWarpTestKernel } }; +namespace alpaka::trait +{ + template + struct WarpSize, TAcc> + { + static constexpr std::uint32_t warp_size = TWarpSize; + }; +} // namespace alpaka::trait + TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs) { using Acc = TestType; @@ -68,7 +78,7 @@ TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs) if(scalar) { alpaka::test::KernelExecutionFixture fixture(alpaka::Vec::all(4)); - REQUIRE(fixture(ActivemaskSingleThreadWarpTestKernel{})); + CHECK(fixture(ActivemaskSingleThreadWarpTestKernel{})); } else { @@ -80,9 +90,27 @@ TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs) auto const threadElementExtent = alpaka::Vec::ones(); auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; auto fixture = ExecutionFixture{workDiv}; - ActivemaskMultipleThreadWarpTestKernel kernel; - for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) - REQUIRE(fixture(kernel, inactiveThreadIdx)); + if(warpExtent == 8) + { + for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) + { + CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<8>{}, inactiveThreadIdx)); + } + } + else if(warpExtent == 16) + { + for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) + { + CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<16>{}, inactiveThreadIdx)); + } + } + else if(warpExtent == 32) + { + for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) + { + CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<32>{}, inactiveThreadIdx)); + } + } } } } diff --git a/test/unit/warp/src/All.cpp b/test/unit/warp/src/All.cpp index 4ea233e3179b..27468cd8803c 100644 --- a/test/unit/warp/src/All.cpp +++ b/test/unit/warp/src/All.cpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan +/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -25,6 +25,7 @@ struct AllSingleThreadWarpTestKernel } }; +template struct AllMultipleThreadWarpTestKernel { ALPAKA_NO_HOST_ACC_WARNING @@ -57,6 +58,15 @@ struct AllMultipleThreadWarpTestKernel } }; +namespace alpaka::trait +{ + template + struct WarpSize, TAcc> + { + static constexpr std::uint32_t warp_size = TWarpSize; + }; +} // namespace alpaka::trait + TEMPLATE_LIST_TEST_CASE("all", "[warp]", alpaka::test::TestAccs) { using Acc = TestType; @@ -84,8 +94,18 @@ TEMPLATE_LIST_TEST_CASE("all", "[warp]", alpaka::test::TestAccs) auto const threadElementExtent = alpaka::Vec::ones(); auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; auto fixture = ExecutionFixture{workDiv}; - AllMultipleThreadWarpTestKernel kernel; - REQUIRE(fixture(kernel)); + if(warpExtent == 8) + { + REQUIRE(fixture(AllMultipleThreadWarpTestKernel<8>{})); + } + else if(warpExtent == 16) + { + REQUIRE(fixture(AllMultipleThreadWarpTestKernel<16>{})); + } + else if(warpExtent == 32) + { + REQUIRE(fixture(AllMultipleThreadWarpTestKernel<32>{})); + } } } } diff --git a/test/unit/warp/src/Any.cpp b/test/unit/warp/src/Any.cpp index 72b247ecfe63..a00f303fc0fa 100644 --- a/test/unit/warp/src/Any.cpp +++ b/test/unit/warp/src/Any.cpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego +/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -25,6 +25,7 @@ struct AnySingleThreadWarpTestKernel } }; +template struct AnyMultipleThreadWarpTestKernel { ALPAKA_NO_HOST_ACC_WARNING @@ -57,6 +58,15 @@ struct AnyMultipleThreadWarpTestKernel } }; +namespace alpaka::trait +{ + template + struct WarpSize, TAcc> + { + static constexpr std::uint32_t warp_size = TWarpSize; + }; +} // namespace alpaka::trait + TEMPLATE_LIST_TEST_CASE("any", "[warp]", alpaka::test::TestAccs) { using Acc = TestType; @@ -84,8 +94,18 @@ TEMPLATE_LIST_TEST_CASE("any", "[warp]", alpaka::test::TestAccs) auto const threadElementExtent = alpaka::Vec::ones(); auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; auto fixture = ExecutionFixture{workDiv}; - AnyMultipleThreadWarpTestKernel kernel; - REQUIRE(fixture(kernel)); + if(warpExtent == 8) + { + REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<8>{})); + } + else if(warpExtent == 16) + { + REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<16>{})); + } + else if(warpExtent == 32) + { + REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<32>{})); + } } } } diff --git a/test/unit/warp/src/Ballot.cpp b/test/unit/warp/src/Ballot.cpp index cf5b59d79445..edf8c59cd575 100644 --- a/test/unit/warp/src/Ballot.cpp +++ b/test/unit/warp/src/Ballot.cpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan +/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -26,6 +26,7 @@ struct BallotSingleThreadWarpTestKernel } }; +template struct BallotMultipleThreadWarpTestKernel { ALPAKA_NO_HOST_ACC_WARNING @@ -65,6 +66,15 @@ struct BallotMultipleThreadWarpTestKernel } }; +namespace alpaka::trait +{ + template + struct WarpSize, TAcc> + { + static constexpr std::uint32_t warp_size = TWarpSize; + }; +} // namespace alpaka::trait + TEMPLATE_LIST_TEST_CASE("ballot", "[warp]", alpaka::test::TestAccs) { using Acc = TestType; @@ -92,8 +102,18 @@ TEMPLATE_LIST_TEST_CASE("ballot", "[warp]", alpaka::test::TestAccs) auto const threadElementExtent = alpaka::Vec::ones(); auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; auto fixture = ExecutionFixture{workDiv}; - BallotMultipleThreadWarpTestKernel kernel; - REQUIRE(fixture(kernel)); + if(warpExtent == 8) + { + REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<8>{})); + } + else if(warpExtent == 16) + { + REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<16>{})); + } + else if(warpExtent == 32) + { + REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<32>{})); + } } } } diff --git a/test/unit/warp/src/Shfl.cpp b/test/unit/warp/src/Shfl.cpp index 63f5b81a5f77..d9ec5d59ee5d 100644 --- a/test/unit/warp/src/Shfl.cpp +++ b/test/unit/warp/src/Shfl.cpp @@ -1,4 +1,4 @@ -/* Copyright 2023 David M. Rogers, Jan Stephan +/* Copyright 2023 David M. Rogers, Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -35,6 +35,7 @@ struct ShflSingleThreadWarpTestKernel } }; +template struct ShflMultipleThreadWarpTestKernel { ALPAKA_NO_HOST_ACC_WARNING @@ -87,6 +88,15 @@ struct ShflMultipleThreadWarpTestKernel } }; +namespace alpaka::trait +{ + template + struct WarpSize, TAcc> + { + static constexpr std::uint32_t warp_size = TWarpSize; + }; +} // namespace alpaka::trait + TEMPLATE_LIST_TEST_CASE("shfl", "[warp]", alpaka::test::TestAccs) { using Acc = TestType; @@ -114,8 +124,18 @@ TEMPLATE_LIST_TEST_CASE("shfl", "[warp]", alpaka::test::TestAccs) auto const threadElementExtent = alpaka::Vec::ones(); auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; auto fixture = ExecutionFixture{workDiv}; - ShflMultipleThreadWarpTestKernel kernel; - REQUIRE(fixture(kernel)); + if(warpExtent == 8) + { + REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<8>{})); + } + else if(warpExtent == 16) + { + REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<16>{})); + } + else if(warpExtent == 32) + { + REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<32>{})); + } } } } From b26b05a34b222ba1e08bbc178cf10e34200f9aff Mon Sep 17 00:00:00 2001 From: AuroraPerego Date: Wed, 5 Jul 2023 09:13:58 +0200 Subject: [PATCH 08/10] Support compile-time warp size in SYCL kernels Introduce a new optional trait to describe at compile time the warp size that a kernel should use. The default behaviour is to let the back-end compiler pick the preferred size. Before launching a kernel with a compile-time sub-group size the user should query the sizes supported by the device, and choose accordingly. If the device does not support the requested size, the SYCL runtime will throw a synchronous exception. During just-in-time (JIT) compilation this guarantees that a kernel is compiled only for the sizes supported by the device. During ahead-of-time (AOT) compilation this is not enough, because the device is not known at compile time. The SYCL specification mandates that the back-end compilers should not fail if a kernel uses unsupported features, like unsupported sub-group sizes. Unfortunately the Intel OpenCL CPU and GPU compilers currently fail with a hard error. To work around this limitation, use the preprocessor macros defined when compiling AOT for the new SYCL targets to enable the compilation only for the sub-group sizes supported by each device. Note: while the CPU OpenCL back-end does support a sub-group size of 64, the SYCL code currently does not. To avoid issues with the sub-group primitives always consider the sub-group size of 64 as not supported by the device. Other changes: - remove the use of SYCL streams in favour of the printf() extension; - remove the ALPAKA_FN_HOST attribute; - fix the GetSize test for the different sub-group sizes; - fix the use of sycl::exceptions; - use different member names for nd_item in different classes, to avoid ambiguous name lookup error when accessing the nd_item in the accelerator object. --- cmake/alpakaCommon.cmake | 12 +- include/alpaka/acc/AccGenericSycl.hpp | 28 --- include/alpaka/core/Sycl.hpp | 15 -- include/alpaka/dev/DevGenericSycl.hpp | 9 +- include/alpaka/idx/bt/IdxBtGenericSycl.hpp | 18 +- include/alpaka/idx/gb/IdxGbGenericSycl.hpp | 18 +- include/alpaka/kernel/SyclSubgroupSize.hpp | 97 ++++++++ .../alpaka/kernel/TaskKernelGenericSycl.hpp | 218 ++++++++++++------ include/alpaka/kernel/Traits.hpp | 8 +- include/alpaka/mem/buf/BufGenericSycl.hpp | 37 ++- include/alpaka/pltf/PltfGenericSycl.hpp | 3 +- include/alpaka/rand/RandGenericSycl.hpp | 120 +++------- include/alpaka/warp/WarpGenericSycl.hpp | 34 +-- include/alpaka/workdiv/WorkDivGenericSycl.hpp | 30 +-- test/unit/warp/src/Activemask.cpp | 29 ++- test/unit/warp/src/All.cpp | 23 +- test/unit/warp/src/Any.cpp | 21 +- test/unit/warp/src/Ballot.cpp | 23 +- test/unit/warp/src/GetSize.cpp | 30 ++- test/unit/warp/src/Shfl.cpp | 23 +- 20 files changed, 471 insertions(+), 325 deletions(-) create mode 100644 include/alpaka/kernel/SyclSubgroupSize.hpp diff --git a/cmake/alpakaCommon.cmake b/cmake/alpakaCommon.cmake index e4ec50a8cd2b..27776497138d 100644 --- a/cmake/alpakaCommon.cmake +++ b/cmake/alpakaCommon.cmake @@ -1,5 +1,5 @@ # -# Copyright 2023 Benjamin Worpitz, Erik Zenker, Axel Hübl, Jan Stephan, René Widera, Jeffrey Kelling, Andrea Bocci, Bernhard Manfred Gruber +# Copyright 2023 Benjamin Worpitz, Erik Zenker, Axel Hübl, Jan Stephan, René Widera, Jeffrey Kelling, Andrea Bocci, Bernhard Manfred Gruber, Aurora Perego # SPDX-License-Identifier: MPL-2.0 # @@ -547,9 +547,6 @@ if(alpaka_ACC_SYCL_ENABLE) # Enable device-side printing to stdout cmake_dependent_option(alpaka_SYCL_ENABLE_IOSTREAM "Enable device-side printing to stdout" OFF "alpaka_ACC_SYCL_ENABLE" OFF) - if(BUILD_TESTING) - set(alpaka_SYCL_ENABLE_IOSTREAM ON CACHE BOOL "Enable device-side printing to stdout" FORCE) - endif() alpaka_set_compiler_options(HOST_DEVICE target alpaka "-fsycl") target_link_options(alpaka INTERFACE "-fsycl") @@ -559,7 +556,7 @@ if(alpaka_ACC_SYCL_ENABLE) # Determine SYCL targets set(alpaka_SYCL_ONEAPI_CPU_TARGET "spir64_x86_64") set(alpaka_SYCL_ONEAPI_FPGA_TARGET "spir64_fpga") - set(alpaka_SYCL_ONEAPI_GPU_TARGET "spir64_gen") + set(alpaka_SYCL_ONEAPI_GPU_TARGET ${alpaka_SYCL_ONEAPI_GPU_DEVICES}) if(alpaka_SYCL_ONEAPI_CPU) list(APPEND alpaka_SYCL_TARGETS ${alpaka_SYCL_ONEAPI_CPU_TARGET}) @@ -620,14 +617,13 @@ if(alpaka_ACC_SYCL_ENABLE) if(alpaka_SYCL_ONEAPI_GPU) # Create a drop-down list (in cmake-gui) of valid Intel GPU targets. On the command line the user can specifiy # additional targets, such as ranges: "Gen8-Gen12LP" or lists: "icllp;skl". - set(alpaka_SYCL_ONEAPI_GPU_DEVICES "bdw" CACHE STRING "Intel GPU devices / generations to compile for") + set(alpaka_SYCL_ONEAPI_GPU_DEVICES "intel_gpu_pvc" CACHE STRING "Intel GPU devices / generations to compile for") set_property(CACHE alpaka_SYCL_ONEAPI_GPU_DEVICES - PROPERTY STRINGS "bdw;skl;kbl;cfl;bxt;glk;whl;aml;cml;icllp;lkf;ehl;tgllp;rkl;adl-s;adl-p;dg1;acm-g10;ats-m150;dg2-g10;acm-g11;ats-m75;dg2-g11;acm-g12;dg2-g12;pvc-sdv;pvc;gen11;gen12lp;gen8;gen9;xe;xe-hpc;xe-hpg") + PROPERTY STRINGS "intel_gpu_pvc;intel_gpu_acm_g12;intel_gpu_acm_g11;intel_gpu_acm_g10;intel_gpu_dg1;intel_gpu_adl_n;intel_gpu_adl_p;intel_gpu_rpl_s;intel_gpu_adl_s;intel_gpu_rkl;intel_gpu_tgllp;intel_gpu_icllp;intel_gpu_cml;intel_gpu_aml;intel_gpu_whl;intel_gpu_glk;intel_gpu_apl;intel_gpu_cfl;intel_gpu_kbl;intel_gpu_skl;intel_gpu_bdw") # If the user has given us a list turn all ';' into ',' to pacify the Intel OpenCL compiler. string(REPLACE ";" "," alpaka_SYCL_ONEAPI_GPU_DEVICES "${alpaka_SYCL_ONEAPI_GPU_DEVICES}") target_compile_definitions(alpaka INTERFACE "ALPAKA_SYCL_ONEAPI_GPU") - target_link_options(alpaka INTERFACE "SHELL:-Xsycl-target-backend=${alpaka_SYCL_ONEAPI_GPU_TARGET} \"-device ${alpaka_SYCL_ONEAPI_GPU_DEVICES}\"") endif() #----------------------------------------------------------------------------------------------------------------- diff --git a/include/alpaka/acc/AccGenericSycl.hpp b/include/alpaka/acc/AccGenericSycl.hpp index 76a735459940..ea9560e23b38 100644 --- a/include/alpaka/acc/AccGenericSycl.hpp +++ b/include/alpaka/acc/AccGenericSycl.hpp @@ -68,33 +68,6 @@ namespace alpaka auto operator=(AccGenericSycl const&) -> AccGenericSycl& = delete; auto operator=(AccGenericSycl&&) -> AccGenericSycl& = delete; -# ifdef ALPAKA_SYCL_IOSTREAM_ENABLED - AccGenericSycl( - Vec const& threadElemExtent, - sycl::nd_item work_item, - sycl::local_accessor dyn_shared_acc, - sycl::local_accessor st_shared_acc, - sycl::accessor global_fence_dummy, - sycl::local_accessor local_fence_dummy, - sycl::stream output_stream) - : WorkDivGenericSycl{threadElemExtent, work_item} - , gb::IdxGbGenericSycl{work_item} - , bt::IdxBtGenericSycl{work_item} - , AtomicHierarchy{} - , math::MathGenericSycl{} - , BlockSharedMemDynGenericSycl{dyn_shared_acc} - , BlockSharedMemStGenericSycl{st_shared_acc} - , BlockSyncGenericSycl{work_item} - , IntrinsicGenericSycl{} - , MemFenceGenericSycl{global_fence_dummy, local_fence_dummy} - , rand::RandGenericSycl{work_item} - , warp::WarpGenericSycl{work_item} - , cout{output_stream} - { - } - - sycl::stream cout; -# else AccGenericSycl( Vec const& threadElemExtent, sycl::nd_item work_item, @@ -116,7 +89,6 @@ namespace alpaka , warp::WarpGenericSycl{work_item} { } -# endif }; } // namespace alpaka diff --git a/include/alpaka/core/Sycl.hpp b/include/alpaka/core/Sycl.hpp index 409f9951a48b..8ba75ef6ecda 100644 --- a/include/alpaka/core/Sycl.hpp +++ b/include/alpaka/core/Sycl.hpp @@ -80,7 +80,6 @@ namespace alpaka // 2 component vector types sycl::char2, - sycl::schar2, sycl::uchar2, sycl::short2, sycl::ushort2, @@ -88,15 +87,12 @@ namespace alpaka sycl::uint2, sycl::long2, sycl::ulong2, - sycl::longlong2, - sycl::ulonglong2, sycl::float2, sycl::double2, sycl::half2, // 3 component vector types sycl::char3, - sycl::schar3, sycl::uchar3, sycl::short3, sycl::ushort3, @@ -104,15 +100,12 @@ namespace alpaka sycl::uint3, sycl::long3, sycl::ulong3, - sycl::longlong3, - sycl::ulonglong3, sycl::float3, sycl::double3, sycl::half3, // 4 component vector types sycl::char4, - sycl::schar4, sycl::uchar4, sycl::short4, sycl::ushort4, @@ -120,15 +113,12 @@ namespace alpaka sycl::uint4, sycl::long4, sycl::ulong4, - sycl::longlong4, - sycl::ulonglong4, sycl::float4, sycl::double4, sycl::half4, // 8 component vector types sycl::char8, - sycl::schar8, sycl::uchar8, sycl::short8, sycl::ushort8, @@ -136,15 +126,12 @@ namespace alpaka sycl::uint8, sycl::long8, sycl::ulong8, - sycl::longlong8, - sycl::ulonglong8, sycl::float8, sycl::double8, sycl::half8, // 16 component vector types sycl::char16, - sycl::schar16, sycl::uchar16, sycl::short16, sycl::ushort16, @@ -152,8 +139,6 @@ namespace alpaka sycl::uint16, sycl::long16, sycl::ulong16, - sycl::longlong16, - sycl::ulonglong16, sycl::float16, sycl::double16, sycl::half16> diff --git a/include/alpaka/dev/DevGenericSycl.hpp b/include/alpaka/dev/DevGenericSycl.hpp index aa5027d23513..0a6875a77ba6 100644 --- a/include/alpaka/dev/DevGenericSycl.hpp +++ b/include/alpaka/dev/DevGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan, Antonio Di Pilato, Luca Ferragina +/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Luca Ferragina, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -176,7 +176,12 @@ namespace alpaka::trait static auto getWarpSizes(DevGenericSycl const& dev) -> std::vector { auto const device = dev.getNativeHandle().first; - return device.template get_info(); + std::vector warp_sizes = device.template get_info(); + // The CPU runtime supports a sub-group size of 64, but the SYCL implementation currently does not + auto find64 = std::find(warp_sizes.begin(), warp_sizes.end(), 64); + if(find64 != warp_sizes.end()) + warp_sizes.erase(find64); + return warp_sizes; } }; diff --git a/include/alpaka/idx/bt/IdxBtGenericSycl.hpp b/include/alpaka/idx/bt/IdxBtGenericSycl.hpp index 88628d343c18..9d68deb826a6 100644 --- a/include/alpaka/idx/bt/IdxBtGenericSycl.hpp +++ b/include/alpaka/idx/bt/IdxBtGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -23,11 +23,11 @@ namespace alpaka::bt public: using IdxBtBase = IdxBtGenericSycl; - explicit IdxBtGenericSycl(sycl::nd_item work_item) : my_item{work_item} + explicit IdxBtGenericSycl(sycl::nd_item work_item) : m_item_bt{work_item} { } - sycl::nd_item my_item; + sycl::nd_item m_item_bt; }; } // namespace alpaka::bt @@ -49,19 +49,19 @@ namespace alpaka::trait static auto getIdx(bt::IdxBtGenericSycl const& idx, TWorkDiv const&) -> Vec { if constexpr(TDim::value == 1) - return Vec{static_cast(idx.my_item.get_local_id(0))}; + return Vec{static_cast(idx.m_item_bt.get_local_id(0))}; else if constexpr(TDim::value == 2) { return Vec{ - static_cast(idx.my_item.get_local_id(1)), - static_cast(idx.my_item.get_local_id(0))}; + static_cast(idx.m_item_bt.get_local_id(1)), + static_cast(idx.m_item_bt.get_local_id(0))}; } else { return Vec{ - static_cast(idx.my_item.get_local_id(2)), - static_cast(idx.my_item.get_local_id(1)), - static_cast(idx.my_item.get_local_id(0))}; + static_cast(idx.m_item_bt.get_local_id(2)), + static_cast(idx.m_item_bt.get_local_id(1)), + static_cast(idx.m_item_bt.get_local_id(0))}; } } }; diff --git a/include/alpaka/idx/gb/IdxGbGenericSycl.hpp b/include/alpaka/idx/gb/IdxGbGenericSycl.hpp index 8fa8a5bb0f35..e7500c0f6144 100644 --- a/include/alpaka/idx/gb/IdxGbGenericSycl.hpp +++ b/include/alpaka/idx/gb/IdxGbGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -23,11 +23,11 @@ namespace alpaka::gb public: using IdxGbBase = IdxGbGenericSycl; - explicit IdxGbGenericSycl(sycl::nd_item work_item) : my_item{work_item} + explicit IdxGbGenericSycl(sycl::nd_item work_item) : m_item_gb{work_item} { } - sycl::nd_item my_item; + sycl::nd_item m_item_gb; }; } // namespace alpaka::gb @@ -49,19 +49,19 @@ namespace alpaka::trait static auto getIdx(gb::IdxGbGenericSycl const& idx, TWorkDiv const&) { if constexpr(TDim::value == 1) - return Vec(static_cast(idx.my_item.get_group(0))); + return Vec(static_cast(idx.m_item_gb.get_group(0))); else if constexpr(TDim::value == 2) { return Vec( - static_cast(idx.my_item.get_group(1)), - static_cast(idx.my_item.get_group(0))); + static_cast(idx.m_item_gb.get_group(1)), + static_cast(idx.m_item_gb.get_group(0))); } else { return Vec( - static_cast(idx.my_item.get_group(2)), - static_cast(idx.my_item.get_group(1)), - static_cast(idx.my_item.get_group(0))); + static_cast(idx.m_item_gb.get_group(2)), + static_cast(idx.m_item_gb.get_group(1)), + static_cast(idx.m_item_gb.get_group(0))); } } }; diff --git a/include/alpaka/kernel/SyclSubgroupSize.hpp b/include/alpaka/kernel/SyclSubgroupSize.hpp new file mode 100644 index 000000000000..b56b652c7b9e --- /dev/null +++ b/include/alpaka/kernel/SyclSubgroupSize.hpp @@ -0,0 +1,97 @@ +/* Copyright 2023 Andrea Bocci, Aurora Perego + * SPDX-License-Identifier: MPL-2.0 + */ + +#ifdef ALPAKA_ACC_SYCL_ENABLED + +# ifdef __SYCL_DEVICE_ONLY__ + +# if defined(__SYCL_TARGET_INTEL_GPU_BDW__) || /* Broadwell Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_SKL__) || /* Skylake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_KBL__) || /* Kaby Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_CFL__) || /* Coffee Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_APL__) || /* Apollo Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_GLK__) || /* Gemini Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_WHL__) || /* Whiskey Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_AML__) || /* Amber Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_CML__) || /* Comet Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_ICLLP__) || /* Ice Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_TGLLP__) || /* Tiger Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_RKL__) || /* Rocket Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_ADL_S__) || /* Alder Lake S Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_RPL_S__) || /* Raptor Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_ADL_P__) || /* Alder Lake P Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_ADL_N__) || /* Alder Lake N Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_DG1__) || /* DG1 Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_ACM_G10__) || /* Alchemist G10 Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_ACM_G11__) || /* Alchemist G11 Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_ACM_G12__) /* Alchemist G12 Intel graphics architecture */ + +# define SYCL_SUBGROUP_SIZE (8 | 16 | 32) + +# elif defined(__SYCL_TARGET_INTEL_GPU_PVC__) /* Ponte Vecchio Intel graphics architecture */ + +# define SYCL_SUBGROUP_SIZE (16 | 32) + +# elif defined(__SYCL_TARGET_INTEL_X86_64__) /* generate code ahead of time for x86_64 CPUs */ + +# define SYCL_SUBGROUP_SIZE (4 | 8 | 16 | 32 | 64) + +# elif defined(__SYCL_TARGET_NVIDIA_GPU_SM_50__) || /* NVIDIA Maxwell architecture (compute capability 5.0) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_52__) || /* NVIDIA Maxwell architecture (compute capability 5.2) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_53__) || /* NVIDIA Jetson TX1 / Nano (compute capability 5.3) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_60__) || /* NVIDIA Pascal architecture (compute capability 6.0) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_61__) || /* NVIDIA Pascal architecture (compute capability 6.1) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_62__) || /* NVIDIA Jetson TX2 (compute capability 6.2) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_70__) || /* NVIDIA Volta architecture (compute capability 7.0) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_72__) || /* NVIDIA Jetson AGX (compute capability 7.2) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_75__) || /* NVIDIA Turing architecture (compute capability 7.5) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_80__) || /* NVIDIA Ampere architecture (compute capability 8.0) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_86__) || /* NVIDIA Ampere architecture (compute capability 8.6) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_87__) || /* NVIDIA Jetson/Drive AGX Orin (compute capability 8.7) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_89__) || /* NVIDIA Ada Lovelace arch. (compute capability 8.9) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_90__) /* NVIDIA Hopper architecture (compute capability 9.0) */ + +# define SYCL_SUBGROUP_SIZE (32) + +# elif defined(__SYCL_TARGET_AMD_GPU_GFX700__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX701__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX702__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX801__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX802__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX803__) || /* AMD GCN 4.0 Arctic Islands architecture (gfx 8.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX805__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX810__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.1) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX900__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX902__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX904__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX906__) || /* AMD GCN 5.1 Vega II architecture (gfx 9.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX908__) || /* AMD CDNA 1.0 Arcturus architecture (gfx 9.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX90A__) /* AMD CDNA 2.0 Aldebaran architecture (gfx 9.0) */ + +# define SYCL_SUBGROUP_SIZE (64) + +# elif defined(__SYCL_TARGET_AMD_GPU_GFX1010__) || /* AMD RDNA 1.0 Navi 10 architecture (gfx 10.1) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX1011__) || /* AMD RDNA 1.0 Navi 12 architecture (gfx 10.1) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX1012__) || /* AMD RDNA 1.0 Navi 14 architecture (gfx 10.1) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX1013__) || /* AMD RDNA 2.0 Oberon architecture (gfx 10.1) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX1030__) || /* AMD RDNA 2.0 Navi 21 architecture (gfx 10.3) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX1031__) || /* AMD RDNA 2.0 Navi 22 architecture (gfx 10.3) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX1032__) || /* AMD RDNA 2.0 Navi 23 architecture (gfx 10.3) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX1034__) /* AMD RDNA 2.0 Navi 24 architecture (gfx 10.3) */ + +# define SYCL_SUBGROUP_SIZE (32 | 64) + +# else // __SYCL_TARGET_* + +# define SYCL_SUBGROUP_SIZE (0) /* unknown target */ + +# endif // __SYCL_TARGET_* + +# else + +# define SYCL_SUBGROUP_SIZE (0) /* host compilation */ + +# endif // __SYCL_DEVICE_ONLY__ + +#endif // ALPAKA_ACC_SYCL_ENABLED diff --git a/include/alpaka/kernel/TaskKernelGenericSycl.hpp b/include/alpaka/kernel/TaskKernelGenericSycl.hpp index a173b42a0437..289a6f72c819 100644 --- a/include/alpaka/kernel/TaskKernelGenericSycl.hpp +++ b/include/alpaka/kernel/TaskKernelGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina +/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -13,6 +13,7 @@ #include "alpaka/dev/Traits.hpp" #include "alpaka/dim/Traits.hpp" #include "alpaka/idx/Traits.hpp" +#include "alpaka/kernel/SyclSubgroupSize.hpp" #include "alpaka/kernel/Traits.hpp" #include "alpaka/mem/buf/sycl/Accessor.hpp" #include "alpaka/pltf/Traits.hpp" @@ -29,8 +30,72 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED +# if BOOST_COMP_CLANG +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wunused-lambda-capture" +# pragma clang diagnostic ignored "-Wunused-parameter" +# endif + # include +# define LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size) \ + cgh.parallel_for( \ + sycl::nd_range{global_size, local_size}, \ + [item_elements, \ + dyn_shared_accessor, \ + st_shared_accessor, \ + global_fence_dummy, \ + local_fence_dummy, \ + k_func, \ + k_args](sycl::nd_item work_item) [[intel::reqd_sub_group_size(sub_group_size)]] \ + { \ + auto acc = TAcc{ \ + item_elements, \ + work_item, \ + dyn_shared_accessor, \ + st_shared_accessor, \ + global_fence_dummy, \ + local_fence_dummy}; \ + core::apply( \ + [k_func, &acc](typename std::decay_t const&... args) { k_func(acc, args...); }, \ + k_args); \ + }); + +# define LAUNCH_SYCL_KERNEL_WITH_DEFAULT_SUBGROUP_SIZE \ + cgh.parallel_for( \ + sycl::nd_range{global_size, local_size}, \ + [item_elements, \ + dyn_shared_accessor, \ + st_shared_accessor, \ + global_fence_dummy, \ + local_fence_dummy, \ + k_func, \ + k_args](sycl::nd_item work_item) \ + { \ + auto acc = TAcc{ \ + item_elements, \ + work_item, \ + dyn_shared_accessor, \ + st_shared_accessor, \ + global_fence_dummy, \ + local_fence_dummy}; \ + core::apply( \ + [k_func, &acc](typename std::decay_t const&... args) { k_func(acc, args...); }, \ + k_args); \ + }); + +# define THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL \ + throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported)); \ + cgh.parallel_for( \ + sycl::nd_range{global_size, local_size}, \ + [item_elements, \ + dyn_shared_accessor, \ + st_shared_accessor, \ + global_fence_dummy, \ + local_fence_dummy, \ + k_func, \ + k_args](sycl::nd_item work_item) {}); + namespace alpaka::detail { template @@ -120,82 +185,86 @@ namespace alpaka auto k_func = m_kernelFnObj; auto k_args = m_args; -# ifdef ALPAKA_SYCL_IOSTREAM_ENABLED - // Set up device-side printing with (user-chosen value) KiB per block for the output buffer. - constexpr auto buf_size = std::size_t{ALPAKA_SYCL_IOSTREAM_KIB * 1024}; - auto buf_per_work_item = std::size_t{}; - if constexpr(TDim::value == 1) - buf_per_work_item = buf_size / static_cast(group_items[0]); - else if constexpr(TDim::value == 2) - buf_per_work_item = buf_size / static_cast(group_items[0] * group_items[1]); - else - buf_per_work_item - = buf_size / static_cast(group_items[0] * group_items[1] * group_items[2]); - - assert(buf_per_work_item > 0); + constexpr std::size_t sub_group_size = trait::warpSize; + bool supported = false; - auto output_stream = sycl::stream{buf_size, buf_per_work_item, cgh}; -# endif - if constexpr(trait::WarpSize::warp_size != 0) + if constexpr(sub_group_size == 0) { - // cgh.parallel_for>( //FIXME_ - cgh.parallel_for( - sycl::nd_range{global_size, local_size}, - [=](sycl::nd_item work_item) - [[intel::reqd_sub_group_size(trait::WarpSize::warp_size)]] - { -# ifdef ALPAKA_SYCL_IOSTREAM_ENABLED - auto acc = TAcc{ - item_elements, - work_item, - dyn_shared_accessor, - st_shared_accessor, - global_fence_dummy, - local_fence_dummy, - output_stream}; -# else - auto acc = TAcc{ - item_elements, - work_item, - dyn_shared_accessor, - st_shared_accessor, - global_fence_dummy, - local_fence_dummy}; -# endif - core::apply( - [k_func, &acc](typename std::decay_t const&... args) { k_func(acc, args...); }, - k_args); - }); + // no explicit subgroup size requirement + LAUNCH_SYCL_KERNEL_WITH_DEFAULT_SUBGROUP_SIZE + supported = true; } else - { // autodetect - // cgh.parallel_for>( //FIXME_ - cgh.parallel_for( - sycl::nd_range{global_size, local_size}, - [=](sycl::nd_item work_item) - { -# ifdef ALPAKA_SYCL_IOSTREAM_ENABLED - auto acc = TAcc{ - item_elements, - work_item, - dyn_shared_accessor, - st_shared_accessor, - global_fence_dummy, - local_fence_dummy, - output_stream}; + { +# if(SYCL_SUBGROUP_SIZE == 0) + // no explicit SYCL target, assume JIT compilation + LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size) + supported = true; # else - auto acc = TAcc{ - item_elements, - work_item, - dyn_shared_accessor, - st_shared_accessor, - global_fence_dummy, - local_fence_dummy}; + // check if the kernel should be launched with a subgroup size of 4 + if constexpr(sub_group_size == 4) + { +# if(SYCL_SUBGROUP_SIZE & 4) + LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(4) + supported = true; +# else + // empty kernel, required to keep SYCL happy + THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL +# endif + } + + // check if the kernel should be launched with a subgroup size of 8 + if constexpr(sub_group_size == 8) + { +# if(SYCL_SUBGROUP_SIZE & 8) + LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(8) + supported = true; +# else + // empty kernel, required to keep SYCL happy + THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL +# endif + } + + // check if the kernel should be launched with a subgroup size of 16 + if constexpr(sub_group_size == 16) + { +# if(SYCL_SUBGROUP_SIZE & 16) + LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(16) + supported = true; +# else + // empty kernel, required to keep SYCL happy + THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL +# endif + } + + // check if the kernel should be launched with a subgroup size of 32 + if constexpr(sub_group_size == 32) + { +# if(SYCL_SUBGROUP_SIZE & 32) + LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(32) + supported = true; +# else + // empty kernel, required to keep SYCL happy + THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL +# endif + } + + // check if the kernel should be launched with a subgroup size of 64 + if constexpr(sub_group_size == 64) + { +# if(SYCL_SUBGROUP_SIZE & 64) + LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(64) + supported = true; +# else + // empty kernel, required to keep SYCL happy + THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL +# endif + } # endif - core::apply( - [k_func, &acc](typename std::decay_t const&... args) { k_func(acc, args...); }, - k_args); - }); + + // this subgroup size is not supported, raise an exception + if(not supported) + throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported)); } } @@ -238,8 +307,13 @@ namespace alpaka TKernelFnObj m_kernelFnObj; core::Tuple...> m_args; }; + } // namespace alpaka +# if BOOST_COMP_CLANG +# pragma clang diagnostic pop +# endif + namespace alpaka::trait { //! The SYCL execution task accelerator type trait specialization. @@ -278,4 +352,6 @@ namespace alpaka::trait }; } // namespace alpaka::trait +# undef LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS + #endif diff --git a/include/alpaka/kernel/Traits.hpp b/include/alpaka/kernel/Traits.hpp index a0c8d17a89e2..384b82873759 100644 --- a/include/alpaka/kernel/Traits.hpp +++ b/include/alpaka/kernel/Traits.hpp @@ -1,5 +1,5 @@ /* Copyright 2023 Axel Huebl, Benjamin Worpitz, René Widera, Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber, - * Andrea Bocci + * Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -76,11 +76,13 @@ namespace alpaka //! //! The default implementation returns 0, which lets the accelerator compiler and runtime choose the warp size. template - struct WarpSize + struct WarpSize : std::integral_constant { - static constexpr std::uint32_t warp_size = 0u; }; + //! This is a shortcut for the trait defined above + template + inline constexpr std::uint32_t warpSize = WarpSize::value; //! The trait for getting the schedule to use when a kernel is run using the CpuOmp2Blocks accelerator. //! diff --git a/include/alpaka/mem/buf/BufGenericSycl.hpp b/include/alpaka/mem/buf/BufGenericSycl.hpp index e725c299a598..38241c83773b 100644 --- a/include/alpaka/mem/buf/BufGenericSycl.hpp +++ b/include/alpaka/mem/buf/BufGenericSycl.hpp @@ -73,7 +73,7 @@ namespace alpaka::trait template struct GetDev> { - ALPAKA_FN_HOST static auto getDev(BufGenericSycl const& buf) + static auto getDev(BufGenericSycl const& buf) { return buf.m_dev; } @@ -99,7 +99,7 @@ namespace alpaka::trait { static_assert(TDim::value > TIdxIntegralConst::value, "Requested dimension out of bounds"); - ALPAKA_FN_HOST static auto getExtent(BufGenericSycl const& buf) -> TIdx + static auto getExtent(BufGenericSycl const& buf) -> TIdx { return buf.m_extentElements[TIdxIntegralConst::value]; } @@ -109,12 +109,12 @@ namespace alpaka::trait template struct GetPtrNative> { - ALPAKA_FN_HOST static auto getPtrNative(BufGenericSycl const& buf) -> TElem const* + static auto getPtrNative(BufGenericSycl const& buf) -> TElem const* { return buf.m_spMem.get(); } - ALPAKA_FN_HOST static auto getPtrNative(BufGenericSycl& buf) -> TElem* + static auto getPtrNative(BufGenericSycl& buf) -> TElem* { return buf.m_spMem.get(); } @@ -124,9 +124,8 @@ namespace alpaka::trait template struct GetPtrDev, DevGenericSycl> { - ALPAKA_FN_HOST static auto getPtrDev( - BufGenericSycl const& buf, - DevGenericSycl const& dev) -> TElem const* + static auto getPtrDev(BufGenericSycl const& buf, DevGenericSycl const& dev) + -> TElem const* { if(dev == getDev(buf)) { @@ -138,9 +137,8 @@ namespace alpaka::trait } } - ALPAKA_FN_HOST static auto getPtrDev( - BufGenericSycl& buf, - DevGenericSycl const& dev) -> TElem* + static auto getPtrDev(BufGenericSycl& buf, DevGenericSycl const& dev) + -> TElem* { if(dev == getDev(buf)) { @@ -158,7 +156,7 @@ namespace alpaka::trait struct BufAlloc> { template - ALPAKA_FN_HOST static auto allocBuf(DevGenericSycl const& dev, TExtent const& extent) + static auto allocBuf(DevGenericSycl const& dev, TExtent const& extent) -> BufGenericSycl { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; @@ -192,10 +190,13 @@ namespace alpaka::trait } # endif + auto const& [nativeDev, nativeContext] = dev.getNativeHandle(); TElem* memPtr = sycl::malloc_device( static_cast(getExtentProduct(extent)), - dev.getNativeHandle().first, - dev.getNativeHandle().second); + nativeDev, + nativeContext); + // captured structured bindings are a C++20 extension + // auto deleter = [nativeContext](TElem* ptr) { sycl::free(ptr, nativeContext); }; auto deleter = [&dev](TElem* ptr) { sycl::free(ptr, dev.getNativeHandle().second); }; return BufGenericSycl(dev, memPtr, std::move(deleter), extent); @@ -204,7 +205,7 @@ namespace alpaka::trait //! The BufGenericSycl stream-ordered memory allocation capability trait specialization. template - struct HasAsyncBufSupport> : public std::false_type + struct HasAsyncBufSupport> : std::false_type { }; @@ -223,8 +224,7 @@ namespace alpaka::trait struct BufAllocMapped { template - ALPAKA_FN_HOST static auto allocMappedBuf(DevCpu const& host, TExtent const& extent) - -> BufCpu + static auto allocMappedBuf(DevCpu const& host, TExtent const& extent) -> BufCpu { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; @@ -249,12 +249,11 @@ namespace alpaka::trait template struct GetPtrDev, DevGenericSycl> { - ALPAKA_FN_HOST static auto getPtrDev(BufCpu const& buf, DevGenericSycl const&) - -> TElem const* + static auto getPtrDev(BufCpu const& buf, DevGenericSycl const&) -> TElem const* { return getPtrNative(buf); } - ALPAKA_FN_HOST static auto getPtrDev(BufCpu& buf, DevGenericSycl const&) -> TElem* + static auto getPtrDev(BufCpu& buf, DevGenericSycl const&) -> TElem* { return getPtrNative(buf); } diff --git a/include/alpaka/pltf/PltfGenericSycl.hpp b/include/alpaka/pltf/PltfGenericSycl.hpp index 914b3c7d80cb..d2a33b959b70 100644 --- a/include/alpaka/pltf/PltfGenericSycl.hpp +++ b/include/alpaka/pltf/PltfGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Luca Ferragina +/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -195,7 +195,6 @@ namespace alpaka::trait # endif std::cout << "Aspects: " << '\n'; - std::cout.flush(); # if defined(BOOST_COMP_ICPX) # if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0) diff --git a/include/alpaka/rand/RandGenericSycl.hpp b/include/alpaka/rand/RandGenericSycl.hpp index 7f4d3a3e1c65..befaa5d7005a 100644 --- a/include/alpaka/rand/RandGenericSycl.hpp +++ b/include/alpaka/rand/RandGenericSycl.hpp @@ -4,12 +4,12 @@ #pragma once -#if defined(ALPAKA_ACC_SYCL_ENABLED) +#include "alpaka/core/BoostPredef.hpp" +#include "alpaka/core/Concepts.hpp" +#include "alpaka/dev/DevGenericSycl.hpp" +#include "alpaka/rand/Traits.hpp" -# include -# include -# include -# include +#ifdef ALPAKA_ACC_SYCL_ENABLED // Backend specific imports. # include @@ -38,14 +38,13 @@ namespace alpaka::rand { //! The SYCL rand implementation. template - class RandGenericSycl : public concepts::Implements> + struct RandGenericSycl : concepts::Implements> { - public: - RandGenericSycl(sycl::nd_item my_item) : m_item{my_item} + explicit RandGenericSycl(sycl::nd_item my_item) : m_item_rand{my_item} { } - sycl::nd_item m_item; + sycl::nd_item m_item_rand; }; # if !defined(ALPAKA_HOST_ONLY) @@ -53,15 +52,11 @@ namespace alpaka::rand { //! The SYCL random number floating point normal distribution. template - class NormalReal; + struct NormalReal; - //! The SYCL random number floating point uniform distribution. + //! The SYCL random number uniform distribution. template - class UniformReal; - - //! The SYCL random number integer uniform distribution. - template - class UniformUint; + struct Uniform; } // namespace distribution::sycl_rand namespace engine::sycl_rand @@ -77,17 +72,15 @@ namespace alpaka::rand Minstd(RandGenericSycl rand, std::uint32_t const& seed) { - oneapi::dpl::minstd_rand engine(seed, rand.m_item.get_global_linear_id()); + oneapi::dpl::minstd_rand engine(seed, rand.m_item_rand.get_global_linear_id()); rng_engine = engine; } private: template - friend class distribution::sycl_rand::NormalReal; - template - friend class distribution::sycl_rand::UniformReal; + friend struct distribution::sycl_rand::NormalReal; template - friend class distribution::sycl_rand::UniformUint; + friend struct distribution::sycl_rand::Uniform; oneapi::dpl::minstd_rand rng_engine; @@ -112,83 +105,40 @@ namespace alpaka::rand namespace distribution::sycl_rand { - //! The SYCL random number float normal distribution. - template<> - class NormalReal - { - public: - template - auto operator()(TEngine& engine) -> float - { - // Create float uniform_real_distribution distribution - oneapi::dpl::normal_distribution distr; - - // Generate float random number - return distr(engine.rng_engine); - } - }; //! The SYCL random number double normal distribution. - template<> - class NormalReal + template + struct NormalReal { - public: - template - auto operator()(TEngine& engine) -> double - { - // Create float uniform_real_distribution distribution - oneapi::dpl::normal_distribution distr; - - // Generate float random number - return distr(engine.rng_engine); - } - }; + static_assert(std::is_floating_point_v); - //! The SYCL random number float uniform distribution. - template<> - class UniformReal - { - public: template - auto operator()(TEngine& engine) -> float + auto operator()(TEngine& engine) -> F { - // Create float uniform_real_distribution distribution - oneapi::dpl::uniform_real_distribution distr; - - // Generate float random number + oneapi::dpl::normal_distribution distr; return distr(engine.rng_engine); } }; //! The SYCL random number float uniform distribution. - template<> - class UniformReal + template + struct Uniform { - public: - template - auto operator()(TEngine& engine) -> double - { - // Create float uniform_real_distribution distribution - oneapi::dpl::uniform_real_distribution distr; + static_assert(std::is_floating_point_v || std::is_unsigned_v); - // Generate float random number - return distr(engine.rng_engine); - } - }; - - //! The SYCL random number unsigned integer uniform distribution. - template<> - class UniformUint - { - public: template - auto operator()(TEngine& engine) -> unsigned int + auto operator()(TEngine& engine) -> T { - // Create float uniform_real_distribution distribution - oneapi::dpl::uniform_int_distribution distr; - - // Generate float random number - return distr(engine.rng_engine); + if constexpr(std::is_floating_point_v) + { + oneapi::dpl::uniform_real_distribution distr; + return distr(engine.rng_engine); + } + else + { + oneapi::dpl::uniform_int_distribution distr; + return distr(engine.rng_engine); + } } }; } // namespace distribution::sycl_rand @@ -209,7 +159,7 @@ namespace alpaka::rand template struct CreateUniformReal, T, std::enable_if_t>> { - static auto createUniformReal(RandGenericSycl const& /*rand*/) -> sycl_rand::UniformReal + static auto createUniformReal(RandGenericSycl const& /*rand*/) -> sycl_rand::Uniform { return {}; } @@ -219,7 +169,7 @@ namespace alpaka::rand template struct CreateUniformUint, T, std::enable_if_t>> { - static auto createUniformUint(RandGenericSycl const& /*rand*/) -> sycl_rand::UniformUint + static auto createUniformUint(RandGenericSycl const& /*rand*/) -> sycl_rand::Uniform { return {}; } diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp index 356ee16e2c63..eb4a21a5a801 100644 --- a/include/alpaka/warp/WarpGenericSycl.hpp +++ b/include/alpaka/warp/WarpGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -19,11 +19,11 @@ namespace alpaka::warp class WarpGenericSycl : public concepts::Implements> { public: - WarpGenericSycl(sycl::nd_item my_item) : m_item{my_item} + WarpGenericSycl(sycl::nd_item my_item) : m_item_warp{my_item} { } - sycl::nd_item m_item; + sycl::nd_item m_item_warp; }; } // namespace alpaka::warp @@ -34,7 +34,7 @@ namespace alpaka::warp::trait { static auto getSize(warp::WarpGenericSycl const& warp) -> std::int32_t { - auto const sub_group = warp.m_item.get_sub_group(); + auto const sub_group = warp.m_item_warp.get_sub_group(); // SYCL sub-groups are always 1D return static_cast(sub_group.get_max_local_range()[0]); } @@ -43,14 +43,18 @@ namespace alpaka::warp::trait template struct Activemask> { - // FIXME This should be std::uint64_t on AMD GCN architectures. + // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU, + // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet. + // Restrict to warpSize <= 32 for now. static auto activemask(warp::WarpGenericSycl const& warp) -> std::uint32_t { // SYCL has no way of querying this. Since sub-group functions have to be executed in convergent code // regions anyway we return the full mask. - auto const sub_group = warp.m_item.get_sub_group(); + auto const sub_group = warp.m_item_warp.get_sub_group(); auto const mask = sycl::ext::oneapi::group_ballot(sub_group, true); - // FIXME This should be std::uint64_t on AMD GCN architectures. + // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU, + // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet. + // Restrict to warpSize <= 32 for now. std::uint32_t bits = 0; mask.extract_bits(bits); return bits; @@ -62,7 +66,7 @@ namespace alpaka::warp::trait { static auto all(warp::WarpGenericSycl const& warp, std::int32_t predicate) -> std::int32_t { - auto const sub_group = warp.m_item.get_sub_group(); + auto const sub_group = warp.m_item_warp.get_sub_group(); return static_cast(sycl::all_of_group(sub_group, static_cast(predicate))); } }; @@ -72,7 +76,7 @@ namespace alpaka::warp::trait { static auto any(warp::WarpGenericSycl const& warp, std::int32_t predicate) -> std::int32_t { - auto const sub_group = warp.m_item.get_sub_group(); + auto const sub_group = warp.m_item_warp.get_sub_group(); return static_cast(sycl::any_of_group(sub_group, static_cast(predicate))); } }; @@ -80,12 +84,16 @@ namespace alpaka::warp::trait template struct Ballot> { - // FIXME This should be std::uint64_t on AMD GCN architectures. + // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU, + // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet. + // Restrict to warpSize <= 32 for now. static auto ballot(warp::WarpGenericSycl const& warp, std::int32_t predicate) -> std::uint32_t { - auto const sub_group = warp.m_item.get_sub_group(); + auto const sub_group = warp.m_item_warp.get_sub_group(); auto const mask = sycl::ext::oneapi::group_ballot(sub_group, static_cast(predicate)); - // FIXME This should be std::uint64_t on AMD GCN architectures. + // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU, + // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet. + // Restrict to warpSize <= 32 for now. std::uint32_t bits = 0; mask.extract_bits(bits); return bits; @@ -108,7 +116,7 @@ namespace alpaka::warp::trait Example: If we assume a sub-group size of 32 and a width of 16 we will receive two subdivisions: The first starts at sub-group index 0 and the second at sub-group index 16. For srcLane = 4 the first subdivision will access the value at sub-group index 4 and the second at sub-group index 20. */ - auto const actual_group = warp.m_item.get_sub_group(); + auto const actual_group = warp.m_item_warp.get_sub_group(); auto const actual_item_id = static_cast(actual_group.get_local_linear_id()); auto const actual_group_id = actual_item_id / width; auto const actual_src_id = static_cast(srcLane + actual_group_id * width); diff --git a/include/alpaka/workdiv/WorkDivGenericSycl.hpp b/include/alpaka/workdiv/WorkDivGenericSycl.hpp index c7a2979815c4..725e1ff78824 100644 --- a/include/alpaka/workdiv/WorkDivGenericSycl.hpp +++ b/include/alpaka/workdiv/WorkDivGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -25,12 +25,12 @@ namespace alpaka WorkDivGenericSycl(Vec const& threadElemExtent, sycl::nd_item work_item) : m_threadElemExtent{threadElemExtent} - , my_item{work_item} + , m_item_workdiv{work_item} { } Vec const& m_threadElemExtent; - sycl::nd_item my_item; + sycl::nd_item m_item_workdiv; }; } // namespace alpaka @@ -60,19 +60,19 @@ namespace alpaka::trait if constexpr(TDim::value == 0) return Vec{}; else if constexpr(TDim::value == 1) - return Vec{static_cast(workDiv.my_item.get_group_range(0))}; + return Vec{static_cast(workDiv.m_item_workdiv.get_group_range(0))}; else if constexpr(TDim::value == 2) { return Vec{ - static_cast(workDiv.my_item.get_group_range(1)), - static_cast(workDiv.my_item.get_group_range(0))}; + static_cast(workDiv.m_item_workdiv.get_group_range(1)), + static_cast(workDiv.m_item_workdiv.get_group_range(0))}; } else { return Vec{ - static_cast(workDiv.my_item.get_group_range(2)), - static_cast(workDiv.my_item.get_group_range(1)), - static_cast(workDiv.my_item.get_group_range(0))}; + static_cast(workDiv.m_item_workdiv.get_group_range(2)), + static_cast(workDiv.m_item_workdiv.get_group_range(1)), + static_cast(workDiv.m_item_workdiv.get_group_range(0))}; } } }; @@ -87,19 +87,19 @@ namespace alpaka::trait if constexpr(TDim::value == 0) return Vec{}; else if constexpr(TDim::value == 1) - return Vec{static_cast(workDiv.my_item.get_local_range(0))}; + return Vec{static_cast(workDiv.m_item_workdiv.get_local_range(0))}; else if constexpr(TDim::value == 2) { return Vec{ - static_cast(workDiv.my_item.get_local_range(1)), - static_cast(workDiv.my_item.get_local_range(0))}; + static_cast(workDiv.m_item_workdiv.get_local_range(1)), + static_cast(workDiv.m_item_workdiv.get_local_range(0))}; } else { return Vec{ - static_cast(workDiv.my_item.get_local_range(2)), - static_cast(workDiv.my_item.get_local_range(1)), - static_cast(workDiv.my_item.get_local_range(0))}; + static_cast(workDiv.m_item_workdiv.get_local_range(2)), + static_cast(workDiv.m_item_workdiv.get_local_range(1)), + static_cast(workDiv.m_item_workdiv.get_local_range(0))}; } } }; diff --git a/test/unit/warp/src/Activemask.cpp b/test/unit/warp/src/Activemask.cpp index 76431d44c557..8038606a4116 100644 --- a/test/unit/warp/src/Activemask.cpp +++ b/test/unit/warp/src/Activemask.cpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci +/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -54,14 +54,11 @@ struct ActivemaskMultipleThreadWarpTestKernel } }; -namespace alpaka::trait +template +struct alpaka::trait::WarpSize, TAcc> + : std::integral_constant { - template - struct WarpSize, TAcc> - { - static constexpr std::uint32_t warp_size = TWarpSize; - }; -} // namespace alpaka::trait +}; TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs) { @@ -90,7 +87,14 @@ TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs) auto const threadElementExtent = alpaka::Vec::ones(); auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; auto fixture = ExecutionFixture{workDiv}; - if(warpExtent == 8) + if(warpExtent == 4) + { + for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) + { + CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<4>{}, inactiveThreadIdx)); + } + } + else if(warpExtent == 8) { for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) { @@ -111,6 +115,13 @@ TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs) CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<32>{}, inactiveThreadIdx)); } } + else if(warpExtent == 64) + { + for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) + { + CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<64>{}, inactiveThreadIdx)); + } + } } } } diff --git a/test/unit/warp/src/All.cpp b/test/unit/warp/src/All.cpp index 27468cd8803c..3a61553496be 100644 --- a/test/unit/warp/src/All.cpp +++ b/test/unit/warp/src/All.cpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci +/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -58,14 +58,11 @@ struct AllMultipleThreadWarpTestKernel } }; -namespace alpaka::trait +template +struct alpaka::trait::WarpSize, TAcc> + : std::integral_constant { - template - struct WarpSize, TAcc> - { - static constexpr std::uint32_t warp_size = TWarpSize; - }; -} // namespace alpaka::trait +}; TEMPLATE_LIST_TEST_CASE("all", "[warp]", alpaka::test::TestAccs) { @@ -94,7 +91,11 @@ TEMPLATE_LIST_TEST_CASE("all", "[warp]", alpaka::test::TestAccs) auto const threadElementExtent = alpaka::Vec::ones(); auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; auto fixture = ExecutionFixture{workDiv}; - if(warpExtent == 8) + if(warpExtent == 4) + { + REQUIRE(fixture(AllMultipleThreadWarpTestKernel<4>{})); + } + else if(warpExtent == 8) { REQUIRE(fixture(AllMultipleThreadWarpTestKernel<8>{})); } @@ -106,6 +107,10 @@ TEMPLATE_LIST_TEST_CASE("all", "[warp]", alpaka::test::TestAccs) { REQUIRE(fixture(AllMultipleThreadWarpTestKernel<32>{})); } + else if(warpExtent == 64) + { + REQUIRE(fixture(AllMultipleThreadWarpTestKernel<64>{})); + } } } } diff --git a/test/unit/warp/src/Any.cpp b/test/unit/warp/src/Any.cpp index a00f303fc0fa..0f5a059d1c48 100644 --- a/test/unit/warp/src/Any.cpp +++ b/test/unit/warp/src/Any.cpp @@ -58,14 +58,11 @@ struct AnyMultipleThreadWarpTestKernel } }; -namespace alpaka::trait +template +struct alpaka::trait::WarpSize, TAcc> + : std::integral_constant { - template - struct WarpSize, TAcc> - { - static constexpr std::uint32_t warp_size = TWarpSize; - }; -} // namespace alpaka::trait +}; TEMPLATE_LIST_TEST_CASE("any", "[warp]", alpaka::test::TestAccs) { @@ -94,7 +91,11 @@ TEMPLATE_LIST_TEST_CASE("any", "[warp]", alpaka::test::TestAccs) auto const threadElementExtent = alpaka::Vec::ones(); auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; auto fixture = ExecutionFixture{workDiv}; - if(warpExtent == 8) + if(warpExtent == 4) + { + REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<4>{})); + } + else if(warpExtent == 8) { REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<8>{})); } @@ -106,6 +107,10 @@ TEMPLATE_LIST_TEST_CASE("any", "[warp]", alpaka::test::TestAccs) { REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<32>{})); } + else if(warpExtent == 64) + { + REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<64>{})); + } } } } diff --git a/test/unit/warp/src/Ballot.cpp b/test/unit/warp/src/Ballot.cpp index edf8c59cd575..0525928c275c 100644 --- a/test/unit/warp/src/Ballot.cpp +++ b/test/unit/warp/src/Ballot.cpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci +/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -66,14 +66,11 @@ struct BallotMultipleThreadWarpTestKernel } }; -namespace alpaka::trait +template +struct alpaka::trait::WarpSize, TAcc> + : std::integral_constant { - template - struct WarpSize, TAcc> - { - static constexpr std::uint32_t warp_size = TWarpSize; - }; -} // namespace alpaka::trait +}; TEMPLATE_LIST_TEST_CASE("ballot", "[warp]", alpaka::test::TestAccs) { @@ -102,7 +99,11 @@ TEMPLATE_LIST_TEST_CASE("ballot", "[warp]", alpaka::test::TestAccs) auto const threadElementExtent = alpaka::Vec::ones(); auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; auto fixture = ExecutionFixture{workDiv}; - if(warpExtent == 8) + if(warpExtent == 4) + { + REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<4>{})); + } + else if(warpExtent == 8) { REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<8>{})); } @@ -114,6 +115,10 @@ TEMPLATE_LIST_TEST_CASE("ballot", "[warp]", alpaka::test::TestAccs) { REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<32>{})); } + else if(warpExtent == 64) + { + REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<64>{})); + } } } } diff --git a/test/unit/warp/src/GetSize.cpp b/test/unit/warp/src/GetSize.cpp index 5dbbaa018bba..d8c2b5ce3a84 100644 --- a/test/unit/warp/src/GetSize.cpp +++ b/test/unit/warp/src/GetSize.cpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan +/* Copyright 2022 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -12,6 +12,7 @@ #include +template struct GetSizeTestKernel { ALPAKA_NO_HOST_ACC_WARNING @@ -22,6 +23,11 @@ struct GetSizeTestKernel } }; +template +struct alpaka::trait::WarpSize, TAcc> : std::integral_constant +{ +}; + TEMPLATE_LIST_TEST_CASE("getSize", "[warp]", alpaka::test::TestAccs) { using Acc = TestType; @@ -37,6 +43,26 @@ TEMPLATE_LIST_TEST_CASE("getSize", "[warp]", alpaka::test::TestAccs) [](std::size_t ws) { alpaka::test::KernelExecutionFixture fixture(alpaka::Vec::all(8)); - return fixture(GetSizeTestKernel{}, static_cast(ws)); + if(ws == 4) + { + return fixture(GetSizeTestKernel<4>{}, static_cast(ws)); + } + else if(ws == 8) + { + return fixture(GetSizeTestKernel<8>{}, static_cast(ws)); + } + else if(ws == 16) + { + return fixture(GetSizeTestKernel<16>{}, static_cast(ws)); + } + else if(ws == 32) + { + return fixture(GetSizeTestKernel<32>{}, static_cast(ws)); + } + else if(ws == 64) + { + return fixture(GetSizeTestKernel<64>{}, static_cast(ws)); + } + return fixture(GetSizeTestKernel<0>{}, static_cast(ws)); })); } diff --git a/test/unit/warp/src/Shfl.cpp b/test/unit/warp/src/Shfl.cpp index d9ec5d59ee5d..4a35ba4480a3 100644 --- a/test/unit/warp/src/Shfl.cpp +++ b/test/unit/warp/src/Shfl.cpp @@ -1,4 +1,4 @@ -/* Copyright 2023 David M. Rogers, Jan Stephan, Andrea Bocci +/* Copyright 2023 David M. Rogers, Jan Stephan, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -88,14 +88,11 @@ struct ShflMultipleThreadWarpTestKernel } }; -namespace alpaka::trait +template +struct alpaka::trait::WarpSize, TAcc> + : std::integral_constant { - template - struct WarpSize, TAcc> - { - static constexpr std::uint32_t warp_size = TWarpSize; - }; -} // namespace alpaka::trait +}; TEMPLATE_LIST_TEST_CASE("shfl", "[warp]", alpaka::test::TestAccs) { @@ -124,7 +121,11 @@ TEMPLATE_LIST_TEST_CASE("shfl", "[warp]", alpaka::test::TestAccs) auto const threadElementExtent = alpaka::Vec::ones(); auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; auto fixture = ExecutionFixture{workDiv}; - if(warpExtent == 8) + if(warpExtent == 4) + { + REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<4>{})); + } + else if(warpExtent == 8) { REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<8>{})); } @@ -136,6 +137,10 @@ TEMPLATE_LIST_TEST_CASE("shfl", "[warp]", alpaka::test::TestAccs) { REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<32>{})); } + else if(warpExtent == 64) + { + REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<64>{})); + } } } } From 218ab6e561b5f79305457729687e6a8e849748c1 Mon Sep 17 00:00:00 2001 From: AuroraPerego Date: Mon, 10 Jul 2023 19:01:41 +0200 Subject: [PATCH 09/10] Various fixes related to the SYCL back-end - add the missing specialization of CreateViewPlainPtr for SYCL devices - improve the comments on the ALPAKA_FN_INLINE macro - remove unnecessary ALPAKA_FN_HOST attributes - rename QueueGenericSyclBase::m_impl to m_spQueueImpl, to align with the other back-ends --- include/alpaka/acc/AccFpgaSyclIntel.hpp | 4 +-- include/alpaka/core/Common.hpp | 11 ++++---- include/alpaka/event/EventGenericSycl.hpp | 10 +++---- include/alpaka/mem/buf/sycl/Copy.hpp | 28 +++++-------------- include/alpaka/mem/buf/sycl/Set.hpp | 6 ++-- include/alpaka/mem/view/ViewPlainPtr.hpp | 22 ++++++++++++++- .../queue/sycl/QueueGenericSyclBase.hpp | 18 ++++++------ 7 files changed, 52 insertions(+), 47 deletions(-) diff --git a/include/alpaka/acc/AccFpgaSyclIntel.hpp b/include/alpaka/acc/AccFpgaSyclIntel.hpp index 7ddc7b2b65de..2c71a86c2b90 100644 --- a/include/alpaka/acc/AccFpgaSyclIntel.hpp +++ b/include/alpaka/acc/AccFpgaSyclIntel.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -45,7 +45,7 @@ namespace alpaka::trait template struct GetAccName> { - ALPAKA_FN_HOST static auto getAccName() -> std::string + static auto getAccName() -> std::string { return "AccFpgaSyclIntel<" + std::to_string(TDim::value) + "," + core::demangled + ">"; } diff --git a/include/alpaka/core/Common.hpp b/include/alpaka/core/Common.hpp index bc7428af08db..6da0b9fbd620 100644 --- a/include/alpaka/core/Common.hpp +++ b/include/alpaka/core/Common.hpp @@ -76,13 +76,12 @@ //! Macro defining the inline function attribute. #if BOOST_LANG_CUDA || BOOST_LANG_HIP # define ALPAKA_FN_INLINE __forceinline__ -#else -# if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED) +#elif BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED) // TODO: With C++20 [[msvc::forceinline]] can be used. -# define ALPAKA_FN_INLINE __forceinline -# else -# define ALPAKA_FN_INLINE [[gnu::always_inline]] inline -# endif +# define ALPAKA_FN_INLINE __forceinline +#else +// For gcc, clang, and clang-based compilers like Intel icpx +# define ALPAKA_FN_INLINE [[gnu::always_inline]] inline #endif //! This macro defines a variable lying in global accelerator device memory. diff --git a/include/alpaka/event/EventGenericSycl.hpp b/include/alpaka/event/EventGenericSycl.hpp index 3e84baec3810..f761902ddfea 100644 --- a/include/alpaka/event/EventGenericSycl.hpp +++ b/include/alpaka/event/EventGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan, Antonio Di Pilato +/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -87,7 +87,7 @@ namespace alpaka::trait { static auto enqueue(QueueGenericSyclNonBlocking& queue, EventGenericSycl& event) { - event.setEvent(queue.m_impl->get_last_event()); + event.setEvent(queue.m_spQueueImpl->get_last_event()); } }; @@ -97,7 +97,7 @@ namespace alpaka::trait { static auto enqueue(QueueGenericSyclBlocking& queue, EventGenericSycl& event) { - event.setEvent(queue.m_impl->get_last_event()); + event.setEvent(queue.m_spQueueImpl->get_last_event()); } }; @@ -120,7 +120,7 @@ namespace alpaka::trait { static auto waiterWaitFor(QueueGenericSyclNonBlocking& queue, EventGenericSycl const& event) { - queue.m_impl->register_dependency(event.getNativeHandle()); + queue.m_spQueueImpl->register_dependency(event.getNativeHandle()); } }; @@ -130,7 +130,7 @@ namespace alpaka::trait { static auto waiterWaitFor(QueueGenericSyclBlocking& queue, EventGenericSycl const& event) { - queue.m_impl->register_dependency(event.getNativeHandle()); + queue.m_spQueueImpl->register_dependency(event.getNativeHandle()); } }; diff --git a/include/alpaka/mem/buf/sycl/Copy.hpp b/include/alpaka/mem/buf/sycl/Copy.hpp index 34527e8478ed..e1d8ce640c32 100644 --- a/include/alpaka/mem/buf/sycl/Copy.hpp +++ b/include/alpaka/mem/buf/sycl/Copy.hpp @@ -59,7 +59,7 @@ namespace alpaka::detail } # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL - ALPAKA_FN_HOST auto printDebug() const -> void + auto printDebug() const -> void { std::cout << __func__ << " e: " << m_extent << " ewb: " << this->m_extentWidthBytes << " de: " << m_dstExtent << " dptr: " << reinterpret_cast(m_dstMemNative) @@ -93,8 +93,7 @@ namespace alpaka::detail using TaskCopySyclBase::TaskCopySyclBase; - ALPAKA_FN_HOST auto operator()(sycl::queue& queue, std::vector const& requirements) const - -> sycl::event + auto operator()(sycl::queue& queue, std::vector const& requirements) const -> sycl::event { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; @@ -145,8 +144,7 @@ namespace alpaka::detail using TaskCopySyclBase, TViewDst, TViewSrc, TExtent>::TaskCopySyclBase; using Elem = alpaka::Elem; - ALPAKA_FN_HOST auto operator()(sycl::queue& queue, std::vector const& requirements) const - -> sycl::event + auto operator()(sycl::queue& queue, std::vector const& requirements) const -> sycl::event { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; @@ -179,10 +177,7 @@ namespace alpaka::detail using Elem = alpaka::Elem; template - ALPAKA_FN_HOST TaskCopySycl( - TViewDstFwd&& viewDst, - TViewSrc const& viewSrc, - [[maybe_unused]] TExtent const& extent) + TaskCopySycl(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, [[maybe_unused]] TExtent const& extent) : m_dstMemNative(reinterpret_cast(getPtrNative(viewDst))) , m_srcMemNative(reinterpret_cast(getPtrNative(viewSrc))) { @@ -211,10 +206,7 @@ namespace alpaka::trait struct CreateTaskMemcpy, DevCpu> { template - ALPAKA_FN_HOST static auto createTaskMemcpy( - TViewDstFwd&& viewDst, - TViewSrc const& viewSrc, - TExtent const& extent) + static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent) -> alpaka::detail::TaskCopySycl, TViewSrc, TExtent> { ALPAKA_DEBUG_FULL_LOG_SCOPE; @@ -228,10 +220,7 @@ namespace alpaka::trait struct CreateTaskMemcpy> { template - ALPAKA_FN_HOST static auto createTaskMemcpy( - TViewDstFwd&& viewDst, - TViewSrc const& viewSrc, - TExtent const& extent) + static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent) -> alpaka::detail::TaskCopySycl, TViewSrc, TExtent> { ALPAKA_DEBUG_FULL_LOG_SCOPE; @@ -245,10 +234,7 @@ namespace alpaka::trait struct CreateTaskMemcpy, DevGenericSycl> { template - ALPAKA_FN_HOST static auto createTaskMemcpy( - TViewDstFwd&& viewDst, - TViewSrc const& viewSrc, - TExtent const& extent) + static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent) -> alpaka::detail::TaskCopySycl, TViewSrc, TExtent> { ALPAKA_DEBUG_FULL_LOG_SCOPE; diff --git a/include/alpaka/mem/buf/sycl/Set.hpp b/include/alpaka/mem/buf/sycl/Set.hpp index da52a9206c46..01899cc18e1b 100644 --- a/include/alpaka/mem/buf/sycl/Set.hpp +++ b/include/alpaka/mem/buf/sycl/Set.hpp @@ -54,7 +54,7 @@ namespace alpaka } # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL - ALPAKA_FN_HOST auto printDebug() const -> void + auto printDebug() const -> void { std::cout << __func__ << " e: " << this->m_extent << " ewb: " << this->m_extentWidthBytes << " de: " << this->m_dstExtent << " dptr: " << reinterpret_cast(this->m_dstMemNative) @@ -170,7 +170,7 @@ namespace alpaka } # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL - ALPAKA_FN_HOST auto printDebug() const -> void + auto printDebug() const -> void { std::cout << __func__ << " e: " << Scalar() << " ewb: " << sizeof(Elem) << " de: " << Scalar() << " dptr: " << reinterpret_cast(m_dstMemNative) << " dpitchb: " << Scalar() @@ -203,7 +203,7 @@ namespace alpaka struct CreateTaskMemset> { template - ALPAKA_FN_HOST static auto createTaskMemset(TView& view, std::uint8_t const& byte, TExtent const& extent) + static auto createTaskMemset(TView& view, std::uint8_t const& byte, TExtent const& extent) -> detail::TaskSetSycl { return detail::TaskSetSycl(view, byte, extent); diff --git a/include/alpaka/mem/view/ViewPlainPtr.hpp b/include/alpaka/mem/view/ViewPlainPtr.hpp index 7416385ea068..56fadf03feb9 100644 --- a/include/alpaka/mem/view/ViewPlainPtr.hpp +++ b/include/alpaka/mem/view/ViewPlainPtr.hpp @@ -1,5 +1,5 @@ /* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Sergei Bastrakov, Bernhard Manfred Gruber, - * Jan Stephan, Andrea Bocci + * Jan Stephan, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -229,6 +229,26 @@ namespace alpaka }; #endif +#if defined(ALPAKA_ACC_SYCL_ENABLED) + //! The SYCL device CreateViewPlainPtr trait specialization. + template + struct CreateViewPlainPtr> + { + template + static auto createViewPlainPtr( + DevGenericSycl const& dev, + TElem* pMem, + TExtent const& extent, + TPitch const& pitch) + { + return alpaka::ViewPlainPtr, TElem, alpaka::Dim, alpaka::Idx>( + pMem, + dev, + extent, + pitch); + } + }; +#endif //! The ViewPlainPtr offset get trait specialization. template struct GetOffset> diff --git a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp index c529ecfaa2e4..0f847079de95 100644 --- a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp +++ b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Luca Ferragina, Andrea Bocci +/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Luca Ferragina, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -172,16 +172,16 @@ namespace alpaka::detail public: QueueGenericSyclBase(TDev const& dev) : m_dev{dev} - , m_impl{std::make_shared( + , m_spQueueImpl{std::make_shared( dev.getNativeHandle().second, dev.getNativeHandle().first)} { - m_dev.m_impl->register_queue(m_impl); + m_dev.m_impl->register_queue(m_spQueueImpl); } friend auto operator==(QueueGenericSyclBase const& lhs, QueueGenericSyclBase const& rhs) -> bool { - return (lhs.m_dev == rhs.m_dev) && (lhs.m_impl == rhs.m_impl); + return (lhs.m_dev == rhs.m_dev) && (lhs.m_spQueueImpl == rhs.m_spQueueImpl); } friend auto operator!=(QueueGenericSyclBase const& lhs, QueueGenericSyclBase const& rhs) -> bool @@ -191,11 +191,11 @@ namespace alpaka::detail [[nodiscard]] auto getNativeHandle() const noexcept { - return m_impl->getNativeHandle(); + return m_spQueueImpl->getNativeHandle(); } TDev m_dev; - std::shared_ptr m_impl; + std::shared_ptr m_spQueueImpl; }; } // namespace alpaka::detail @@ -239,7 +239,7 @@ namespace alpaka::trait static auto enqueue(detail::QueueGenericSyclBase& queue, TTask const& task) -> void { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - queue.m_impl->template enqueue(task); + queue.m_spQueueImpl->template enqueue(task); } }; @@ -250,7 +250,7 @@ namespace alpaka::trait static auto empty(detail::QueueGenericSyclBase const& queue) -> bool { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - return queue.m_impl->empty(); + return queue.m_spQueueImpl->empty(); } }; @@ -264,7 +264,7 @@ namespace alpaka::trait static auto currentThreadWaitFor(detail::QueueGenericSyclBase const& queue) -> void { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - queue.m_impl->wait(); + queue.m_spQueueImpl->wait(); } }; From c66b7a54c162e848c3c15069a5b008f9480e01d6 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Wed, 12 Jul 2023 18:53:57 +0200 Subject: [PATCH 10/10] Update the main SYCL include file name --- include/alpaka/acc/AccCpuSycl.hpp | 2 +- include/alpaka/acc/AccFpgaSyclIntel.hpp | 2 +- include/alpaka/acc/AccGenericSycl.hpp | 2 +- include/alpaka/atomic/AtomicGenericSycl.hpp | 2 +- .../alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp | 2 +- include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp | 2 +- include/alpaka/block/sync/BlockSyncGenericSycl.hpp | 2 +- include/alpaka/core/Sycl.hpp | 2 +- include/alpaka/dev/DevGenericSycl.hpp | 2 +- include/alpaka/event/EventGenericSycl.hpp | 2 +- include/alpaka/idx/bt/IdxBtGenericSycl.hpp | 2 +- include/alpaka/idx/gb/IdxGbGenericSycl.hpp | 2 +- include/alpaka/intrinsic/IntrinsicGenericSycl.hpp | 2 +- include/alpaka/kernel/TaskKernelGenericSycl.hpp | 2 +- include/alpaka/math/MathGenericSycl.hpp | 2 +- include/alpaka/mem/buf/BufGenericSycl.hpp | 2 +- include/alpaka/mem/buf/sycl/Accessor.hpp | 2 +- include/alpaka/mem/buf/sycl/Common.hpp | 2 +- include/alpaka/mem/buf/sycl/Copy.hpp | 2 +- include/alpaka/mem/fence/MemFenceGenericSycl.hpp | 2 +- include/alpaka/pltf/PltfCpuSycl.hpp | 2 +- include/alpaka/pltf/PltfFpgaSyclIntel.hpp | 2 +- include/alpaka/pltf/PltfGenericSycl.hpp | 2 +- include/alpaka/pltf/PltfGpuSyclIntel.hpp | 2 +- include/alpaka/queue/sycl/QueueGenericSyclBase.hpp | 2 +- include/alpaka/rand/RandGenericSycl.hpp | 2 +- include/alpaka/warp/WarpGenericSycl.hpp | 2 +- include/alpaka/workdiv/WorkDivGenericSycl.hpp | 2 +- 28 files changed, 28 insertions(+), 28 deletions(-) diff --git a/include/alpaka/acc/AccCpuSycl.hpp b/include/alpaka/acc/AccCpuSycl.hpp index 5cedbea64d8f..e9e5504bae67 100644 --- a/include/alpaka/acc/AccCpuSycl.hpp +++ b/include/alpaka/acc/AccCpuSycl.hpp @@ -23,7 +23,7 @@ #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU) -# include +# include namespace alpaka { diff --git a/include/alpaka/acc/AccFpgaSyclIntel.hpp b/include/alpaka/acc/AccFpgaSyclIntel.hpp index 2c71a86c2b90..7e84ce6d5e39 100644 --- a/include/alpaka/acc/AccFpgaSyclIntel.hpp +++ b/include/alpaka/acc/AccFpgaSyclIntel.hpp @@ -22,7 +22,7 @@ #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_FPGA) -# include +# include namespace alpaka { diff --git a/include/alpaka/acc/AccGenericSycl.hpp b/include/alpaka/acc/AccGenericSycl.hpp index ea9560e23b38..34400442c31c 100644 --- a/include/alpaka/acc/AccGenericSycl.hpp +++ b/include/alpaka/acc/AccGenericSycl.hpp @@ -38,7 +38,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { diff --git a/include/alpaka/atomic/AtomicGenericSycl.hpp b/include/alpaka/atomic/AtomicGenericSycl.hpp index a869248e53b4..8ebf608dc570 100644 --- a/include/alpaka/atomic/AtomicGenericSycl.hpp +++ b/include/alpaka/atomic/AtomicGenericSycl.hpp @@ -14,7 +14,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { diff --git a/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp b/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp index 1e4671ebbf72..4e2af194ddcd 100644 --- a/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp +++ b/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp @@ -9,7 +9,7 @@ #include #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { diff --git a/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp b/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp index abda79f7b049..f92df9c051f0 100644 --- a/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp +++ b/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp @@ -12,7 +12,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { diff --git a/include/alpaka/block/sync/BlockSyncGenericSycl.hpp b/include/alpaka/block/sync/BlockSyncGenericSycl.hpp index 87d478eb9c19..67e97493fee4 100644 --- a/include/alpaka/block/sync/BlockSyncGenericSycl.hpp +++ b/include/alpaka/block/sync/BlockSyncGenericSycl.hpp @@ -8,7 +8,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { diff --git a/include/alpaka/core/Sycl.hpp b/include/alpaka/core/Sycl.hpp index 8ba75ef6ecda..dbc8c1e0f748 100644 --- a/include/alpaka/core/Sycl.hpp +++ b/include/alpaka/core/Sycl.hpp @@ -22,7 +22,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include // if SYCL is enabled with the AMD backend the printf will be killed because of missing compiler support # ifdef __AMDGCN__ diff --git a/include/alpaka/dev/DevGenericSycl.hpp b/include/alpaka/dev/DevGenericSycl.hpp index 0a6875a77ba6..85c218365f3d 100644 --- a/include/alpaka/dev/DevGenericSycl.hpp +++ b/include/alpaka/dev/DevGenericSycl.hpp @@ -27,7 +27,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { diff --git a/include/alpaka/event/EventGenericSycl.hpp b/include/alpaka/event/EventGenericSycl.hpp index f761902ddfea..68011a0247cd 100644 --- a/include/alpaka/event/EventGenericSycl.hpp +++ b/include/alpaka/event/EventGenericSycl.hpp @@ -17,7 +17,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { diff --git a/include/alpaka/idx/bt/IdxBtGenericSycl.hpp b/include/alpaka/idx/bt/IdxBtGenericSycl.hpp index 9d68deb826a6..54ef78014f1f 100644 --- a/include/alpaka/idx/bt/IdxBtGenericSycl.hpp +++ b/include/alpaka/idx/bt/IdxBtGenericSycl.hpp @@ -12,7 +12,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka::bt { diff --git a/include/alpaka/idx/gb/IdxGbGenericSycl.hpp b/include/alpaka/idx/gb/IdxGbGenericSycl.hpp index e7500c0f6144..42547effd6c3 100644 --- a/include/alpaka/idx/gb/IdxGbGenericSycl.hpp +++ b/include/alpaka/idx/gb/IdxGbGenericSycl.hpp @@ -12,7 +12,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka::gb { diff --git a/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp b/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp index 700442fb4520..395043a9cd95 100644 --- a/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp +++ b/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp @@ -11,7 +11,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { diff --git a/include/alpaka/kernel/TaskKernelGenericSycl.hpp b/include/alpaka/kernel/TaskKernelGenericSycl.hpp index 289a6f72c819..92eab6fa5221 100644 --- a/include/alpaka/kernel/TaskKernelGenericSycl.hpp +++ b/include/alpaka/kernel/TaskKernelGenericSycl.hpp @@ -36,7 +36,7 @@ # pragma clang diagnostic ignored "-Wunused-parameter" # endif -# include +# include # define LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size) \ cgh.parallel_for( \ diff --git a/include/alpaka/math/MathGenericSycl.hpp b/include/alpaka/math/MathGenericSycl.hpp index ba99befbf66a..53898f210680 100644 --- a/include/alpaka/math/MathGenericSycl.hpp +++ b/include/alpaka/math/MathGenericSycl.hpp @@ -12,7 +12,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include //! The mathematical operation specifics. namespace alpaka::math diff --git a/include/alpaka/mem/buf/BufGenericSycl.hpp b/include/alpaka/mem/buf/BufGenericSycl.hpp index 38241c83773b..c8849cedf7dd 100644 --- a/include/alpaka/mem/buf/BufGenericSycl.hpp +++ b/include/alpaka/mem/buf/BufGenericSycl.hpp @@ -20,7 +20,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { diff --git a/include/alpaka/mem/buf/sycl/Accessor.hpp b/include/alpaka/mem/buf/sycl/Accessor.hpp index 6d2dec54cb7c..a83cd3d7d8e9 100644 --- a/include/alpaka/mem/buf/sycl/Accessor.hpp +++ b/include/alpaka/mem/buf/sycl/Accessor.hpp @@ -13,7 +13,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { diff --git a/include/alpaka/mem/buf/sycl/Common.hpp b/include/alpaka/mem/buf/sycl/Common.hpp index 80e9763ebf75..87058bc6ca92 100644 --- a/include/alpaka/mem/buf/sycl/Common.hpp +++ b/include/alpaka/mem/buf/sycl/Common.hpp @@ -12,7 +12,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka::detail { diff --git a/include/alpaka/mem/buf/sycl/Copy.hpp b/include/alpaka/mem/buf/sycl/Copy.hpp index e1d8ce640c32..8e8cf533f747 100644 --- a/include/alpaka/mem/buf/sycl/Copy.hpp +++ b/include/alpaka/mem/buf/sycl/Copy.hpp @@ -22,7 +22,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka::detail { diff --git a/include/alpaka/mem/fence/MemFenceGenericSycl.hpp b/include/alpaka/mem/fence/MemFenceGenericSycl.hpp index c0a308448f8e..0b7559f3d61b 100644 --- a/include/alpaka/mem/fence/MemFenceGenericSycl.hpp +++ b/include/alpaka/mem/fence/MemFenceGenericSycl.hpp @@ -8,7 +8,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { diff --git a/include/alpaka/pltf/PltfCpuSycl.hpp b/include/alpaka/pltf/PltfCpuSycl.hpp index 40f6855eab1a..c30793dd98a9 100644 --- a/include/alpaka/pltf/PltfCpuSycl.hpp +++ b/include/alpaka/pltf/PltfCpuSycl.hpp @@ -12,7 +12,7 @@ #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU) -# include +# include namespace alpaka { diff --git a/include/alpaka/pltf/PltfFpgaSyclIntel.hpp b/include/alpaka/pltf/PltfFpgaSyclIntel.hpp index 1d908c832f1b..9b5e1b100373 100644 --- a/include/alpaka/pltf/PltfFpgaSyclIntel.hpp +++ b/include/alpaka/pltf/PltfFpgaSyclIntel.hpp @@ -10,7 +10,7 @@ #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_FPGA) -# include +# include # include diff --git a/include/alpaka/pltf/PltfGenericSycl.hpp b/include/alpaka/pltf/PltfGenericSycl.hpp index d2a33b959b70..8c67f5f50c31 100644 --- a/include/alpaka/pltf/PltfGenericSycl.hpp +++ b/include/alpaka/pltf/PltfGenericSycl.hpp @@ -20,7 +20,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { diff --git a/include/alpaka/pltf/PltfGpuSyclIntel.hpp b/include/alpaka/pltf/PltfGpuSyclIntel.hpp index 2e5d7583aaab..9c0e6db1eb97 100644 --- a/include/alpaka/pltf/PltfGpuSyclIntel.hpp +++ b/include/alpaka/pltf/PltfGpuSyclIntel.hpp @@ -12,7 +12,7 @@ #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_GPU) -# include +# include namespace alpaka { diff --git a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp index 0f847079de95..9b08f3de55ca 100644 --- a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp +++ b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp @@ -21,7 +21,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka::detail { diff --git a/include/alpaka/rand/RandGenericSycl.hpp b/include/alpaka/rand/RandGenericSycl.hpp index befaa5d7005a..6a85a3a79a5a 100644 --- a/include/alpaka/rand/RandGenericSycl.hpp +++ b/include/alpaka/rand/RandGenericSycl.hpp @@ -12,7 +12,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED // Backend specific imports. -# include +# include # if BOOST_COMP_CLANG # pragma clang diagnostic push # pragma clang diagnostic ignored "-Wcast-align" diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp index eb4a21a5a801..b2b6aa8328b5 100644 --- a/include/alpaka/warp/WarpGenericSycl.hpp +++ b/include/alpaka/warp/WarpGenericSycl.hpp @@ -10,7 +10,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka::warp { diff --git a/include/alpaka/workdiv/WorkDivGenericSycl.hpp b/include/alpaka/workdiv/WorkDivGenericSycl.hpp index 725e1ff78824..26e00750e42d 100644 --- a/include/alpaka/workdiv/WorkDivGenericSycl.hpp +++ b/include/alpaka/workdiv/WorkDivGenericSycl.hpp @@ -10,7 +10,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka {