From 11ab2182995b4aa325b037dd1f08cc90b244c4bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Widera?= Date: Fri, 20 Sep 2024 10:37:21 +0200 Subject: [PATCH] Reduce Uniform/Independent-Elements iterator register footprint (#2383) Rewrite `UniformElements` and `IndependentElements` iterators to reduce the register footprint. - avoid multiple return within a function - reduce the iterator state size by one element --- example/bufferCopy/src/bufferCopy.cpp | 2 +- include/alpaka/exec/IndependentElements.hpp | 50 +++++++++------------ include/alpaka/exec/UniformElements.hpp | 35 +++++++-------- 3 files changed, 38 insertions(+), 49 deletions(-) diff --git a/example/bufferCopy/src/bufferCopy.cpp b/example/bufferCopy/src/bufferCopy.cpp index d07bb032338..7c2f9e164d6 100644 --- a/example/bufferCopy/src/bufferCopy.cpp +++ b/example/bufferCopy/src/bufferCopy.cpp @@ -95,7 +95,7 @@ auto example(TAccTag const&) -> int // Define the work division for kernels to be run on devAcc and devHost using Vec = alpaka::Vec; - Vec const elementsPerThread(Vec::all(static_cast(1))); + Vec const elementsPerThread(Vec::all(static_cast(3))); Vec const elementsPerGrid(Vec::all(static_cast(10))); // Create host and device buffers diff --git a/include/alpaka/exec/IndependentElements.hpp b/include/alpaka/exec/IndependentElements.hpp index 3af342bf249..447fa7ef177 100644 --- a/include/alpaka/exec/IndependentElements.hpp +++ b/include/alpaka/exec/IndependentElements.hpp @@ -146,9 +146,9 @@ namespace alpaka }; private: - const Idx first_; - const Idx stride_; - const Idx extent_; + Idx const first_; + Idx const stride_; + Idx const extent_; }; } // namespace detail @@ -311,11 +311,12 @@ namespace alpaka ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first) : elements_{elements} - , stride_{stride} + , + // we need to reduce the stride by on element range because index_ is later increased with each + // increment + stride_{stride - elements} , extent_{extent} - , first_{std::min(first, extent)} - , index_{first_} - , range_{std::min(first + elements, extent)} + , index_{std::min(first, extent)} { } @@ -328,22 +329,16 @@ namespace alpaka // pre-increment the iterator ALPAKA_FN_ACC inline const_iterator& operator++() { - // increment the index along the elements processed by the current thread + ++indexElem_; ++index_; - if(index_ < range_) - return *this; - - // increment the thread index with the block stride - first_ += stride_; - index_ = first_; - range_ = std::min(first_ + elements_, extent_); - if(index_ < extent_) - return *this; + if(indexElem_ >= elements_) + { + indexElem_ = 0; + index_ += stride_; + } + if(index_ >= extent_) + index_ = extent_; - // the iterator has reached or passed the end of the extent, clamp it to the extent - first_ = extent_; - index_ = extent_; - range_ = extent_; return *this; } @@ -357,7 +352,7 @@ namespace alpaka ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { - return (index_ == other.index_) and (first_ == other.first_); + return (*(*this) == *other); } ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const @@ -371,16 +366,15 @@ namespace alpaka Idx stride_; Idx extent_; // modified by the pre/post-increment operator - Idx first_; Idx index_; - Idx range_; + Idx indexElem_ = 0; }; private: - const Idx elements_; - const Idx thread_; - const Idx stride_; - const Idx extent_; + Idx const elements_; + Idx const thread_; + Idx const stride_; + Idx const extent_; }; } // namespace detail diff --git a/include/alpaka/exec/UniformElements.hpp b/include/alpaka/exec/UniformElements.hpp index b7f6cd2ee54..2bfbc94ad24 100644 --- a/include/alpaka/exec/UniformElements.hpp +++ b/include/alpaka/exec/UniformElements.hpp @@ -130,11 +130,12 @@ namespace alpaka ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first) : elements_{elements} - , stride_{stride} + , + // we need to reduce the stride by on element range because index_ is later increased with each + // increment + stride_{stride - elements} , extent_{extent} - , first_{std::min(first, extent)} - , index_{first_} - , range_{std::min(first + elements, extent)} + , index_{std::min(first, extent)} { } @@ -148,21 +149,16 @@ namespace alpaka ALPAKA_FN_ACC inline const_iterator& operator++() { // increment the index along the elements processed by the current thread + ++indexElem_; ++index_; - if(index_ < range_) - return *this; - - // increment the thread index with the grid stride - first_ += stride_; - index_ = first_; - range_ = std::min(first_ + elements_, extent_); - if(index_ < extent_) - return *this; + if(indexElem_ >= elements_) + { + indexElem_ = 0; + index_ += stride_; + } + if(index_ >= extent_) + index_ = extent_; - // the iterator has reached or passed the end of the extent, clamp it to the extent - first_ = extent_; - index_ = extent_; - range_ = extent_; return *this; } @@ -176,7 +172,7 @@ namespace alpaka ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { - return (index_ == other.index_) and (first_ == other.first_); + return (*(*this) == *other); } ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const @@ -190,9 +186,8 @@ namespace alpaka Idx stride_; Idx extent_; // modified by the pre/post-increment operator - Idx first_; Idx index_; - Idx range_; + Idx indexElem_ = 0; }; private: