From bb2123199477ef143c210ee920228c1644ec5fa3 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 10 Jan 2024 20:50:49 +0100 Subject: [PATCH] Align BabelStream with LLAMA version --- example/babelstream/src/AlpakaStream.cpp | 55 +++++++++++++----------- example/babelstream/src/AlpakaStream.h | 20 ++++----- example/babelstream/src/README.md | 4 +- example/babelstream/src/Stream.h | 4 ++ example/babelstream/src/main.cpp | 4 ++ 5 files changed, 47 insertions(+), 40 deletions(-) diff --git a/example/babelstream/src/AlpakaStream.cpp b/example/babelstream/src/AlpakaStream.cpp index bca66c7cfbee..7f6185494084 100644 --- a/example/babelstream/src/AlpakaStream.cpp +++ b/example/babelstream/src/AlpakaStream.cpp @@ -11,23 +11,26 @@ #include -constexpr auto TBSIZE = 1024; -constexpr auto DOT_NUM_BLOCKS = 256; +namespace +{ + constexpr auto blockSize = 1024; + constexpr auto dotBlockSize = 256; +} // namespace template AlpakaStream::AlpakaStream(Idx arraySize, Idx deviceIndex) : arraySize(arraySize) , devHost(alpaka::getDevByIdx(platformHost, 0)) , devAcc(alpaka::getDevByIdx(platformAcc, deviceIndex)) - , sums(alpaka::allocBuf(devHost, DOT_NUM_BLOCKS)) + , sums(alpaka::allocBuf(devHost, dotBlockSize)) , d_a(alpaka::allocBuf(devAcc, arraySize)) , d_b(alpaka::allocBuf(devAcc, arraySize)) , d_c(alpaka::allocBuf(devAcc, arraySize)) - , d_sum(alpaka::allocBuf(devAcc, DOT_NUM_BLOCKS)) + , d_sum(alpaka::allocBuf(devAcc, dotBlockSize)) , queue(devAcc) { - if(arraySize % TBSIZE != 0) - throw std::runtime_error("Array size must be a multiple of " + std::to_string(TBSIZE)); + if(arraySize % blockSize != 0) + throw std::runtime_error("Array size must be a multiple of " + std::to_string(blockSize)); std::cout << "Using alpaka device " << alpaka::getName(devAcc) << std::endl; } @@ -46,7 +49,7 @@ struct InitKernel template void AlpakaStream::init_arrays(T initA, T initB, T initC) { - auto const workdiv = WorkDiv{arraySize / TBSIZE, TBSIZE, 1}; + auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1}; // auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize); alpaka::exec( queue, @@ -82,7 +85,7 @@ struct CopyKernel template void AlpakaStream::copy() { - auto const workdiv = WorkDiv{arraySize / TBSIZE, TBSIZE, 1}; + auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1}; // auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize); alpaka::exec(queue, workdiv, CopyKernel{}, alpaka::getPtrNative(d_a), alpaka::getPtrNative(d_c)); alpaka::wait(queue); @@ -102,7 +105,7 @@ struct MulKernel template void AlpakaStream::mul() { - auto const workdiv = WorkDiv{arraySize / TBSIZE, TBSIZE, 1}; + auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1}; // auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize); alpaka::exec(queue, workdiv, MulKernel{}, alpaka::getPtrNative(d_b), alpaka::getPtrNative(d_c)); alpaka::wait(queue); @@ -121,7 +124,7 @@ struct AddKernel template void AlpakaStream::add() { - auto const workdiv = WorkDiv{arraySize / TBSIZE, TBSIZE, 1}; + auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1}; // auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize); alpaka::exec( queue, @@ -147,7 +150,7 @@ struct TriadKernel template void AlpakaStream::triad() { - auto const workdiv = WorkDiv{arraySize / TBSIZE, TBSIZE, 1}; + auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1}; // auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize); alpaka::exec( queue, @@ -173,7 +176,7 @@ struct NstreamKernel template void AlpakaStream::nstream() { - auto const workdiv = WorkDiv{arraySize / TBSIZE, TBSIZE, 1}; + auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1}; // auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize); alpaka::exec( queue, @@ -190,37 +193,37 @@ struct DotKernel template ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* sum, int arraySize) const { - // TODO - test if sharedMem bug is affecting performance here - auto& tb_sum = alpaka::declareSharedVar(acc); + // TODO(Jeff Young) - test if sharedMem bug is affecting performance here + auto& tbSum = alpaka::declareSharedVar(acc); auto [i] = alpaka::getIdx(acc); auto const [local_i] = alpaka::getIdx(acc); auto const [totalThreads] = alpaka::getWorkDiv(acc); - T thread_sum = 0; - for(; i < arraySize; i += totalThreads) - thread_sum += a[i] * b[i]; - tb_sum[local_i] = thread_sum; + T threadSum = 0; + for(; i < arraySize; i += totalThreads) // NOLINT(bugprone-infinite-loop) + threadSum += a[i] * b[i]; + tbSum[local_i] = threadSum; auto const [blockDim] = alpaka::getWorkDiv(acc); for(int offset = blockDim / 2; offset > 0; offset /= 2) { alpaka::syncBlockThreads(acc); if(local_i < offset) - tb_sum[local_i] += tb_sum[local_i + offset]; + tbSum[local_i] += tbSum[local_i + offset]; } auto const [blockIdx] = alpaka::getIdx(acc); if(local_i == 0) - sum[blockIdx] = tb_sum[local_i]; + sum[blockIdx] = tbSum[local_i]; } }; template -T AlpakaStream::dot() +auto AlpakaStream::dot() -> T { - auto const workdiv = WorkDiv{DOT_NUM_BLOCKS, TBSIZE, 1}; - // auto const workdiv = alpaka::getValidWorkDiv(devAcc, DOT_NUM_BLOCKS * TBSIZE); + auto const workdiv = WorkDiv{dotBlockSize, blockSize, 1}; + // auto const workdiv = alpaka::getValidWorkDiv(devAcc, dotBlockSize * blockSize); alpaka::exec( queue, workdiv, @@ -234,7 +237,7 @@ T AlpakaStream::dot() alpaka::memcpy(queue, sums, d_sum); T const* sumPtr = alpaka::getPtrNative(sums); // TODO(bgruber): replace by std::reduce, when gcc 9.3 is the baseline - return std::accumulate(sumPtr, sumPtr + DOT_NUM_BLOCKS, T{0}); + return std::accumulate(sumPtr, sumPtr + dotBlockSize, T{0}); } void listDevices() @@ -246,13 +249,13 @@ void listDevices() std::cout << i << ": " << getDeviceName(i) << std::endl; } -std::string getDeviceName(int deviceIndex) +auto getDeviceName(int deviceIndex) -> std::string { auto const platform = alpaka::Platform{}; return alpaka::getName(alpaka::getDevByIdx(platform, deviceIndex)); } -std::string getDeviceDriver(int device) +auto getDeviceDriver([[maybe_unused]] int device) -> std::string { return "Not supported"; } diff --git a/example/babelstream/src/AlpakaStream.h b/example/babelstream/src/AlpakaStream.h index 35ecf5486fb5..ba556b028dba 100644 --- a/example/babelstream/src/AlpakaStream.h +++ b/example/babelstream/src/AlpakaStream.h @@ -28,15 +28,15 @@ struct AlpakaStream : Stream { AlpakaStream(Idx arraySize, Idx deviceIndex); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + auto dot() -> T override; - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void init_arrays(T initA, T initB, T initC) override; + void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; using PlatformHost = alpaka::PlatformCpu; using DevHost = alpaka::Dev; @@ -61,7 +61,3 @@ struct AlpakaStream : Stream BufAcc d_sum; Queue queue; }; - -void listDevices(); -std::string getDeviceName(int deviceIndex); -std::string getDeviceDriver(int device); diff --git a/example/babelstream/src/README.md b/example/babelstream/src/README.md index 89215e007d19..781cdf31039f 100644 --- a/example/babelstream/src/README.md +++ b/example/babelstream/src/README.md @@ -1,6 +1,6 @@ This is a port of [BabelStream](https://github.com/UoB-HPC/BabelStream) to alpaka. -This work is based on the [cupla port of BabelStream]( https://github.com/jyoung3131/BabelStream) from Jeff Young. -The benchmark driver (main.cpp and Stream.h) is taken from BabelStream. +This work is based on the [cupla port of BabelStream](https://github.com/jyoung3131/BabelStream) from Jeff Young. +The benchmark driver (`main.cpp` and `Stream.h`) is taken from BabelStream. No other backends are available, only alpaka. Thus, there is no need to select a backend, just run the executable. Please refer to the BabelStream documentation of more information on how to run the benchmark. diff --git a/example/babelstream/src/Stream.h b/example/babelstream/src/Stream.h index 2d050972c3c0..d4548428f0bb 100644 --- a/example/babelstream/src/Stream.h +++ b/example/babelstream/src/Stream.h @@ -5,6 +5,8 @@ // For full license terms please see the LICENSE file distributed with this // source code +// NOLINTBEGIN + #pragma once #include @@ -42,3 +44,5 @@ class Stream void listDevices(void); std::string getDeviceName(int const); std::string getDeviceDriver(int const); + +// NOLINTEND diff --git a/example/babelstream/src/main.cpp b/example/babelstream/src/main.cpp index 13bd865b6295..acef1c33a60c 100644 --- a/example/babelstream/src/main.cpp +++ b/example/babelstream/src/main.cpp @@ -5,6 +5,8 @@ // For full license terms please see the LICENSE file distributed with this // source code +// NOLINTBEGIN + #include #include #include @@ -582,3 +584,5 @@ void parseArguments(int argc, char* argv[]) } } } + +// NOLINTEND