Skip to content

Commit

Permalink
Align BabelStream with LLAMA version
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber authored and psychocoderHPC committed Jan 11, 2024
1 parent 0562877 commit bb21231
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 40 deletions.
55 changes: 29 additions & 26 deletions example/babelstream/src/AlpakaStream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,26 @@

#include <numeric>

constexpr auto TBSIZE = 1024;
constexpr auto DOT_NUM_BLOCKS = 256;
namespace
{
constexpr auto blockSize = 1024;
constexpr auto dotBlockSize = 256;
} // namespace

template<typename T>
AlpakaStream<T>::AlpakaStream(Idx arraySize, Idx deviceIndex)
: arraySize(arraySize)
, devHost(alpaka::getDevByIdx(platformHost, 0))
, devAcc(alpaka::getDevByIdx(platformAcc, deviceIndex))
, sums(alpaka::allocBuf<T, Idx>(devHost, DOT_NUM_BLOCKS))
, sums(alpaka::allocBuf<T, Idx>(devHost, dotBlockSize))
, d_a(alpaka::allocBuf<T, Idx>(devAcc, arraySize))
, d_b(alpaka::allocBuf<T, Idx>(devAcc, arraySize))
, d_c(alpaka::allocBuf<T, Idx>(devAcc, arraySize))
, d_sum(alpaka::allocBuf<T, Idx>(devAcc, DOT_NUM_BLOCKS))
, d_sum(alpaka::allocBuf<T, Idx>(devAcc, dotBlockSize))
, queue(devAcc)
{
if(arraySize % TBSIZE != 0)
throw std::runtime_error("Array size must be a multiple of " + std::to_string(TBSIZE));
if(arraySize % blockSize != 0)
throw std::runtime_error("Array size must be a multiple of " + std::to_string(blockSize));
std::cout << "Using alpaka device " << alpaka::getName(devAcc) << std::endl;
}

Expand All @@ -46,7 +49,7 @@ struct InitKernel
template<typename T>
void AlpakaStream<T>::init_arrays(T initA, T initB, T initC)
{
auto const workdiv = WorkDiv{arraySize / TBSIZE, TBSIZE, 1};
auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
alpaka::exec<Acc>(
queue,
Expand Down Expand Up @@ -82,7 +85,7 @@ struct CopyKernel
template<typename T>
void AlpakaStream<T>::copy()
{
auto const workdiv = WorkDiv{arraySize / TBSIZE, TBSIZE, 1};
auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
alpaka::exec<Acc>(queue, workdiv, CopyKernel{}, alpaka::getPtrNative(d_a), alpaka::getPtrNative(d_c));
alpaka::wait(queue);
Expand All @@ -102,7 +105,7 @@ struct MulKernel
template<typename T>
void AlpakaStream<T>::mul()
{
auto const workdiv = WorkDiv{arraySize / TBSIZE, TBSIZE, 1};
auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
alpaka::exec<Acc>(queue, workdiv, MulKernel{}, alpaka::getPtrNative(d_b), alpaka::getPtrNative(d_c));
alpaka::wait(queue);
Expand All @@ -121,7 +124,7 @@ struct AddKernel
template<typename T>
void AlpakaStream<T>::add()
{
auto const workdiv = WorkDiv{arraySize / TBSIZE, TBSIZE, 1};
auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
alpaka::exec<Acc>(
queue,
Expand All @@ -147,7 +150,7 @@ struct TriadKernel
template<typename T>
void AlpakaStream<T>::triad()
{
auto const workdiv = WorkDiv{arraySize / TBSIZE, TBSIZE, 1};
auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
alpaka::exec<Acc>(
queue,
Expand All @@ -173,7 +176,7 @@ struct NstreamKernel
template<typename T>
void AlpakaStream<T>::nstream()
{
auto const workdiv = WorkDiv{arraySize / TBSIZE, TBSIZE, 1};
auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
alpaka::exec<Acc>(
queue,
Expand All @@ -190,37 +193,37 @@ struct DotKernel
template<typename TAcc, typename T>
ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* sum, int arraySize) const
{
// TODO - test if sharedMem bug is affecting performance here
auto& tb_sum = alpaka::declareSharedVar<T[TBSIZE], __COUNTER__>(acc);
// TODO(Jeff Young) - test if sharedMem bug is affecting performance here
auto& tbSum = alpaka::declareSharedVar<T[blockSize], __COUNTER__>(acc);

auto [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const [local_i] = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
auto const [totalThreads] = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

T thread_sum = 0;
for(; i < arraySize; i += totalThreads)
thread_sum += a[i] * b[i];
tb_sum[local_i] = thread_sum;
T threadSum = 0;
for(; i < arraySize; i += totalThreads) // NOLINT(bugprone-infinite-loop)
threadSum += a[i] * b[i];
tbSum[local_i] = threadSum;

auto const [blockDim] = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
for(int offset = blockDim / 2; offset > 0; offset /= 2)
{
alpaka::syncBlockThreads(acc);
if(local_i < offset)
tb_sum[local_i] += tb_sum[local_i + offset];
tbSum[local_i] += tbSum[local_i + offset];
}

auto const [blockIdx] = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);
if(local_i == 0)
sum[blockIdx] = tb_sum[local_i];
sum[blockIdx] = tbSum[local_i];
}
};

template<typename T>
T AlpakaStream<T>::dot()
auto AlpakaStream<T>::dot() -> T
{
auto const workdiv = WorkDiv{DOT_NUM_BLOCKS, TBSIZE, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, DOT_NUM_BLOCKS * TBSIZE);
auto const workdiv = WorkDiv{dotBlockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, dotBlockSize * blockSize);
alpaka::exec<Acc>(
queue,
workdiv,
Expand All @@ -234,7 +237,7 @@ T AlpakaStream<T>::dot()
alpaka::memcpy(queue, sums, d_sum);
T const* sumPtr = alpaka::getPtrNative(sums);
// TODO(bgruber): replace by std::reduce, when gcc 9.3 is the baseline
return std::accumulate(sumPtr, sumPtr + DOT_NUM_BLOCKS, T{0});
return std::accumulate(sumPtr, sumPtr + dotBlockSize, T{0});
}

void listDevices()
Expand All @@ -246,13 +249,13 @@ void listDevices()
std::cout << i << ": " << getDeviceName(i) << std::endl;
}

std::string getDeviceName(int deviceIndex)
auto getDeviceName(int deviceIndex) -> std::string
{
auto const platform = alpaka::Platform<Acc>{};
return alpaka::getName(alpaka::getDevByIdx(platform, deviceIndex));
}

std::string getDeviceDriver(int device)
auto getDeviceDriver([[maybe_unused]] int device) -> std::string
{
return "Not supported";
}
Expand Down
20 changes: 8 additions & 12 deletions example/babelstream/src/AlpakaStream.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ struct AlpakaStream : Stream<T>
{
AlpakaStream(Idx arraySize, Idx deviceIndex);

virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
void copy() override;
void add() override;
void mul() override;
void triad() override;
void nstream() override;
auto dot() -> T override;

virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
void init_arrays(T initA, T initB, T initC) override;
void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;

using PlatformHost = alpaka::PlatformCpu;
using DevHost = alpaka::Dev<PlatformHost>;
Expand All @@ -61,7 +61,3 @@ struct AlpakaStream : Stream<T>
BufAcc d_sum;
Queue queue;
};

void listDevices();
std::string getDeviceName(int deviceIndex);
std::string getDeviceDriver(int device);
4 changes: 2 additions & 2 deletions example/babelstream/src/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
This is a port of [BabelStream](https://github.com/UoB-HPC/BabelStream) to alpaka.
This work is based on the [cupla port of BabelStream]( https://github.com/jyoung3131/BabelStream) from Jeff Young.
The benchmark driver (main.cpp and Stream.h) is taken from BabelStream.
This work is based on the [cupla port of BabelStream](https://github.com/jyoung3131/BabelStream) from Jeff Young.
The benchmark driver (`main.cpp` and `Stream.h`) is taken from BabelStream.
No other backends are available, only alpaka.
Thus, there is no need to select a backend, just run the executable.
Please refer to the BabelStream documentation of more information on how to run the benchmark.
4 changes: 4 additions & 0 deletions example/babelstream/src/Stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
// For full license terms please see the LICENSE file distributed with this
// source code

// NOLINTBEGIN

#pragma once

#include <string>
Expand Down Expand Up @@ -42,3 +44,5 @@ class Stream
void listDevices(void);
std::string getDeviceName(int const);
std::string getDeviceDriver(int const);

// NOLINTEND
4 changes: 4 additions & 0 deletions example/babelstream/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
// For full license terms please see the LICENSE file distributed with this
// source code

// NOLINTBEGIN

#include <algorithm>
#include <chrono>
#include <cmath>
Expand Down Expand Up @@ -582,3 +584,5 @@ void parseArguments(int argc, char* argv[])
}
}
}

// NOLINTEND

0 comments on commit bb21231

Please sign in to comment.