From 53392f4f7a0b5f6348ce128fe2b06cd6f650d4ce Mon Sep 17 00:00:00 2001
From: Tapish <narwal83@hzdr.de>
Date: Tue, 3 Sep 2024 13:33:29 +0200
Subject: [PATCH] Add 2D heat equation example with shared memory and
 validation

---
 example/CMakeLists.txt                        |   1 +
 example/heatEquation2D/CMakeLists.txt         |  67 ++++++
 example/heatEquation2D/src/BoundaryKernel.hpp |  86 +++++++
 example/heatEquation2D/src/StencilKernel.hpp  |  89 +++++++
 .../heatEquation2D/src/analyticalSolution.hpp |  71 ++++++
 example/heatEquation2D/src/heatEquation2D.cpp | 217 ++++++++++++++++++
 example/heatEquation2D/src/helpers.hpp        |  25 ++
 example/heatEquation2D/src/writeImage.hpp     |  38 +++
 8 files changed, 594 insertions(+)
 create mode 100644 example/heatEquation2D/CMakeLists.txt
 create mode 100644 example/heatEquation2D/src/BoundaryKernel.hpp
 create mode 100644 example/heatEquation2D/src/StencilKernel.hpp
 create mode 100644 example/heatEquation2D/src/analyticalSolution.hpp
 create mode 100644 example/heatEquation2D/src/heatEquation2D.cpp
 create mode 100644 example/heatEquation2D/src/helpers.hpp
 create mode 100644 example/heatEquation2D/src/writeImage.hpp

diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index d981136dbb2..4d3ae6a1e0f 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -22,6 +22,7 @@ add_subdirectory("convolution2D/")
 add_subdirectory("conv2DWithMdspan/")
 add_subdirectory("counterBasedRng/")
 add_subdirectory("heatEquation/")
+add_subdirectory("heatEquation2D/")
 add_subdirectory("helloWorld/")
 add_subdirectory("helloWorldLambda/")
 add_subdirectory("kernelSpecialization/")
diff --git a/example/heatEquation2D/CMakeLists.txt b/example/heatEquation2D/CMakeLists.txt
new file mode 100644
index 00000000000..ab9c5fe65d6
--- /dev/null
+++ b/example/heatEquation2D/CMakeLists.txt
@@ -0,0 +1,67 @@
+#
+# Copyright 2023 Benjamin Worpitz, Jan Stephan
+# SPDX-License-Identifier: ISC
+#
+
+################################################################################
+# Required CMake version.
+
+cmake_minimum_required(VERSION 3.22)
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+################################################################################
+# Project.
+
+set(_TARGET_NAME heatEquation2D)
+
+project(${_TARGET_NAME} LANGUAGES CXX)
+
+
+################################################################################
+# PNGwriter
+################################################################################
+
+# find PNGwriter installation
+find_package(PNGwriter 0.7.0 CONFIG)
+
+if(PNGwriter_FOUND)
+  set(PNGWRITER_ENABLED True)
+else()
+  set(PNGWRITER_ENABLED False)
+endif()
+
+#-------------------------------------------------------------------------------
+# Find alpaka.
+
+if(NOT TARGET alpaka::alpaka)
+    option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(alpaka_USE_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
+
+#-------------------------------------------------------------------------------
+# Add executable.
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    src/heatEquation2D.cpp)
+target_link_libraries(
+    ${_TARGET_NAME}
+    PUBLIC alpaka::alpaka)
+if(PNGwriter_FOUND)
+    target_link_libraries(
+        ${_TARGET_NAME}
+        PRIVATE PNGwriter::PNGwriter)
+    target_compile_definitions(${_TARGET_NAME} PRIVATE PNGWRITER_ENABLED)
+endif()
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/example/heatEquation2D/src/BoundaryKernel.hpp b/example/heatEquation2D/src/BoundaryKernel.hpp
new file mode 100644
index 00000000000..e2a4d022a7b
--- /dev/null
+++ b/example/heatEquation2D/src/BoundaryKernel.hpp
@@ -0,0 +1,86 @@
+/* Copyright 2024 Tapish Narwal
+ * SPDX-License-Identifier: ISC
+ */
+
+#pragma once
+
+#include "analyticalSolution.hpp"
+#include "helpers.hpp"
+
+#include <alpaka/alpaka.hpp>
+
+//! alpaka version of explicit finite-difference 1d heat equation solver
+//!
+//! Applies boundary conditions
+//! forward difference in t and second-order central difference in x
+//!
+//! \param uBuf grid values of u for each x, y and the current value of t:
+//!                 u(x, y, t)  | t = t_current
+//! \param chunkSize
+//! \param pitch
+//! \param dx step in x
+//! \param dy step in y
+//! \param dt step in t
+struct BoundaryKernel
+{
+    template<typename TAcc, typename TChunk>
+    ALPAKA_FN_ACC auto operator()(
+        TAcc const& acc,
+        double* const uBuf,
+        TChunk const chunkSize,
+        TChunk const pitch,
+        uint32_t step,
+        double const dx,
+        double const dy,
+        double const dt) const -> void
+    {
+        using Dim = alpaka::DimInt<2u>;
+        using Idx = uint32_t;
+
+        // Get extents(dimensions)
+        auto const gridBlockExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc);
+        auto const blockThreadExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
+        auto const numThreadsPerBlock = blockThreadExtent.prod();
+
+        // Get indexes
+        auto const gridBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);
+        auto const blockThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+        auto const threadIdx1D = alpaka::mapIdx<1>(blockThreadIdx, blockThreadExtent)[0u];
+        auto const blockStartIdx = gridBlockIdx * chunkSize;
+
+        // Lambda function to apply boundary conditions
+        auto applyBoundary = [&](auto const& globalIdxStart, auto const length, bool isRow)
+        {
+            for(auto i = threadIdx1D; i < length; i += numThreadsPerBlock)
+            {
+                auto idx2D = globalIdxStart + (isRow ? alpaka::Vec<Dim, Idx>{0, i} : alpaka::Vec<Dim, Idx>{i, 0});
+                auto elem = getElementPtr(uBuf, idx2D, pitch);
+                *elem = exactSolution(idx2D[1] * dx, idx2D[0] * dy, step * dt);
+            }
+        };
+
+        // Apply boundary conditions for the top row
+        if(gridBlockIdx[0] == 0)
+        {
+            applyBoundary(blockStartIdx + alpaka::Vec<Dim, Idx>{0, 1}, chunkSize[1], true);
+        }
+
+        // Apply boundary conditions for the bottom row
+        if(gridBlockIdx[0] == gridBlockExtent[0] - 1)
+        {
+            applyBoundary(blockStartIdx + alpaka::Vec<Dim, Idx>{chunkSize[0] + 1, 1}, chunkSize[1], true);
+        }
+
+        // Apply boundary conditions for the left column
+        if(gridBlockIdx[1] == 0)
+        {
+            applyBoundary(blockStartIdx + alpaka::Vec<Dim, Idx>{1, 0}, chunkSize[0], false);
+        }
+
+        // Apply boundary conditions for the right column
+        if(gridBlockIdx[1] == gridBlockExtent[1] - 1)
+        {
+            applyBoundary(blockStartIdx + alpaka::Vec<Dim, Idx>{1, chunkSize[1] + 1}, chunkSize[0], false);
+        }
+    }
+};
diff --git a/example/heatEquation2D/src/StencilKernel.hpp b/example/heatEquation2D/src/StencilKernel.hpp
new file mode 100644
index 00000000000..a6121d3dbb9
--- /dev/null
+++ b/example/heatEquation2D/src/StencilKernel.hpp
@@ -0,0 +1,89 @@
+/* Copyright 2024 Tapish Narwal
+ * SPDX-License-Identifier: ISC
+ */
+
+#pragma once
+
+#include "helpers.hpp"
+
+#include <alpaka/alpaka.hpp>
+
+//! alpaka version of explicit finite-difference 2D heat equation solver
+//!
+//! \tparam T_SharedMemSize1D size of the shared memory box
+//!
+//! Solving equation u_t(x, t) = u_xx(x, t) + u_yy(y, t) using a simple explicit scheme with
+//! forward difference in t and second-order central difference in x and y
+//!
+//! \param uCurrBuf Current buffer with grid values of u for each x, y pair and the current value of t:
+//!                 u(x, y, t) | t = t_current
+//! \param uNextBuf resulting grid values of u for each x, y pair and the next value of t:
+//!              u(x, y, t) | t = t_current + dt
+//! \param chunkSize The size of the chunk or tile that the user divides the problem into. This defines the size of the
+//!                  workload handled by each thread block.
+//! \param pitchCurr The pitch (or stride) in memory corresponding to the TDim grid in the accelerator's memory.
+//!              This is used to calculate memory offsets when accessing elements in the current buffer.
+//! \param pitchNext The pitch used to calculate memory offsets when accessing elements in the next buffer.
+//! \param dx step in x
+//! \param dy step in y
+//! \param dt step in t
+template<size_t T_SharedMemSize1D>
+struct StencilKernel
+{
+    template<typename TAcc, typename TDim, typename TIdx>
+    ALPAKA_FN_ACC auto operator()(
+        TAcc const& acc,
+        double const* const uCurrBuf,
+        double* const uNextBuf,
+        alpaka::Vec<TDim, TIdx> const chunkSize,
+        alpaka::Vec<TDim, TIdx> const pitchCurr,
+        alpaka::Vec<TDim, TIdx> const pitchNext,
+        double const dx,
+        double const dy,
+        double const dt) const -> void
+    {
+        auto& sdata = alpaka::declareSharedVar<double[T_SharedMemSize1D], __COUNTER__>(acc);
+
+        // Get extents(dimensions)
+        auto const blockThreadExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
+        auto const numThreadsPerBlock = blockThreadExtent.prod();
+
+        // Get indexes
+        auto const gridBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);
+        auto const blockThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+        auto const threadIdx1D = alpaka::mapIdx<1>(blockThreadIdx, blockThreadExtent)[0u];
+        auto const blockStartIdx = gridBlockIdx * chunkSize;
+
+        constexpr alpaka::Vec<TDim, TIdx> halo{2, 2};
+
+        for(auto i = threadIdx1D; i < T_SharedMemSize1D; i += numThreadsPerBlock)
+        {
+            auto idx2d = alpaka::mapIdx<2>(alpaka::Vec(i), chunkSize + halo);
+            idx2d = idx2d + blockStartIdx;
+            auto elem = getElementPtr(uCurrBuf, idx2d, pitchCurr);
+            sdata[i] = *elem;
+        }
+
+        alpaka::syncBlockThreads(acc);
+
+        // Each kernel executes one element
+        double const rX = dt / (dx * dx);
+        double const rY = dt / (dy * dy);
+
+        // go over only core cells
+        for(auto i = threadIdx1D; i < chunkSize.prod(); i += numThreadsPerBlock)
+        {
+            auto idx2D = alpaka::mapIdx<2>(alpaka::Vec(i), chunkSize);
+            idx2D = idx2D + alpaka::Vec<TDim, TIdx>{1, 1}; // offset for halo above and to the left
+            auto localIdx1D = alpaka::mapIdx<1>(idx2D, chunkSize + halo)[0u];
+
+
+            auto bufIdx = idx2D + blockStartIdx;
+            auto elem = getElementPtr(uNextBuf, bufIdx, pitchNext);
+
+            *elem = sdata[localIdx1D] * (1.0 - 2.0 * rX - 2.0 * rY) + sdata[localIdx1D - 1] * rX
+                    + sdata[localIdx1D + 1] * rX + sdata[localIdx1D - chunkSize[1] - halo[1]] * rY
+                    + sdata[localIdx1D + chunkSize[1] + halo[1]] * rY;
+        }
+    }
+};
diff --git a/example/heatEquation2D/src/analyticalSolution.hpp b/example/heatEquation2D/src/analyticalSolution.hpp
new file mode 100644
index 00000000000..161d37869b3
--- /dev/null
+++ b/example/heatEquation2D/src/analyticalSolution.hpp
@@ -0,0 +1,71 @@
+/* Copyright 2020 Tapish Narwal
+ * SPDX-License-Identifier: ISC
+ */
+
+#pragma once
+
+#include <alpaka/alpaka.hpp>
+
+#include <cmath>
+
+//! Exact solution to the test problem
+//! u_t(x, y, t) = u_xx(x, t) + u_yy(y, t), x in [0, 1], y in [0, 1], t in [0, T]
+//!
+//! \param x value of x
+//! \param x value of y
+//! \param t value of t
+ALPAKA_FN_HOST_ACC auto exactSolution(double const x, double const y, double const t) -> double
+{
+    constexpr double pi = alpaka::math::constants::pi;
+    return std::exp(-pi * pi * t) * (std::sin(pi * x) + std::sin(pi * y));
+}
+
+//! Valdidate calculated solution in the buffer to the analytical solution at t=tMax
+//!
+//! \param buffer buffer holding the solution at t=tMax
+//! \param extent extents of the buffer
+//! \param dx
+//! \param dy
+//! \param tMax time at simulation end
+template<typename T_Buffer, typename T_Extent>
+auto validateSolution(
+    T_Buffer const& buffer,
+    T_Extent const& extent,
+    double const dx,
+    double const dy,
+    double const tMax) -> std::pair<bool, double>
+{
+    // Calculate error
+    double maxError = 0.0;
+    for(uint32_t j = 1; j < extent[0] - 1; ++j)
+    {
+        for(uint32_t i = 0; i < extent[1]; ++i)
+        {
+            auto const error = std::abs(buffer.data()[j * extent[1] + i] - exactSolution(i * dx, j * dy, tMax));
+            maxError = std::max(maxError, error);
+        }
+    }
+
+    constexpr double errorThreshold = 1e-5;
+    return std::make_pair(maxError < errorThreshold, maxError);
+}
+
+//! Initialize the buffer to the analytical solution at t=0
+//!
+//! \param buffer buffer holding the solution at tMax
+//! \param extent extents of the buffer
+//! \param dx
+//! \param dy
+template<typename TBuffer>
+auto initalizeBuffer(TBuffer& buffer, double const dx, double const dy) -> void
+{
+    auto extents = alpaka::getExtents(buffer);
+    // Apply initial conditions for the test problem
+    for(uint32_t j = 0; j < extents[0]; ++j)
+    {
+        for(uint32_t i = 0; i < extents[1]; ++i)
+        {
+            buffer.data()[j * extents[1] + i] = exactSolution(i * dx, j * dy, 0.0);
+        }
+    }
+}
diff --git a/example/heatEquation2D/src/heatEquation2D.cpp b/example/heatEquation2D/src/heatEquation2D.cpp
new file mode 100644
index 00000000000..458a632d2e4
--- /dev/null
+++ b/example/heatEquation2D/src/heatEquation2D.cpp
@@ -0,0 +1,217 @@
+/* Copyright 2020 Benjamin Worpitz, Matthias Werner, Jakob Krude, Sergei Bastrakov, Bernhard Manfred Gruber,
+ * Tapish Narwal
+ * SPDX-License-Identifier: ISC
+ */
+
+#include "BoundaryKernel.hpp"
+#include "StencilKernel.hpp"
+#include "analyticalSolution.hpp"
+
+#ifdef PNGWRITER_ENABLED
+#    include "writeImage.hpp"
+#endif
+
+#include <alpaka/alpaka.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+#include <utility>
+
+//! Each kernel computes the next step for one point.
+//! Therefore the number of threads should be equal to numNodesX.
+//! Every time step the kernel will be executed numNodesX-times
+//! After every step the curr-buffer will be set to the calculated values
+//! from the next-buffer.
+//!
+//! In standard projects, you typically do not execute the code with any available accelerator.
+//! Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+//! selected accelerator only. If you use the example as the starting point for your project, you can rename the
+//! example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
+{
+    // Set Dim and Idx type
+    using Dim = alpaka::DimInt<2u>;
+    using Idx = uint32_t;
+
+    // Define the accelerator
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
+    std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
+
+    // Select specific devices
+    auto const platformHost = alpaka::PlatformCpu{};
+    auto const devHost = alpaka::getDevByIdx(platformHost, 0);
+    auto const platformAcc = alpaka::Platform<Acc>{};
+    // get suitable device for this Acc
+    auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);
+
+    // simulation defines
+    // {Y, X}
+    constexpr alpaka::Vec<Dim, Idx> numNodes{64, 64};
+    constexpr alpaka::Vec<Dim, Idx> haloSize{2, 2};
+    constexpr alpaka::Vec<Dim, Idx> extent = numNodes + haloSize;
+
+    constexpr uint32_t numTimeSteps = 100;
+    constexpr double tMax = 0.001;
+    // x, y in [0, 1], t in [0, tMax]
+    constexpr double dx = 1.0 / static_cast<double>(extent[1] - 1);
+    constexpr double dy = 1.0 / static_cast<double>(extent[0] - 1);
+    constexpr double dt = tMax / static_cast<double>(numTimeSteps);
+
+    // Check the stability condition
+    constexpr double r = dt / std::min(dx * dx, dy * dy);
+    if constexpr(r > 0.5)
+    {
+        std::cerr << "Stability condition check failed: dt/min(dx^2,dy^2) = " << r
+                  << ", it is required to be <= 0.5\n";
+        return EXIT_FAILURE;
+    }
+
+    // Initialize host-buffer
+    // This buffer will hold the current values (used for the next step)
+    auto uBufHost = alpaka::allocBuf<double, Idx>(devHost, extent);
+
+    // Accelerator buffer
+    auto uCurrBufAcc = alpaka::allocBuf<double, Idx>(devAcc, extent);
+    auto uNextBufAcc = alpaka::allocBuf<double, Idx>(devAcc, extent);
+
+    auto const pitchCurrAcc{alpaka::getPitchesInBytes(uCurrBufAcc)};
+    auto const pitchNextAcc{alpaka::getPitchesInBytes(uNextBufAcc)};
+
+    // Set buffer to initial conditions
+    initalizeBuffer(uBufHost, dx, dy);
+
+    // Select queue
+    using QueueProperty = alpaka::NonBlocking;
+    using QueueAcc = alpaka::Queue<Acc, QueueProperty>;
+    QueueAcc dumpQueue{devAcc};
+    QueueAcc computeQueue{devAcc};
+
+    // Copy host -> device
+    alpaka::memcpy(computeQueue, uCurrBufAcc, uBufHost);
+    alpaka::wait(computeQueue);
+
+    // Define a workdiv for the given problem
+    constexpr alpaka::Vec<Dim, Idx> elemPerThread{1, 1};
+
+    // Appropriate chunk size to split your problem for your Acc
+    constexpr Idx xSize = 16u;
+    constexpr Idx ySize = 16u;
+    constexpr Idx halo = 2u;
+    constexpr alpaka::Vec<Dim, Idx> chunkSize{ySize, xSize};
+    constexpr auto sharedMemSize = (ySize + halo) * (xSize + halo);
+
+    constexpr alpaka::Vec<Dim, Idx> numChunks{
+        alpaka::core::divCeil(numNodes[0], chunkSize[0]),
+        alpaka::core::divCeil(numNodes[1], chunkSize[1]),
+    };
+
+    assert(
+        numNodes[0] % chunkSize[0] == 0 && numNodes[1] % chunkSize[1] == 0
+        && "Domain must be divisible by chunk size");
+
+    StencilKernel<sharedMemSize> stencilKernel;
+    BoundaryKernel boundaryKernel;
+
+    // Get max threads that can be run in a block for this kernel
+    auto const kernelFunctionAttributes = alpaka::getFunctionAttributes<Acc>(
+        devAcc,
+        stencilKernel,
+        uCurrBufAcc.data(),
+        uNextBufAcc.data(),
+        chunkSize,
+        pitchCurrAcc,
+        pitchNextAcc,
+        dx,
+        dy,
+        dt);
+    auto const maxThreadsPerBlock = kernelFunctionAttributes.maxThreadsPerBlock;
+
+    auto const threadsPerBlock
+        = maxThreadsPerBlock < chunkSize.prod() ? alpaka::Vec<Dim, Idx>{maxThreadsPerBlock, 1} : chunkSize;
+
+    alpaka::WorkDivMembers<Dim, Idx> workDiv_manual{numChunks, threadsPerBlock, elemPerThread};
+
+    // Simulate
+    for(uint32_t step = 1; step <= numTimeSteps; ++step)
+    {
+        // Compute next values
+        alpaka::exec<Acc>(
+            computeQueue,
+            workDiv_manual,
+            stencilKernel,
+            uCurrBufAcc.data(),
+            uNextBufAcc.data(),
+            chunkSize,
+            pitchCurrAcc,
+            pitchNextAcc,
+            dx,
+            dy,
+            dt);
+
+        // Apply boundaries
+        alpaka::exec<Acc>(
+            computeQueue,
+            workDiv_manual,
+            boundaryKernel,
+            uNextBufAcc.data(),
+            chunkSize,
+            pitchNextAcc,
+            step,
+            dx,
+            dy,
+            dt);
+
+#ifdef PNGWRITER_ENABLED
+        if((step - 1) % 100 == 0)
+        {
+            alpaka::memcpy(dumpQueue, uBufHost, uCurrBufAcc);
+            alpaka::wait(dumpQueue);
+            writeImage(step - 1, uBufHost);
+        }
+#endif
+
+        // So we just swap next and curr (shallow copy)
+        std::swap(uNextBufAcc, uCurrBufAcc);
+    }
+
+    // Copy device -> host
+    alpaka::wait(computeQueue);
+    alpaka::memcpy(dumpQueue, uBufHost, uCurrBufAcc);
+    alpaka::wait(dumpQueue);
+
+    // Validate
+    auto const [resultIsCorrect, maxError] = validateSolution(uBufHost, extent, dx, dy, tMax);
+
+    if(resultIsCorrect)
+    {
+        std::cout << "Execution results correct!" << std::endl;
+        return EXIT_SUCCESS;
+    }
+    else
+    {
+        std::cout << "Execution results incorrect: Max error = " << maxError << " (the grid resolution may be too low)"
+                  << std::endl;
+        return EXIT_FAILURE;
+    }
+}
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/example/heatEquation2D/src/helpers.hpp b/example/heatEquation2D/src/helpers.hpp
new file mode 100644
index 00000000000..adac2dfe4f0
--- /dev/null
+++ b/example/heatEquation2D/src/helpers.hpp
@@ -0,0 +1,25 @@
+/* Copyright 2020 Tapish Narwal
+ * SPDX-License-Identifier: ISC
+ */
+
+#pragma once
+
+#include <alpaka/alpaka.hpp>
+
+template<typename T, typename U>
+using const_match = std::conditional_t<std::is_const_v<T>, U const, U>;
+
+//! Helper function to get a pointer to an element in a multidimensional buffer
+//!
+//! \tparam T type of the element
+//! \tparam TDim dimension of the buffer
+//! \tparam TIdx index type
+//! \param ptr pointer to the buffer
+//! \param idx index of the element
+//! \param pitch pitch of the buffer
+template<typename T, typename TDim, typename TIdx>
+ALPAKA_FN_ACC T* getElementPtr(T* ptr, alpaka::Vec<TDim, TIdx> idx, alpaka::Vec<TDim, TIdx> pitch)
+{
+    return reinterpret_cast<T*>(
+        reinterpret_cast<const_match<T, std::byte>*>(ptr) + idx[0] * pitch[0] + idx[1] * pitch[1]);
+}
diff --git a/example/heatEquation2D/src/writeImage.hpp b/example/heatEquation2D/src/writeImage.hpp
new file mode 100644
index 00000000000..dc35867aa48
--- /dev/null
+++ b/example/heatEquation2D/src/writeImage.hpp
@@ -0,0 +1,38 @@
+/* Copyright 2024 Tapish Narwal
+ * SPDX-License-Identifier: ISC
+ */
+
+#pragma once
+
+#include <alpaka/extent/Traits.hpp>
+
+#include <pngwriter.h>
+
+#include <cstdint>
+#include <iomanip>
+#include <sstream>
+
+//! Writes the buffer to a png file
+//!
+//! \param currentStep the current step of the simulation
+//! \param buffer the buffer to write to the file
+template<typename T_Buffer>
+auto writeImage(uint32_t const currentStep, T_Buffer const& buffer) -> void
+{
+    std::stringstream step;
+    step << std::setw(6) << std::setfill('0') << currentStep;
+    std::string filename("heat_" + step.str() + ".png");
+    auto extents = alpaka::getExtents(buffer);
+    pngwriter png{static_cast<int>(extents[1]), static_cast<int>(extents[0]), 0, filename.c_str()};
+    png.setcompressionlevel(9);
+
+    for(uint32_t y = 0; y < extents[0]; ++y)
+    {
+        for(uint32_t x = 0; x < extents[1]; ++x)
+        {
+            auto p = buffer.data()[y * extents[1] + x];
+            png.plot(x + 1, extents[0] - y, p, 0., 1. - p);
+        }
+    }
+    png.close();
+}