From 53392f4f7a0b5f6348ce128fe2b06cd6f650d4ce Mon Sep 17 00:00:00 2001 From: Tapish Date: Tue, 3 Sep 2024 13:33:29 +0200 Subject: [PATCH] Add 2D heat equation example with shared memory and validation --- example/CMakeLists.txt | 1 + example/heatEquation2D/CMakeLists.txt | 67 ++++++ example/heatEquation2D/src/BoundaryKernel.hpp | 86 +++++++ example/heatEquation2D/src/StencilKernel.hpp | 89 +++++++ .../heatEquation2D/src/analyticalSolution.hpp | 71 ++++++ example/heatEquation2D/src/heatEquation2D.cpp | 217 ++++++++++++++++++ example/heatEquation2D/src/helpers.hpp | 25 ++ example/heatEquation2D/src/writeImage.hpp | 38 +++ 8 files changed, 594 insertions(+) create mode 100644 example/heatEquation2D/CMakeLists.txt create mode 100644 example/heatEquation2D/src/BoundaryKernel.hpp create mode 100644 example/heatEquation2D/src/StencilKernel.hpp create mode 100644 example/heatEquation2D/src/analyticalSolution.hpp create mode 100644 example/heatEquation2D/src/heatEquation2D.cpp create mode 100644 example/heatEquation2D/src/helpers.hpp create mode 100644 example/heatEquation2D/src/writeImage.hpp diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index d981136dbb2..4d3ae6a1e0f 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -22,6 +22,7 @@ add_subdirectory("convolution2D/") add_subdirectory("conv2DWithMdspan/") add_subdirectory("counterBasedRng/") add_subdirectory("heatEquation/") +add_subdirectory("heatEquation2D/") add_subdirectory("helloWorld/") add_subdirectory("helloWorldLambda/") add_subdirectory("kernelSpecialization/") diff --git a/example/heatEquation2D/CMakeLists.txt b/example/heatEquation2D/CMakeLists.txt new file mode 100644 index 00000000000..ab9c5fe65d6 --- /dev/null +++ b/example/heatEquation2D/CMakeLists.txt @@ -0,0 +1,67 @@ +# +# Copyright 2023 Benjamin Worpitz, Jan Stephan +# SPDX-License-Identifier: ISC +# + +################################################################################ +# Required CMake version. + +cmake_minimum_required(VERSION 3.22) + +set_property(GLOBAL PROPERTY USE_FOLDERS ON) + +################################################################################ +# Project. + +set(_TARGET_NAME heatEquation2D) + +project(${_TARGET_NAME} LANGUAGES CXX) + + +################################################################################ +# PNGwriter +################################################################################ + +# find PNGwriter installation +find_package(PNGwriter 0.7.0 CONFIG) + +if(PNGwriter_FOUND) + set(PNGWRITER_ENABLED True) +else() + set(PNGWRITER_ENABLED False) +endif() + +#------------------------------------------------------------------------------- +# Find alpaka. + +if(NOT TARGET alpaka::alpaka) + option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF) + + if(alpaka_USE_SOURCE_TREE) + # Don't build the examples recursively + set(alpaka_BUILD_EXAMPLES OFF) + add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka") + else() + find_package(alpaka REQUIRED) + endif() +endif() + +#------------------------------------------------------------------------------- +# Add executable. + +alpaka_add_executable( + ${_TARGET_NAME} + src/heatEquation2D.cpp) +target_link_libraries( + ${_TARGET_NAME} + PUBLIC alpaka::alpaka) +if(PNGwriter_FOUND) + target_link_libraries( + ${_TARGET_NAME} + PRIVATE PNGwriter::PNGwriter) + target_compile_definitions(${_TARGET_NAME} PRIVATE PNGWRITER_ENABLED) +endif() + +set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example) + +add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME}) diff --git a/example/heatEquation2D/src/BoundaryKernel.hpp b/example/heatEquation2D/src/BoundaryKernel.hpp new file mode 100644 index 00000000000..e2a4d022a7b --- /dev/null +++ b/example/heatEquation2D/src/BoundaryKernel.hpp @@ -0,0 +1,86 @@ +/* Copyright 2024 Tapish Narwal + * SPDX-License-Identifier: ISC + */ + +#pragma once + +#include "analyticalSolution.hpp" +#include "helpers.hpp" + +#include + +//! alpaka version of explicit finite-difference 1d heat equation solver +//! +//! Applies boundary conditions +//! forward difference in t and second-order central difference in x +//! +//! \param uBuf grid values of u for each x, y and the current value of t: +//! u(x, y, t) | t = t_current +//! \param chunkSize +//! \param pitch +//! \param dx step in x +//! \param dy step in y +//! \param dt step in t +struct BoundaryKernel +{ + template + ALPAKA_FN_ACC auto operator()( + TAcc const& acc, + double* const uBuf, + TChunk const chunkSize, + TChunk const pitch, + uint32_t step, + double const dx, + double const dy, + double const dt) const -> void + { + using Dim = alpaka::DimInt<2u>; + using Idx = uint32_t; + + // Get extents(dimensions) + auto const gridBlockExtent = alpaka::getWorkDiv(acc); + auto const blockThreadExtent = alpaka::getWorkDiv(acc); + auto const numThreadsPerBlock = blockThreadExtent.prod(); + + // Get indexes + auto const gridBlockIdx = alpaka::getIdx(acc); + auto const blockThreadIdx = alpaka::getIdx(acc); + auto const threadIdx1D = alpaka::mapIdx<1>(blockThreadIdx, blockThreadExtent)[0u]; + auto const blockStartIdx = gridBlockIdx * chunkSize; + + // Lambda function to apply boundary conditions + auto applyBoundary = [&](auto const& globalIdxStart, auto const length, bool isRow) + { + for(auto i = threadIdx1D; i < length; i += numThreadsPerBlock) + { + auto idx2D = globalIdxStart + (isRow ? alpaka::Vec{0, i} : alpaka::Vec{i, 0}); + auto elem = getElementPtr(uBuf, idx2D, pitch); + *elem = exactSolution(idx2D[1] * dx, idx2D[0] * dy, step * dt); + } + }; + + // Apply boundary conditions for the top row + if(gridBlockIdx[0] == 0) + { + applyBoundary(blockStartIdx + alpaka::Vec{0, 1}, chunkSize[1], true); + } + + // Apply boundary conditions for the bottom row + if(gridBlockIdx[0] == gridBlockExtent[0] - 1) + { + applyBoundary(blockStartIdx + alpaka::Vec{chunkSize[0] + 1, 1}, chunkSize[1], true); + } + + // Apply boundary conditions for the left column + if(gridBlockIdx[1] == 0) + { + applyBoundary(blockStartIdx + alpaka::Vec{1, 0}, chunkSize[0], false); + } + + // Apply boundary conditions for the right column + if(gridBlockIdx[1] == gridBlockExtent[1] - 1) + { + applyBoundary(blockStartIdx + alpaka::Vec{1, chunkSize[1] + 1}, chunkSize[0], false); + } + } +}; diff --git a/example/heatEquation2D/src/StencilKernel.hpp b/example/heatEquation2D/src/StencilKernel.hpp new file mode 100644 index 00000000000..a6121d3dbb9 --- /dev/null +++ b/example/heatEquation2D/src/StencilKernel.hpp @@ -0,0 +1,89 @@ +/* Copyright 2024 Tapish Narwal + * SPDX-License-Identifier: ISC + */ + +#pragma once + +#include "helpers.hpp" + +#include + +//! alpaka version of explicit finite-difference 2D heat equation solver +//! +//! \tparam T_SharedMemSize1D size of the shared memory box +//! +//! Solving equation u_t(x, t) = u_xx(x, t) + u_yy(y, t) using a simple explicit scheme with +//! forward difference in t and second-order central difference in x and y +//! +//! \param uCurrBuf Current buffer with grid values of u for each x, y pair and the current value of t: +//! u(x, y, t) | t = t_current +//! \param uNextBuf resulting grid values of u for each x, y pair and the next value of t: +//! u(x, y, t) | t = t_current + dt +//! \param chunkSize The size of the chunk or tile that the user divides the problem into. This defines the size of the +//! workload handled by each thread block. +//! \param pitchCurr The pitch (or stride) in memory corresponding to the TDim grid in the accelerator's memory. +//! This is used to calculate memory offsets when accessing elements in the current buffer. +//! \param pitchNext The pitch used to calculate memory offsets when accessing elements in the next buffer. +//! \param dx step in x +//! \param dy step in y +//! \param dt step in t +template +struct StencilKernel +{ + template + ALPAKA_FN_ACC auto operator()( + TAcc const& acc, + double const* const uCurrBuf, + double* const uNextBuf, + alpaka::Vec const chunkSize, + alpaka::Vec const pitchCurr, + alpaka::Vec const pitchNext, + double const dx, + double const dy, + double const dt) const -> void + { + auto& sdata = alpaka::declareSharedVar(acc); + + // Get extents(dimensions) + auto const blockThreadExtent = alpaka::getWorkDiv(acc); + auto const numThreadsPerBlock = blockThreadExtent.prod(); + + // Get indexes + auto const gridBlockIdx = alpaka::getIdx(acc); + auto const blockThreadIdx = alpaka::getIdx(acc); + auto const threadIdx1D = alpaka::mapIdx<1>(blockThreadIdx, blockThreadExtent)[0u]; + auto const blockStartIdx = gridBlockIdx * chunkSize; + + constexpr alpaka::Vec halo{2, 2}; + + for(auto i = threadIdx1D; i < T_SharedMemSize1D; i += numThreadsPerBlock) + { + auto idx2d = alpaka::mapIdx<2>(alpaka::Vec(i), chunkSize + halo); + idx2d = idx2d + blockStartIdx; + auto elem = getElementPtr(uCurrBuf, idx2d, pitchCurr); + sdata[i] = *elem; + } + + alpaka::syncBlockThreads(acc); + + // Each kernel executes one element + double const rX = dt / (dx * dx); + double const rY = dt / (dy * dy); + + // go over only core cells + for(auto i = threadIdx1D; i < chunkSize.prod(); i += numThreadsPerBlock) + { + auto idx2D = alpaka::mapIdx<2>(alpaka::Vec(i), chunkSize); + idx2D = idx2D + alpaka::Vec{1, 1}; // offset for halo above and to the left + auto localIdx1D = alpaka::mapIdx<1>(idx2D, chunkSize + halo)[0u]; + + + auto bufIdx = idx2D + blockStartIdx; + auto elem = getElementPtr(uNextBuf, bufIdx, pitchNext); + + *elem = sdata[localIdx1D] * (1.0 - 2.0 * rX - 2.0 * rY) + sdata[localIdx1D - 1] * rX + + sdata[localIdx1D + 1] * rX + sdata[localIdx1D - chunkSize[1] - halo[1]] * rY + + sdata[localIdx1D + chunkSize[1] + halo[1]] * rY; + } + } +}; diff --git a/example/heatEquation2D/src/analyticalSolution.hpp b/example/heatEquation2D/src/analyticalSolution.hpp new file mode 100644 index 00000000000..161d37869b3 --- /dev/null +++ b/example/heatEquation2D/src/analyticalSolution.hpp @@ -0,0 +1,71 @@ +/* Copyright 2020 Tapish Narwal + * SPDX-License-Identifier: ISC + */ + +#pragma once + +#include + +#include + +//! Exact solution to the test problem +//! u_t(x, y, t) = u_xx(x, t) + u_yy(y, t), x in [0, 1], y in [0, 1], t in [0, T] +//! +//! \param x value of x +//! \param x value of y +//! \param t value of t +ALPAKA_FN_HOST_ACC auto exactSolution(double const x, double const y, double const t) -> double +{ + constexpr double pi = alpaka::math::constants::pi; + return std::exp(-pi * pi * t) * (std::sin(pi * x) + std::sin(pi * y)); +} + +//! Valdidate calculated solution in the buffer to the analytical solution at t=tMax +//! +//! \param buffer buffer holding the solution at t=tMax +//! \param extent extents of the buffer +//! \param dx +//! \param dy +//! \param tMax time at simulation end +template +auto validateSolution( + T_Buffer const& buffer, + T_Extent const& extent, + double const dx, + double const dy, + double const tMax) -> std::pair +{ + // Calculate error + double maxError = 0.0; + for(uint32_t j = 1; j < extent[0] - 1; ++j) + { + for(uint32_t i = 0; i < extent[1]; ++i) + { + auto const error = std::abs(buffer.data()[j * extent[1] + i] - exactSolution(i * dx, j * dy, tMax)); + maxError = std::max(maxError, error); + } + } + + constexpr double errorThreshold = 1e-5; + return std::make_pair(maxError < errorThreshold, maxError); +} + +//! Initialize the buffer to the analytical solution at t=0 +//! +//! \param buffer buffer holding the solution at tMax +//! \param extent extents of the buffer +//! \param dx +//! \param dy +template +auto initalizeBuffer(TBuffer& buffer, double const dx, double const dy) -> void +{ + auto extents = alpaka::getExtents(buffer); + // Apply initial conditions for the test problem + for(uint32_t j = 0; j < extents[0]; ++j) + { + for(uint32_t i = 0; i < extents[1]; ++i) + { + buffer.data()[j * extents[1] + i] = exactSolution(i * dx, j * dy, 0.0); + } + } +} diff --git a/example/heatEquation2D/src/heatEquation2D.cpp b/example/heatEquation2D/src/heatEquation2D.cpp new file mode 100644 index 00000000000..458a632d2e4 --- /dev/null +++ b/example/heatEquation2D/src/heatEquation2D.cpp @@ -0,0 +1,217 @@ +/* Copyright 2020 Benjamin Worpitz, Matthias Werner, Jakob Krude, Sergei Bastrakov, Bernhard Manfred Gruber, + * Tapish Narwal + * SPDX-License-Identifier: ISC + */ + +#include "BoundaryKernel.hpp" +#include "StencilKernel.hpp" +#include "analyticalSolution.hpp" + +#ifdef PNGWRITER_ENABLED +# include "writeImage.hpp" +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include + +//! Each kernel computes the next step for one point. +//! Therefore the number of threads should be equal to numNodesX. +//! Every time step the kernel will be executed numNodesX-times +//! After every step the curr-buffer will be set to the calculated values +//! from the next-buffer. +//! +//! In standard projects, you typically do not execute the code with any available accelerator. +//! Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the +//! selected accelerator only. If you use the example as the starting point for your project, you can rename the +//! example() function to main() and move the accelerator tag to the function body. +template +auto example(TAccTag const&) -> int +{ + // Set Dim and Idx type + using Dim = alpaka::DimInt<2u>; + using Idx = uint32_t; + + // Define the accelerator + using Acc = alpaka::TagToAcc; + std::cout << "Using alpaka accelerator: " << alpaka::getAccName() << std::endl; + + // Select specific devices + auto const platformHost = alpaka::PlatformCpu{}; + auto const devHost = alpaka::getDevByIdx(platformHost, 0); + auto const platformAcc = alpaka::Platform{}; + // get suitable device for this Acc + auto const devAcc = alpaka::getDevByIdx(platformAcc, 0); + + // simulation defines + // {Y, X} + constexpr alpaka::Vec numNodes{64, 64}; + constexpr alpaka::Vec haloSize{2, 2}; + constexpr alpaka::Vec extent = numNodes + haloSize; + + constexpr uint32_t numTimeSteps = 100; + constexpr double tMax = 0.001; + // x, y in [0, 1], t in [0, tMax] + constexpr double dx = 1.0 / static_cast(extent[1] - 1); + constexpr double dy = 1.0 / static_cast(extent[0] - 1); + constexpr double dt = tMax / static_cast(numTimeSteps); + + // Check the stability condition + constexpr double r = dt / std::min(dx * dx, dy * dy); + if constexpr(r > 0.5) + { + std::cerr << "Stability condition check failed: dt/min(dx^2,dy^2) = " << r + << ", it is required to be <= 0.5\n"; + return EXIT_FAILURE; + } + + // Initialize host-buffer + // This buffer will hold the current values (used for the next step) + auto uBufHost = alpaka::allocBuf(devHost, extent); + + // Accelerator buffer + auto uCurrBufAcc = alpaka::allocBuf(devAcc, extent); + auto uNextBufAcc = alpaka::allocBuf(devAcc, extent); + + auto const pitchCurrAcc{alpaka::getPitchesInBytes(uCurrBufAcc)}; + auto const pitchNextAcc{alpaka::getPitchesInBytes(uNextBufAcc)}; + + // Set buffer to initial conditions + initalizeBuffer(uBufHost, dx, dy); + + // Select queue + using QueueProperty = alpaka::NonBlocking; + using QueueAcc = alpaka::Queue; + QueueAcc dumpQueue{devAcc}; + QueueAcc computeQueue{devAcc}; + + // Copy host -> device + alpaka::memcpy(computeQueue, uCurrBufAcc, uBufHost); + alpaka::wait(computeQueue); + + // Define a workdiv for the given problem + constexpr alpaka::Vec elemPerThread{1, 1}; + + // Appropriate chunk size to split your problem for your Acc + constexpr Idx xSize = 16u; + constexpr Idx ySize = 16u; + constexpr Idx halo = 2u; + constexpr alpaka::Vec chunkSize{ySize, xSize}; + constexpr auto sharedMemSize = (ySize + halo) * (xSize + halo); + + constexpr alpaka::Vec numChunks{ + alpaka::core::divCeil(numNodes[0], chunkSize[0]), + alpaka::core::divCeil(numNodes[1], chunkSize[1]), + }; + + assert( + numNodes[0] % chunkSize[0] == 0 && numNodes[1] % chunkSize[1] == 0 + && "Domain must be divisible by chunk size"); + + StencilKernel stencilKernel; + BoundaryKernel boundaryKernel; + + // Get max threads that can be run in a block for this kernel + auto const kernelFunctionAttributes = alpaka::getFunctionAttributes( + devAcc, + stencilKernel, + uCurrBufAcc.data(), + uNextBufAcc.data(), + chunkSize, + pitchCurrAcc, + pitchNextAcc, + dx, + dy, + dt); + auto const maxThreadsPerBlock = kernelFunctionAttributes.maxThreadsPerBlock; + + auto const threadsPerBlock + = maxThreadsPerBlock < chunkSize.prod() ? alpaka::Vec{maxThreadsPerBlock, 1} : chunkSize; + + alpaka::WorkDivMembers workDiv_manual{numChunks, threadsPerBlock, elemPerThread}; + + // Simulate + for(uint32_t step = 1; step <= numTimeSteps; ++step) + { + // Compute next values + alpaka::exec( + computeQueue, + workDiv_manual, + stencilKernel, + uCurrBufAcc.data(), + uNextBufAcc.data(), + chunkSize, + pitchCurrAcc, + pitchNextAcc, + dx, + dy, + dt); + + // Apply boundaries + alpaka::exec( + computeQueue, + workDiv_manual, + boundaryKernel, + uNextBufAcc.data(), + chunkSize, + pitchNextAcc, + step, + dx, + dy, + dt); + +#ifdef PNGWRITER_ENABLED + if((step - 1) % 100 == 0) + { + alpaka::memcpy(dumpQueue, uBufHost, uCurrBufAcc); + alpaka::wait(dumpQueue); + writeImage(step - 1, uBufHost); + } +#endif + + // So we just swap next and curr (shallow copy) + std::swap(uNextBufAcc, uCurrBufAcc); + } + + // Copy device -> host + alpaka::wait(computeQueue); + alpaka::memcpy(dumpQueue, uBufHost, uCurrBufAcc); + alpaka::wait(dumpQueue); + + // Validate + auto const [resultIsCorrect, maxError] = validateSolution(uBufHost, extent, dx, dy, tMax); + + if(resultIsCorrect) + { + std::cout << "Execution results correct!" << std::endl; + return EXIT_SUCCESS; + } + else + { + std::cout << "Execution results incorrect: Max error = " << maxError << " (the grid resolution may be too low)" + << std::endl; + return EXIT_FAILURE; + } +} + +auto main() -> int +{ + // Execute the example once for each enabled accelerator. + // If you would like to execute it for a single accelerator only you can use the following code. + // \code{.cpp} + // auto tag = TagCpuSerial; + // return example(tag); + // \endcode + // + // valid tags: + // TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, + // TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, + // TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel + return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); +} diff --git a/example/heatEquation2D/src/helpers.hpp b/example/heatEquation2D/src/helpers.hpp new file mode 100644 index 00000000000..adac2dfe4f0 --- /dev/null +++ b/example/heatEquation2D/src/helpers.hpp @@ -0,0 +1,25 @@ +/* Copyright 2020 Tapish Narwal + * SPDX-License-Identifier: ISC + */ + +#pragma once + +#include + +template +using const_match = std::conditional_t, U const, U>; + +//! Helper function to get a pointer to an element in a multidimensional buffer +//! +//! \tparam T type of the element +//! \tparam TDim dimension of the buffer +//! \tparam TIdx index type +//! \param ptr pointer to the buffer +//! \param idx index of the element +//! \param pitch pitch of the buffer +template +ALPAKA_FN_ACC T* getElementPtr(T* ptr, alpaka::Vec idx, alpaka::Vec pitch) +{ + return reinterpret_cast( + reinterpret_cast*>(ptr) + idx[0] * pitch[0] + idx[1] * pitch[1]); +} diff --git a/example/heatEquation2D/src/writeImage.hpp b/example/heatEquation2D/src/writeImage.hpp new file mode 100644 index 00000000000..dc35867aa48 --- /dev/null +++ b/example/heatEquation2D/src/writeImage.hpp @@ -0,0 +1,38 @@ +/* Copyright 2024 Tapish Narwal + * SPDX-License-Identifier: ISC + */ + +#pragma once + +#include + +#include + +#include +#include +#include + +//! Writes the buffer to a png file +//! +//! \param currentStep the current step of the simulation +//! \param buffer the buffer to write to the file +template +auto writeImage(uint32_t const currentStep, T_Buffer const& buffer) -> void +{ + std::stringstream step; + step << std::setw(6) << std::setfill('0') << currentStep; + std::string filename("heat_" + step.str() + ".png"); + auto extents = alpaka::getExtents(buffer); + pngwriter png{static_cast(extents[1]), static_cast(extents[0]), 0, filename.c_str()}; + png.setcompressionlevel(9); + + for(uint32_t y = 0; y < extents[0]; ++y) + { + for(uint32_t x = 0; x < extents[1]; ++x) + { + auto p = buffer.data()[y * extents[1] + x]; + png.plot(x + 1, extents[0] - y, p, 0., 1. - p); + } + } + png.close(); +}