Skip to content

Commit

Permalink
remove KernelBundle, change signature of [get|is]ValidWorkDiv* (#…
Browse files Browse the repository at this point in the history
…2349)

* remove `KernelBundle`, change signature of `getValidWorkDiv*`

- revert introduced `KernelBundle` in #2251
- change signature of `getValidWorkDivForKernel`,`isValidWorkDivKernel` and `isValidWorkDivKernel`
- reuse old naming `getValidWorkDiv` and `isValidWorkDiv`

* use new interface for `getValidWorkDiv`

* fix cheat sheet
  • Loading branch information
psychocoderHPC authored Aug 15, 2024
1 parent ad77f15 commit d7c459d
Show file tree
Hide file tree
Showing 39 changed files with 367 additions and 400 deletions.
28 changes: 18 additions & 10 deletions docs/source/basic/cheatsheet.rst
Original file line number Diff line number Diff line change
Expand Up @@ -180,21 +180,28 @@ Prepare Kernel Bundle
.. code-block:: c++

HeatEquationKernel heatEqKernel;
// Arguments of KernelBundle: The kernel instance and the kernel arguments
auto const& bundeledKernel = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);

Automatically select a valid kernel launch configuration
.. code-block:: c++

Vec<Dim, Idx> const globalThreadExtent = vectorValue;
Vec<Dim, Idx> const elementsPerThread = vectorValue;

auto autoWorkDiv = getValidWorkDivForKernel<Acc>(
device,
bundeledKernel,
globalThreadExtent, elementsPerThread,
KernelCfg<Acc> const kernelCfg = {
globalThreadExtent,
elementsPerThread,
false,
GridBlockExtentSubDivRestrictions::Unrestricted);
GridBlockExtentSubDivRestrictions::Unrestricted};

auto autoWorkDiv = getValidWorkDiv(
kernelCfg,
device,
heatEqKernel,
pCurrAcc,
pNextAcc,
numNodesX,
dx,
dt);

Manually set a kernel launch configuration
.. code-block:: c++
Expand All @@ -204,9 +211,10 @@ Manually set a kernel launch configuration
Vec<Dim, Idx> const elementsPerThread = vectorValue;

using WorkDiv = WorkDivMembers<Dim, Idx>;
auto manualWorkDiv = WorkDiv{blocksPerGrid,
threadsPerBlock,
elementsPerThread};
auto manualWorkDiv = WorkDiv{
blocksPerGrid,
threadsPerBlock,
elementsPerThread};

Instantiate a kernel and create a task that will run it (does not launch it yet)
.. code-block:: c++
Expand Down
12 changes: 5 additions & 7 deletions example/bufferCopy/src/bufferCopy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ auto example(TAccTag const&) -> int
using Data = std::uint32_t;
constexpr Idx nElementsPerDim = 2;

const Vec extents(Vec::all(static_cast<Idx>(nElementsPerDim)));
Vec const extents(Vec::all(static_cast<Idx>(nElementsPerDim)));

// Allocate host memory buffers
//
Expand Down Expand Up @@ -164,9 +164,8 @@ auto example(TAccTag const&) -> int

FillBufferKernel fillBufferKernel;

auto const& bundeledFillBufferKernel = alpaka::KernelBundle(fillBufferKernel, hostViewPlainPtrMdSpan);
auto const hostWorkDiv
= alpaka::getValidWorkDivForKernel<Host>(devHost, bundeledFillBufferKernel, threadsPerGrid, elementsPerThread);
alpaka::KernelCfg<Host> const hostKernelCfg = {threadsPerGrid, elementsPerThread};
auto const hostWorkDiv = alpaka::getValidWorkDiv(hostKernelCfg, devHost, fillBufferKernel, hostViewPlainPtrMdSpan);

alpaka::exec<Host>(hostQueue, hostWorkDiv, fillBufferKernel,
hostViewPlainPtrMdSpan); // 1st kernel argument
Expand Down Expand Up @@ -203,11 +202,10 @@ auto example(TAccTag const&) -> int
auto deviceBufferMdSpan2 = alpaka::experimental::getMdSpan(deviceBuffer2);

TestBufferKernel testBufferKernel;
auto const& bundeledTestBufferKernel = alpaka::KernelBundle(testBufferKernel, deviceBufferMdSpan1);

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const devWorkDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledTestBufferKernel, threadsPerGrid, elementsPerThread);
alpaka::KernelCfg<Acc> const devKernelCfg = {threadsPerGrid, elementsPerThread};
auto const devWorkDiv = alpaka::getValidWorkDiv(devKernelCfg, devAcc, testBufferKernel, deviceBufferMdSpan1);

alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan1);
alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan2);
Expand Down
6 changes: 3 additions & 3 deletions example/complex/src/complex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,10 @@ auto example(TAccTag const&) -> int

ComplexKernel complexKernel;

auto const& bundeledKernel = alpaka::KernelBundle(complexKernel);
alpaka::KernelCfg<Acc> const kernelCfg = {threadsPerGrid, elementsPerThread};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, complexKernel);

// Run the kernel
alpaka::exec<Acc>(queue, workDiv, complexKernel);
Expand Down
10 changes: 5 additions & 5 deletions example/conv2DWithMdspan/src/conv2DWithMdspan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,16 +147,16 @@ auto example(TAccTag const&) -> int
// Construct kernel object
ConvolutionKernelMdspan2D convolutionKernel2D;

// Make a bundle
auto const& bundeledKernel = alpaka::KernelBundle(
// Let alpaka calculate good block and grid sizes given our full problem extent.
alpaka::KernelCfg<DevAcc> const kernelCfg = {extent, Vec::ones()};
auto const workDiv = alpaka::getValidWorkDiv(
kernelCfg,
devAcc,
convolutionKernel2D,
alpaka::experimental::getMdSpan(bufInputAcc),
alpaka::experimental::getMdSpan(outputDeviceMemory),
alpaka::experimental::getMdSpan(bufFilterAcc));

// Let alpaka calculate good block and grid sizes given our full problem extent.
auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, extent, Vec::ones());


// Run the kernel, pass 3 arrays as 2D mdspans
alpaka::exec<DevAcc>(
Expand Down
14 changes: 8 additions & 6 deletions example/convolution1D/src/convolution1D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ struct ConvolutionKernel
TElem const* const input,
TElem const* const filter,
TElem* const output,
const std::size_t inputSize,
const std::size_t filterSize) const -> void
std::size_t const inputSize,
std::size_t const filterSize) const -> void
{
auto const globalThreadIdxX = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];

Expand Down Expand Up @@ -140,17 +140,19 @@ auto example(TAccTag const&) -> int
DataType* nativeInputDeviceMemory = std::data(inputDeviceMemory);
DataType* nativeOutputDeviceMemory = std::data(outputDeviceMemory);

auto const& bundeledKernel = alpaka::KernelBundle(
alpaka::KernelCfg<DevAcc> const kernelCfg = {threadsPerGrid, elementsPerThread};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv = alpaka::getValidWorkDiv(
kernelCfg,
devAcc,
convolutionKernel,
nativeInputDeviceMemory,
nativeFilterDeviceMemory,
nativeOutputDeviceMemory,
inputSize,
filterSize);

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
// Run the kernel
alpaka::exec<DevAcc>(
queue,
Expand Down
20 changes: 11 additions & 9 deletions example/convolution2D/src/convolution2D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ auto example(TAccTag const&) -> int
alpaka::wait(queueAcc);

// Calculate the allocated width, due to padding it might be larger then the matrix width
auto const intputWidthAllocated = [&]() -> const Idx
auto const intputWidthAllocated = [&]() -> Idx const
{
// Calculate pitch: The size of one line in bytes including padding.
auto const rowPitchInput{alpaka::getPitchesInBytes(bufInputAcc)[0]};
Expand Down Expand Up @@ -294,7 +294,7 @@ auto example(TAccTag const&) -> int
alpaka::wait(queueAcc);

// Calculate the allocated width, due to padding it might be larger then the matrix width
auto const filterWidthAllocated = [&]() -> const Idx
auto const filterWidthAllocated = [&]() -> Idx const
{
// Calculate pitch: The size of one line in bytes including padding.
auto const rowPitchFilter{alpaka::getPitchesInBytes(bufFilterAcc)[0]};
Expand All @@ -305,20 +305,22 @@ auto example(TAccTag const&) -> int
// ConvolutionKernel2DSharedMemory
ConvolutionKernel2DSharedMemory convolutionKernel2D;

auto const& bundeledKernel = alpaka::KernelBundle(
alpaka::KernelCfg<DevAcc> kernelCfg = {extent, Vec::ones()};

// Let alpaka calculate good block and grid sizes given our full problem extent.
auto const workDiv = alpaka::getValidWorkDiv(
kernelCfg,
devAcc,
convolutionKernel2D,
alpaka::getPtrNative(bufInputAcc),
alpaka::getPtrNative(outputDeviceMemory),
std::data(bufInputAcc),
std::data(outputDeviceMemory),
matrixWidth,
matrixHeight,
alpaka::getPtrNative(bufFilterAcc),
std::data(bufFilterAcc),
filterWidth,
intputWidthAllocated,
filterWidthAllocated);

// Let alpaka calculate good block and grid sizes given our full problem extent.
auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, extent, Vec::ones());

// Run the kernel
alpaka::exec<DevAcc>(
queueAcc,
Expand Down
23 changes: 16 additions & 7 deletions example/counterBasedRng/src/counterBasedRng.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,22 +147,31 @@ auto example(TAccTag const&) -> int
BufAcc bufAcc(alpaka::allocBuf<Data, Idx>(devAcc, extent));

CounterBasedRngKernel counterBasedRngKernel;
auto const& bundeledKernel
= alpaka::KernelBundle(counterBasedRngKernel, alpaka::experimental::getMdSpan(bufAcc), key);
auto const& bundeledKernel2
= alpaka::KernelBundle(counterBasedRngKernel, alpaka::experimental::getMdSpan(bufHost), key);

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDivAcc = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, extent, elementsPerThread);
auto const workDivHost
= alpaka::getValidWorkDivForKernel<AccHost>(devHost, bundeledKernel2, extent, elementsPerThreadHost);
alpaka::KernelCfg<Acc> kernerlCfgAccDev = {extent, elementsPerThread};
auto const workDivAcc = alpaka::getValidWorkDiv(
kernerlCfgAccDev,
devAcc,
counterBasedRngKernel,
alpaka::experimental::getMdSpan(bufAcc),
key);

// Create the kernel execution task.
auto const taskKernelAcc = alpaka::createTaskKernel<Acc>(
workDivAcc,
CounterBasedRngKernel(),
alpaka::experimental::getMdSpan(bufAcc),
key);

alpaka::KernelCfg<AccHost> kernerlCfgAccHost = {extent, elementsPerThreadHost};
auto const workDivHost = alpaka::getValidWorkDiv(
kernerlCfgAccHost,
devHost,
counterBasedRngKernel,
alpaka::experimental::getMdSpan(bufHost),
key);

auto const taskKernelHost = alpaka::createTaskKernel<AccHost>(
workDivHost,
CounterBasedRngKernel(),
Expand Down
6 changes: 4 additions & 2 deletions example/heatEquation/src/heatEquation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,11 @@ auto example(TAccTag const&) -> int

HeatEquationKernel heatEqKernel;

auto const& bundeledKernel = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
alpaka::KernelCfg<Acc> const kernelCfg = {extent, elemPerThread};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, extent, elemPerThread);
auto const workDiv
= alpaka::getValidWorkDiv(kernelCfg, devAcc, heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);

// Copy host -> device
alpaka::memcpy(queue, uCurrBufAcc, uCurrBufHost);
Expand Down
6 changes: 3 additions & 3 deletions example/helloWorld/src/helloWorld.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,10 @@ auto example(TAccTag const&) -> int
// argument. So a kernel can be a class or struct, a lambda, etc.
HelloWorldKernel helloWorldKernel;

auto const& bundeledKernel = alpaka::KernelBundle(helloWorldKernel);
alpaka::KernelCfg<Acc> const kernelCfg = {threadsPerGrid, elementsPerThread};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, helloWorldKernel);

// Run the kernel
//
Expand Down
8 changes: 4 additions & 4 deletions example/helloWorldLambda/src/helloWorldLambda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ auto example(TAccTag const&) -> int
auto const threadsPerGrid = Vec{4, 2, 4};


const size_t nExclamationMarks = 10;
size_t const nExclamationMarks = 10;

// Run "Hello World" kernel with a lambda function
//
Expand Down Expand Up @@ -117,10 +117,10 @@ auto example(TAccTag const&) -> int
printf("\n");
};

auto const& bundeledKernel = alpaka::KernelBundle(kernelLambda, nExclamationMarks);
alpaka::KernelCfg<Acc> const kernelCfg = {threadsPerGrid, elementsPerThread};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, kernelLambda, nExclamationMarks);

alpaka::exec<Acc>(queue, workDiv, kernelLambda, nExclamationMarks);
alpaka::wait(queue);
Expand Down
6 changes: 3 additions & 3 deletions example/kernelSpecialization/src/kernelSpecialization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,10 @@ auto example(TAccTag const&) -> int
std::size_t const elementsPerThread = 1u;
Kernel kernel;

auto const& bundeledKernel = alpaka::KernelBundle(kernel);
alpaka::KernelCfg<Acc> const kernelCfg = {threadsPerGrid, elementsPerThread};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, kernel);

// Run the kernel
alpaka::exec<Acc>(queue, workDiv, kernel);
Expand Down
13 changes: 4 additions & 9 deletions example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,19 +147,14 @@ auto example(TAccTag const&) -> int
auto mdDevC = alpaka::experimental::getMdSpan(bufDevC);

MatrixMulKernel kernel;
auto const& bundeledKernel = alpaka::KernelBundle(kernel, mdDevA, mdDevB, mdDevC);

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
devAcc,
bundeledKernel,
extentC,
Vec::ones(),
false,
alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
alpaka::KernelCfg<Acc> const kernelCfg
= {extentC, Vec::ones(), false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};
auto const workDiv = alpaka::getValidWorkDiv<Acc>(kernelCfg, devAcc, kernel, mdDevA, mdDevB, mdDevC);

// Execute the kernel
alpaka::exec<Acc>(queue, workDiv, MatrixMulKernel{}, mdDevA, mdDevB, mdDevC);
alpaka::exec<Acc>(queue, workDiv, kernel, mdDevA, mdDevB, mdDevC);

// Copy result back to host
alpaka::memcpy(queue, bufHostC, bufDevC);
Expand Down
9 changes: 3 additions & 6 deletions example/monteCarloIntegration/src/monteCarloIntegration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,14 +112,11 @@ auto example(TAccTag const&) -> int
bufHost[0] = 0.0f;
alpaka::memcpy(queue, bufAcc, bufHost);

alpaka::KernelCfg<Acc> const kernelCfg = {Vec(numThreads), Vec(numAlpakaElementsPerThread)};
Kernel kernel;
auto const& bundeledKernel = alpaka::KernelBundle(kernel, numPoints, ptrBufAcc, Function{});

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
devAcc,
bundeledKernel,
Vec(numThreads),
Vec(numAlpakaElementsPerThread));
auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, kernel, numPoints, ptrBufAcc, Function{});

alpaka::exec<Acc>(queue, workDiv, kernel, numPoints, ptrBufAcc, Function{});
alpaka::memcpy(queue, bufHost, bufAcc);
Expand Down
6 changes: 3 additions & 3 deletions example/openMPSchedule/src/openMPSchedule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,10 @@ auto main() -> int
Idx const elementsPerThread = 1u;

OpenMPScheduleDefaultKernel openMPScheduleDefaultKernel;
auto const& bundeledKernel = alpaka::KernelBundle(openMPScheduleDefaultKernel);

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
alpaka::KernelCfg<Acc> kernelCfg = {threadsPerGrid, elementsPerThread};
auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, openMPScheduleDefaultKernel);

// Run the kernel setting no schedule explicitly.
std::cout << "OpenMPScheduleDefaultKernel setting no schedule explicitly:\n";
Expand Down
Loading

0 comments on commit d7c459d

Please sign in to comment.