From d7c459d739087df7efa5c622de6306f64a126716 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Widera?= Date: Thu, 15 Aug 2024 12:49:45 +0200 Subject: [PATCH] remove `KernelBundle`, change signature of `[get|is]ValidWorkDiv*` (#2349) * remove `KernelBundle`, change signature of `getValidWorkDiv*` - revert introduced `KernelBundle` in #2251 - change signature of `getValidWorkDivForKernel`,`isValidWorkDivKernel` and `isValidWorkDivKernel` - reuse old naming `getValidWorkDiv` and `isValidWorkDiv` * use new interface for `getValidWorkDiv` * fix cheat sheet --- docs/source/basic/cheatsheet.rst | 28 ++-- example/bufferCopy/src/bufferCopy.cpp | 12 +- example/complex/src/complex.cpp | 6 +- .../conv2DWithMdspan/src/conv2DWithMdspan.cpp | 10 +- example/convolution1D/src/convolution1D.cpp | 14 +- example/convolution2D/src/convolution2D.cpp | 20 +-- .../counterBasedRng/src/counterBasedRng.cpp | 23 ++- example/heatEquation/src/heatEquation.cpp | 6 +- example/helloWorld/src/helloWorld.cpp | 6 +- .../helloWorldLambda/src/helloWorldLambda.cpp | 8 +- .../src/kernelSpecialization.cpp | 6 +- .../src/matrixMulMdSpan.cpp | 13 +- .../src/monteCarloIntegration.cpp | 9 +- example/openMPSchedule/src/openMPSchedule.cpp | 6 +- example/randomCells2D/src/randomCells2D.cpp | 20 ++- .../randomStrategies/src/randomStrategies.cpp | 39 ++--- example/vectorAdd/src/vectorAdd.cpp | 9 +- include/alpaka/alpaka.hpp | 1 - include/alpaka/kernel/KernelBundle.hpp | 58 ------- .../alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp | 11 +- .../kernel/TaskKernelCpuOmp2Threads.hpp | 11 +- include/alpaka/kernel/TaskKernelCpuSerial.hpp | 11 +- include/alpaka/kernel/TaskKernelCpuSycl.hpp | 10 +- .../alpaka/kernel/TaskKernelCpuTbbBlocks.hpp | 11 +- .../alpaka/kernel/TaskKernelCpuThreads.hpp | 11 +- .../alpaka/kernel/TaskKernelFpgaSyclIntel.hpp | 10 +- .../alpaka/kernel/TaskKernelGenericSycl.hpp | 1 - .../alpaka/kernel/TaskKernelGpuSyclIntel.hpp | 10 +- .../kernel/TaskKernelGpuUniformCudaHipRt.hpp | 21 ++- include/alpaka/kernel/Traits.hpp | 61 ++++---- .../alpaka/test/KernelExecutionFixture.hpp | 15 +- include/alpaka/workdiv/WorkDivHelpers.hpp | 146 ++++++++++-------- test/integ/axpy/src/axpy.cpp | 18 ++- test/integ/mandelbrot/src/mandelbrot.cpp | 17 +- test/integ/matMul/src/matMul.cpp | 16 +- test/integ/separableCompilation/src/main.cpp | 12 +- test/integ/sharedMem/src/sharedMem.cpp | 17 +- test/unit/math/src/TestTemplate.hpp | 17 +- .../unit/workDiv/src/WorkDivForKernelTest.cpp | 47 +++--- 39 files changed, 367 insertions(+), 400 deletions(-) delete mode 100644 include/alpaka/kernel/KernelBundle.hpp diff --git a/docs/source/basic/cheatsheet.rst b/docs/source/basic/cheatsheet.rst index 7cd60c4de97e..21ade07e134b 100644 --- a/docs/source/basic/cheatsheet.rst +++ b/docs/source/basic/cheatsheet.rst @@ -180,8 +180,6 @@ Prepare Kernel Bundle .. code-block:: c++ HeatEquationKernel heatEqKernel; - // Arguments of KernelBundle: The kernel instance and the kernel arguments - auto const& bundeledKernel = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt); Automatically select a valid kernel launch configuration .. code-block:: c++ @@ -189,12 +187,21 @@ Automatically select a valid kernel launch configuration Vec const globalThreadExtent = vectorValue; Vec const elementsPerThread = vectorValue; - auto autoWorkDiv = getValidWorkDivForKernel( - device, - bundeledKernel, - globalThreadExtent, elementsPerThread, + KernelCfg const kernelCfg = { + globalThreadExtent, + elementsPerThread, false, - GridBlockExtentSubDivRestrictions::Unrestricted); + GridBlockExtentSubDivRestrictions::Unrestricted}; + + auto autoWorkDiv = getValidWorkDiv( + kernelCfg, + device, + heatEqKernel, + pCurrAcc, + pNextAcc, + numNodesX, + dx, + dt); Manually set a kernel launch configuration .. code-block:: c++ @@ -204,9 +211,10 @@ Manually set a kernel launch configuration Vec const elementsPerThread = vectorValue; using WorkDiv = WorkDivMembers; - auto manualWorkDiv = WorkDiv{blocksPerGrid, - threadsPerBlock, - elementsPerThread}; + auto manualWorkDiv = WorkDiv{ + blocksPerGrid, + threadsPerBlock, + elementsPerThread}; Instantiate a kernel and create a task that will run it (does not launch it yet) .. code-block:: c++ diff --git a/example/bufferCopy/src/bufferCopy.cpp b/example/bufferCopy/src/bufferCopy.cpp index 53d9d25e7f84..1c99de879cfb 100644 --- a/example/bufferCopy/src/bufferCopy.cpp +++ b/example/bufferCopy/src/bufferCopy.cpp @@ -118,7 +118,7 @@ auto example(TAccTag const&) -> int using Data = std::uint32_t; constexpr Idx nElementsPerDim = 2; - const Vec extents(Vec::all(static_cast(nElementsPerDim))); + Vec const extents(Vec::all(static_cast(nElementsPerDim))); // Allocate host memory buffers // @@ -164,9 +164,8 @@ auto example(TAccTag const&) -> int FillBufferKernel fillBufferKernel; - auto const& bundeledFillBufferKernel = alpaka::KernelBundle(fillBufferKernel, hostViewPlainPtrMdSpan); - auto const hostWorkDiv - = alpaka::getValidWorkDivForKernel(devHost, bundeledFillBufferKernel, threadsPerGrid, elementsPerThread); + alpaka::KernelCfg const hostKernelCfg = {threadsPerGrid, elementsPerThread}; + auto const hostWorkDiv = alpaka::getValidWorkDiv(hostKernelCfg, devHost, fillBufferKernel, hostViewPlainPtrMdSpan); alpaka::exec(hostQueue, hostWorkDiv, fillBufferKernel, hostViewPlainPtrMdSpan); // 1st kernel argument @@ -203,11 +202,10 @@ auto example(TAccTag const&) -> int auto deviceBufferMdSpan2 = alpaka::experimental::getMdSpan(deviceBuffer2); TestBufferKernel testBufferKernel; - auto const& bundeledTestBufferKernel = alpaka::KernelBundle(testBufferKernel, deviceBufferMdSpan1); // Let alpaka calculate good block and grid sizes given our full problem extent - auto const devWorkDiv - = alpaka::getValidWorkDivForKernel(devAcc, bundeledTestBufferKernel, threadsPerGrid, elementsPerThread); + alpaka::KernelCfg const devKernelCfg = {threadsPerGrid, elementsPerThread}; + auto const devWorkDiv = alpaka::getValidWorkDiv(devKernelCfg, devAcc, testBufferKernel, deviceBufferMdSpan1); alpaka::exec(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan1); alpaka::exec(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan2); diff --git a/example/complex/src/complex.cpp b/example/complex/src/complex.cpp index 7c9b39563460..ece105e845fd 100644 --- a/example/complex/src/complex.cpp +++ b/example/complex/src/complex.cpp @@ -58,10 +58,10 @@ auto example(TAccTag const&) -> int ComplexKernel complexKernel; - auto const& bundeledKernel = alpaka::KernelBundle(complexKernel); + alpaka::KernelCfg const kernelCfg = {threadsPerGrid, elementsPerThread}; + // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDiv - = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread); + auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, complexKernel); // Run the kernel alpaka::exec(queue, workDiv, complexKernel); diff --git a/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp b/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp index 0a8b7d165b7d..698a8d12fa75 100644 --- a/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp +++ b/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp @@ -147,16 +147,16 @@ auto example(TAccTag const&) -> int // Construct kernel object ConvolutionKernelMdspan2D convolutionKernel2D; - // Make a bundle - auto const& bundeledKernel = alpaka::KernelBundle( + // Let alpaka calculate good block and grid sizes given our full problem extent. + alpaka::KernelCfg const kernelCfg = {extent, Vec::ones()}; + auto const workDiv = alpaka::getValidWorkDiv( + kernelCfg, + devAcc, convolutionKernel2D, alpaka::experimental::getMdSpan(bufInputAcc), alpaka::experimental::getMdSpan(outputDeviceMemory), alpaka::experimental::getMdSpan(bufFilterAcc)); - // Let alpaka calculate good block and grid sizes given our full problem extent. - auto const workDiv = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, extent, Vec::ones()); - // Run the kernel, pass 3 arrays as 2D mdspans alpaka::exec( diff --git a/example/convolution1D/src/convolution1D.cpp b/example/convolution1D/src/convolution1D.cpp index 098dc8501d09..7aa3012206a7 100644 --- a/example/convolution1D/src/convolution1D.cpp +++ b/example/convolution1D/src/convolution1D.cpp @@ -37,8 +37,8 @@ struct ConvolutionKernel TElem const* const input, TElem const* const filter, TElem* const output, - const std::size_t inputSize, - const std::size_t filterSize) const -> void + std::size_t const inputSize, + std::size_t const filterSize) const -> void { auto const globalThreadIdxX = alpaka::getIdx(acc)[0]; @@ -140,7 +140,12 @@ auto example(TAccTag const&) -> int DataType* nativeInputDeviceMemory = std::data(inputDeviceMemory); DataType* nativeOutputDeviceMemory = std::data(outputDeviceMemory); - auto const& bundeledKernel = alpaka::KernelBundle( + alpaka::KernelCfg const kernelCfg = {threadsPerGrid, elementsPerThread}; + + // Let alpaka calculate good block and grid sizes given our full problem extent + auto const workDiv = alpaka::getValidWorkDiv( + kernelCfg, + devAcc, convolutionKernel, nativeInputDeviceMemory, nativeFilterDeviceMemory, @@ -148,9 +153,6 @@ auto example(TAccTag const&) -> int inputSize, filterSize); - // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDiv - = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread); // Run the kernel alpaka::exec( queue, diff --git a/example/convolution2D/src/convolution2D.cpp b/example/convolution2D/src/convolution2D.cpp index 2c8a6b28d850..87f618c7380e 100644 --- a/example/convolution2D/src/convolution2D.cpp +++ b/example/convolution2D/src/convolution2D.cpp @@ -265,7 +265,7 @@ auto example(TAccTag const&) -> int alpaka::wait(queueAcc); // Calculate the allocated width, due to padding it might be larger then the matrix width - auto const intputWidthAllocated = [&]() -> const Idx + auto const intputWidthAllocated = [&]() -> Idx const { // Calculate pitch: The size of one line in bytes including padding. auto const rowPitchInput{alpaka::getPitchesInBytes(bufInputAcc)[0]}; @@ -294,7 +294,7 @@ auto example(TAccTag const&) -> int alpaka::wait(queueAcc); // Calculate the allocated width, due to padding it might be larger then the matrix width - auto const filterWidthAllocated = [&]() -> const Idx + auto const filterWidthAllocated = [&]() -> Idx const { // Calculate pitch: The size of one line in bytes including padding. auto const rowPitchFilter{alpaka::getPitchesInBytes(bufFilterAcc)[0]}; @@ -305,20 +305,22 @@ auto example(TAccTag const&) -> int // ConvolutionKernel2DSharedMemory ConvolutionKernel2DSharedMemory convolutionKernel2D; - auto const& bundeledKernel = alpaka::KernelBundle( + alpaka::KernelCfg kernelCfg = {extent, Vec::ones()}; + + // Let alpaka calculate good block and grid sizes given our full problem extent. + auto const workDiv = alpaka::getValidWorkDiv( + kernelCfg, + devAcc, convolutionKernel2D, - alpaka::getPtrNative(bufInputAcc), - alpaka::getPtrNative(outputDeviceMemory), + std::data(bufInputAcc), + std::data(outputDeviceMemory), matrixWidth, matrixHeight, - alpaka::getPtrNative(bufFilterAcc), + std::data(bufFilterAcc), filterWidth, intputWidthAllocated, filterWidthAllocated); - // Let alpaka calculate good block and grid sizes given our full problem extent. - auto const workDiv = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, extent, Vec::ones()); - // Run the kernel alpaka::exec( queueAcc, diff --git a/example/counterBasedRng/src/counterBasedRng.cpp b/example/counterBasedRng/src/counterBasedRng.cpp index 7a9a9abfc7fe..d96ab2b775a2 100644 --- a/example/counterBasedRng/src/counterBasedRng.cpp +++ b/example/counterBasedRng/src/counterBasedRng.cpp @@ -147,15 +147,15 @@ auto example(TAccTag const&) -> int BufAcc bufAcc(alpaka::allocBuf(devAcc, extent)); CounterBasedRngKernel counterBasedRngKernel; - auto const& bundeledKernel - = alpaka::KernelBundle(counterBasedRngKernel, alpaka::experimental::getMdSpan(bufAcc), key); - auto const& bundeledKernel2 - = alpaka::KernelBundle(counterBasedRngKernel, alpaka::experimental::getMdSpan(bufHost), key); // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDivAcc = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, extent, elementsPerThread); - auto const workDivHost - = alpaka::getValidWorkDivForKernel(devHost, bundeledKernel2, extent, elementsPerThreadHost); + alpaka::KernelCfg kernerlCfgAccDev = {extent, elementsPerThread}; + auto const workDivAcc = alpaka::getValidWorkDiv( + kernerlCfgAccDev, + devAcc, + counterBasedRngKernel, + alpaka::experimental::getMdSpan(bufAcc), + key); // Create the kernel execution task. auto const taskKernelAcc = alpaka::createTaskKernel( @@ -163,6 +163,15 @@ auto example(TAccTag const&) -> int CounterBasedRngKernel(), alpaka::experimental::getMdSpan(bufAcc), key); + + alpaka::KernelCfg kernerlCfgAccHost = {extent, elementsPerThreadHost}; + auto const workDivHost = alpaka::getValidWorkDiv( + kernerlCfgAccHost, + devHost, + counterBasedRngKernel, + alpaka::experimental::getMdSpan(bufHost), + key); + auto const taskKernelHost = alpaka::createTaskKernel( workDivHost, CounterBasedRngKernel(), diff --git a/example/heatEquation/src/heatEquation.cpp b/example/heatEquation/src/heatEquation.cpp index df43a4e0ed47..a13b3f00bc26 100644 --- a/example/heatEquation/src/heatEquation.cpp +++ b/example/heatEquation/src/heatEquation.cpp @@ -134,9 +134,11 @@ auto example(TAccTag const&) -> int HeatEquationKernel heatEqKernel; - auto const& bundeledKernel = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt); + alpaka::KernelCfg const kernelCfg = {extent, elemPerThread}; + // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDiv = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, extent, elemPerThread); + auto const workDiv + = alpaka::getValidWorkDiv(kernelCfg, devAcc, heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt); // Copy host -> device alpaka::memcpy(queue, uCurrBufAcc, uCurrBufHost); diff --git a/example/helloWorld/src/helloWorld.cpp b/example/helloWorld/src/helloWorld.cpp index 646df34d7b66..79ad64ae9494 100644 --- a/example/helloWorld/src/helloWorld.cpp +++ b/example/helloWorld/src/helloWorld.cpp @@ -135,10 +135,10 @@ auto example(TAccTag const&) -> int // argument. So a kernel can be a class or struct, a lambda, etc. HelloWorldKernel helloWorldKernel; - auto const& bundeledKernel = alpaka::KernelBundle(helloWorldKernel); + alpaka::KernelCfg const kernelCfg = {threadsPerGrid, elementsPerThread}; + // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDiv - = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread); + auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, helloWorldKernel); // Run the kernel // diff --git a/example/helloWorldLambda/src/helloWorldLambda.cpp b/example/helloWorldLambda/src/helloWorldLambda.cpp index b0e028cea2d7..85d599b63b6a 100644 --- a/example/helloWorldLambda/src/helloWorldLambda.cpp +++ b/example/helloWorldLambda/src/helloWorldLambda.cpp @@ -78,7 +78,7 @@ auto example(TAccTag const&) -> int auto const threadsPerGrid = Vec{4, 2, 4}; - const size_t nExclamationMarks = 10; + size_t const nExclamationMarks = 10; // Run "Hello World" kernel with a lambda function // @@ -117,10 +117,10 @@ auto example(TAccTag const&) -> int printf("\n"); }; - auto const& bundeledKernel = alpaka::KernelBundle(kernelLambda, nExclamationMarks); + alpaka::KernelCfg const kernelCfg = {threadsPerGrid, elementsPerThread}; + // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDiv - = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread); + auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, kernelLambda, nExclamationMarks); alpaka::exec(queue, workDiv, kernelLambda, nExclamationMarks); alpaka::wait(queue); diff --git a/example/kernelSpecialization/src/kernelSpecialization.cpp b/example/kernelSpecialization/src/kernelSpecialization.cpp index 6bb7ccbda79f..899e61ad7f4a 100644 --- a/example/kernelSpecialization/src/kernelSpecialization.cpp +++ b/example/kernelSpecialization/src/kernelSpecialization.cpp @@ -81,10 +81,10 @@ auto example(TAccTag const&) -> int std::size_t const elementsPerThread = 1u; Kernel kernel; - auto const& bundeledKernel = alpaka::KernelBundle(kernel); + alpaka::KernelCfg const kernelCfg = {threadsPerGrid, elementsPerThread}; + // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDiv - = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread); + auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, kernel); // Run the kernel alpaka::exec(queue, workDiv, kernel); diff --git a/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp b/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp index 1a5ee577b405..e34dcb2d60fe 100644 --- a/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp +++ b/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp @@ -147,19 +147,14 @@ auto example(TAccTag const&) -> int auto mdDevC = alpaka::experimental::getMdSpan(bufDevC); MatrixMulKernel kernel; - auto const& bundeledKernel = alpaka::KernelBundle(kernel, mdDevA, mdDevB, mdDevC); // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDiv = alpaka::getValidWorkDivForKernel( - devAcc, - bundeledKernel, - extentC, - Vec::ones(), - false, - alpaka::GridBlockExtentSubDivRestrictions::Unrestricted); + alpaka::KernelCfg const kernelCfg + = {extentC, Vec::ones(), false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted}; + auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, kernel, mdDevA, mdDevB, mdDevC); // Execute the kernel - alpaka::exec(queue, workDiv, MatrixMulKernel{}, mdDevA, mdDevB, mdDevC); + alpaka::exec(queue, workDiv, kernel, mdDevA, mdDevB, mdDevC); // Copy result back to host alpaka::memcpy(queue, bufHostC, bufDevC); diff --git a/example/monteCarloIntegration/src/monteCarloIntegration.cpp b/example/monteCarloIntegration/src/monteCarloIntegration.cpp index fd0961979b36..b26cd2af10fa 100644 --- a/example/monteCarloIntegration/src/monteCarloIntegration.cpp +++ b/example/monteCarloIntegration/src/monteCarloIntegration.cpp @@ -112,14 +112,11 @@ auto example(TAccTag const&) -> int bufHost[0] = 0.0f; alpaka::memcpy(queue, bufAcc, bufHost); + alpaka::KernelCfg const kernelCfg = {Vec(numThreads), Vec(numAlpakaElementsPerThread)}; Kernel kernel; - auto const& bundeledKernel = alpaka::KernelBundle(kernel, numPoints, ptrBufAcc, Function{}); + // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDiv = alpaka::getValidWorkDivForKernel( - devAcc, - bundeledKernel, - Vec(numThreads), - Vec(numAlpakaElementsPerThread)); + auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, kernel, numPoints, ptrBufAcc, Function{}); alpaka::exec(queue, workDiv, kernel, numPoints, ptrBufAcc, Function{}); alpaka::memcpy(queue, bufHost, bufAcc); diff --git a/example/openMPSchedule/src/openMPSchedule.cpp b/example/openMPSchedule/src/openMPSchedule.cpp index 1febb42cd685..b2d149bec7b7 100644 --- a/example/openMPSchedule/src/openMPSchedule.cpp +++ b/example/openMPSchedule/src/openMPSchedule.cpp @@ -108,10 +108,10 @@ auto main() -> int Idx const elementsPerThread = 1u; OpenMPScheduleDefaultKernel openMPScheduleDefaultKernel; - auto const& bundeledKernel = alpaka::KernelBundle(openMPScheduleDefaultKernel); + // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDiv - = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread); + alpaka::KernelCfg kernelCfg = {threadsPerGrid, elementsPerThread}; + auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, openMPScheduleDefaultKernel); // Run the kernel setting no schedule explicitly. std::cout << "OpenMPScheduleDefaultKernel setting no schedule explicitly:\n"; diff --git a/example/randomCells2D/src/randomCells2D.cpp b/example/randomCells2D/src/randomCells2D.cpp index b5b45a5ef423..36bc1258d3b0 100644 --- a/example/randomCells2D/src/randomCells2D.cpp +++ b/example/randomCells2D/src/randomCells2D.cpp @@ -202,11 +202,11 @@ auto example(TAccTag const&) -> int auto pitchBufAccRandV = alpaka::getPitchesInBytes(bufAccRandV)[0]; - auto const& bundeledKernelInitRandom - = alpaka::KernelBundle(initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS); + alpaka::KernelCfg const kernelCfg = {extent, Vec(perThreadY, perThreadX)}; + // Let alpaka calculate good block and grid sizes given our full problem extent auto const workDivInitRandom - = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernelInitRandom, extent, Vec(perThreadY, perThreadX)); + = alpaka::getValidWorkDiv(kernelCfg, devAcc, initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS); alpaka::exec(queue, workDivInitRandom, initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS); alpaka::wait(queue); @@ -230,7 +230,12 @@ auto example(TAccTag const&) -> int alpaka::memcpy(queue, bufAccS, bufHostS); RunTimestepKernelSingle runTimestepKernelSingle; - auto const& bundeledKernelRuntimeStep = alpaka::KernelBundle( + alpaka::KernelCfg const runtimeRandomKernelCfg = {extent, Vec(perThreadY, perThreadX)}; + + // Let alpaka calculate good block and grid sizes given our full problem extent + auto const workDivRuntimeStep = alpaka::getValidWorkDiv( + runtimeRandomKernelCfg, + devAcc, runTimestepKernelSingle, extent, ptrBufAccRandS, @@ -238,13 +243,6 @@ auto example(TAccTag const&) -> int pitchBufAccRandS, pitchBufAccS); - // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDivRuntimeStep = alpaka::getValidWorkDivForKernel( - devAcc, - bundeledKernelRuntimeStep, - extent, - Vec(perThreadY, perThreadX)); - alpaka::exec( queue, workDivRuntimeStep, diff --git a/example/randomStrategies/src/randomStrategies.cpp b/example/randomStrategies/src/randomStrategies.cpp index ea87d290a2c4..6a1940c8b244 100644 --- a/example/randomStrategies/src/randomStrategies.cpp +++ b/example/randomStrategies/src/randomStrategies.cpp @@ -247,20 +247,20 @@ void runStrategy(Box& box) // the initial parameters solely from the thread index - auto const& bundeledKernel = alpaka::KernelBundle( - initRandomKernel, - box.extentRand, - ptrBufAccRand, - static_cast(box.extentResult[0] / box.extentRand[0])); + alpaka::KernelCfg::Acc> kernelCfg + = {box.extentRand, + typename Box::Vec(typename Box::Idx{1}), + false, + alpaka::GridBlockExtentSubDivRestrictions::Unrestricted}; // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDivRand = alpaka::getValidWorkDivForKernel::Acc>( + auto const workDivRand = alpaka::getValidWorkDiv( + kernelCfg, alpaka::getDevByIdx(box.accPlatform, 0), - bundeledKernel, + initRandomKernel, box.extentRand, - typename Box::Vec(typename Box::Idx{1}), - false, - alpaka::GridBlockExtentSubDivRestrictions::Unrestricted); + ptrBufAccRand, + static_cast(box.extentResult[0] / box.extentRand[0])); alpaka::exec::Acc>( @@ -291,18 +291,21 @@ void runStrategy(Box& box) alpaka::memcpy(box.queue, box.bufAccResult, box.bufHostResult); FillKernel fillKernel; - auto const& bundeledKernelFill - = alpaka::KernelBundle(fillKernel, box.extentResult, ptrBufAccRand, ptrBufAccResult); + alpaka::KernelCfg::Acc> fillKernelCfg + = {box.extentResult, + typename Box::Vec(static_cast::Idx>( + NUM_ROLLS)), // One thread per "point"; each performs NUM_ROLLS "rolls" + false, + alpaka::GridBlockExtentSubDivRestrictions::Unrestricted}; // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workdivResult = alpaka::getValidWorkDivForKernel::Acc>( + auto const workdivResult = alpaka::getValidWorkDiv( + fillKernelCfg, alpaka::getDevByIdx(box.accPlatform, 0), - bundeledKernelFill, + fillKernel, box.extentResult, - typename Box::Vec(static_cast::Idx>( - NUM_ROLLS)), // One thread per "point"; each performs NUM_ROLLS "rolls" - false, - alpaka::GridBlockExtentSubDivRestrictions::Unrestricted); + ptrBufAccRand, + ptrBufAccResult); alpaka::exec::Acc>( diff --git a/example/vectorAdd/src/vectorAdd.cpp b/example/vectorAdd/src/vectorAdd.cpp index a99393fb8b5b..05b3303d7c7b 100644 --- a/example/vectorAdd/src/vectorAdd.cpp +++ b/example/vectorAdd/src/vectorAdd.cpp @@ -130,14 +130,17 @@ auto example(TAccTag const&) -> int // Instantiate the kernel function object VectorAddKernel kernel; - auto const& bundeledKernel = alpaka::KernelBundle( + alpaka::KernelCfg const kernelCfg = {extent, elementsPerThread}; + + // Let alpaka calculate good block and grid sizes given our full problem extent + auto const workDiv = alpaka::getValidWorkDiv( + kernelCfg, + devAcc, kernel, alpaka::getPtrNative(bufAccA), alpaka::getPtrNative(bufAccB), alpaka::getPtrNative(bufAccC), numElements); - // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDiv = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, extent, elementsPerThread); // Create the kernel execution task. auto const taskKernel = alpaka::createTaskKernel( diff --git a/include/alpaka/alpaka.hpp b/include/alpaka/alpaka.hpp index 5f654b32a0a1..e06dede53d48 100644 --- a/include/alpaka/alpaka.hpp +++ b/include/alpaka/alpaka.hpp @@ -112,7 +112,6 @@ #include "alpaka/idx/gb/IdxGbRef.hpp" #include "alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp" // kernel -#include "alpaka/kernel/KernelBundle.hpp" #include "alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp" #include "alpaka/kernel/TaskKernelCpuOmp2Threads.hpp" #include "alpaka/kernel/TaskKernelCpuSerial.hpp" diff --git a/include/alpaka/kernel/KernelBundle.hpp b/include/alpaka/kernel/KernelBundle.hpp deleted file mode 100644 index 9a7c59b8d005..000000000000 --- a/include/alpaka/kernel/KernelBundle.hpp +++ /dev/null @@ -1,58 +0,0 @@ -/* Copyright 2022 Benjamin Worpitz, Bert Wesarg, René Widera, Sergei Bastrakov, Bernhard Manfred Gruber, Mehmet - * Yusufoglu SPDX-License-Identifier: MPL-2.0 - */ - -#pragma once - -#include -#include - -#include -#include - -namespace alpaka -{ - //! \brief The class used to bind kernel function object and arguments together. Once an instance of this class is - //! created, arguments are not needed to be separately given to functions who need kernel function and arguments. - //! \tparam TKernelFn The kernel function object type. - //! \tparam TArgs Kernel function object invocation argument types as a parameter pack. - template - class KernelBundle - { - public: - //! The function object type - using KernelFn = TKernelFn; - //! Tuple type to encapsulate kernel function argument types and argument values - using ArgTuple = std::tuple>...>; - - // Constructor - KernelBundle(KernelFn kernelFn, TArgs&&... args) - : m_kernelFn(std::move(kernelFn)) - , m_args(std::forward(args)...) - { - } - - private: - KernelFn m_kernelFn; - ArgTuple m_args; // Store the argument types without const and reference - }; - - //! \brief User defined deduction guide with trailing return type. For CTAD during the construction. - //! \tparam TKernelFn The kernel function object type. - //! \tparam TArgs Kernel function object argument types as a parameter pack. - //! \param kernelFn The kernel object -#if BOOST_COMP_CLANG -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Wdocumentation" // clang does not support the syntax for variadic template - // arguments "args,...". Ignore the error. -#endif - //! \param args,... The kernel invocation arguments. -#if BOOST_COMP_CLANG -# pragma clang diagnostic pop -#endif - //! \return Kernel function bundle. An instance of KernelBundle which consists the kernel function object and its - //! arguments. - template - ALPAKA_FN_HOST KernelBundle(TKernelFn, TArgs&&...) -> KernelBundle; - -} // namespace alpaka diff --git a/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp b/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp index 456f0c42e339..d1e5f4b0574b 100644 --- a/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp +++ b/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp @@ -17,7 +17,6 @@ #include "alpaka/core/OmpSchedule.hpp" #include "alpaka/dev/DevCpu.hpp" #include "alpaka/idx/MapIdx.hpp" -#include "alpaka/kernel/KernelBundle.hpp" #include "alpaka/kernel/KernelFunctionAttributes.hpp" #include "alpaka/kernel/Traits.hpp" #include "alpaka/platform/PlatformCpu.hpp" @@ -957,17 +956,17 @@ namespace alpaka //! \tparam TKernelFn Kernel function object type. //! \tparam TArgs Kernel function object argument types as a parameter pack. template - struct FunctionAttributes, TDev, KernelBundle> + struct FunctionAttributes, TDev, TKernelFn, TArgs...> { //! \param dev The device instance - //! \param kernelBundle Kernel bundeled with it's arguments. The function attributes of this kernel will be - //! determined. Max threads per block is one of the attributes. + //! \param kernelFn The kernel function object which should be executed. + //! \param args The kernel invocation arguments. //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1. ALPAKA_FN_HOST static auto getFunctionAttributes( TDev const& dev, - [[maybe_unused]] KernelBundle const& kernelBundle) - -> alpaka::KernelFunctionAttributes + [[maybe_unused]] TKernelFn const& kernelFn, + [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes { alpaka::KernelFunctionAttributes kernelFunctionAttributes; diff --git a/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp b/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp index 8f68e0f1489d..6b08e9693a0e 100644 --- a/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp +++ b/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp @@ -15,7 +15,6 @@ #include "alpaka/acc/AccCpuOmp2Threads.hpp" #include "alpaka/core/Decay.hpp" #include "alpaka/dev/DevCpu.hpp" -#include "alpaka/kernel/KernelBundle.hpp" #include "alpaka/kernel/KernelFunctionAttributes.hpp" #include "alpaka/kernel/Traits.hpp" #include "alpaka/meta/NdLoop.hpp" @@ -203,17 +202,17 @@ namespace alpaka //! \tparam TKernelFn Kernel function object type. //! \tparam TArgs Kernel function object argument types as a parameter pack. template - struct FunctionAttributes, TDev, KernelBundle> + struct FunctionAttributes, TDev, TKernelFn, TArgs...> { //! \param dev The device instance - //! \param kernelBundle Kernel bundeled with it's arguments. The function attributes of this kernel will be - //! determined. Max threads per block is one of the attributes. + //! \param kernelFn The kernel function object which should be executed. + //! \param args The kernel invocation arguments. //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1. ALPAKA_FN_HOST static auto getFunctionAttributes( TDev const& dev, - [[maybe_unused]] KernelBundle const& kernelBundle) - -> alpaka::KernelFunctionAttributes + [[maybe_unused]] TKernelFn const& kernelFn, + [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes { alpaka::KernelFunctionAttributes kernelFunctionAttributes; diff --git a/include/alpaka/kernel/TaskKernelCpuSerial.hpp b/include/alpaka/kernel/TaskKernelCpuSerial.hpp index be0d590cc2f0..2889ac3d2a19 100644 --- a/include/alpaka/kernel/TaskKernelCpuSerial.hpp +++ b/include/alpaka/kernel/TaskKernelCpuSerial.hpp @@ -15,7 +15,6 @@ #include "alpaka/acc/AccCpuSerial.hpp" #include "alpaka/core/Decay.hpp" #include "alpaka/dev/DevCpu.hpp" -#include "alpaka/kernel/KernelBundle.hpp" #include "alpaka/kernel/KernelFunctionAttributes.hpp" #include "alpaka/kernel/Traits.hpp" #include "alpaka/meta/NdLoop.hpp" @@ -148,17 +147,17 @@ namespace alpaka //! \tparam TKernelFn Kernel function object type. //! \tparam TArgs Kernel function object argument types as a parameter pack. template - struct FunctionAttributes, TDev, KernelBundle> + struct FunctionAttributes, TDev, TKernelFn, TArgs...> { //! \param dev The device instance - //! \param kernelBundle Kernel bundeled with it's arguments. The function attributes of this kernel will be - //! determined. Max threads per block is one of the attributes. + //! \param kernelFn The kernel function object which should be executed. + //! \param args The kernel invocation arguments. //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1. ALPAKA_FN_HOST static auto getFunctionAttributes( TDev const& dev, - [[maybe_unused]] KernelBundle const& kernelBundle) - -> alpaka::KernelFunctionAttributes + [[maybe_unused]] TKernelFn const& kernelFn, + [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes { alpaka::KernelFunctionAttributes kernelFunctionAttributes; diff --git a/include/alpaka/kernel/TaskKernelCpuSycl.hpp b/include/alpaka/kernel/TaskKernelCpuSycl.hpp index e41926fd21b5..2287e1852457 100644 --- a/include/alpaka/kernel/TaskKernelCpuSycl.hpp +++ b/include/alpaka/kernel/TaskKernelCpuSycl.hpp @@ -25,17 +25,17 @@ namespace alpaka //! \tparam TKernelFn Kernel function object type. //! \tparam TArgs Kernel function object argument types as a parameter pack. template - struct FunctionAttributes, TDev, KernelBundle> + struct FunctionAttributes, TDev, TKernelFn, TArgs...> { //! \param dev The device instance - //! \param kernelBundle Kernel bundeled with it's arguments. The function attributes of this kernel will be - //! determined. Max threads per block is one of the attributes. + //! \param kernelFn The kernel function object which should be executed. + //! \param args The kernel invocation arguments. //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1. ALPAKA_FN_HOST static auto getFunctionAttributes( TDev const& dev, - [[maybe_unused]] KernelBundle const& kernelBundle) - -> alpaka::KernelFunctionAttributes + [[maybe_unused]] TKernelFn const& kernelFn, + [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes { alpaka::KernelFunctionAttributes kernelFunctionAttributes; diff --git a/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp b/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp index 6dd90c3d2ff2..0bc578ccc5a1 100644 --- a/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp +++ b/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp @@ -16,7 +16,6 @@ #include "alpaka/core/Decay.hpp" #include "alpaka/dev/DevCpu.hpp" #include "alpaka/idx/MapIdx.hpp" -#include "alpaka/kernel/KernelBundle.hpp" #include "alpaka/kernel/KernelFunctionAttributes.hpp" #include "alpaka/kernel/Traits.hpp" #include "alpaka/meta/NdLoop.hpp" @@ -160,17 +159,17 @@ namespace alpaka //! \tparam TKernelFn Kernel function object type. //! \tparam TArgs Kernel function object argument types as a parameter pack. template - struct FunctionAttributes, TDev, KernelBundle> + struct FunctionAttributes, TDev, TKernelFn, TArgs...> { //! \param dev The device instance - //! \param kernelBundle Kernel bundeled with it's arguments. The function attributes of this kernel will be - //! determined. Max threads per block is one of the attributes. + //! \param kernelFn The kernel function object which should be executed. + //! \param args The kernel invocation arguments. //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1. ALPAKA_FN_HOST static auto getFunctionAttributes( TDev const& dev, - [[maybe_unused]] KernelBundle const& kernelBundle) - -> alpaka::KernelFunctionAttributes + [[maybe_unused]] TKernelFn const& kernelFn, + [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes { alpaka::KernelFunctionAttributes kernelFunctionAttributes; diff --git a/include/alpaka/kernel/TaskKernelCpuThreads.hpp b/include/alpaka/kernel/TaskKernelCpuThreads.hpp index 7f12a3a12201..850b66154dab 100644 --- a/include/alpaka/kernel/TaskKernelCpuThreads.hpp +++ b/include/alpaka/kernel/TaskKernelCpuThreads.hpp @@ -17,7 +17,6 @@ #include "alpaka/core/Decay.hpp" #include "alpaka/core/ThreadPool.hpp" #include "alpaka/dev/DevCpu.hpp" -#include "alpaka/kernel/KernelBundle.hpp" #include "alpaka/kernel/KernelFunctionAttributes.hpp" #include "alpaka/kernel/Traits.hpp" #include "alpaka/meta/NdLoop.hpp" @@ -211,17 +210,17 @@ namespace alpaka //! \tparam TKernelFn Kernel function object type. //! \tparam TArgs Kernel function object argument types as a parameter pack. template - struct FunctionAttributes, TDev, KernelBundle> + struct FunctionAttributes, TDev, TKernelFn, TArgs...> { //! \param dev The device instance - //! \param kernelBundle Kernel bundeled with it's arguments. The function attributes of this kernel will be - //! determined. Max threads per block is one of the attributes. + //! \param kernelFn The kernel function object which should be executed. + //! \param args The kernel invocation arguments. //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1. ALPAKA_FN_HOST static auto getFunctionAttributes( TDev const& dev, - [[maybe_unused]] KernelBundle const& kernelBundle) - -> alpaka::KernelFunctionAttributes + [[maybe_unused]] TKernelFn const& kernelFn, + [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes { alpaka::KernelFunctionAttributes kernelFunctionAttributes; diff --git a/include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp b/include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp index 7afd7bd805b0..6a44b7269e23 100644 --- a/include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp +++ b/include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp @@ -26,17 +26,17 @@ namespace alpaka //! \tparam TKernelFn Kernel function object type. //! \tparam TArgs Kernel function object argument types as a parameter pack. template - struct FunctionAttributes, TDev, KernelBundle> + struct FunctionAttributes, TDev, TKernelFn, TArgs...> { //! \param dev The device instance - //! \param kernelBundle Kernel bundeled with it's arguments. The function attributes of this kernel will be - //! determined. Max threads per block is one of the attributes. + //! \param kernelFn The kernel function object which should be executed. + //! \param args The kernel invocation arguments. //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1. ALPAKA_FN_HOST static auto getFunctionAttributes( TDev const& dev, - [[maybe_unused]] KernelBundle const& kernelBundle) - -> alpaka::KernelFunctionAttributes + [[maybe_unused]] TKernelFn const& kernelFn, + [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes { alpaka::KernelFunctionAttributes kernelFunctionAttributes; diff --git a/include/alpaka/kernel/TaskKernelGenericSycl.hpp b/include/alpaka/kernel/TaskKernelGenericSycl.hpp index 291ae1262103..f913d2ec4d5f 100644 --- a/include/alpaka/kernel/TaskKernelGenericSycl.hpp +++ b/include/alpaka/kernel/TaskKernelGenericSycl.hpp @@ -10,7 +10,6 @@ #include "alpaka/dev/Traits.hpp" #include "alpaka/dim/Traits.hpp" #include "alpaka/idx/Traits.hpp" -#include "alpaka/kernel/KernelBundle.hpp" #include "alpaka/kernel/KernelFunctionAttributes.hpp" #include "alpaka/kernel/SyclSubgroupSize.hpp" #include "alpaka/kernel/Traits.hpp" diff --git a/include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp b/include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp index 57459eb32915..03d90bcb31c2 100644 --- a/include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp +++ b/include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp @@ -26,17 +26,17 @@ namespace alpaka //! \tparam TKernelFn Kernel function object type. //! \tparam TArgs Kernel function object argument types as a parameter pack. template - struct FunctionAttributes, TDev, KernelBundle> + struct FunctionAttributes, TDev, TKernelFn, TArgs...> { //! \param dev The device instance - //! \param kernelBundle Kernel bundeled with it's arguments. The function attributes of this kernel will be - //! determined. Max threads per block is one of the attributes. + //! \param kernelFn The kernel function object which should be executed. + //! \param args The kernel invocation arguments. //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1. ALPAKA_FN_HOST static auto getFunctionAttributes( TDev const& dev, - [[maybe_unused]] KernelBundle const& kernelBundle) - -> alpaka::KernelFunctionAttributes + [[maybe_unused]] TKernelFn const& kernelFn, + [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes { alpaka::KernelFunctionAttributes kernelFunctionAttributes; diff --git a/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp b/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp index d07fc4e7834a..53bbaf67529f 100644 --- a/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp +++ b/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp @@ -17,7 +17,6 @@ #include "alpaka/dev/Traits.hpp" #include "alpaka/dim/Traits.hpp" #include "alpaka/idx/Traits.hpp" -#include "alpaka/kernel/KernelBundle.hpp" #include "alpaka/kernel/KernelFunctionAttributes.hpp" #include "alpaka/kernel/Traits.hpp" #include "alpaka/platform/Traits.hpp" @@ -222,7 +221,7 @@ namespace alpaka # if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL // This checks for a valid work division that is also compliant with the hardware maxima of the // accelerator. - if(!isValidWorkDiv(getDev(queue), task)) + if(!isValidWorkDiv(task, getDev(queue))) { throw std::runtime_error( "The given work division is not valid or not supported by the device of type " @@ -305,17 +304,17 @@ namespace alpaka //! \tparam TKernelFn Kernel function object type. //! \tparam TArgs Kernel function object argument types as a parameter pack. template - struct FunctionAttributes, TDev, KernelBundle> + struct FunctionAttributes, TDev, TKernelFn, TArgs...> { - //! \param kernelBundle Kernel bundeled with it's arguments. The function attributes of this kernel will be - //! determined. Max threads per block is one of the attributes. - //! \return KernelFunctionAttributes instance. For GPU backend, all values are set by calling the - //! corresponding API functions. The default version always returns an instance with zero fields. For CPU, - //! the field of max threads allowed by kernel function for the block is 1. + //! \param dev The device instance + //! \param kernelFn The kernel function object which should be executed. + //! \param args The kernel invocation arguments. + //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero + //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1. ALPAKA_FN_HOST static auto getFunctionAttributes( - TDev const&, - [[maybe_unused]] KernelBundle const& kernelBundle) - -> alpaka::KernelFunctionAttributes + [[maybe_unused]] TDev const& dev, + [[maybe_unused]] TKernelFn const& kernelFn, + [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes { auto kernelName = alpaka::detail::gpuKernel< TKernelFn, diff --git a/include/alpaka/kernel/Traits.hpp b/include/alpaka/kernel/Traits.hpp index 2047ac1a2b59..c2c0a55b1f7a 100644 --- a/include/alpaka/kernel/Traits.hpp +++ b/include/alpaka/kernel/Traits.hpp @@ -72,18 +72,20 @@ namespace alpaka //! \brief The structure template to access to the functions attributes of a kernel function object. //! \tparam TAcc The accelerator type - //! \tparam TKernelBundle The kernel object type, which includes the kernel function object and it's invocation - //! arguments. - template + //! \tparam TKernelFnObj Kernel function object type. + //! \tparam TArgs Kernel function object argument types as a parameter pack. + template struct FunctionAttributes { - //! \param kernelBundle The kernel object instance, which includes the kernel function object and it's - //! invocation arguments. + //! \param dev The device instance + //! \param kernelFn The kernel function object which should be executed. + //! \param args The kernel invocation arguments. //! \return KernelFunctionAttributes data structure instance. The default version always returns the //! instance with fields which are set to zero. ALPAKA_FN_HOST static auto getFunctionAttributes( - TDev const&, - [[maybe_unused]] TKernelBundle const& kernelBundle) -> alpaka::KernelFunctionAttributes + [[maybe_unused]] TDev const& dev, + [[maybe_unused]] TKernelFnObj const& kernelFn, + [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes { std::string const str = std::string(__func__) + " function is not specialised for the given arguments.\n"; @@ -164,13 +166,13 @@ namespace alpaka # pragma clang diagnostic ignored \ "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..." #endif - //! \tparam TAcc The accelerator type. - //! \param kernelFnObj The kernel object for which the block shared memory size should be calculated. - //! \param blockThreadExtent The block thread extent. - //! \param threadElemExtent The thread element extent. - //! \param args,... The kernel invocation arguments. - //! \return The size of the shared memory allocated for a block in bytes. - //! The default implementation always returns zero. +//! \tparam TAcc The accelerator type. +//! \param kernelFnObj The kernel object for which the block shared memory size should be calculated. +//! \param blockThreadExtent The block thread extent. +//! \param threadElemExtent The thread element extent. +//! \param args,... The kernel invocation arguments. +//! \return The size of the shared memory allocated for a block in bytes. +//! The default implementation always returns zero. #if BOOST_COMP_CLANG # pragma clang diagnostic pop #endif @@ -191,20 +193,21 @@ namespace alpaka //! \tparam TAcc The accelerator type. //! \tparam TDev The device type. - //! \tparam TKernelBundle The kernel object type, which includes the kernel function object and it's invocation - //! arguments. //! \param dev The device instance - //! \param kernelBundle The kernel object, which includes the kernel function object and it's invocation - //! arguments. + //! \param kernelFnObj The kernel function object which should be executed. + //! \param args The kernel invocation arguments. //! \return KernelFunctionAttributes instance. Instance is filled with values returned by the accelerator API //! depending on the specific kernel. The default version always returns the instance with fields which are set to //! zero. ALPAKA_NO_HOST_ACC_WARNING - template - ALPAKA_FN_HOST auto getFunctionAttributes(TDev const& dev, TKernelBundle const& kernelBundle) + template + ALPAKA_FN_HOST auto getFunctionAttributes(TDev const& dev, TKernelFnObj const& kernelFnObj, TArgs&&... args) -> alpaka::KernelFunctionAttributes { - return trait::FunctionAttributes::getFunctionAttributes(dev, kernelBundle); + return trait::FunctionAttributes::getFunctionAttributes( + dev, + kernelFnObj, + std::forward(args)...); } #if BOOST_COMP_CLANG @@ -212,13 +215,13 @@ namespace alpaka # pragma clang diagnostic ignored \ "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..." #endif - //! \tparam TAcc The accelerator type. - //! \param kernelFnObj The kernel object for which the block shared memory size should be calculated. - //! \param blockThreadExtent The block thread extent. - //! \param threadElemExtent The thread element extent. - //! \param args,... The kernel invocation arguments. - //! \return The OpenMP schedule information as an alpaka::omp::Schedule object if the kernel specialized the - //! OmpSchedule trait, an object of another type if the kernel didn't specialize the trait. +//! \tparam TAcc The accelerator type. +//! \param kernelFnObj The kernel object for which the block shared memory size should be calculated. +//! \param blockThreadExtent The block thread extent. +//! \param threadElemExtent The thread element extent. +//! \param args,... The kernel invocation arguments. +//! \return The OpenMP schedule information as an alpaka::omp::Schedule object if the kernel specialized the +//! OmpSchedule trait, an object of another type if the kernel didn't specialize the trait. #if BOOST_COMP_CLANG # pragma clang diagnostic pop #endif @@ -313,7 +316,7 @@ namespace alpaka template inline constexpr bool isKernelTriviallyCopyable = IsKernelTriviallyCopyable::value; - //! @} +//! @} //! Creates a kernel execution task. //! diff --git a/include/alpaka/test/KernelExecutionFixture.hpp b/include/alpaka/test/KernelExecutionFixture.hpp index 66d14b5b63e3..0e59344497ed 100644 --- a/include/alpaka/test/KernelExecutionFixture.hpp +++ b/include/alpaka/test/KernelExecutionFixture.hpp @@ -69,19 +69,16 @@ namespace alpaka::test memset(m_queue, bufAccResult, static_cast(true)); - auto bundeledKernel = alpaka::KernelBundle( - kernelFnObj, - getPtrNative(bufAccResult), - std::forward(args)...); - + alpaka::KernelCfg const kernelCfg = {m_extent, Vec::ones()}; // set workdiv if it is not before if(m_workDiv == WorkDiv{Vec::all(0), Vec::all(0), Vec::all(0)}) - m_workDiv = alpaka::getValidWorkDivForKernel>( + m_workDiv = alpaka::getValidWorkDiv( + kernelCfg, m_device, - bundeledKernel, - m_extent, - Vec::ones()); + kernelFnObj, + getPtrNative(bufAccResult), + std::forward(args)...); exec(m_queue, m_workDiv, kernelFnObj, getPtrNative(bufAccResult), std::forward(args)...); diff --git a/include/alpaka/workdiv/WorkDivHelpers.hpp b/include/alpaka/workdiv/WorkDivHelpers.hpp index 6ac433c2c0ee..b1585c75bb82 100644 --- a/include/alpaka/workdiv/WorkDivHelpers.hpp +++ b/include/alpaka/workdiv/WorkDivHelpers.hpp @@ -10,7 +10,6 @@ #include "alpaka/core/Utility.hpp" #include "alpaka/dev/Traits.hpp" #include "alpaka/extent/Traits.hpp" -#include "alpaka/kernel/KernelBundle.hpp" #include "alpaka/kernel/KernelFunctionAttributes.hpp" #include "alpaka/kernel/Traits.hpp" #include "alpaka/vec/Vec.hpp" @@ -304,87 +303,103 @@ namespace alpaka return WorkDivMembers(gridBlockExtent, blockThreadExtent, clippedThreadElemExtent); } - //! \tparam TDev The type of the device. - //! \tparam TKernelBundle The type of the bundle of kernel and the arguments. Kernel is used to get number of - //! threads per block, this number could be less than or equal to the number of threads per block according to - //! device properties. + //! Kernel start configuration to determine a valid work division + //! //! \tparam TGridElemExtent The type of the grid element extent. //! \tparam TThreadElemExtent The type of the thread element extent. - //! \param dev The device the work division should be valid for. - //! \param kernelBundle An instance of a class consisting Kernel function and its arguments - //! \param gridElemExtent The full extent of elements in the grid. - //! \param threadElemExtents the number of elements computed per thread. - //! \param blockThreadMustDivideGridThreadExtent If this is true, the grid thread extent will be multiples of the - //! corresponding block thread extent. - //! NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block - //! thread extent will be one in this dimension. - //! \param gridBlockExtentSubDivRestrictions The grid block extent subdivision restrictions. - //! \return The work division. template< typename TAcc, - typename TDev, - typename TKernelBundle, typename TGridElemExtent = alpaka::Vec, Idx>, typename TThreadElemExtent = alpaka::Vec, Idx>> - ALPAKA_FN_HOST auto getValidWorkDivForKernel( - [[maybe_unused]] TDev const& dev, - TKernelBundle const& kernelBundle, - [[maybe_unused]] TGridElemExtent const& gridElemExtent = alpaka::Vec, Idx>::ones(), - [[maybe_unused]] TThreadElemExtent const& threadElemExtents = alpaka::Vec, Idx>::ones(), - [[maybe_unused]] bool blockThreadMustDivideGridThreadExtent = true, - [[maybe_unused]] GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions - = GridBlockExtentSubDivRestrictions::Unrestricted) -> WorkDivMembers, Idx> + struct KernelCfg { - using Acc = TAcc; + //! The full extent of elements in the grid. + TGridElemExtent const gridElemExtent = alpaka::Vec, Idx>::ones(); + //! The number of elements computed per thread. + TThreadElemExtent const threadElemExtents = alpaka::Vec, Idx>::ones(); + //! If this is true, the grid thread extent will be multiples of + //! the corresponding block thread extent. + //! NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block + //! thread extent will be one in this dimension. + bool blockThreadMustDivideGridThreadExtent = true; + //! The grid block extent subdivision restrictions. + GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions + = GridBlockExtentSubDivRestrictions::Unrestricted; static_assert( - Dim::value == Dim::value, + Dim::value == Dim::value, "The dimension of Acc and the dimension of TGridElemExtent have to be identical!"); static_assert( - Dim::value == Dim::value, + Dim::value == Dim::value, "The dimension of Acc and the dimension of TThreadElemExtent have to be identical!"); static_assert( - std::is_same_v, Idx>, + std::is_same_v, Idx>, "The idx type of Acc and the idx type of TGridElemExtent have to be identical!"); static_assert( - std::is_same_v, Idx>, + std::is_same_v, Idx>, "The idx type of Acc and the idx type of TThreadElemExtent have to be identical!"); + }; + + //! \tparam TDev The type of the device. + //! \tparam TGridElemExtent The type of the grid element extent. + //! \tparam TThreadElemExtent The type of the thread element extent. + //! \param dev The device the work division should be valid for. + //! \param kernelFnObj The kernel function object which should be executed. + //! \param args The kernel invocation arguments. + //! \return The work division for the accelerator based on the kernel and argument types + template< + typename TAcc, + typename TDev, + typename TGridElemExtent, + typename TThreadElemExtent, + typename TKernelFnObj, + typename... TArgs> + ALPAKA_FN_HOST auto getValidWorkDiv( + KernelCfg const& kernelCfg, + [[maybe_unused]] TDev const& dev, + TKernelFnObj const& kernelFnObj, + TArgs&&... args) -> WorkDivMembers, Idx> + { + using Acc = TAcc; // Get max number of threads per block depending on the kernel function attributes. // For GPU backend; number of registers used by the kernel, local and shared memory usage of the kernel // determines the max number of threads per block. This number could be equal or less than the max number of // threads per block defined by device properties. - auto const kernelFunctionAttributes = getFunctionAttributes(dev, kernelBundle); + auto const kernelFunctionAttributes + = getFunctionAttributes(dev, kernelFnObj, std::forward(args)...); auto const threadsPerBlock = kernelFunctionAttributes.maxThreadsPerBlock; if constexpr(Dim::value == 0) { auto const zero = Vec, Idx>{}; - ALPAKA_ASSERT(gridElemExtent == zero); - ALPAKA_ASSERT(threadElemExtents == zero); + ALPAKA_ASSERT(kernelCfg.gridElemExtent == zero); + ALPAKA_ASSERT(kernelCfg.threadElemExtents == zero); return WorkDivMembers, Idx>{zero, zero, zero}; } else return subDivideGridElems( - getExtents(gridElemExtent), - getExtents(threadElemExtents), + getExtents(kernelCfg.gridElemExtent), + getExtents(kernelCfg.threadElemExtents), getAccDevProps(dev), static_cast>(threadsPerBlock), - blockThreadMustDivideGridThreadExtent, - gridBlockExtentSubDivRestrictions); + kernelCfg.blockThreadMustDivideGridThreadExtent, + kernelCfg.gridBlockExtentSubDivRestrictions); using V [[maybe_unused]] = Vec, Idx>; ALPAKA_UNREACHABLE(WorkDivMembers, Idx>{V{}, V{}, V{}}); } + //! Checks if the work division is supported + //! + //! \tparam TWorkDiv The type of the work division. //! \tparam TDim The dimensionality of the accelerator device properties. //! \tparam TIdx The idx type of the accelerator device properties. - //! \tparam TWorkDiv The type of the work division. - //! \param accDevProps The maxima for the work division. //! \param workDiv The work division to test for validity. + //! \param accDevProps The maxima for the work division. //! \return If the work division is valid for the given accelerator device properties. - template - ALPAKA_FN_HOST auto isValidWorkDiv(AccDevProps const& accDevProps, TWorkDiv const& workDiv) -> bool + template + ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const& workDiv, AccDevProps const& accDevProps) -> bool { // Get the extents of grid, blocks and threads of the work division to check. auto const gridBlockExtent = getWorkDiv(workDiv); @@ -428,21 +443,23 @@ namespace alpaka return true; } + //! Checks if the work division is supported + //! + //! \tparam TWorkDiv The type of the work division. //! \tparam TDim The dimensionality of the accelerator device properties. //! \tparam TIdx The idx type of the accelerator device properties. - //! \tparam TWorkDiv The type of the work division. + //! \param workDiv The work division to test for validity. //! \param accDevProps The maxima for the work division. //! \param kernelFunctionAttributes Kernel attributes, including the maximum number of threads per block that can //! be used by this kernel on the given device. This number can be equal to or smaller than the the number of //! threads per block supported by the device. - //! \param workDiv The work division to test for validity. //! \return Returns true if the work division is valid for the given accelerator device properties and for the //! given kernel. Otherwise returns false. - template - ALPAKA_FN_HOST auto isValidWorkDivKernel( + template + ALPAKA_FN_HOST auto isValidWorkDiv( + TWorkDiv const& workDiv, AccDevProps const& accDevProps, - KernelFunctionAttributes const& kernelFunctionAttributes, - TWorkDiv const& workDiv) -> bool + KernelFunctionAttributes const& kernelFunctionAttributes) -> bool { // Get the extents of grid, blocks and threads of the work division to check. auto const gridBlockExtent = getWorkDiv(workDiv); @@ -491,33 +508,38 @@ namespace alpaka return true; } + //! Checks if the work division is supported for the kernel on the device + //! //! \tparam TAcc The accelerator to test the validity on. //! \tparam TDev The type of the device. - //! \tparam TKernelBundle The type of the bundle of kernel and the arguments. //! \tparam TWorkDiv The type of work division to test for validity. - //! \param dev The device to test the work division for validity on. - //! \param kernelBundle An instance of a class consisting Kernel function and its arguments. //! \param workDiv The work division to test for validity. - //! \return Returns the value of isValidWorkDivKernel function. - template - ALPAKA_FN_HOST auto isValidWorkDivKernel( + //! \param dev The device to test the work division for validity on. + //! \param kernelFnObj The kernel function object which should be executed. + //! \param args The kernel invocation arguments. + //! \return Returns the value of isValidWorkDiv function. + template + ALPAKA_FN_HOST auto isValidWorkDiv( + TWorkDiv const& workDiv, TDev const& dev, - TKernelBundle const& kernelBundle, - TWorkDiv const& workDiv) -> bool + TKernelFnObj const& kernelFnObj, + TArgs&&... args) -> bool { - return isValidWorkDivKernel( + return isValidWorkDiv( + workDiv, getAccDevProps(dev), - getFunctionAttributes(dev, kernelBundle), - workDiv); + getFunctionAttributes(dev, kernelFnObj, std::forward(args)...)); } + //! Checks if the work division is supported by the device + //! //! \tparam TAcc The accelerator to test the validity on. - //! \param dev The device to test the work division for validity on. //! \param workDiv The work division to test for validity. + //! \param dev The device to test the work division for validity on. //! \return If the work division is valid on this accelerator. - template - ALPAKA_FN_HOST auto isValidWorkDiv(TDev const& dev, TWorkDiv const& workDiv) -> bool + template + ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const& workDiv, TDev const& dev) -> bool { - return isValidWorkDiv(getAccDevProps(dev), workDiv); + return isValidWorkDiv(workDiv, getAccDevProps(dev)); } } // namespace alpaka diff --git a/test/integ/axpy/src/axpy.cpp b/test/integ/axpy/src/axpy.cpp index 4553dba458f9..67b73f579504 100644 --- a/test/integ/axpy/src/axpy.cpp +++ b/test/integ/axpy/src/axpy.cpp @@ -147,16 +147,18 @@ TEMPLATE_LIST_TEST_CASE("axpy", "[axpy]", TestAccs) #endif - auto const& bundeledKernel - = alpaka::KernelBundle(kernel, numElements, alpha, std::data(memBufAccX), std::data(memBufAccY)); + alpaka::KernelCfg const kernelCfg + = {extent, static_cast(3u), false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted}; + // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDiv = alpaka::getValidWorkDivForKernel( + auto const workDiv = alpaka::getValidWorkDiv( + kernelCfg, devAcc, - bundeledKernel, - extent, - static_cast(3u), - false, - alpaka::GridBlockExtentSubDivRestrictions::Unrestricted); + kernel, + numElements, + alpha, + std::data(memBufAccX), + std::data(memBufAccY)); std::cout << "AxpyKernel(" << " numElements:" << numElements << ", accelerator: " << alpaka::getAccName() diff --git a/test/integ/mandelbrot/src/mandelbrot.cpp b/test/integ/mandelbrot/src/mandelbrot.cpp index 6424b3e986e1..58ba9c6b0abc 100644 --- a/test/integ/mandelbrot/src/mandelbrot.cpp +++ b/test/integ/mandelbrot/src/mandelbrot.cpp @@ -307,7 +307,13 @@ TEMPLATE_LIST_TEST_CASE("mandelbrot", "[mandelbrot]", TestAccs) auto const [rowPitch, _] = alpaka::getPitchesInBytes(bufColorAcc); CHECK(rowPitch % sizeof(Val) == 0); - auto const& bundeledKernel = alpaka::KernelBundle( + alpaka::KernelCfg const kernelCfg + = {extent, alpaka::Vec::ones(), false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted}; + + // Let alpaka calculate good block and grid sizes given our full problem extent + auto const workDiv = alpaka::getValidWorkDiv( + kernelCfg, + devAcc, kernel, std::data(bufColorAcc), numRows, @@ -318,15 +324,6 @@ TEMPLATE_LIST_TEST_CASE("mandelbrot", "[mandelbrot]", TestAccs) fMinI, fMaxI, maxIterations); - // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDiv = alpaka::getValidWorkDivForKernel( - devAcc, - bundeledKernel, - extent, - alpaka::Vec::ones(), - false, - alpaka::GridBlockExtentSubDivRestrictions::Unrestricted); - std::cout << "MandelbrotKernel(" << " numRows:" << numRows << ", numCols:" << numCols << ", maxIterations:" << maxIterations diff --git a/test/integ/matMul/src/matMul.cpp b/test/integ/matMul/src/matMul.cpp index 41e2e4f9cdb0..dd87cfe489d9 100644 --- a/test/integ/matMul/src/matMul.cpp +++ b/test/integ/matMul/src/matMul.cpp @@ -243,8 +243,12 @@ TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs) std::cout << "pitchesB " << alpaka::getPitchesInBytes(bufBAcc) << " ldb: " << ldb << "\n"; std::cout << "pitchesC " << alpaka::getPitchesInBytes(bufCAcc) << " ldc: " << ldc << "\n"; - - auto const& bundeledKernel = alpaka::KernelBundle( + // Let alpaka calculate good block and grid sizes given our full problem extent + alpaka::KernelCfg const kernelCfg + = {extentC, alpaka::Vec::ones(), false, alpaka::GridBlockExtentSubDivRestrictions::EqualExtent}; + auto const workDiv = alpaka::getValidWorkDiv( + kernelCfg, + devAcc, kernel, m, n, @@ -257,14 +261,6 @@ TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs) static_cast(1), std::data(bufCAcc), ldc); - // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDiv = alpaka::getValidWorkDivForKernel( - devAcc, - bundeledKernel, - extentC, - alpaka::Vec::ones(), - false, - alpaka::GridBlockExtentSubDivRestrictions::EqualExtent); std::cout << "MatMulKernel(" diff --git a/test/integ/separableCompilation/src/main.cpp b/test/integ/separableCompilation/src/main.cpp index 3fb4f3245682..70739d1010c2 100644 --- a/test/integ/separableCompilation/src/main.cpp +++ b/test/integ/separableCompilation/src/main.cpp @@ -111,10 +111,16 @@ TEMPLATE_LIST_TEST_CASE("separableCompilation", "[separableCompilation]", TestAc alpaka::memcpy(queueAcc, memBufAccA, memBufHostA); alpaka::memcpy(queueAcc, memBufAccB, memBufHostB); - auto const& bundeledKernel - = alpaka::KernelBundle(kernel, memBufAccA.data(), memBufAccB.data(), memBufAccC.data(), numElements); // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDiv = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, extent, static_cast(3u)); + alpaka::KernelCfg const kernelCfg = {extent, static_cast(3u)}; + auto const workDiv = alpaka::getValidWorkDiv( + kernelCfg, + devAcc, + kernel, + memBufAccA.data(), + memBufAccB.data(), + memBufAccC.data(), + numElements); std::cout << alpaka::core::demangled << "(" << "accelerator: " << alpaka::getAccName() << ", workDiv: " << workDiv diff --git a/test/integ/sharedMem/src/sharedMem.cpp b/test/integ/sharedMem/src/sharedMem.cpp index 0377f623a5e1..855e094688bb 100644 --- a/test/integ/sharedMem/src/sharedMem.cpp +++ b/test/integ/sharedMem/src/sharedMem.cpp @@ -131,18 +131,15 @@ TEMPLATE_LIST_TEST_CASE("sharedMem", "[sharedMem]", TestAccs) auto blockRetValuesDummy = alpaka::allocBuf(devAcc, static_cast(1)); - // Kernel input during the runtim of kernel will be different and is chosen to depend on workdiv. - // Therefore initially a workdiv is needed to find the parameter. Therefore in kernel bundle, we can not use the + + // Kernel input during the runtime of kernel will be different and is chosen to depend on workdiv. + // Therefore, initially a workdiv is needed to find the parameter. Therefore, in kernel bundle, we can not use the // real input for the buffer pointer. - auto const& bundeledKernel = alpaka::KernelBundle(kernel, std::data(blockRetValuesDummy)); + // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDiv = alpaka::getValidWorkDivForKernel( - devAcc, - bundeledKernel, - numElements, - static_cast(1u), - false, - alpaka::GridBlockExtentSubDivRestrictions::Unrestricted); + alpaka::KernelCfg const kernelCfg + = {numElements, static_cast(1u), false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted}; + auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, kernel, std::data(blockRetValuesDummy)); std::cout << "SharedMemKernel(" << " accelerator: " << alpaka::getAccName() diff --git a/test/unit/math/src/TestTemplate.hpp b/test/unit/math/src/TestTemplate.hpp index 7d45010c2d4e..3fdec6f69f14 100644 --- a/test/unit/math/src/TestTemplate.hpp +++ b/test/unit/math/src/TestTemplate.hpp @@ -98,17 +98,16 @@ namespace mathtest Args args{devAcc}; Results results{devAcc}; - - auto const& bundeledKernel - = alpaka::KernelBundle(kernel, results.pDevBuffer, wrappedFunctor, args.pDevBuffer); // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDiv = alpaka::getValidWorkDivForKernel( + alpaka::KernelCfg const kernelCfg + = {sizeExtent, elementsPerThread, false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted}; + auto const workDiv = alpaka::getValidWorkDiv( + kernelCfg, devAcc, - bundeledKernel, - sizeExtent, - elementsPerThread, - false, - alpaka::GridBlockExtentSubDivRestrictions::Unrestricted); + kernel, + results.pDevBuffer, + wrappedFunctor, + args.pDevBuffer); // SETUP COMPLETED. diff --git a/test/unit/workDiv/src/WorkDivForKernelTest.cpp b/test/unit/workDiv/src/WorkDivForKernelTest.cpp index f6de4a020776..88bdcdfe8c7c 100644 --- a/test/unit/workDiv/src/WorkDivForKernelTest.cpp +++ b/test/unit/workDiv/src/WorkDivForKernelTest.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -75,7 +74,7 @@ struct TestKernelWithManyRegisters using TestAccs = alpaka::test::EnabledAccs, std::uint32_t>; -TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.1D", "[workDivKernel]", TestAccs) +TEMPLATE_LIST_TEST_CASE("getValidWorkDiv.1D", "[workDivKernel]", TestAccs) { using Acc = TestType; using Idx = alpaka::Idx; @@ -86,18 +85,17 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.1D", "[workDivKernel]", TestAc auto const dev = alpaka::getDevByIdx(platform, 0); TestKernelWithManyRegisters kernel; - auto const kernelBundle = alpaka::KernelBundle(kernel, 200ul); // Get the device properties and hard limits auto const props = alpaka::getAccDevProps(dev); Idx const threadsPerGridTestValue = props.m_blockThreadCountMax * props.m_gridBlockCountMax; - // Test the getValidWorkDivForKernel function for threadsPerGridTestValue threads per grid. - auto const workDiv - = alpaka::getValidWorkDivForKernel(dev, kernelBundle, Vec{threadsPerGridTestValue}, Vec{1}); + // Test the getValidWorkDiv function for threadsPerGridTestValue threads per grid. + alpaka::KernelCfg const kernelCfg = {Vec{threadsPerGridTestValue}, Vec{1}}; + auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, dev, kernel, 200ul); - // Test the isValidWorkDivKernel function - CHECK(alpaka::isValidWorkDivKernel(dev, kernelBundle, workDiv)); + // Test the isValidWorkDiv function + CHECK(alpaka::isValidWorkDiv(workDiv, dev, kernel, 200ul)); // Get calculated threads per block from the workDiv that was found by examining the kernel function. Idx const threadsPerBlock = workDiv.m_blockThreadExtent.prod(); @@ -110,15 +108,15 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.1D", "[workDivKernel]", TestAc // Check that using the maximum number of threads per block is valid. auto const validWorkDiv = WorkDiv{Vec{threadsPerGridTestValue / threadsPerBlock}, Vec{threadsPerBlock}, Vec{1}}; - CHECK(alpaka::isValidWorkDivKernel(dev, kernelBundle, validWorkDiv)); + CHECK(alpaka::isValidWorkDiv(validWorkDiv, dev, kernel, 200ul)); // Check that using too many threads per block is not valid. auto const invalidThreads = WorkDiv{Vec{1}, Vec{2 * threadsPerBlockLimit}, Vec{1}}; - CHECK(not alpaka::isValidWorkDivKernel(dev, kernelBundle, invalidThreads)); + CHECK(not alpaka::isValidWorkDiv(invalidThreads, dev, kernel, 200ul)); // Check that a work division with a single block, thread and element is always valid auto const serialWorkDiv = WorkDiv{Vec{1}, Vec{1}, Vec{1}}; - CHECK(alpaka::isValidWorkDivKernel(dev, kernelBundle, serialWorkDiv)); + CHECK(alpaka::isValidWorkDiv(serialWorkDiv, dev, kernel, 200ul)); // Some accelerators support only one thread per block: if constexpr(alpaka::isSingleThreadAcc) @@ -129,7 +127,7 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.1D", "[workDivKernel]", TestAc // Check that a work division with more than one thread per block is not valid. auto const parallelWorkDiv = WorkDiv{Vec{1}, Vec{2}, Vec{1}}; - CHECK(not alpaka::isValidWorkDivKernel(dev, kernelBundle, parallelWorkDiv)); + CHECK(not alpaka::isValidWorkDiv(parallelWorkDiv, dev, kernel, 200ul)); } // Check the maxDynamicSharedSizeBytes for CPU backends @@ -142,14 +140,14 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.1D", "[workDivKernel]", TestAc alpaka::TagCpuTbbBlocks>) { int const maxDynamicSharedSizeBytes - = alpaka::getFunctionAttributes(dev, kernelBundle).maxDynamicSharedSizeBytes; + = alpaka::getFunctionAttributes(dev, kernel, 200ul).maxDynamicSharedSizeBytes; CHECK(maxDynamicSharedSizeBytes == static_cast(alpaka::BlockSharedDynMemberAllocKiB * 1024)); } } using TestAccs2D = alpaka::test::EnabledAccs, std::uint32_t>; -TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.2D", "[workDivKernel]", TestAccs2D) +TEMPLATE_LIST_TEST_CASE("getValidWorkDiv.2D", "[workDivKernel]", TestAccs2D) { using Acc = TestType; using Idx = alpaka::Idx; @@ -160,18 +158,17 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.2D", "[workDivKernel]", TestAc auto const dev = alpaka::getDevByIdx(platform, 0); TestKernelWithManyRegisters kernel; - auto const kernelBundle = alpaka::KernelBundle(kernel, 200ul); // Get the device properties and hard limits auto const props = alpaka::getAccDevProps(dev); Idx const threadsPerGridTestValue = props.m_blockThreadCountMax * props.m_gridBlockCountMax; - // Test getValidWorkDivForKernel function for threadsPerGridTestValue threads per grid. - auto const workDiv - = alpaka::getValidWorkDivForKernel(dev, kernelBundle, Vec{8, threadsPerGridTestValue / 8}, Vec{1, 1}); + // Test getValidWorkDiv function for threadsPerGridTestValue threads per grid. + alpaka::KernelCfg const kernelCfg = {Vec{8, threadsPerGridTestValue / 8}, Vec{1, 1}}; + auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, dev, kernel, 200ul); - // Test the isValidWorkDivKernel function - CHECK(alpaka::isValidWorkDivKernel(dev, kernelBundle, workDiv)); + // Test the isValidWorkDiv function + CHECK(alpaka::isValidWorkDiv(workDiv, dev, kernel, 200ul)); // The valid workdiv values for the kernel may change depending on the GPU type and compiler. // Therefore the generated workdiv is not compared to a specific workdiv in this test. @@ -188,15 +185,15 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.2D", "[workDivKernel]", TestAc // Check that using the maximum number of threads per block is valid. auto const validWorkDiv = WorkDiv{Vec{8, threadsPerGridTestValue / threadsPerBlock / 8}, Vec{1, threadsPerBlock}, Vec{1, 1}}; - CHECK(alpaka::isValidWorkDivKernel(dev, kernelBundle, validWorkDiv)); + CHECK(alpaka::isValidWorkDiv(validWorkDiv, dev, kernel, 200ul)); // Check that using too many threads per block is not valid. auto const invalidThreads = WorkDiv{Vec{1, 1}, Vec{2, threadsPerBlockLimit}, Vec{1, 1}}; - CHECK(not alpaka::isValidWorkDivKernel(dev, kernelBundle, invalidThreads)); + CHECK(not alpaka::isValidWorkDiv(invalidThreads, dev, kernel, 200ul)); // Check that a work division with a single block, thread and element is always valid auto const serialWorkDiv = WorkDiv{Vec{1, 1}, Vec{1, 1}, Vec{1, 1}}; - CHECK(alpaka::isValidWorkDivKernel(dev, kernelBundle, serialWorkDiv)); + CHECK(alpaka::isValidWorkDiv(serialWorkDiv, dev, kernel, 200ul)); // Some accelerators support only one thread per block: if constexpr(alpaka::isSingleThreadAcc) @@ -207,7 +204,7 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.2D", "[workDivKernel]", TestAc // Check that a work division with more than one thread per block is not valid. auto const parallelWorkDiv = WorkDiv{Vec{1, 1}, Vec{1, 2}, Vec{1, 1}}; - CHECK(not alpaka::isValidWorkDivKernel(dev, kernelBundle, parallelWorkDiv)); + CHECK(not alpaka::isValidWorkDiv(parallelWorkDiv, dev, kernel, 200ul)); } // Check the maxDynamicSharedSizeBytes for CPU backends @@ -220,7 +217,7 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.2D", "[workDivKernel]", TestAc alpaka::TagCpuTbbBlocks>) { int const maxDynamicSharedSizeBytes - = alpaka::getFunctionAttributes(dev, kernelBundle).maxDynamicSharedSizeBytes; + = alpaka::getFunctionAttributes(dev, kernel, 200ul).maxDynamicSharedSizeBytes; CHECK(maxDynamicSharedSizeBytes == static_cast(alpaka::BlockSharedDynMemberAllocKiB * 1024)); } }