From d7c459d739087df7efa5c622de6306f64a126716 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Widera?= <r.widera@hzdr.de>
Date: Thu, 15 Aug 2024 12:49:45 +0200
Subject: [PATCH] remove `KernelBundle`, change signature of
 `[get|is]ValidWorkDiv*` (#2349)

* remove `KernelBundle`, change signature of `getValidWorkDiv*`

- revert introduced `KernelBundle` in #2251
- change signature of `getValidWorkDivForKernel`,`isValidWorkDivKernel` and `isValidWorkDivKernel`
- reuse old naming `getValidWorkDiv` and `isValidWorkDiv`

* use new interface for `getValidWorkDiv`

* fix cheat sheet
---
 docs/source/basic/cheatsheet.rst              |  28 ++--
 example/bufferCopy/src/bufferCopy.cpp         |  12 +-
 example/complex/src/complex.cpp               |   6 +-
 .../conv2DWithMdspan/src/conv2DWithMdspan.cpp |  10 +-
 example/convolution1D/src/convolution1D.cpp   |  14 +-
 example/convolution2D/src/convolution2D.cpp   |  20 +--
 .../counterBasedRng/src/counterBasedRng.cpp   |  23 ++-
 example/heatEquation/src/heatEquation.cpp     |   6 +-
 example/helloWorld/src/helloWorld.cpp         |   6 +-
 .../helloWorldLambda/src/helloWorldLambda.cpp |   8 +-
 .../src/kernelSpecialization.cpp              |   6 +-
 .../src/matrixMulMdSpan.cpp                   |  13 +-
 .../src/monteCarloIntegration.cpp             |   9 +-
 example/openMPSchedule/src/openMPSchedule.cpp |   6 +-
 example/randomCells2D/src/randomCells2D.cpp   |  20 ++-
 .../randomStrategies/src/randomStrategies.cpp |  39 ++---
 example/vectorAdd/src/vectorAdd.cpp           |   9 +-
 include/alpaka/alpaka.hpp                     |   1 -
 include/alpaka/kernel/KernelBundle.hpp        |  58 -------
 .../alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp |  11 +-
 .../kernel/TaskKernelCpuOmp2Threads.hpp       |  11 +-
 include/alpaka/kernel/TaskKernelCpuSerial.hpp |  11 +-
 include/alpaka/kernel/TaskKernelCpuSycl.hpp   |  10 +-
 .../alpaka/kernel/TaskKernelCpuTbbBlocks.hpp  |  11 +-
 .../alpaka/kernel/TaskKernelCpuThreads.hpp    |  11 +-
 .../alpaka/kernel/TaskKernelFpgaSyclIntel.hpp |  10 +-
 .../alpaka/kernel/TaskKernelGenericSycl.hpp   |   1 -
 .../alpaka/kernel/TaskKernelGpuSyclIntel.hpp  |  10 +-
 .../kernel/TaskKernelGpuUniformCudaHipRt.hpp  |  21 ++-
 include/alpaka/kernel/Traits.hpp              |  61 ++++----
 .../alpaka/test/KernelExecutionFixture.hpp    |  15 +-
 include/alpaka/workdiv/WorkDivHelpers.hpp     | 146 ++++++++++--------
 test/integ/axpy/src/axpy.cpp                  |  18 ++-
 test/integ/mandelbrot/src/mandelbrot.cpp      |  17 +-
 test/integ/matMul/src/matMul.cpp              |  16 +-
 test/integ/separableCompilation/src/main.cpp  |  12 +-
 test/integ/sharedMem/src/sharedMem.cpp        |  17 +-
 test/unit/math/src/TestTemplate.hpp           |  17 +-
 .../unit/workDiv/src/WorkDivForKernelTest.cpp |  47 +++---
 39 files changed, 367 insertions(+), 400 deletions(-)
 delete mode 100644 include/alpaka/kernel/KernelBundle.hpp
diff --git a/docs/source/basic/cheatsheet.rst b/docs/source/basic/cheatsheet.rst
index 7cd60c4de97e..21ade07e134b 100644
--- a/docs/source/basic/cheatsheet.rst
+++ b/docs/source/basic/cheatsheet.rst
@@ -180,8 +180,6 @@ Prepare Kernel Bundle
   .. code-block:: c++
 
      HeatEquationKernel heatEqKernel;
-     // Arguments of KernelBundle: The kernel instance and the kernel arguments
-     auto const& bundeledKernel = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
 
 Automatically select a valid kernel launch configuration
   .. code-block:: c++
@@ -189,12 +187,21 @@ Automatically select a valid kernel launch configuration
      Vec<Dim, Idx> const globalThreadExtent = vectorValue;
      Vec<Dim, Idx> const elementsPerThread = vectorValue;
 
-     auto autoWorkDiv = getValidWorkDivForKernel<Acc>(
-       device,
-       bundeledKernel,
-       globalThreadExtent, elementsPerThread,
+     KernelCfg<Acc> const kernelCfg = {
+       globalThreadExtent,
+       elementsPerThread,
        false,
-       GridBlockExtentSubDivRestrictions::Unrestricted);
+       GridBlockExtentSubDivRestrictions::Unrestricted};
+
+     auto autoWorkDiv = getValidWorkDiv(
+       kernelCfg,
+       device,
+       heatEqKernel,
+       pCurrAcc,
+       pNextAcc,
+       numNodesX,
+       dx,
+       dt);
 
 Manually set a kernel launch configuration
   .. code-block:: c++
@@ -204,9 +211,10 @@ Manually set a kernel launch configuration
      Vec<Dim, Idx> const elementsPerThread = vectorValue;
 
      using WorkDiv = WorkDivMembers<Dim, Idx>;
-     auto manualWorkDiv = WorkDiv{blocksPerGrid,
-                                  threadsPerBlock,
-				  elementsPerThread};
+     auto manualWorkDiv = WorkDiv{
+       blocksPerGrid,
+       threadsPerBlock,
+       elementsPerThread};
 
 Instantiate a kernel and create a task that will run it (does not launch it yet)
   .. code-block:: c++
diff --git a/example/bufferCopy/src/bufferCopy.cpp b/example/bufferCopy/src/bufferCopy.cpp
index 53d9d25e7f84..1c99de879cfb 100644
--- a/example/bufferCopy/src/bufferCopy.cpp
+++ b/example/bufferCopy/src/bufferCopy.cpp
@@ -118,7 +118,7 @@ auto example(TAccTag const&) -> int
     using Data = std::uint32_t;
     constexpr Idx nElementsPerDim = 2;
 
-    const Vec extents(Vec::all(static_cast<Idx>(nElementsPerDim)));
+    Vec const extents(Vec::all(static_cast<Idx>(nElementsPerDim)));
 
     // Allocate host memory buffers
     //
@@ -164,9 +164,8 @@ auto example(TAccTag const&) -> int
 
     FillBufferKernel fillBufferKernel;
 
-    auto const& bundeledFillBufferKernel = alpaka::KernelBundle(fillBufferKernel, hostViewPlainPtrMdSpan);
-    auto const hostWorkDiv
-        = alpaka::getValidWorkDivForKernel<Host>(devHost, bundeledFillBufferKernel, threadsPerGrid, elementsPerThread);
+    alpaka::KernelCfg<Host> const hostKernelCfg = {threadsPerGrid, elementsPerThread};
+    auto const hostWorkDiv = alpaka::getValidWorkDiv(hostKernelCfg, devHost, fillBufferKernel, hostViewPlainPtrMdSpan);
 
     alpaka::exec<Host>(hostQueue, hostWorkDiv, fillBufferKernel,
                        hostViewPlainPtrMdSpan); // 1st kernel argument
@@ -203,11 +202,10 @@ auto example(TAccTag const&) -> int
     auto deviceBufferMdSpan2 = alpaka::experimental::getMdSpan(deviceBuffer2);
 
     TestBufferKernel testBufferKernel;
-    auto const& bundeledTestBufferKernel = alpaka::KernelBundle(testBufferKernel, deviceBufferMdSpan1);
 
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const devWorkDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledTestBufferKernel, threadsPerGrid, elementsPerThread);
+    alpaka::KernelCfg<Acc> const devKernelCfg = {threadsPerGrid, elementsPerThread};
+    auto const devWorkDiv = alpaka::getValidWorkDiv(devKernelCfg, devAcc, testBufferKernel, deviceBufferMdSpan1);
 
     alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan1);
     alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan2);
diff --git a/example/complex/src/complex.cpp b/example/complex/src/complex.cpp
index 7c9b39563460..ece105e845fd 100644
--- a/example/complex/src/complex.cpp
+++ b/example/complex/src/complex.cpp
@@ -58,10 +58,10 @@ auto example(TAccTag const&) -> int
 
     ComplexKernel complexKernel;
 
-    auto const& bundeledKernel = alpaka::KernelBundle(complexKernel);
+    alpaka::KernelCfg<Acc> const kernelCfg = {threadsPerGrid, elementsPerThread};
+
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
+    auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, complexKernel);
 
     // Run the kernel
     alpaka::exec<Acc>(queue, workDiv, complexKernel);
diff --git a/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp b/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp
index 0a8b7d165b7d..698a8d12fa75 100644
--- a/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp
+++ b/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp
@@ -147,16 +147,16 @@ auto example(TAccTag const&) -> int
     //  Construct kernel object
     ConvolutionKernelMdspan2D convolutionKernel2D;
 
-    // Make a bundle
-    auto const& bundeledKernel = alpaka::KernelBundle(
+    //   Let alpaka calculate good block and grid sizes given our full problem extent.
+    alpaka::KernelCfg<DevAcc> const kernelCfg = {extent, Vec::ones()};
+    auto const workDiv = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
         convolutionKernel2D,
         alpaka::experimental::getMdSpan(bufInputAcc),
         alpaka::experimental::getMdSpan(outputDeviceMemory),
         alpaka::experimental::getMdSpan(bufFilterAcc));
 
-    //   Let alpaka calculate good block and grid sizes given our full problem extent.
-    auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, extent, Vec::ones());
-
 
     // Run the kernel, pass 3 arrays as 2D mdspans
     alpaka::exec<DevAcc>(
diff --git a/example/convolution1D/src/convolution1D.cpp b/example/convolution1D/src/convolution1D.cpp
index 098dc8501d09..7aa3012206a7 100644
--- a/example/convolution1D/src/convolution1D.cpp
+++ b/example/convolution1D/src/convolution1D.cpp
@@ -37,8 +37,8 @@ struct ConvolutionKernel
         TElem const* const input,
         TElem const* const filter,
         TElem* const output,
-        const std::size_t inputSize,
-        const std::size_t filterSize) const -> void
+        std::size_t const inputSize,
+        std::size_t const filterSize) const -> void
     {
         auto const globalThreadIdxX = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
 
@@ -140,7 +140,12 @@ auto example(TAccTag const&) -> int
     DataType* nativeInputDeviceMemory = std::data(inputDeviceMemory);
     DataType* nativeOutputDeviceMemory = std::data(outputDeviceMemory);
 
-    auto const& bundeledKernel = alpaka::KernelBundle(
+    alpaka::KernelCfg<DevAcc> const kernelCfg = {threadsPerGrid, elementsPerThread};
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workDiv = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
         convolutionKernel,
         nativeInputDeviceMemory,
         nativeFilterDeviceMemory,
@@ -148,9 +153,6 @@ auto example(TAccTag const&) -> int
         inputSize,
         filterSize);
 
-    // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDiv
-        = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
     // Run the kernel
     alpaka::exec<DevAcc>(
         queue,
diff --git a/example/convolution2D/src/convolution2D.cpp b/example/convolution2D/src/convolution2D.cpp
index 2c8a6b28d850..87f618c7380e 100644
--- a/example/convolution2D/src/convolution2D.cpp
+++ b/example/convolution2D/src/convolution2D.cpp
@@ -265,7 +265,7 @@ auto example(TAccTag const&) -> int
     alpaka::wait(queueAcc);
 
     // Calculate the allocated width, due to padding it might be larger then the matrix width
-    auto const intputWidthAllocated = [&]() -> const Idx
+    auto const intputWidthAllocated = [&]() -> Idx const
     {
         // Calculate pitch: The size of one line in bytes including padding.
         auto const rowPitchInput{alpaka::getPitchesInBytes(bufInputAcc)[0]};
@@ -294,7 +294,7 @@ auto example(TAccTag const&) -> int
     alpaka::wait(queueAcc);
 
     // Calculate the allocated width, due to padding it might be larger then the matrix width
-    auto const filterWidthAllocated = [&]() -> const Idx
+    auto const filterWidthAllocated = [&]() -> Idx const
     {
         // Calculate pitch: The size of one line in bytes including padding.
         auto const rowPitchFilter{alpaka::getPitchesInBytes(bufFilterAcc)[0]};
@@ -305,20 +305,22 @@ auto example(TAccTag const&) -> int
     //  ConvolutionKernel2DSharedMemory
     ConvolutionKernel2DSharedMemory convolutionKernel2D;
 
-    auto const& bundeledKernel = alpaka::KernelBundle(
+    alpaka::KernelCfg<DevAcc> kernelCfg = {extent, Vec::ones()};
+
+    //   Let alpaka calculate good block and grid sizes given our full problem extent.
+    auto const workDiv = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
         convolutionKernel2D,
-        alpaka::getPtrNative(bufInputAcc),
-        alpaka::getPtrNative(outputDeviceMemory),
+        std::data(bufInputAcc),
+        std::data(outputDeviceMemory),
         matrixWidth,
         matrixHeight,
-        alpaka::getPtrNative(bufFilterAcc),
+        std::data(bufFilterAcc),
         filterWidth,
         intputWidthAllocated,
         filterWidthAllocated);
 
-    //   Let alpaka calculate good block and grid sizes given our full problem extent.
-    auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, extent, Vec::ones());
-
     // Run the kernel
     alpaka::exec<DevAcc>(
         queueAcc,
diff --git a/example/counterBasedRng/src/counterBasedRng.cpp b/example/counterBasedRng/src/counterBasedRng.cpp
index 7a9a9abfc7fe..d96ab2b775a2 100644
--- a/example/counterBasedRng/src/counterBasedRng.cpp
+++ b/example/counterBasedRng/src/counterBasedRng.cpp
@@ -147,15 +147,15 @@ auto example(TAccTag const&) -> int
     BufAcc bufAcc(alpaka::allocBuf<Data, Idx>(devAcc, extent));
 
     CounterBasedRngKernel counterBasedRngKernel;
-    auto const& bundeledKernel
-        = alpaka::KernelBundle(counterBasedRngKernel, alpaka::experimental::getMdSpan(bufAcc), key);
-    auto const& bundeledKernel2
-        = alpaka::KernelBundle(counterBasedRngKernel, alpaka::experimental::getMdSpan(bufHost), key);
 
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDivAcc = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, extent, elementsPerThread);
-    auto const workDivHost
-        = alpaka::getValidWorkDivForKernel<AccHost>(devHost, bundeledKernel2, extent, elementsPerThreadHost);
+    alpaka::KernelCfg<Acc> kernerlCfgAccDev = {extent, elementsPerThread};
+    auto const workDivAcc = alpaka::getValidWorkDiv(
+        kernerlCfgAccDev,
+        devAcc,
+        counterBasedRngKernel,
+        alpaka::experimental::getMdSpan(bufAcc),
+        key);
 
     // Create the kernel execution task.
     auto const taskKernelAcc = alpaka::createTaskKernel<Acc>(
@@ -163,6 +163,15 @@ auto example(TAccTag const&) -> int
         CounterBasedRngKernel(),
         alpaka::experimental::getMdSpan(bufAcc),
         key);
+
+    alpaka::KernelCfg<AccHost> kernerlCfgAccHost = {extent, elementsPerThreadHost};
+    auto const workDivHost = alpaka::getValidWorkDiv(
+        kernerlCfgAccHost,
+        devHost,
+        counterBasedRngKernel,
+        alpaka::experimental::getMdSpan(bufHost),
+        key);
+
     auto const taskKernelHost = alpaka::createTaskKernel<AccHost>(
         workDivHost,
         CounterBasedRngKernel(),
diff --git a/example/heatEquation/src/heatEquation.cpp b/example/heatEquation/src/heatEquation.cpp
index df43a4e0ed47..a13b3f00bc26 100644
--- a/example/heatEquation/src/heatEquation.cpp
+++ b/example/heatEquation/src/heatEquation.cpp
@@ -134,9 +134,11 @@ auto example(TAccTag const&) -> int
 
     HeatEquationKernel heatEqKernel;
 
-    auto const& bundeledKernel = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
+    alpaka::KernelCfg<Acc> const kernelCfg = {extent, elemPerThread};
+
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, extent, elemPerThread);
+    auto const workDiv
+        = alpaka::getValidWorkDiv(kernelCfg, devAcc, heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
 
     // Copy host -> device
     alpaka::memcpy(queue, uCurrBufAcc, uCurrBufHost);
diff --git a/example/helloWorld/src/helloWorld.cpp b/example/helloWorld/src/helloWorld.cpp
index 646df34d7b66..79ad64ae9494 100644
--- a/example/helloWorld/src/helloWorld.cpp
+++ b/example/helloWorld/src/helloWorld.cpp
@@ -135,10 +135,10 @@ auto example(TAccTag const&) -> int
     // argument. So a kernel can be a class or struct, a lambda, etc.
     HelloWorldKernel helloWorldKernel;
 
-    auto const& bundeledKernel = alpaka::KernelBundle(helloWorldKernel);
+    alpaka::KernelCfg<Acc> const kernelCfg = {threadsPerGrid, elementsPerThread};
+
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
+    auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, helloWorldKernel);
 
     // Run the kernel
     //
diff --git a/example/helloWorldLambda/src/helloWorldLambda.cpp b/example/helloWorldLambda/src/helloWorldLambda.cpp
index b0e028cea2d7..85d599b63b6a 100644
--- a/example/helloWorldLambda/src/helloWorldLambda.cpp
+++ b/example/helloWorldLambda/src/helloWorldLambda.cpp
@@ -78,7 +78,7 @@ auto example(TAccTag const&) -> int
     auto const threadsPerGrid = Vec{4, 2, 4};
 
 
-    const size_t nExclamationMarks = 10;
+    size_t const nExclamationMarks = 10;
 
     // Run "Hello World" kernel with a lambda function
     //
@@ -117,10 +117,10 @@ auto example(TAccTag const&) -> int
         printf("\n");
     };
 
-    auto const& bundeledKernel = alpaka::KernelBundle(kernelLambda, nExclamationMarks);
+    alpaka::KernelCfg<Acc> const kernelCfg = {threadsPerGrid, elementsPerThread};
+
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
+    auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, kernelLambda, nExclamationMarks);
 
     alpaka::exec<Acc>(queue, workDiv, kernelLambda, nExclamationMarks);
     alpaka::wait(queue);
diff --git a/example/kernelSpecialization/src/kernelSpecialization.cpp b/example/kernelSpecialization/src/kernelSpecialization.cpp
index 6bb7ccbda79f..899e61ad7f4a 100644
--- a/example/kernelSpecialization/src/kernelSpecialization.cpp
+++ b/example/kernelSpecialization/src/kernelSpecialization.cpp
@@ -81,10 +81,10 @@ auto example(TAccTag const&) -> int
     std::size_t const elementsPerThread = 1u;
     Kernel kernel;
 
-    auto const& bundeledKernel = alpaka::KernelBundle(kernel);
+    alpaka::KernelCfg<Acc> const kernelCfg = {threadsPerGrid, elementsPerThread};
+
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
+    auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, kernel);
 
     // Run the kernel
     alpaka::exec<Acc>(queue, workDiv, kernel);
diff --git a/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp b/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
index 1a5ee577b405..e34dcb2d60fe 100644
--- a/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
+++ b/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
@@ -147,19 +147,14 @@ auto example(TAccTag const&) -> int
     auto mdDevC = alpaka::experimental::getMdSpan(bufDevC);
 
     MatrixMulKernel kernel;
-    auto const& bundeledKernel = alpaka::KernelBundle(kernel, mdDevA, mdDevB, mdDevC);
 
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
-        devAcc,
-        bundeledKernel,
-        extentC,
-        Vec::ones(),
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
+    alpaka::KernelCfg<Acc> const kernelCfg
+        = {extentC, Vec::ones(), false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};
+    auto const workDiv = alpaka::getValidWorkDiv<Acc>(kernelCfg, devAcc, kernel, mdDevA, mdDevB, mdDevC);
 
     // Execute the kernel
-    alpaka::exec<Acc>(queue, workDiv, MatrixMulKernel{}, mdDevA, mdDevB, mdDevC);
+    alpaka::exec<Acc>(queue, workDiv, kernel, mdDevA, mdDevB, mdDevC);
 
     // Copy result back to host
     alpaka::memcpy(queue, bufHostC, bufDevC);
diff --git a/example/monteCarloIntegration/src/monteCarloIntegration.cpp b/example/monteCarloIntegration/src/monteCarloIntegration.cpp
index fd0961979b36..b26cd2af10fa 100644
--- a/example/monteCarloIntegration/src/monteCarloIntegration.cpp
+++ b/example/monteCarloIntegration/src/monteCarloIntegration.cpp
@@ -112,14 +112,11 @@ auto example(TAccTag const&) -> int
     bufHost[0] = 0.0f;
     alpaka::memcpy(queue, bufAcc, bufHost);
 
+    alpaka::KernelCfg<Acc> const kernelCfg = {Vec(numThreads), Vec(numAlpakaElementsPerThread)};
     Kernel kernel;
-    auto const& bundeledKernel = alpaka::KernelBundle(kernel, numPoints, ptrBufAcc, Function{});
+
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
-        devAcc,
-        bundeledKernel,
-        Vec(numThreads),
-        Vec(numAlpakaElementsPerThread));
+    auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, kernel, numPoints, ptrBufAcc, Function{});
 
     alpaka::exec<Acc>(queue, workDiv, kernel, numPoints, ptrBufAcc, Function{});
     alpaka::memcpy(queue, bufHost, bufAcc);
diff --git a/example/openMPSchedule/src/openMPSchedule.cpp b/example/openMPSchedule/src/openMPSchedule.cpp
index 1febb42cd685..b2d149bec7b7 100644
--- a/example/openMPSchedule/src/openMPSchedule.cpp
+++ b/example/openMPSchedule/src/openMPSchedule.cpp
@@ -108,10 +108,10 @@ auto main() -> int
     Idx const elementsPerThread = 1u;
 
     OpenMPScheduleDefaultKernel openMPScheduleDefaultKernel;
-    auto const& bundeledKernel = alpaka::KernelBundle(openMPScheduleDefaultKernel);
+
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
+    alpaka::KernelCfg<Acc> kernelCfg = {threadsPerGrid, elementsPerThread};
+    auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, openMPScheduleDefaultKernel);
 
     // Run the kernel setting no schedule explicitly.
     std::cout << "OpenMPScheduleDefaultKernel setting no schedule explicitly:\n";
diff --git a/example/randomCells2D/src/randomCells2D.cpp b/example/randomCells2D/src/randomCells2D.cpp
index b5b45a5ef423..36bc1258d3b0 100644
--- a/example/randomCells2D/src/randomCells2D.cpp
+++ b/example/randomCells2D/src/randomCells2D.cpp
@@ -202,11 +202,11 @@ auto example(TAccTag const&) -> int
 
     auto pitchBufAccRandV = alpaka::getPitchesInBytes(bufAccRandV)[0];
 
-    auto const& bundeledKernelInitRandom
-        = alpaka::KernelBundle(initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS);
+    alpaka::KernelCfg<Acc> const kernelCfg = {extent, Vec(perThreadY, perThreadX)};
+
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDivInitRandom
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernelInitRandom, extent, Vec(perThreadY, perThreadX));
+        = alpaka::getValidWorkDiv(kernelCfg, devAcc, initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS);
 
     alpaka::exec<Acc>(queue, workDivInitRandom, initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS);
     alpaka::wait(queue);
@@ -230,7 +230,12 @@ auto example(TAccTag const&) -> int
     alpaka::memcpy(queue, bufAccS, bufHostS);
     RunTimestepKernelSingle runTimestepKernelSingle;
 
-    auto const& bundeledKernelRuntimeStep = alpaka::KernelBundle(
+    alpaka::KernelCfg<Acc> const runtimeRandomKernelCfg = {extent, Vec(perThreadY, perThreadX)};
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workDivRuntimeStep = alpaka::getValidWorkDiv(
+        runtimeRandomKernelCfg,
+        devAcc,
         runTimestepKernelSingle,
         extent,
         ptrBufAccRandS,
@@ -238,13 +243,6 @@ auto example(TAccTag const&) -> int
         pitchBufAccRandS,
         pitchBufAccS);
 
-    // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDivRuntimeStep = alpaka::getValidWorkDivForKernel<Acc>(
-        devAcc,
-        bundeledKernelRuntimeStep,
-        extent,
-        Vec(perThreadY, perThreadX));
-
     alpaka::exec<Acc>(
         queue,
         workDivRuntimeStep,
diff --git a/example/randomStrategies/src/randomStrategies.cpp b/example/randomStrategies/src/randomStrategies.cpp
index ea87d290a2c4..6a1940c8b244 100644
--- a/example/randomStrategies/src/randomStrategies.cpp
+++ b/example/randomStrategies/src/randomStrategies.cpp
@@ -247,20 +247,20 @@ void runStrategy(Box<TAccTag>& box)
     // the initial parameters solely from the thread index
 
 
-    auto const& bundeledKernel = alpaka::KernelBundle(
-        initRandomKernel,
-        box.extentRand,
-        ptrBufAccRand,
-        static_cast<unsigned>(box.extentResult[0] / box.extentRand[0]));
+    alpaka::KernelCfg<typename Box<TAccTag>::Acc> kernelCfg
+        = {box.extentRand,
+           typename Box<TAccTag>::Vec(typename Box<TAccTag>::Idx{1}),
+           false,
+           alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};
 
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDivRand = alpaka::getValidWorkDivForKernel<typename Box<TAccTag>::Acc>(
+    auto const workDivRand = alpaka::getValidWorkDiv(
+        kernelCfg,
         alpaka::getDevByIdx(box.accPlatform, 0),
-        bundeledKernel,
+        initRandomKernel,
         box.extentRand,
-        typename Box<TAccTag>::Vec(typename Box<TAccTag>::Idx{1}),
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
+        ptrBufAccRand,
+        static_cast<unsigned>(box.extentResult[0] / box.extentRand[0]));
 
 
     alpaka::exec<typename Box<TAccTag>::Acc>(
@@ -291,18 +291,21 @@ void runStrategy(Box<TAccTag>& box)
     alpaka::memcpy(box.queue, box.bufAccResult, box.bufHostResult);
     FillKernel fillKernel;
 
-    auto const& bundeledKernelFill
-        = alpaka::KernelBundle(fillKernel, box.extentResult, ptrBufAccRand, ptrBufAccResult);
+    alpaka::KernelCfg<typename Box<TAccTag>::Acc> fillKernelCfg
+        = {box.extentResult,
+           typename Box<TAccTag>::Vec(static_cast<typename Box<TAccTag>::Idx>(
+               NUM_ROLLS)), // One thread per "point"; each performs NUM_ROLLS "rolls"
+           false,
+           alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};
 
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workdivResult = alpaka::getValidWorkDivForKernel<typename Box<TAccTag>::Acc>(
+    auto const workdivResult = alpaka::getValidWorkDiv(
+        fillKernelCfg,
         alpaka::getDevByIdx(box.accPlatform, 0),
-        bundeledKernelFill,
+        fillKernel,
         box.extentResult,
-        typename Box<TAccTag>::Vec(static_cast<typename Box<TAccTag>::Idx>(
-            NUM_ROLLS)), // One thread per "point"; each performs NUM_ROLLS "rolls"
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
+        ptrBufAccRand,
+        ptrBufAccResult);
 
 
     alpaka::exec<typename Box<TAccTag>::Acc>(
diff --git a/example/vectorAdd/src/vectorAdd.cpp b/example/vectorAdd/src/vectorAdd.cpp
index a99393fb8b5b..05b3303d7c7b 100644
--- a/example/vectorAdd/src/vectorAdd.cpp
+++ b/example/vectorAdd/src/vectorAdd.cpp
@@ -130,14 +130,17 @@ auto example(TAccTag const&) -> int
     // Instantiate the kernel function object
     VectorAddKernel kernel;
 
-    auto const& bundeledKernel = alpaka::KernelBundle(
+    alpaka::KernelCfg<Acc> const kernelCfg = {extent, elementsPerThread};
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workDiv = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
         kernel,
         alpaka::getPtrNative(bufAccA),
         alpaka::getPtrNative(bufAccB),
         alpaka::getPtrNative(bufAccC),
         numElements);
-    // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, extent, elementsPerThread);
 
     // Create the kernel execution task.
     auto const taskKernel = alpaka::createTaskKernel<Acc>(
diff --git a/include/alpaka/alpaka.hpp b/include/alpaka/alpaka.hpp
index 5f654b32a0a1..e06dede53d48 100644
--- a/include/alpaka/alpaka.hpp
+++ b/include/alpaka/alpaka.hpp
@@ -112,7 +112,6 @@
 #include "alpaka/idx/gb/IdxGbRef.hpp"
 #include "alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp"
 // kernel
-#include "alpaka/kernel/KernelBundle.hpp"
 #include "alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp"
 #include "alpaka/kernel/TaskKernelCpuOmp2Threads.hpp"
 #include "alpaka/kernel/TaskKernelCpuSerial.hpp"
diff --git a/include/alpaka/kernel/KernelBundle.hpp b/include/alpaka/kernel/KernelBundle.hpp
deleted file mode 100644
index 9a7c59b8d005..000000000000
--- a/include/alpaka/kernel/KernelBundle.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bert Wesarg, René Widera, Sergei Bastrakov, Bernhard Manfred Gruber, Mehmet
- * Yusufoglu SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/RemoveRestrict.hpp>
-
-#include <tuple>
-#include <type_traits>
-
-namespace alpaka
-{
-    //! \brief The class used to bind kernel function object and arguments together. Once an instance of this class is
-    //! created, arguments are not needed to be separately given to functions who need kernel function and arguments.
-    //! \tparam TKernelFn The kernel function object type.
-    //! \tparam TArgs Kernel function object invocation argument types as a parameter pack.
-    template<typename TKernelFn, typename... TArgs>
-    class KernelBundle
-    {
-    public:
-        //! The function object type
-        using KernelFn = TKernelFn;
-        //! Tuple type to encapsulate kernel function argument types and argument values
-        using ArgTuple = std::tuple<remove_restrict_t<std::decay_t<TArgs>>...>;
-
-        // Constructor
-        KernelBundle(KernelFn kernelFn, TArgs&&... args)
-            : m_kernelFn(std::move(kernelFn))
-            , m_args(std::forward<TArgs>(args)...)
-        {
-        }
-
-    private:
-        KernelFn m_kernelFn;
-        ArgTuple m_args; // Store the argument types without const and reference
-    };
-
-    //! \brief User defined deduction guide with trailing return type. For CTAD during the construction.
-    //! \tparam TKernelFn The kernel function object type.
-    //! \tparam TArgs Kernel function object argument types as a parameter pack.
-    //! \param kernelFn The kernel object
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wdocumentation" // clang does not support the syntax for variadic template
-                                                       // arguments "args,...". Ignore the error.
-#endif
-    //! \param args,... The kernel invocation arguments.
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic pop
-#endif
-    //! \return Kernel function bundle. An instance of KernelBundle which consists the kernel function object and its
-    //! arguments.
-    template<typename TKernelFn, typename... TArgs>
-    ALPAKA_FN_HOST KernelBundle(TKernelFn, TArgs&&...) -> KernelBundle<TKernelFn, TArgs...>;
-
-} // namespace alpaka
diff --git a/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp b/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp
index 456f0c42e339..d1e5f4b0574b 100644
--- a/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp
+++ b/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp
@@ -17,7 +17,6 @@
 #include "alpaka/core/OmpSchedule.hpp"
 #include "alpaka/dev/DevCpu.hpp"
 #include "alpaka/idx/MapIdx.hpp"
-#include "alpaka/kernel/KernelBundle.hpp"
 #include "alpaka/kernel/KernelFunctionAttributes.hpp"
 #include "alpaka/kernel/Traits.hpp"
 #include "alpaka/platform/PlatformCpu.hpp"
@@ -957,17 +956,17 @@ namespace alpaka
         //! \tparam TKernelFn Kernel function object type.
         //! \tparam TArgs Kernel function object argument types as a parameter pack.
         template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
-        struct FunctionAttributes<AccCpuOmp2Blocks<TDim, TIdx>, TDev, KernelBundle<TKernelFn, TArgs...>>
+        struct FunctionAttributes<AccCpuOmp2Blocks<TDim, TIdx>, TDev, TKernelFn, TArgs...>
         {
             //! \param dev The device instance
-            //! \param kernelBundle Kernel bundeled with it's arguments. The function attributes of this kernel will be
-            //! determined. Max threads per block is one of the attributes.
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
             //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
             //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
             ALPAKA_FN_HOST static auto getFunctionAttributes(
                 TDev const& dev,
-                [[maybe_unused]] KernelBundle<TKernelFn, TArgs...> const& kernelBundle)
-                -> alpaka::KernelFunctionAttributes
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
             {
                 alpaka::KernelFunctionAttributes kernelFunctionAttributes;
 
diff --git a/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp b/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp
index 8f68e0f1489d..6b08e9693a0e 100644
--- a/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp
+++ b/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp
@@ -15,7 +15,6 @@
 #include "alpaka/acc/AccCpuOmp2Threads.hpp"
 #include "alpaka/core/Decay.hpp"
 #include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/kernel/KernelBundle.hpp"
 #include "alpaka/kernel/KernelFunctionAttributes.hpp"
 #include "alpaka/kernel/Traits.hpp"
 #include "alpaka/meta/NdLoop.hpp"
@@ -203,17 +202,17 @@ namespace alpaka
         //! \tparam TKernelFn Kernel function object type.
         //! \tparam TArgs Kernel function object argument types as a parameter pack.
         template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
-        struct FunctionAttributes<AccCpuOmp2Threads<TDim, TIdx>, TDev, KernelBundle<TKernelFn, TArgs...>>
+        struct FunctionAttributes<AccCpuOmp2Threads<TDim, TIdx>, TDev, TKernelFn, TArgs...>
         {
             //! \param dev The device instance
-            //! \param kernelBundle Kernel bundeled with it's arguments. The function attributes of this kernel will be
-            //! determined. Max threads per block is one of the attributes.
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
             //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
             //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
             ALPAKA_FN_HOST static auto getFunctionAttributes(
                 TDev const& dev,
-                [[maybe_unused]] KernelBundle<TKernelFn, TArgs...> const& kernelBundle)
-                -> alpaka::KernelFunctionAttributes
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
             {
                 alpaka::KernelFunctionAttributes kernelFunctionAttributes;
 
diff --git a/include/alpaka/kernel/TaskKernelCpuSerial.hpp b/include/alpaka/kernel/TaskKernelCpuSerial.hpp
index be0d590cc2f0..2889ac3d2a19 100644
--- a/include/alpaka/kernel/TaskKernelCpuSerial.hpp
+++ b/include/alpaka/kernel/TaskKernelCpuSerial.hpp
@@ -15,7 +15,6 @@
 #include "alpaka/acc/AccCpuSerial.hpp"
 #include "alpaka/core/Decay.hpp"
 #include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/kernel/KernelBundle.hpp"
 #include "alpaka/kernel/KernelFunctionAttributes.hpp"
 #include "alpaka/kernel/Traits.hpp"
 #include "alpaka/meta/NdLoop.hpp"
@@ -148,17 +147,17 @@ namespace alpaka
         //! \tparam TKernelFn Kernel function object type.
         //! \tparam TArgs Kernel function object argument types as a parameter pack.
         template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
-        struct FunctionAttributes<AccCpuSerial<TDim, TIdx>, TDev, KernelBundle<TKernelFn, TArgs...>>
+        struct FunctionAttributes<AccCpuSerial<TDim, TIdx>, TDev, TKernelFn, TArgs...>
         {
             //! \param dev The device instance
-            //! \param kernelBundle Kernel bundeled with it's arguments. The function attributes of this kernel will be
-            //! determined. Max threads per block is one of the attributes.
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
             //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
             //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
             ALPAKA_FN_HOST static auto getFunctionAttributes(
                 TDev const& dev,
-                [[maybe_unused]] KernelBundle<TKernelFn, TArgs...> const& kernelBundle)
-                -> alpaka::KernelFunctionAttributes
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
             {
                 alpaka::KernelFunctionAttributes kernelFunctionAttributes;
 
diff --git a/include/alpaka/kernel/TaskKernelCpuSycl.hpp b/include/alpaka/kernel/TaskKernelCpuSycl.hpp
index e41926fd21b5..2287e1852457 100644
--- a/include/alpaka/kernel/TaskKernelCpuSycl.hpp
+++ b/include/alpaka/kernel/TaskKernelCpuSycl.hpp
@@ -25,17 +25,17 @@ namespace alpaka
         //! \tparam TKernelFn Kernel function object type.
         //! \tparam TArgs Kernel function object argument types as a parameter pack.
         template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
-        struct FunctionAttributes<AccCpuSycl<TDim, TIdx>, TDev, KernelBundle<TKernelFn, TArgs...>>
+        struct FunctionAttributes<AccCpuSycl<TDim, TIdx>, TDev, TKernelFn, TArgs...>
         {
             //! \param dev The device instance
-            //! \param kernelBundle Kernel bundeled with it's arguments. The function attributes of this kernel will be
-            //! determined. Max threads per block is one of the attributes.
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
             //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
             //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
             ALPAKA_FN_HOST static auto getFunctionAttributes(
                 TDev const& dev,
-                [[maybe_unused]] KernelBundle<TKernelFn, TArgs...> const& kernelBundle)
-                -> alpaka::KernelFunctionAttributes
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
             {
                 alpaka::KernelFunctionAttributes kernelFunctionAttributes;
 
diff --git a/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp b/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
index 6dd90c3d2ff2..0bc578ccc5a1 100644
--- a/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
+++ b/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
@@ -16,7 +16,6 @@
 #include "alpaka/core/Decay.hpp"
 #include "alpaka/dev/DevCpu.hpp"
 #include "alpaka/idx/MapIdx.hpp"
-#include "alpaka/kernel/KernelBundle.hpp"
 #include "alpaka/kernel/KernelFunctionAttributes.hpp"
 #include "alpaka/kernel/Traits.hpp"
 #include "alpaka/meta/NdLoop.hpp"
@@ -160,17 +159,17 @@ namespace alpaka
         //! \tparam TKernelFn Kernel function object type.
         //! \tparam TArgs Kernel function object argument types as a parameter pack.
         template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
-        struct FunctionAttributes<AccCpuTbbBlocks<TDim, TIdx>, TDev, KernelBundle<TKernelFn, TArgs...>>
+        struct FunctionAttributes<AccCpuTbbBlocks<TDim, TIdx>, TDev, TKernelFn, TArgs...>
         {
             //! \param dev The device instance
-            //! \param kernelBundle Kernel bundeled with it's arguments. The function attributes of this kernel will be
-            //! determined. Max threads per block is one of the attributes.
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
             //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
             //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
             ALPAKA_FN_HOST static auto getFunctionAttributes(
                 TDev const& dev,
-                [[maybe_unused]] KernelBundle<TKernelFn, TArgs...> const& kernelBundle)
-                -> alpaka::KernelFunctionAttributes
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
             {
                 alpaka::KernelFunctionAttributes kernelFunctionAttributes;
 
diff --git a/include/alpaka/kernel/TaskKernelCpuThreads.hpp b/include/alpaka/kernel/TaskKernelCpuThreads.hpp
index 7f12a3a12201..850b66154dab 100644
--- a/include/alpaka/kernel/TaskKernelCpuThreads.hpp
+++ b/include/alpaka/kernel/TaskKernelCpuThreads.hpp
@@ -17,7 +17,6 @@
 #include "alpaka/core/Decay.hpp"
 #include "alpaka/core/ThreadPool.hpp"
 #include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/kernel/KernelBundle.hpp"
 #include "alpaka/kernel/KernelFunctionAttributes.hpp"
 #include "alpaka/kernel/Traits.hpp"
 #include "alpaka/meta/NdLoop.hpp"
@@ -211,17 +210,17 @@ namespace alpaka
         //! \tparam TKernelFn Kernel function object type.
         //! \tparam TArgs Kernel function object argument types as a parameter pack.
         template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
-        struct FunctionAttributes<AccCpuThreads<TDim, TIdx>, TDev, KernelBundle<TKernelFn, TArgs...>>
+        struct FunctionAttributes<AccCpuThreads<TDim, TIdx>, TDev, TKernelFn, TArgs...>
         {
             //! \param dev The device instance
-            //! \param kernelBundle Kernel bundeled with it's arguments. The function attributes of this kernel will be
-            //! determined. Max threads per block is one of the attributes.
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
             //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
             //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
             ALPAKA_FN_HOST static auto getFunctionAttributes(
                 TDev const& dev,
-                [[maybe_unused]] KernelBundle<TKernelFn, TArgs...> const& kernelBundle)
-                -> alpaka::KernelFunctionAttributes
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
             {
                 alpaka::KernelFunctionAttributes kernelFunctionAttributes;
 
diff --git a/include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp b/include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp
index 7afd7bd805b0..6a44b7269e23 100644
--- a/include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp
+++ b/include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp
@@ -26,17 +26,17 @@ namespace alpaka
         //! \tparam TKernelFn Kernel function object type.
         //! \tparam TArgs Kernel function object argument types as a parameter pack.
         template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
-        struct FunctionAttributes<AccFpgaSyclIntel<TDim, TIdx>, TDev, KernelBundle<TKernelFn, TArgs...>>
+        struct FunctionAttributes<AccFpgaSyclIntel<TDim, TIdx>, TDev, TKernelFn, TArgs...>
         {
             //! \param dev The device instance
-            //! \param kernelBundle Kernel bundeled with it's arguments. The function attributes of this kernel will be
-            //! determined. Max threads per block is one of the attributes.
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
             //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
             //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
             ALPAKA_FN_HOST static auto getFunctionAttributes(
                 TDev const& dev,
-                [[maybe_unused]] KernelBundle<TKernelFn, TArgs...> const& kernelBundle)
-                -> alpaka::KernelFunctionAttributes
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
             {
                 alpaka::KernelFunctionAttributes kernelFunctionAttributes;
 
diff --git a/include/alpaka/kernel/TaskKernelGenericSycl.hpp b/include/alpaka/kernel/TaskKernelGenericSycl.hpp
index 291ae1262103..f913d2ec4d5f 100644
--- a/include/alpaka/kernel/TaskKernelGenericSycl.hpp
+++ b/include/alpaka/kernel/TaskKernelGenericSycl.hpp
@@ -10,7 +10,6 @@
 #include "alpaka/dev/Traits.hpp"
 #include "alpaka/dim/Traits.hpp"
 #include "alpaka/idx/Traits.hpp"
-#include "alpaka/kernel/KernelBundle.hpp"
 #include "alpaka/kernel/KernelFunctionAttributes.hpp"
 #include "alpaka/kernel/SyclSubgroupSize.hpp"
 #include "alpaka/kernel/Traits.hpp"
diff --git a/include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp b/include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp
index 57459eb32915..03d90bcb31c2 100644
--- a/include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp
+++ b/include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp
@@ -26,17 +26,17 @@ namespace alpaka
         //! \tparam TKernelFn Kernel function object type.
         //! \tparam TArgs Kernel function object argument types as a parameter pack.
         template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
-        struct FunctionAttributes<AccGpuSyclIntel<TDim, TIdx>, TDev, KernelBundle<TKernelFn, TArgs...>>
+        struct FunctionAttributes<AccGpuSyclIntel<TDim, TIdx>, TDev, TKernelFn, TArgs...>
         {
             //! \param dev The device instance
-            //! \param kernelBundle Kernel bundeled with it's arguments. The function attributes of this kernel will be
-            //! determined. Max threads per block is one of the attributes.
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
             //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
             //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
             ALPAKA_FN_HOST static auto getFunctionAttributes(
                 TDev const& dev,
-                [[maybe_unused]] KernelBundle<TKernelFn, TArgs...> const& kernelBundle)
-                -> alpaka::KernelFunctionAttributes
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
             {
                 alpaka::KernelFunctionAttributes kernelFunctionAttributes;
 
diff --git a/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp b/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp
index d07fc4e7834a..53bbaf67529f 100644
--- a/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp
+++ b/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp
@@ -17,7 +17,6 @@
 #include "alpaka/dev/Traits.hpp"
 #include "alpaka/dim/Traits.hpp"
 #include "alpaka/idx/Traits.hpp"
-#include "alpaka/kernel/KernelBundle.hpp"
 #include "alpaka/kernel/KernelFunctionAttributes.hpp"
 #include "alpaka/kernel/Traits.hpp"
 #include "alpaka/platform/Traits.hpp"
@@ -222,7 +221,7 @@ namespace alpaka
 #        if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
                 // This checks for a valid work division that is also compliant with the hardware maxima of the
                 // accelerator.
-                if(!isValidWorkDiv<TAcc>(getDev(queue), task))
+                if(!isValidWorkDiv<TAcc>(task, getDev(queue)))
                 {
                     throw std::runtime_error(
                         "The given work division is not valid or not supported by the device of type "
@@ -305,17 +304,17 @@ namespace alpaka
         //! \tparam TKernelFn Kernel function object type.
         //! \tparam TArgs Kernel function object argument types as a parameter pack.
         template<typename TApi, typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
-        struct FunctionAttributes<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>, TDev, KernelBundle<TKernelFn, TArgs...>>
+        struct FunctionAttributes<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>, TDev, TKernelFn, TArgs...>
         {
-            //! \param kernelBundle Kernel bundeled with it's arguments. The function attributes of this kernel will be
-            //! determined. Max threads per block is one of the attributes.
-            //! \return KernelFunctionAttributes instance. For GPU backend, all values are set by calling the
-            //! corresponding API functions. The default version always returns an instance with zero fields. For CPU,
-            //! the field of max threads allowed by kernel function for the block is 1.
+            //! \param dev The device instance
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
+            //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
+            //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
             ALPAKA_FN_HOST static auto getFunctionAttributes(
-                TDev const&,
-                [[maybe_unused]] KernelBundle<TKernelFn, TArgs...> const& kernelBundle)
-                -> alpaka::KernelFunctionAttributes
+                [[maybe_unused]] TDev const& dev,
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
             {
                 auto kernelName = alpaka::detail::gpuKernel<
                     TKernelFn,
diff --git a/include/alpaka/kernel/Traits.hpp b/include/alpaka/kernel/Traits.hpp
index 2047ac1a2b59..c2c0a55b1f7a 100644
--- a/include/alpaka/kernel/Traits.hpp
+++ b/include/alpaka/kernel/Traits.hpp
@@ -72,18 +72,20 @@ namespace alpaka
 
         //! \brief The structure template to access to the functions attributes of a kernel function object.
         //! \tparam TAcc The accelerator type
-        //! \tparam TKernelBundle The kernel object type, which includes the kernel function object and it's invocation
-        //! arguments.
-        template<typename TAcc, typename TDev, typename TKernelBundle>
+        //! \tparam TKernelFnObj Kernel function object type.
+        //! \tparam TArgs Kernel function object argument types as a parameter pack.
+        template<typename TAcc, typename TDev, typename TKernelFnObj, typename... TArgs>
         struct FunctionAttributes
         {
-            //! \param kernelBundle The kernel object instance, which includes the kernel function object and it's
-            //! invocation arguments.
+            //! \param dev The device instance
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
             //! \return KernelFunctionAttributes data structure instance. The default version always returns the
             //! instance with fields which are set to zero.
             ALPAKA_FN_HOST static auto getFunctionAttributes(
-                TDev const&,
-                [[maybe_unused]] TKernelBundle const& kernelBundle) -> alpaka::KernelFunctionAttributes
+                [[maybe_unused]] TDev const& dev,
+                [[maybe_unused]] TKernelFnObj const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
             {
                 std::string const str
                     = std::string(__func__) + " function is not specialised for the given arguments.\n";
@@ -164,13 +166,13 @@ namespace alpaka
 #    pragma clang diagnostic ignored                                                                                  \
         "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
 #endif
-    //! \tparam TAcc The accelerator type.
-    //! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
-    //! \param blockThreadExtent The block thread extent.
-    //! \param threadElemExtent The thread element extent.
-    //! \param args,... The kernel invocation arguments.
-    //! \return The size of the shared memory allocated for a block in bytes.
-    //! The default implementation always returns zero.
+//! \tparam TAcc The accelerator type.
+//! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
+//! \param blockThreadExtent The block thread extent.
+//! \param threadElemExtent The thread element extent.
+//! \param args,... The kernel invocation arguments.
+//! \return The size of the shared memory allocated for a block in bytes.
+//! The default implementation always returns zero.
 #if BOOST_COMP_CLANG
 #    pragma clang diagnostic pop
 #endif
@@ -191,20 +193,21 @@ namespace alpaka
 
     //! \tparam TAcc The accelerator type.
     //! \tparam TDev The device type.
-    //! \tparam TKernelBundle The kernel object type, which includes the kernel function object and it's invocation
-    //! arguments.
     //! \param dev The device instance
-    //! \param kernelBundle The kernel object, which includes the kernel function object and it's invocation
-    //! arguments.
+    //! \param kernelFnObj The kernel function object which should be executed.
+    //! \param args The kernel invocation arguments.
     //! \return KernelFunctionAttributes instance. Instance is filled with values returned by the accelerator API
     //! depending on the specific kernel. The default version always returns the instance with fields which are set to
     //! zero.
     ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAcc, typename TDev, typename TKernelBundle>
-    ALPAKA_FN_HOST auto getFunctionAttributes(TDev const& dev, TKernelBundle const& kernelBundle)
+    template<typename TAcc, typename TDev, typename TKernelFnObj, typename... TArgs>
+    ALPAKA_FN_HOST auto getFunctionAttributes(TDev const& dev, TKernelFnObj const& kernelFnObj, TArgs&&... args)
         -> alpaka::KernelFunctionAttributes
     {
-        return trait::FunctionAttributes<TAcc, TDev, TKernelBundle>::getFunctionAttributes(dev, kernelBundle);
+        return trait::FunctionAttributes<TAcc, TDev, TKernelFnObj, TArgs...>::getFunctionAttributes(
+            dev,
+            kernelFnObj,
+            std::forward<TArgs>(args)...);
     }
 
 #if BOOST_COMP_CLANG
@@ -212,13 +215,13 @@ namespace alpaka
 #    pragma clang diagnostic ignored                                                                                  \
         "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
 #endif
-    //! \tparam TAcc The accelerator type.
-    //! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
-    //! \param blockThreadExtent The block thread extent.
-    //! \param threadElemExtent The thread element extent.
-    //! \param args,... The kernel invocation arguments.
-    //! \return The OpenMP schedule information as an alpaka::omp::Schedule object if the kernel specialized the
-    //!         OmpSchedule trait, an object of another type if the kernel didn't specialize the trait.
+//! \tparam TAcc The accelerator type.
+//! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
+//! \param blockThreadExtent The block thread extent.
+//! \param threadElemExtent The thread element extent.
+//! \param args,... The kernel invocation arguments.
+//! \return The OpenMP schedule information as an alpaka::omp::Schedule object if the kernel specialized the
+//!         OmpSchedule trait, an object of another type if the kernel didn't specialize the trait.
 #if BOOST_COMP_CLANG
 #    pragma clang diagnostic pop
 #endif
@@ -313,7 +316,7 @@ namespace alpaka
     template<typename T>
     inline constexpr bool isKernelTriviallyCopyable = IsKernelTriviallyCopyable<T>::value;
 
-    //! @}
+//! @}
 
 //! Creates a kernel execution task.
 //!
diff --git a/include/alpaka/test/KernelExecutionFixture.hpp b/include/alpaka/test/KernelExecutionFixture.hpp
index 66d14b5b63e3..0e59344497ed 100644
--- a/include/alpaka/test/KernelExecutionFixture.hpp
+++ b/include/alpaka/test/KernelExecutionFixture.hpp
@@ -69,19 +69,16 @@ namespace alpaka::test
             memset(m_queue, bufAccResult, static_cast<std::uint8_t>(true));
 
 
-            auto bundeledKernel = alpaka::KernelBundle<TKernelFnObj, decltype(getPtrNative(bufAccResult)), TArgs...>(
-                kernelFnObj,
-                getPtrNative(bufAccResult),
-                std::forward<TArgs>(args)...);
-
+            alpaka::KernelCfg<Acc> const kernelCfg = {m_extent, Vec<Dim, Idx>::ones()};
 
             // set workdiv if it is not before
             if(m_workDiv == WorkDiv{Vec<Dim, Idx>::all(0), Vec<Dim, Idx>::all(0), Vec<Dim, Idx>::all(0)})
-                m_workDiv = alpaka::getValidWorkDivForKernel<Acc, Dev<Acc>>(
+                m_workDiv = alpaka::getValidWorkDiv(
+                    kernelCfg,
                     m_device,
-                    bundeledKernel,
-                    m_extent,
-                    Vec<Dim, Idx>::ones());
+                    kernelFnObj,
+                    getPtrNative(bufAccResult),
+                    std::forward<TArgs>(args)...);
 
             exec<Acc>(m_queue, m_workDiv, kernelFnObj, getPtrNative(bufAccResult), std::forward<TArgs>(args)...);
 
diff --git a/include/alpaka/workdiv/WorkDivHelpers.hpp b/include/alpaka/workdiv/WorkDivHelpers.hpp
index 6ac433c2c0ee..b1585c75bb82 100644
--- a/include/alpaka/workdiv/WorkDivHelpers.hpp
+++ b/include/alpaka/workdiv/WorkDivHelpers.hpp
@@ -10,7 +10,6 @@
 #include "alpaka/core/Utility.hpp"
 #include "alpaka/dev/Traits.hpp"
 #include "alpaka/extent/Traits.hpp"
-#include "alpaka/kernel/KernelBundle.hpp"
 #include "alpaka/kernel/KernelFunctionAttributes.hpp"
 #include "alpaka/kernel/Traits.hpp"
 #include "alpaka/vec/Vec.hpp"
@@ -304,87 +303,103 @@ namespace alpaka
         return WorkDivMembers<TDim, TIdx>(gridBlockExtent, blockThreadExtent, clippedThreadElemExtent);
     }
 
-    //! \tparam TDev The type of the device.
-    //! \tparam TKernelBundle The type of the bundle of kernel and the arguments. Kernel is used to get number of
-    //! threads per block, this number could be less than or equal to the number of threads per block according to
-    //! device properties.
+    //! Kernel start configuration to determine a valid work division
+    //!
     //! \tparam TGridElemExtent The type of the grid element extent.
     //! \tparam TThreadElemExtent The type of the thread element extent.
-    //! \param dev The device the work division should be valid for.
-    //! \param kernelBundle An instance of a class consisting Kernel function and its arguments
-    //! \param gridElemExtent The full extent of elements in the grid.
-    //! \param threadElemExtents the number of elements computed per thread.
-    //! \param blockThreadMustDivideGridThreadExtent If this is true, the grid thread extent will be multiples of the
-    //! corresponding block thread extent.
-    //!     NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
-    //!     thread extent will be one in this dimension.
-    //! \param gridBlockExtentSubDivRestrictions The grid block extent subdivision restrictions.
-    //! \return The work division.
     template<
         typename TAcc,
-        typename TDev,
-        typename TKernelBundle,
         typename TGridElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>,
         typename TThreadElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>>
-    ALPAKA_FN_HOST auto getValidWorkDivForKernel(
-        [[maybe_unused]] TDev const& dev,
-        TKernelBundle const& kernelBundle,
-        [[maybe_unused]] TGridElemExtent const& gridElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>::ones(),
-        [[maybe_unused]] TThreadElemExtent const& threadElemExtents = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>::ones(),
-        [[maybe_unused]] bool blockThreadMustDivideGridThreadExtent = true,
-        [[maybe_unused]] GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
-        = GridBlockExtentSubDivRestrictions::Unrestricted) -> WorkDivMembers<Dim<TAcc>, Idx<TAcc>>
+    struct KernelCfg
     {
-        using Acc = TAcc;
+        //! The full extent of elements in the grid.
+        TGridElemExtent const gridElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>::ones();
+        //! The number of elements computed per thread.
+        TThreadElemExtent const threadElemExtents = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>::ones();
+        //! If this is true, the grid thread extent will be multiples of
+        //! the corresponding block thread extent.
+        //!     NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
+        //!     thread extent will be one in this dimension.
+        bool blockThreadMustDivideGridThreadExtent = true;
+        //! The grid block extent subdivision restrictions.
+        GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
+            = GridBlockExtentSubDivRestrictions::Unrestricted;
 
         static_assert(
-            Dim<TGridElemExtent>::value == Dim<Acc>::value,
+            Dim<TGridElemExtent>::value == Dim<TAcc>::value,
             "The dimension of Acc and the dimension of TGridElemExtent have to be identical!");
         static_assert(
-            Dim<TThreadElemExtent>::value == Dim<Acc>::value,
+            Dim<TGridElemExtent>::value == Dim<TAcc>::value,
             "The dimension of Acc and the dimension of TThreadElemExtent have to be identical!");
         static_assert(
-            std::is_same_v<Idx<TGridElemExtent>, Idx<Acc>>,
+            std::is_same_v<Idx<TGridElemExtent>, Idx<TAcc>>,
             "The idx type of Acc and the idx type of TGridElemExtent have to be identical!");
         static_assert(
-            std::is_same_v<Idx<TThreadElemExtent>, Idx<Acc>>,
+            std::is_same_v<Idx<TThreadElemExtent>, Idx<TAcc>>,
             "The idx type of Acc and the idx type of TThreadElemExtent have to be identical!");
+    };
+
+    //! \tparam TDev The type of the device.
+    //! \tparam TGridElemExtent The type of the grid element extent.
+    //! \tparam TThreadElemExtent The type of the thread element extent.
+    //! \param dev The device the work division should be valid for.
+    //! \param kernelFnObj The kernel function object which should be executed.
+    //! \param args The kernel invocation arguments.
+    //! \return The work division for the accelerator based on the kernel and argument types
+    template<
+        typename TAcc,
+        typename TDev,
+        typename TGridElemExtent,
+        typename TThreadElemExtent,
+        typename TKernelFnObj,
+        typename... TArgs>
+    ALPAKA_FN_HOST auto getValidWorkDiv(
+        KernelCfg<TAcc, TGridElemExtent, TThreadElemExtent> const& kernelCfg,
+        [[maybe_unused]] TDev const& dev,
+        TKernelFnObj const& kernelFnObj,
+        TArgs&&... args) -> WorkDivMembers<Dim<TAcc>, Idx<TAcc>>
+    {
+        using Acc = TAcc;
 
         // Get max number of threads per block depending on the kernel function attributes.
         // For GPU backend; number of registers used by the kernel, local and shared memory usage of the kernel
         // determines the max number of threads per block. This number could be equal or less than the max number of
         // threads per block defined by device properties.
-        auto const kernelFunctionAttributes = getFunctionAttributes<Acc>(dev, kernelBundle);
+        auto const kernelFunctionAttributes
+            = getFunctionAttributes<Acc>(dev, kernelFnObj, std::forward<TArgs>(args)...);
         auto const threadsPerBlock = kernelFunctionAttributes.maxThreadsPerBlock;
 
         if constexpr(Dim<TGridElemExtent>::value == 0)
         {
             auto const zero = Vec<DimInt<0>, Idx<Acc>>{};
-            ALPAKA_ASSERT(gridElemExtent == zero);
-            ALPAKA_ASSERT(threadElemExtents == zero);
+            ALPAKA_ASSERT(kernelCfg.gridElemExtent == zero);
+            ALPAKA_ASSERT(kernelCfg.threadElemExtents == zero);
             return WorkDivMembers<DimInt<0>, Idx<Acc>>{zero, zero, zero};
         }
         else
             return subDivideGridElems(
-                getExtents(gridElemExtent),
-                getExtents(threadElemExtents),
+                getExtents(kernelCfg.gridElemExtent),
+                getExtents(kernelCfg.threadElemExtents),
                 getAccDevProps<Acc>(dev),
                 static_cast<Idx<Acc>>(threadsPerBlock),
-                blockThreadMustDivideGridThreadExtent,
-                gridBlockExtentSubDivRestrictions);
+                kernelCfg.blockThreadMustDivideGridThreadExtent,
+                kernelCfg.gridBlockExtentSubDivRestrictions);
 
         using V [[maybe_unused]] = Vec<Dim<TGridElemExtent>, Idx<TGridElemExtent>>;
         ALPAKA_UNREACHABLE(WorkDivMembers<Dim<TGridElemExtent>, Idx<TGridElemExtent>>{V{}, V{}, V{}});
     }
 
+    //! Checks if the work division is supported
+    //!
+    //! \tparam TWorkDiv The type of the work division.
     //! \tparam TDim The dimensionality of the accelerator device properties.
     //! \tparam TIdx The idx type of the accelerator device properties.
-    //! \tparam TWorkDiv The type of the work division.
-    //! \param accDevProps The maxima for the work division.
     //! \param workDiv The work division to test for validity.
+    //! \param accDevProps The maxima for the work division.
     //! \return If the work division is valid for the given accelerator device properties.
-    template<typename TDim, typename TIdx, typename TWorkDiv>
-    ALPAKA_FN_HOST auto isValidWorkDiv(AccDevProps<TDim, TIdx> const& accDevProps, TWorkDiv const& workDiv) -> bool
+    template<typename TWorkDiv, typename TDim, typename TIdx>
+    ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const& workDiv, AccDevProps<TDim, TIdx> const& accDevProps) -> bool
     {
         // Get the extents of grid, blocks and threads of the work division to check.
         auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
@@ -428,21 +443,23 @@ namespace alpaka
         return true;
     }
 
+    //! Checks if the work division is supported
+    //!
+    //! \tparam TWorkDiv The type of the work division.
     //! \tparam TDim The dimensionality of the accelerator device properties.
     //! \tparam TIdx The idx type of the accelerator device properties.
-    //! \tparam TWorkDiv The type of the work division.
+    //! \param workDiv The work division to test for validity.
     //! \param accDevProps The maxima for the work division.
     //! \param kernelFunctionAttributes Kernel attributes, including the maximum number of threads per block that can
     //! be used by this kernel on the given device. This number can be equal to or smaller than the the number of
     //! threads per block supported by the device.
-    //! \param workDiv The work division to test for validity.
     //! \return Returns true if the work division is valid for the given accelerator device properties and for the
     //! given kernel. Otherwise returns false.
-    template<typename TAcc, typename TDim, typename TIdx, typename TWorkDiv>
-    ALPAKA_FN_HOST auto isValidWorkDivKernel(
+    template<typename TAcc, typename TWorkDiv, typename TDim, typename TIdx>
+    ALPAKA_FN_HOST auto isValidWorkDiv(
+        TWorkDiv const& workDiv,
         AccDevProps<TDim, TIdx> const& accDevProps,
-        KernelFunctionAttributes const& kernelFunctionAttributes,
-        TWorkDiv const& workDiv) -> bool
+        KernelFunctionAttributes const& kernelFunctionAttributes) -> bool
     {
         // Get the extents of grid, blocks and threads of the work division to check.
         auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
@@ -491,33 +508,38 @@ namespace alpaka
         return true;
     }
 
+    //! Checks if the work division is supported for the kernel on the device
+    //!
     //! \tparam TAcc The accelerator to test the validity on.
     //! \tparam TDev The type of the device.
-    //! \tparam TKernelBundle The type of the bundle of kernel and the arguments.
     //! \tparam TWorkDiv The type of work division to test for validity.
-    //! \param dev The device to test the work division for validity on.
-    //! \param kernelBundle An instance of a class consisting Kernel function and its arguments.
     //! \param workDiv The work division to test for validity.
-    //! \return Returns the value of isValidWorkDivKernel function.
-    template<typename TAcc, typename TDev, typename TKernelBundle, typename TWorkDiv>
-    ALPAKA_FN_HOST auto isValidWorkDivKernel(
+    //! \param dev The device to test the work division for validity on.
+    //! \param kernelFnObj The kernel function object which should be executed.
+    //! \param args The kernel invocation arguments.
+    //! \return Returns the value of isValidWorkDiv function.
+    template<typename TAcc, typename TWorkDiv, typename TDev, typename TKernelFnObj, typename... TArgs>
+    ALPAKA_FN_HOST auto isValidWorkDiv(
+        TWorkDiv const& workDiv,
         TDev const& dev,
-        TKernelBundle const& kernelBundle,
-        TWorkDiv const& workDiv) -> bool
+        TKernelFnObj const& kernelFnObj,
+        TArgs&&... args) -> bool
     {
-        return isValidWorkDivKernel<TAcc>(
+        return isValidWorkDiv<TAcc>(
+            workDiv,
             getAccDevProps<TAcc>(dev),
-            getFunctionAttributes<TAcc>(dev, kernelBundle),
-            workDiv);
+            getFunctionAttributes<TAcc>(dev, kernelFnObj, std::forward<TArgs>(args)...));
     }
 
+    //! Checks if the work division is supported by the device
+    //!
     //! \tparam TAcc The accelerator to test the validity on.
-    //! \param dev The device to test the work division for validity on.
     //! \param workDiv The work division to test for validity.
+    //! \param dev The device to test the work division for validity on.
     //! \return If the work division is valid on this accelerator.
-    template<typename TAcc, typename TDev, typename TWorkDiv>
-    ALPAKA_FN_HOST auto isValidWorkDiv(TDev const& dev, TWorkDiv const& workDiv) -> bool
+    template<typename TAcc, typename TWorkDiv, typename TDev>
+    ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const& workDiv, TDev const& dev) -> bool
     {
-        return isValidWorkDiv(getAccDevProps<TAcc>(dev), workDiv);
+        return isValidWorkDiv(workDiv, getAccDevProps<TAcc>(dev));
     }
 } // namespace alpaka
diff --git a/test/integ/axpy/src/axpy.cpp b/test/integ/axpy/src/axpy.cpp
index 4553dba458f9..67b73f579504 100644
--- a/test/integ/axpy/src/axpy.cpp
+++ b/test/integ/axpy/src/axpy.cpp
@@ -147,16 +147,18 @@ TEMPLATE_LIST_TEST_CASE("axpy", "[axpy]", TestAccs)
 #endif
 
 
-    auto const& bundeledKernel
-        = alpaka::KernelBundle(kernel, numElements, alpha, std::data(memBufAccX), std::data(memBufAccY));
+    alpaka::KernelCfg<Acc> const kernelCfg
+        = {extent, static_cast<Idx>(3u), false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};
+
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
+    auto const workDiv = alpaka::getValidWorkDiv(
+        kernelCfg,
         devAcc,
-        bundeledKernel,
-        extent,
-        static_cast<Idx>(3u),
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
+        kernel,
+        numElements,
+        alpha,
+        std::data(memBufAccX),
+        std::data(memBufAccY));
 
     std::cout << "AxpyKernel("
               << " numElements:" << numElements << ", accelerator: " << alpaka::getAccName<Acc>()
diff --git a/test/integ/mandelbrot/src/mandelbrot.cpp b/test/integ/mandelbrot/src/mandelbrot.cpp
index 6424b3e986e1..58ba9c6b0abc 100644
--- a/test/integ/mandelbrot/src/mandelbrot.cpp
+++ b/test/integ/mandelbrot/src/mandelbrot.cpp
@@ -307,7 +307,13 @@ TEMPLATE_LIST_TEST_CASE("mandelbrot", "[mandelbrot]", TestAccs)
     auto const [rowPitch, _] = alpaka::getPitchesInBytes(bufColorAcc);
     CHECK(rowPitch % sizeof(Val) == 0);
 
-    auto const& bundeledKernel = alpaka::KernelBundle(
+    alpaka::KernelCfg<Acc> const kernelCfg
+        = {extent, alpaka::Vec<Dim, Idx>::ones(), false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workDiv = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
         kernel,
         std::data(bufColorAcc),
         numRows,
@@ -318,15 +324,6 @@ TEMPLATE_LIST_TEST_CASE("mandelbrot", "[mandelbrot]", TestAccs)
         fMinI,
         fMaxI,
         maxIterations);
-    // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
-        devAcc,
-        bundeledKernel,
-        extent,
-        alpaka::Vec<Dim, Idx>::ones(),
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
-
 
     std::cout << "MandelbrotKernel("
               << " numRows:" << numRows << ", numCols:" << numCols << ", maxIterations:" << maxIterations
diff --git a/test/integ/matMul/src/matMul.cpp b/test/integ/matMul/src/matMul.cpp
index 41e2e4f9cdb0..dd87cfe489d9 100644
--- a/test/integ/matMul/src/matMul.cpp
+++ b/test/integ/matMul/src/matMul.cpp
@@ -243,8 +243,12 @@ TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs)
     std::cout << "pitchesB " << alpaka::getPitchesInBytes(bufBAcc) << " ldb: " << ldb << "\n";
     std::cout << "pitchesC " << alpaka::getPitchesInBytes(bufCAcc) << " ldc: " << ldc << "\n";
 
-
-    auto const& bundeledKernel = alpaka::KernelBundle(
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    alpaka::KernelCfg<Acc> const kernelCfg
+        = {extentC, alpaka::Vec<Dim, Idx>::ones(), false, alpaka::GridBlockExtentSubDivRestrictions::EqualExtent};
+    auto const workDiv = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
         kernel,
         m,
         n,
@@ -257,14 +261,6 @@ TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs)
         static_cast<Val>(1),
         std::data(bufCAcc),
         ldc);
-    // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
-        devAcc,
-        bundeledKernel,
-        extentC,
-        alpaka::Vec<Dim, Idx>::ones(),
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::EqualExtent);
 
 
     std::cout << "MatMulKernel("
diff --git a/test/integ/separableCompilation/src/main.cpp b/test/integ/separableCompilation/src/main.cpp
index 3fb4f3245682..70739d1010c2 100644
--- a/test/integ/separableCompilation/src/main.cpp
+++ b/test/integ/separableCompilation/src/main.cpp
@@ -111,10 +111,16 @@ TEMPLATE_LIST_TEST_CASE("separableCompilation", "[separableCompilation]", TestAc
     alpaka::memcpy(queueAcc, memBufAccA, memBufHostA);
     alpaka::memcpy(queueAcc, memBufAccB, memBufHostB);
 
-    auto const& bundeledKernel
-        = alpaka::KernelBundle(kernel, memBufAccA.data(), memBufAccB.data(), memBufAccC.data(), numElements);
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, extent, static_cast<Idx>(3u));
+    alpaka::KernelCfg<Acc> const kernelCfg = {extent, static_cast<Idx>(3u)};
+    auto const workDiv = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
+        kernel,
+        memBufAccA.data(),
+        memBufAccB.data(),
+        memBufAccC.data(),
+        numElements);
 
     std::cout << alpaka::core::demangled<decltype(kernel)> << "("
               << "accelerator: " << alpaka::getAccName<Acc>() << ", workDiv: " << workDiv
diff --git a/test/integ/sharedMem/src/sharedMem.cpp b/test/integ/sharedMem/src/sharedMem.cpp
index 0377f623a5e1..855e094688bb 100644
--- a/test/integ/sharedMem/src/sharedMem.cpp
+++ b/test/integ/sharedMem/src/sharedMem.cpp
@@ -131,18 +131,15 @@ TEMPLATE_LIST_TEST_CASE("sharedMem", "[sharedMem]", TestAccs)
 
 
     auto blockRetValuesDummy = alpaka::allocBuf<Val, Idx>(devAcc, static_cast<Idx>(1));
-    // Kernel input during the runtim of kernel will be different and is chosen to depend on workdiv.
-    // Therefore initially a  workdiv is needed to find the parameter. Therefore in kernel bundle, we can not use the
+
+    // Kernel input during the runtime of kernel will be different and is chosen to depend on workdiv.
+    // Therefore, initially a  workdiv is needed to find the parameter. Therefore, in kernel bundle, we can not use the
     // real input for the buffer pointer.
-    auto const& bundeledKernel = alpaka::KernelBundle(kernel, std::data(blockRetValuesDummy));
+
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
-        devAcc,
-        bundeledKernel,
-        numElements,
-        static_cast<Idx>(1u),
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
+    alpaka::KernelCfg<Acc> const kernelCfg
+        = {numElements, static_cast<Idx>(1u), false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};
+    auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, kernel, std::data(blockRetValuesDummy));
 
     std::cout << "SharedMemKernel("
               << " accelerator: " << alpaka::getAccName<Acc>()
diff --git a/test/unit/math/src/TestTemplate.hpp b/test/unit/math/src/TestTemplate.hpp
index 7d45010c2d4e..3fdec6f69f14 100644
--- a/test/unit/math/src/TestTemplate.hpp
+++ b/test/unit/math/src/TestTemplate.hpp
@@ -98,17 +98,16 @@ namespace mathtest
             Args args{devAcc};
             Results results{devAcc};
 
-
-            auto const& bundeledKernel
-                = alpaka::KernelBundle(kernel, results.pDevBuffer, wrappedFunctor, args.pDevBuffer);
             // Let alpaka calculate good block and grid sizes given our full problem extent
-            auto const workDiv = alpaka::getValidWorkDivForKernel<TAcc>(
+            alpaka::KernelCfg<TAcc> const kernelCfg
+                = {sizeExtent, elementsPerThread, false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};
+            auto const workDiv = alpaka::getValidWorkDiv(
+                kernelCfg,
                 devAcc,
-                bundeledKernel,
-                sizeExtent,
-                elementsPerThread,
-                false,
-                alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
+                kernel,
+                results.pDevBuffer,
+                wrappedFunctor,
+                args.pDevBuffer);
 
             // SETUP COMPLETED.
 
diff --git a/test/unit/workDiv/src/WorkDivForKernelTest.cpp b/test/unit/workDiv/src/WorkDivForKernelTest.cpp
index f6de4a020776..88bdcdfe8c7c 100644
--- a/test/unit/workDiv/src/WorkDivForKernelTest.cpp
+++ b/test/unit/workDiv/src/WorkDivForKernelTest.cpp
@@ -9,7 +9,6 @@
 #include <alpaka/acc/AccDevProps.hpp>
 #include <alpaka/acc/AccGpuUniformCudaHipRt.hpp>
 #include <alpaka/idx/Traits.hpp>
-#include <alpaka/kernel/KernelBundle.hpp>
 #include <alpaka/kernel/KernelFunctionAttributes.hpp>
 #include <alpaka/math/MathStdLib.hpp>
 #include <alpaka/test/acc/TestAccs.hpp>
@@ -75,7 +74,7 @@ struct TestKernelWithManyRegisters
 
 using TestAccs = alpaka::test::EnabledAccs<alpaka::DimInt<1u>, std::uint32_t>;
 
-TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.1D", "[workDivKernel]", TestAccs)
+TEMPLATE_LIST_TEST_CASE("getValidWorkDiv.1D", "[workDivKernel]", TestAccs)
 {
     using Acc = TestType;
     using Idx = alpaka::Idx<Acc>;
@@ -86,18 +85,17 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.1D", "[workDivKernel]", TestAc
     auto const dev = alpaka::getDevByIdx(platform, 0);
 
     TestKernelWithManyRegisters kernel;
-    auto const kernelBundle = alpaka::KernelBundle(kernel, 200ul);
 
     // Get the device properties and hard limits
     auto const props = alpaka::getAccDevProps<Acc>(dev);
     Idx const threadsPerGridTestValue = props.m_blockThreadCountMax * props.m_gridBlockCountMax;
 
-    // Test the getValidWorkDivForKernel function for threadsPerGridTestValue threads per grid.
-    auto const workDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(dev, kernelBundle, Vec{threadsPerGridTestValue}, Vec{1});
+    // Test the getValidWorkDiv function for threadsPerGridTestValue threads per grid.
+    alpaka::KernelCfg<Acc> const kernelCfg = {Vec{threadsPerGridTestValue}, Vec{1}};
+    auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, dev, kernel, 200ul);
 
-    // Test the isValidWorkDivKernel function
-    CHECK(alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, workDiv));
+    // Test the isValidWorkDiv function
+    CHECK(alpaka::isValidWorkDiv<Acc>(workDiv, dev, kernel, 200ul));
 
     // Get calculated threads per block from the workDiv that was found by examining the kernel function.
     Idx const threadsPerBlock = workDiv.m_blockThreadExtent.prod();
@@ -110,15 +108,15 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.1D", "[workDivKernel]", TestAc
 
     // Check that using the maximum number of threads per block is valid.
     auto const validWorkDiv = WorkDiv{Vec{threadsPerGridTestValue / threadsPerBlock}, Vec{threadsPerBlock}, Vec{1}};
-    CHECK(alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, validWorkDiv));
+    CHECK(alpaka::isValidWorkDiv<Acc>(validWorkDiv, dev, kernel, 200ul));
 
     // Check that using too many threads per block is not valid.
     auto const invalidThreads = WorkDiv{Vec{1}, Vec{2 * threadsPerBlockLimit}, Vec{1}};
-    CHECK(not alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, invalidThreads));
+    CHECK(not alpaka::isValidWorkDiv<Acc>(invalidThreads, dev, kernel, 200ul));
 
     // Check that a work division with a single block, thread and element is always valid
     auto const serialWorkDiv = WorkDiv{Vec{1}, Vec{1}, Vec{1}};
-    CHECK(alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, serialWorkDiv));
+    CHECK(alpaka::isValidWorkDiv<Acc>(serialWorkDiv, dev, kernel, 200ul));
 
     // Some accelerators support only one thread per block:
     if constexpr(alpaka::isSingleThreadAcc<Acc>)
@@ -129,7 +127,7 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.1D", "[workDivKernel]", TestAc
 
         // Check that a work division with more than one thread per block is not valid.
         auto const parallelWorkDiv = WorkDiv{Vec{1}, Vec{2}, Vec{1}};
-        CHECK(not alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, parallelWorkDiv));
+        CHECK(not alpaka::isValidWorkDiv<Acc>(parallelWorkDiv, dev, kernel, 200ul));
     }
 
     // Check the maxDynamicSharedSizeBytes for CPU backends
@@ -142,14 +140,14 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.1D", "[workDivKernel]", TestAc
                      alpaka::TagCpuTbbBlocks>)
     {
         int const maxDynamicSharedSizeBytes
-            = alpaka::getFunctionAttributes<Acc>(dev, kernelBundle).maxDynamicSharedSizeBytes;
+            = alpaka::getFunctionAttributes<Acc>(dev, kernel, 200ul).maxDynamicSharedSizeBytes;
         CHECK(maxDynamicSharedSizeBytes == static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024));
     }
 }
 
 using TestAccs2D = alpaka::test::EnabledAccs<alpaka::DimInt<2u>, std::uint32_t>;
 
-TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.2D", "[workDivKernel]", TestAccs2D)
+TEMPLATE_LIST_TEST_CASE("getValidWorkDiv.2D", "[workDivKernel]", TestAccs2D)
 {
     using Acc = TestType;
     using Idx = alpaka::Idx<Acc>;
@@ -160,18 +158,17 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.2D", "[workDivKernel]", TestAc
     auto const dev = alpaka::getDevByIdx(platform, 0);
 
     TestKernelWithManyRegisters kernel;
-    auto const kernelBundle = alpaka::KernelBundle(kernel, 200ul);
 
     // Get the device properties and hard limits
     auto const props = alpaka::getAccDevProps<Acc>(dev);
     Idx const threadsPerGridTestValue = props.m_blockThreadCountMax * props.m_gridBlockCountMax;
 
-    // Test getValidWorkDivForKernel function for threadsPerGridTestValue threads per grid.
-    auto const workDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(dev, kernelBundle, Vec{8, threadsPerGridTestValue / 8}, Vec{1, 1});
+    // Test getValidWorkDiv function for threadsPerGridTestValue threads per grid.
+    alpaka::KernelCfg<Acc> const kernelCfg = {Vec{8, threadsPerGridTestValue / 8}, Vec{1, 1}};
+    auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, dev, kernel, 200ul);
 
-    // Test the isValidWorkDivKernel function
-    CHECK(alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, workDiv));
+    // Test the isValidWorkDiv function
+    CHECK(alpaka::isValidWorkDiv<Acc>(workDiv, dev, kernel, 200ul));
 
     // The valid workdiv values for the kernel may change depending on the GPU type and compiler.
     // Therefore the generated workdiv is not compared to a specific workdiv in this test.
@@ -188,15 +185,15 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.2D", "[workDivKernel]", TestAc
     // Check that using the maximum number of threads per block is valid.
     auto const validWorkDiv
         = WorkDiv{Vec{8, threadsPerGridTestValue / threadsPerBlock / 8}, Vec{1, threadsPerBlock}, Vec{1, 1}};
-    CHECK(alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, validWorkDiv));
+    CHECK(alpaka::isValidWorkDiv<Acc>(validWorkDiv, dev, kernel, 200ul));
 
     // Check that using too many threads per block is not valid.
     auto const invalidThreads = WorkDiv{Vec{1, 1}, Vec{2, threadsPerBlockLimit}, Vec{1, 1}};
-    CHECK(not alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, invalidThreads));
+    CHECK(not alpaka::isValidWorkDiv<Acc>(invalidThreads, dev, kernel, 200ul));
 
     // Check that a work division with a single block, thread and element is always valid
     auto const serialWorkDiv = WorkDiv{Vec{1, 1}, Vec{1, 1}, Vec{1, 1}};
-    CHECK(alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, serialWorkDiv));
+    CHECK(alpaka::isValidWorkDiv<Acc>(serialWorkDiv, dev, kernel, 200ul));
 
     // Some accelerators support only one thread per block:
     if constexpr(alpaka::isSingleThreadAcc<Acc>)
@@ -207,7 +204,7 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.2D", "[workDivKernel]", TestAc
 
         // Check that a work division with more than one thread per block is not valid.
         auto const parallelWorkDiv = WorkDiv{Vec{1, 1}, Vec{1, 2}, Vec{1, 1}};
-        CHECK(not alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, parallelWorkDiv));
+        CHECK(not alpaka::isValidWorkDiv<Acc>(parallelWorkDiv, dev, kernel, 200ul));
     }
 
     // Check the maxDynamicSharedSizeBytes for CPU backends
@@ -220,7 +217,7 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.2D", "[workDivKernel]", TestAc
                      alpaka::TagCpuTbbBlocks>)
     {
         int const maxDynamicSharedSizeBytes
-            = alpaka::getFunctionAttributes<Acc>(dev, kernelBundle).maxDynamicSharedSizeBytes;
+            = alpaka::getFunctionAttributes<Acc>(dev, kernel, 200ul).maxDynamicSharedSizeBytes;
         CHECK(maxDynamicSharedSizeBytes == static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024));
     }
 }