Skip to content

Commit

Permalink
single kernel - multiple batches convolution support (#159)
Browse files Browse the repository at this point in the history
-enabled through singleKernelMultipleBatches parameter
-kernel batching is controlled through coordinateFeatures
-number of input/output systems is controlled through numberBatches
-sample 53 shows the usage of this option
  • Loading branch information
DTolm committed Mar 2, 2024
1 parent d753451 commit e2d3d57
Show file tree
Hide file tree
Showing 9 changed files with 399 additions and 52 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ if(build_VkFFT_FFTW_precision)
benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp
benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp
benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
benchmark_scripts/vkFFT_scripts/src/sample_53_convolution_VkFFT_single_2d_Nimages_1kernel.cpp
benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
benchmark_scripts/vkFFT_scripts/src/sample_1000_benchmark_VkFFT_single_2_4096.cpp
Expand All @@ -85,6 +86,7 @@ else()
benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp
benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp
benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
benchmark_scripts/vkFFT_scripts/src/sample_53_convolution_VkFFT_single_2d_Nimages_1kernel.cpp
benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
benchmark_scripts/vkFFT_scripts/src/sample_1000_benchmark_VkFFT_single_2_4096.cpp
Expand Down
7 changes: 7 additions & 0 deletions VkFFT_TestSuite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
#include "sample_50_convolution_VkFFT_single_1d_matrix.h"
#include "sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.h"
#include "sample_52_convolution_VkFFT_single_2d_batched_r2c.h"
#include "sample_53_convolution_VkFFT_single_2d_Nimages_1kernel.h"

#include "sample_100_benchmark_VkFFT_single_nd_dct.h"
#include "sample_101_benchmark_VkFFT_double_nd_dct.h"
Expand Down Expand Up @@ -423,6 +424,11 @@ VkFFTResult launchVkFFT(VkGPU* vkGPU, uint64_t sample_id, bool file_output, FILE
{
resFFT = sample_52_convolution_VkFFT_single_2d_batched_r2c(vkGPU, file_output, output, isCompilerInitialized);
break;
}
case 53:
{
resFFT = sample_53_convolution_VkFFT_single_2d_Nimages_1kernel(vkGPU, file_output, output, isCompilerInitialized);
break;
}
case 110:
{
Expand Down Expand Up @@ -638,6 +644,7 @@ int main(int argc, char* argv[])
printf(" 50 - convolution example with identity kernel\n");
printf(" 51 - zeropadding convolution example with identity kernel\n");
printf(" 52 - batched convolution example with identity kernel\n");
printf(" 53 - convolution example with one scaling kernel of three colors, multiple images of three colors\n");
printf(" 110 - VkFFT FFT + iFFT R2R DCT-1 multidimensional benchmark in single precision\n");
printf(" 111 - VkFFT FFT + iFFT R2R DCT-1 multidimensional benchmark in double precision\n");
printf(" 120 - VkFFT FFT + iFFT R2R DCT-2 multidimensional benchmark in single precision\n");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#include "vkFFT.h"
#include "utils_VkFFT.h"

VkFFTResult sample_53_convolution_VkFFT_single_2d_Nimages_1kernel(VkGPU* vkGPU, uint64_t file_output, FILE* output, uint64_t isCompilerInitialized);

Large diffs are not rendered by default.

104 changes: 53 additions & 51 deletions vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h
Original file line number Diff line number Diff line change
Expand Up @@ -1176,56 +1176,6 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
if (inputLaunchConfiguration.kernelOffset != 0) app->configuration.kernelOffset = inputLaunchConfiguration.kernelOffset;
if (inputLaunchConfiguration.specifyOffsetsAtLaunch != 0) app->configuration.specifyOffsetsAtLaunch = inputLaunchConfiguration.specifyOffsetsAtLaunch;
//set optional parameters:
pfUINT checkBufferSizeFor64BitAddressing = 0;
for (pfUINT i = 0; i < app->configuration.bufferNum; i++) {
if (app->configuration.bufferSize)
checkBufferSizeFor64BitAddressing += app->configuration.bufferSize[i];
else {
checkBufferSizeFor64BitAddressing = app->configuration.size[0] * app->configuration.size[1] * app->configuration.size[2] * 8;
if (app->configuration.coordinateFeatures > 0) checkBufferSizeFor64BitAddressing *= app->configuration.coordinateFeatures;
if (app->configuration.numberBatches > 0) checkBufferSizeFor64BitAddressing *= app->configuration.numberBatches;
if (app->configuration.numberKernels > 0) checkBufferSizeFor64BitAddressing *= app->configuration.numberKernels;
if (app->configuration.doublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) checkBufferSizeFor64BitAddressing *= 2;
if (app->configuration.quadDoubleDoublePrecision) checkBufferSizeFor64BitAddressing *= 4;
}
}
#if(VKFFT_BACKEND==2)
app->configuration.useStrict32BitAddress = 0;
if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)32)) app->configuration.useStrict32BitAddress = -1;
#endif
if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1;
checkBufferSizeFor64BitAddressing = 0;
for (pfUINT i = 0; i < app->configuration.inputBufferNum; i++) {
if (app->configuration.inputBufferSize)
checkBufferSizeFor64BitAddressing += app->configuration.inputBufferSize[i];
}
#if(VKFFT_BACKEND==2)
if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)32)) app->configuration.useStrict32BitAddress = -1;
#endif
if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1;

checkBufferSizeFor64BitAddressing = 0;
for (pfUINT i = 0; i < app->configuration.outputBufferNum; i++) {
if (app->configuration.outputBufferSize)
checkBufferSizeFor64BitAddressing += app->configuration.outputBufferSize[i];
}
if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1;

checkBufferSizeFor64BitAddressing = 0;
for (pfUINT i = 0; i < app->configuration.kernelNum; i++) {
if (app->configuration.kernelSize)
checkBufferSizeFor64BitAddressing += app->configuration.kernelSize[i];
}
#if(VKFFT_BACKEND==2)
if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)32)) app->configuration.useStrict32BitAddress = -1;
// No reason was found to disable strict 32 bit addressing, so enable it
if (app->configuration.useStrict32BitAddress == 0) app->configuration.useStrict32BitAddress = 1;
#endif
if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1;
if (inputLaunchConfiguration.useUint64 != 0) app->configuration.useUint64 = inputLaunchConfiguration.useUint64;
#if(VKFFT_BACKEND==2)
if (inputLaunchConfiguration.useStrict32BitAddress != 0) app->configuration.useStrict32BitAddress = inputLaunchConfiguration.useStrict32BitAddress;
#endif
if (inputLaunchConfiguration.maxThreadsNum != 0) app->configuration.maxThreadsNum = inputLaunchConfiguration.maxThreadsNum;
if (inputLaunchConfiguration.coalescedMemory != 0) app->configuration.coalescedMemory = inputLaunchConfiguration.coalescedMemory;
app->configuration.aimThreads = 128;
Expand Down Expand Up @@ -1380,7 +1330,7 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf

if (inputLaunchConfiguration.matrixConvolution != 0) app->configuration.matrixConvolution = inputLaunchConfiguration.matrixConvolution;
if (inputLaunchConfiguration.numberKernels != 0) app->configuration.numberKernels = inputLaunchConfiguration.numberKernels;

if (inputLaunchConfiguration.singleKernelMultipleBatches != 0) app->configuration.singleKernelMultipleBatches = inputLaunchConfiguration.singleKernelMultipleBatches;
if (inputLaunchConfiguration.symmetricKernel != 0) app->configuration.symmetricKernel = inputLaunchConfiguration.symmetricKernel;
if (inputLaunchConfiguration.conjugateConvolution != 0) app->configuration.conjugateConvolution = inputLaunchConfiguration.conjugateConvolution;
if (inputLaunchConfiguration.crossPowerSpectrumNormalization != 0) app->configuration.crossPowerSpectrumNormalization = inputLaunchConfiguration.crossPowerSpectrumNormalization;
Expand All @@ -1391,6 +1341,58 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
app->configuration.registerBoost4Step = 1;
if (app->configuration.matrixConvolution > 1) app->configuration.coordinateFeatures = app->configuration.matrixConvolution;
}

pfUINT checkBufferSizeFor64BitAddressing = 0;
for (pfUINT i = 0; i < app->configuration.bufferNum; i++) {
if (app->configuration.bufferSize)
checkBufferSizeFor64BitAddressing += app->configuration.bufferSize[i];
else {
checkBufferSizeFor64BitAddressing = app->configuration.size[0] * app->configuration.size[1] * app->configuration.size[2] * 8;
if (app->configuration.coordinateFeatures > 0) checkBufferSizeFor64BitAddressing *= app->configuration.coordinateFeatures;
if (app->configuration.numberBatches > 0) checkBufferSizeFor64BitAddressing *= app->configuration.numberBatches;
if (app->configuration.numberKernels > 0) checkBufferSizeFor64BitAddressing *= app->configuration.numberKernels;
if (app->configuration.doublePrecision || app->configuration.quadDoubleDoublePrecisionDoubleMemory) checkBufferSizeFor64BitAddressing *= 2;
if (app->configuration.quadDoubleDoublePrecision) checkBufferSizeFor64BitAddressing *= 4;
}
}
#if(VKFFT_BACKEND==2)
app->configuration.useStrict32BitAddress = 0;
if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)32)) app->configuration.useStrict32BitAddress = -1;
#endif
if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1;
checkBufferSizeFor64BitAddressing = 0;
for (pfUINT i = 0; i < app->configuration.inputBufferNum; i++) {
if (app->configuration.inputBufferSize)
checkBufferSizeFor64BitAddressing += app->configuration.inputBufferSize[i];
}
#if(VKFFT_BACKEND==2)
if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)32)) app->configuration.useStrict32BitAddress = -1;
#endif
if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1;

checkBufferSizeFor64BitAddressing = 0;
for (pfUINT i = 0; i < app->configuration.outputBufferNum; i++) {
if (app->configuration.outputBufferSize)
checkBufferSizeFor64BitAddressing += app->configuration.outputBufferSize[i];
}
if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1;

checkBufferSizeFor64BitAddressing = 0;
for (pfUINT i = 0; i < app->configuration.kernelNum; i++) {
if (app->configuration.kernelSize)
checkBufferSizeFor64BitAddressing += app->configuration.kernelSize[i];
}
#if(VKFFT_BACKEND==2)
if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)32)) app->configuration.useStrict32BitAddress = -1;
// No reason was found to disable strict 32 bit addressing, so enable it
if (app->configuration.useStrict32BitAddress == 0) app->configuration.useStrict32BitAddress = 1;
#endif
if (checkBufferSizeFor64BitAddressing >= (pfUINT)pow((pfUINT)2, (pfUINT)34)) app->configuration.useUint64 = 1;
if (inputLaunchConfiguration.useUint64 != 0) app->configuration.useUint64 = inputLaunchConfiguration.useUint64;
#if(VKFFT_BACKEND==2)
if (inputLaunchConfiguration.useStrict32BitAddress != 0) app->configuration.useStrict32BitAddress = inputLaunchConfiguration.useStrict32BitAddress;
#endif

app->firstAxis = 0;
app->lastAxis = app->configuration.FFTdim - 1;
for (int i = 0; i < app->configuration.FFTdim; i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ static inline void appendKernelOffset(VkFFTSpecializationConstantsLayout* sc, in
PfMul(sc, &temp_int, &sc->coordinate, &bufferStride[sc->numFFTdims], 0);
PfAdd(sc, &sc->blockInvocationID, &sc->blockInvocationID, &temp_int);
}
if ((sc->numBatches.data.i > 1) || (sc->numKernels.data.i > 1)) {
if (((sc->numBatches.data.i > 1) && (!sc->singleKernelMultipleBatches)) || (sc->numKernels.data.i > 1)) {
if (sc->convolutionStep && (sc->numKernels.data.i > 1)) {
PfMul(sc, &sc->tempInt, &sc->batchID, &sc->inputStride[sc->numFFTdims+1], 0);
PfAdd(sc, &sc->blockInvocationID, &sc->blockInvocationID, &sc->tempInt);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,7 @@ static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPla

axis->specializationConstants.numCoordinates = (app->configuration.matrixConvolution > 1) ? 1 : (int)app->configuration.coordinateFeatures;
axis->specializationConstants.matrixConvolution = (int)app->configuration.matrixConvolution;
axis->specializationConstants.singleKernelMultipleBatches = (int)app->configuration.singleKernelMultipleBatches;
axis->specializationConstants.coordinate.type = 31;
axis->specializationConstants.coordinate.data.i = 0;
axis->specializationConstants.batchID.type = 31;
Expand Down
2 changes: 2 additions & 0 deletions vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_Plans/vkFFT_Plan_R2C.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,8 @@ static inline VkFFTResult VkFFTPlanR2CMultiUploadDecomposition(VkFFTApplication*

axis->specializationConstants.numCoordinates = (app->configuration.matrixConvolution > 1) ? 1 : (int)app->configuration.coordinateFeatures;
axis->specializationConstants.matrixConvolution = (int)app->configuration.matrixConvolution;
axis->specializationConstants.singleKernelMultipleBatches = (int)app->configuration.singleKernelMultipleBatches;

for (pfUINT i = 0; i < VKFFT_MAX_FFT_DIMENSIONS; i++) {
axis->specializationConstants.size[i].type = 31;
axis->specializationConstants.size[i].data.i = (pfINT)app->configuration.size[i];
Expand Down
2 changes: 2 additions & 0 deletions vkFFT/vkFFT/vkFFT_Structs/vkFFT_Structs.h
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ typedef struct {
pfUINT matrixConvolution; //if equal to 2 perform 2x2, if equal to 3 perform 3x3 matrix-vector convolution. Overrides coordinateFeatures
pfUINT symmetricKernel; //specify if kernel in 2x2 or 3x3 matrix convolution is symmetric
pfUINT numberKernels;// N - only used in convolution step - specify how many kernels were initialized before. Expands one input to multiple (batched) output
pfUINT singleKernelMultipleBatches;// 0 off, 1 - perform convolution with one kernel to multiple (numberBatches) input/output. kernel can still use multiple coordinates for batching (for example if you want to have 3 kernels cycling for 9 systems). Default 0
pfUINT kernelConvolution;// specify if this application is used to create kernel for convolution, so it has the same properties. performConvolution has to be set to 0 for kernel creation

//register overutilization (experimental): (default 0 if not stated otherwise)
Expand Down Expand Up @@ -802,6 +803,7 @@ typedef struct {
pfUINT kernelBlockSize;
int numCoordinates;
int matrixConvolution; //if equal to 2 perform 2x2, if equal to 3 perform 3x3 matrix-vector convolution. Overrides coordinateFeatures
int singleKernelMultipleBatches;
PfContainer numBatches;
PfContainer numKernels;
int conjugateConvolution;
Expand Down

0 comments on commit e2d3d57

Please sign in to comment.