diff --git a/VkFFT_TestSuite.cpp b/VkFFT_TestSuite.cpp index b31392c..7de67d5 100644 --- a/VkFFT_TestSuite.cpp +++ b/VkFFT_TestSuite.cpp @@ -562,7 +562,7 @@ int main(int argc, char* argv[]) version_decomposed[0] = version / 10000; version_decomposed[1] = (version - version_decomposed[0] * 10000) / 100; version_decomposed[2] = (version - version_decomposed[0] * 10000 - version_decomposed[1] * 100); - printf("VkFFT v%d.%d.%d (05-02-2024). Author: Tolmachev Dmitrii\n", version_decomposed[0], version_decomposed[1], version_decomposed[2]); + printf("VkFFT v%d.%d.%d. Author: Tolmachev Dmitrii\n", version_decomposed[0], version_decomposed[1], version_decomposed[2]); #if (VKFFT_BACKEND==0) printf("Vulkan backend\n"); #elif (VKFFT_BACKEND==1) diff --git a/vkFFT/vkFFT.h b/vkFFT/vkFFT.h index da52e52..c3175da 100644 --- a/vkFFT/vkFFT.h +++ b/vkFFT/vkFFT.h @@ -107,6 +107,6 @@ #include "vkFFT/vkFFT_AppManagement/vkFFT_DeleteApp.h" static inline int VkFFTGetVersion() { - return 10304; //X.XX.XX format + return 10305; //X.XX.XX format } #endif diff --git a/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_DeleteApp.h b/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_DeleteApp.h index 8bada42..39ebeca 100644 --- a/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_DeleteApp.h +++ b/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_DeleteApp.h @@ -436,14 +436,6 @@ static inline void deleteVkFFT(VkFFTApplication* app) { app->configuration.commandQueue = 0; } #elif(VKFFT_BACKEND==5) - if (app->configuration.device) { - free(app->configuration.device); - app->configuration.device = 0; - } - if (app->configuration.queue) { - free(app->configuration.queue); - app->configuration.queue = 0; - } #endif memset(app, 0, sizeof(VkFFTApplication)); } diff --git a/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h b/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h index b194f9a..7f0500f 100644 --- a/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h +++ b/vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h @@ -1069,27 +1069,13 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf deleteVkFFT(app); return VKFFT_ERROR_INVALID_DEVICE; } - app->configuration.device = (MTL::Device*)calloc(1, sizeof(MTL::Device)); - if (!app->configuration.device) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - else { - app->configuration.device[0] = inputLaunchConfiguration.device[0]; - } + app->configuration.device = inputLaunchConfiguration.device; if (inputLaunchConfiguration.queue == 0) { deleteVkFFT(app); return VKFFT_ERROR_INVALID_QUEUE; } - app->configuration.queue = (MTL::CommandQueue*)calloc(1, sizeof(MTL::CommandQueue)); - if (!app->configuration.queue) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - else { - app->configuration.queue[0] = inputLaunchConfiguration.queue[0]; - } + app->configuration.queue = inputLaunchConfiguration.queue; const char dummy_kernel[50] = "kernel void VkFFT_dummy (){}"; const char function_name[20] = "VkFFT_dummy"; @@ -2327,4 +2313,4 @@ static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfigurat return resFFT; } -#endif +#endif \ No newline at end of file diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_PushConstants.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_PushConstants.h index b1144a1..8c6ebc6 100644 --- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_PushConstants.h +++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_PushConstants.h @@ -46,18 +46,9 @@ static inline void appendPushConstants(VkFFTSpecializationConstantsLayout* sc) { sc->tempLen = sprintf(sc->tempStr, "layout(push_constant) uniform PushConsts\n{\n"); PfAppendLine(sc); -#elif(VKFFT_BACKEND==1) +#else sc->tempLen = sprintf(sc->tempStr, " typedef struct {\n"); PfAppendLine(sc); - -#elif(VKFFT_BACKEND==2) - sc->tempLen = sprintf(sc->tempStr, " typedef struct {\n"); - PfAppendLine(sc); - -#elif(VKFFT_BACKEND==3) - sc->tempLen = sprintf(sc->tempStr, " typedef struct {\n"); - PfAppendLine(sc); - #endif char tempCopyStr[60]; if (sc->performWorkGroupShift[0]) { @@ -109,22 +100,9 @@ static inline void appendPushConstants(VkFFTSpecializationConstantsLayout* sc) { sc->tempLen = sprintf(sc->tempStr, "} consts;\n\n"); PfAppendLine(sc); -#elif(VKFFT_BACKEND==1) +#else sc->tempLen = sprintf(sc->tempStr, " }PushConsts;\n"); PfAppendLine(sc); - //sc->tempLen = sprintf(sc->tempStr, " __constant__ PushConsts consts;\n"); - //PfAppendLine(sc); -#elif(VKFFT_BACKEND==2) - sc->tempLen = sprintf(sc->tempStr, " }PushConsts;\n"); - PfAppendLine(sc); - - //sc->tempLen = sprintf(sc->tempStr, " __constant__ PushConsts consts;\n"); - //PfAppendLine(sc); - -#elif(VKFFT_BACKEND==3) - sc->tempLen = sprintf(sc->tempStr, " }PushConsts;\n"); - PfAppendLine(sc); - #endif return; } diff --git a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_SharedMemory.h b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_SharedMemory.h index 47d7de0..be5e8a0 100644 --- a/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_SharedMemory.h +++ b/vkFFT/vkFFT/vkFFT_CodeGen/vkFFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_SharedMemory.h @@ -155,7 +155,7 @@ static inline void appendSharedMemoryVkFFT(VkFFTSpecializationConstantsLayout* s } break; } - case 1: case 2: //grouped_c2c + single_c2c_strided + case 1: case 2: //grouped_c2c + single_c2c_strided + axisSwapped { pfUINT shift = (sc->fftDim.data.i < (sc->numSharedBanks / 2)) ? (sc->numSharedBanks / 2) / sc->fftDim.data.i : 1; sc->sharedStrideReadWriteConflict.type = 31; @@ -168,7 +168,7 @@ static inline void appendSharedMemoryVkFFT(VkFFTSpecializationConstantsLayout* s sc->sharedStride4StepLastAxisConflict.data.i = sc->localSize[0].data.i; sc->maxSharedStride.type = 31; - sc->maxSharedStride.data.i = maxSequenceSharedMemory.data.i / sc->fftDim.data.i;// ((maxSequenceSharedMemory.data.i < sc->sharedStrideReadWriteConflict.data.i* (sc->fftDim.data.i / sc->registerBoost + (pfINT)additionalR2Cshared))) ? sc->localSize[0].data.i : sc->sharedStrideReadWriteConflict.data.i; + sc->maxSharedStride.data.i = maxSequenceSharedMemory.data.i / (sc->fftDim.data.i / sc->registerBoost + additionalR2Cshared);// ((maxSequenceSharedMemory.data.i < sc->sharedStrideReadWriteConflict.data.i* (sc->fftDim.data.i / sc->registerBoost + (pfINT)additionalR2Cshared))) ? sc->localSize[0].data.i : sc->sharedStrideReadWriteConflict.data.i; sc->sharedStrideReadWriteConflict.data.i = (sc->maxSharedStride.data.i == sc->localSize[0].data.i) ? sc->localSize[0].data.i : sc->sharedStrideReadWriteConflict.data.i; sc->sharedStride4StepLastAxisConflict.data.i = (sc->maxSharedStride.data.i == sc->localSize[0].data.i) ? sc->localSize[0].data.i : sc->sharedStride4StepLastAxisConflict.data.i; diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_AxisBlockSplitter.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_AxisBlockSplitter.h index 1ff430f..ddcfdd6 100644 --- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_AxisBlockSplitter.h +++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_AxisBlockSplitter.h @@ -140,6 +140,22 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan* } while ((axis->axisBlock[1] * (axis->specializationConstants.fftDim.data.i / axis->specializationConstants.registerBoost)) > maxSequenceLengthSharedMemory) axis->axisBlock[1] /= 2; + pfUINT additionalR2Cshared = 0; + if ((axis->specializationConstants.performR2C || ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2) || (axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3) || (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && ((axis->specializationConstants.fft_dim_full.data.i % 2) != 0)))) && (axis->specializationConstants.axis_id == 0) && (!axis->specializationConstants.performR2CmultiUpload) && (!axis->specializationConstants.performR2RmultiUpload)) { + additionalR2Cshared = ((axis->specializationConstants.fft_dim_full.data.i % 2) == 0) ? 2 : 1; + if ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2) || (axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3) || (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && ((axis->specializationConstants.fft_dim_full.data.i % 2) != 0))) additionalR2Cshared = 1; + } + if ((axis->specializationConstants.mergeSequencesR2C) && ((!axis->specializationConstants.performR2CmultiUpload) && (!axis->specializationConstants.performR2RmultiUpload) && ((axis->specializationConstants.fft_dim_full.data.i + additionalR2Cshared) <= maxSequenceLengthSharedMemory) && (FFTPlan->actualFFTSizePerAxis[axis_id][1] > 1) && ((FFTPlan->actualPerformR2CPerAxis[axis_id]) || ((((axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3)) || ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2)) || ((axis->specializationConstants.performDCT == 1) || (axis->specializationConstants.performDST == 1)) || (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && ((app->configuration.size[axis_id] % 2) != 0))) && (axis_id == 0))))) { + if (((axis->specializationConstants.fft_dim_full.data.i + additionalR2Cshared) * axis->axisBlock[1] * axis->specializationConstants.complexSize) > (app->configuration.sharedMemorySize - axis->specializationConstants.additionalRaderSharedSize.data.i * axis->specializationConstants.complexSize)) { + if (axis->axisBlock[1] > maxBatchCoalesced) { + axis->axisBlock[1] = maxBatchCoalesced; + } + else { + axis->specializationConstants.mergeSequencesR2C = 0; + } + } + } + axis->groupedBatch = axis->axisBlock[1]; if (((axis->specializationConstants.fftDim.data.i % 2 == 0) || (axis->axisBlock[0] < app->configuration.numSharedBanks / 4)) && (!(((!axis->specializationConstants.reorderFourStep) || (axis->specializationConstants.useBluesteinFFT)) && (FFTPlan->numAxisUploads[0] > 1))) && (axis->axisBlock[1] > 1) && (axis->axisBlock[1] * axis->specializationConstants.fftDim.data.i < maxSequenceLengthSharedMemory) && (!((app->configuration.performZeropadding[0] || app->configuration.performZeropadding[1] || app->configuration.performZeropadding[2])))) { /*#if (VKFFT_BACKEND==0) @@ -424,6 +440,21 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan* } } while ((axis->axisBlock[1] * (axis->specializationConstants.fftDim.data.i / axis->specializationConstants.registerBoost)) > maxSequenceLengthSharedMemory) axis->axisBlock[1] /= 2; + pfUINT additionalR2Cshared = 0; + if ((axis->specializationConstants.performR2C || ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2) || (axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3) || (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && ((axis->specializationConstants.fft_dim_full.data.i % 2) != 0)))) && (axis->specializationConstants.axis_id == 0) && (!axis->specializationConstants.performR2CmultiUpload) && (!axis->specializationConstants.performR2RmultiUpload)) { + additionalR2Cshared = ((axis->specializationConstants.fft_dim_full.data.i % 2) == 0) ? 2 : 1; + if ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2) || (axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3) || (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && ((axis->specializationConstants.fft_dim_full.data.i % 2) != 0))) additionalR2Cshared = 1; + } + if ((axis->specializationConstants.mergeSequencesR2C) && ((!axis->specializationConstants.performR2CmultiUpload) && (!axis->specializationConstants.performR2RmultiUpload) && ((axis->specializationConstants.fft_dim_full.data.i + additionalR2Cshared) <= maxSequenceLengthSharedMemory) && (FFTPlan->actualFFTSizePerAxis[axis_id][1] > 1) && ((FFTPlan->actualPerformR2CPerAxis[axis_id]) || ((((axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3)) || ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2)) || ((axis->specializationConstants.performDCT == 1) || (axis->specializationConstants.performDST == 1)) || (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && ((app->configuration.size[axis_id] % 2) != 0))) && (axis_id == 0))))) { + if (((axis->specializationConstants.fft_dim_full.data.i + additionalR2Cshared) * axis->axisBlock[1] * axis->specializationConstants.complexSize) > (app->configuration.sharedMemorySize - axis->specializationConstants.additionalRaderSharedSize.data.i * axis->specializationConstants.complexSize)) { + if (axis->axisBlock[1] > maxBatchCoalesced) { + axis->axisBlock[1] = maxBatchCoalesced; + } + else { + axis->specializationConstants.mergeSequencesR2C = 0; + } + } + } axis->groupedBatch = axis->axisBlock[1]; if ((!axis->specializationConstants.useRaderMult) && (axis->axisBlock[1] >= 4) && (((axis->axisBlock[0] & (axis->axisBlock[0]-1))) || (axis->axisBlock[0] <= app->configuration.numSharedBanks / 2)) && (!(((!axis->specializationConstants.reorderFourStep) || (axis->specializationConstants.useBluesteinFFT)) && (FFTPlan->numAxisUploads[0] > 1))) && (axis->axisBlock[1] > 1) && (axis->axisBlock[1] * axis->specializationConstants.fftDim.data.i < maxSequenceLengthSharedMemory) && (!((app->configuration.performZeropadding[0] || app->configuration.performZeropadding[1] || app->configuration.performZeropadding[2])))) { /*#if (VKFFT_BACKEND==0) diff --git a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_Scheduler.h b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_Scheduler.h index 15a2764..56e961d 100644 --- a/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_Scheduler.h +++ b/vkFFT/vkFFT/vkFFT_PlanManagement/vkFFT_HostFunctions/vkFFT_Scheduler.h @@ -391,7 +391,7 @@ static inline VkFFTResult VkFFTConstructRaderTree(VkFFTApplication* app, VkFFTAx } return res; } -static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTRaderContainer* raderContainer, int numRaderPrimes, int fftDim, int* min_registers_per_thread, int* registers_per_thread, int* registers_per_thread_per_radix) { +static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTApplication* app, VkFFTRaderContainer* raderContainer, int numRaderPrimes, int fftDim, int* min_registers_per_thread, int* registers_per_thread, int* registers_per_thread_per_radix) { VkFFTResult res = VKFFT_SUCCESS; for (pfINT i = 0; i < (pfINT)numRaderPrimes; i++) { if (raderContainer[i].type == 0) { @@ -413,7 +413,12 @@ static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTRaderContainer* ra if (raderContainer[i].registers_per_thread_per_radix[j] > raderContainer[i].registers_per_thread) raderContainer[i].registers_per_thread = raderContainer[i].registers_per_thread_per_radix[j]; } } - + for (int j = 0; j < 68; j++) { + if (raderContainer[i].registers_per_thread_per_radix[j] > 0){ + while (raderContainer[i].containerFFTNum * (int)pfceil(raderContainer[i].containerFFTDim / (double)raderContainer[i].registers_per_thread_per_radix[j]) > app->configuration.maxThreadsNum) + raderContainer[i].registers_per_thread_per_radix[j] += j; + } + } /*if (raderContainer[i].min_registers_per_thread < min_registers_per_thread[0]) { for (int j = 0; j < 68; j++) { if (raderContainer[i].registers_per_thread_per_radix[j] > 0) { @@ -424,7 +429,7 @@ static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTRaderContainer* ra } } }*/ - if (numRaderPrimes>1){ + /*if (app->configuration.maxThreadsNum < fftDim / min_registers_per_thread[0]) { for (pfINT j = 2; j < 68; j++) { if (raderContainer[i].registers_per_thread_per_radix[j] != 0) { double scaling = (raderContainer[i].containerFFTDim > raderContainer[i].registers_per_thread_per_radix[j]) ? pfceil(raderContainer[i].containerFFTDim / (double)raderContainer[i].registers_per_thread_per_radix[j]) : 1.0 / floor(raderContainer[i].registers_per_thread_per_radix[j] / (double)raderContainer[i].containerFFTDim); @@ -435,7 +440,7 @@ static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTRaderContainer* ra if (raderContainer[i].registers_per_thread_per_radix[j] > raderContainer[i].registers_per_thread) raderContainer[i].registers_per_thread = raderContainer[i].registers_per_thread_per_radix[j]; } } - } + }*/ if (raderContainer[i].registers_per_thread > registers_per_thread[0]) registers_per_thread[0] = raderContainer[i].registers_per_thread; } } @@ -466,7 +471,7 @@ static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTRaderContainer* ra //subprimes optimization for (pfINT i = 0; i < (pfINT)numRaderPrimes; i++) { if (raderContainer[i].numSubPrimes) { - res = VkFFTOptimizeRaderFFTRegisters(raderContainer[i].container, raderContainer[i].numSubPrimes, fftDim, min_registers_per_thread, registers_per_thread, registers_per_thread_per_radix); + res = VkFFTOptimizeRaderFFTRegisters(app, raderContainer[i].container, raderContainer[i].numSubPrimes, fftDim, min_registers_per_thread, registers_per_thread, registers_per_thread_per_radix); if (res != VKFFT_SUCCESS) return res; } } @@ -1457,7 +1462,7 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl min_registers_per_thread = 2; registers_per_thread = 2; } - res = VkFFTOptimizeRaderFFTRegisters(axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes, (int)locAxisSplit[k], &min_registers_per_thread, ®isters_per_thread, registers_per_thread_per_radix); + res = VkFFTOptimizeRaderFFTRegisters(app, axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes, (int)locAxisSplit[k], &min_registers_per_thread, ®isters_per_thread, registers_per_thread_per_radix); if (res != VKFFT_SUCCESS) return res; } if ((registerBoost == 4) && (registers_per_thread % 4 != 0)) { @@ -1608,7 +1613,7 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl } //second optimizer pass if (axes[k].specializationConstants.numRaderPrimes) { - res = VkFFTOptimizeRaderFFTRegisters(axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes, (int)locAxisSplit[k], &min_registers_per_thread, ®isters_per_thread, registers_per_thread_per_radix); + res = VkFFTOptimizeRaderFFTRegisters(app, axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes, (int)locAxisSplit[k], &min_registers_per_thread, ®isters_per_thread, registers_per_thread_per_radix); if (res != VKFFT_SUCCESS) return res; } for (int i = 2; i < 68; i++) {