Skip to content

Commit

Permalink
Bugfixes
Browse files Browse the repository at this point in the history
-Fix for Metal and Level Zero push constants
-Fix for Metal device/queue passing
-Fix for Rader primes that use more threads than hw limit (vincefn/pyvkfft#39 (comment))
-Fix for incorrect shared memory allocation in merged R2C case (vincefn/pyvkfft#39 (comment))
  • Loading branch information
DTolm committed Sep 27, 2024
1 parent ae94053 commit 79f7b7a
Show file tree
Hide file tree
Showing 8 changed files with 52 additions and 60 deletions.
2 changes: 1 addition & 1 deletion VkFFT_TestSuite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ int main(int argc, char* argv[])
version_decomposed[0] = version / 10000;
version_decomposed[1] = (version - version_decomposed[0] * 10000) / 100;
version_decomposed[2] = (version - version_decomposed[0] * 10000 - version_decomposed[1] * 100);
printf("VkFFT v%d.%d.%d (05-02-2024). Author: Tolmachev Dmitrii\n", version_decomposed[0], version_decomposed[1], version_decomposed[2]);
printf("VkFFT v%d.%d.%d. Author: Tolmachev Dmitrii\n", version_decomposed[0], version_decomposed[1], version_decomposed[2]);
#if (VKFFT_BACKEND==0)
printf("Vulkan backend\n");
#elif (VKFFT_BACKEND==1)
Expand Down
2 changes: 1 addition & 1 deletion vkFFT/vkFFT.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,6 @@
#include "vkFFT/vkFFT_AppManagement/vkFFT_DeleteApp.h"

static inline int VkFFTGetVersion() {
return 10304; //X.XX.XX format
return 10305; //X.XX.XX format
}
#endif
8 changes: 0 additions & 8 deletions vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_DeleteApp.h
Original file line number Diff line number Diff line change
Expand Up @@ -436,14 +436,6 @@ static inline void deleteVkFFT(VkFFTApplication* app) {
app->configuration.commandQueue = 0;
}
#elif(VKFFT_BACKEND==5)
if (app->configuration.device) {
free(app->configuration.device);
app->configuration.device = 0;
}
if (app->configuration.queue) {
free(app->configuration.queue);
app->configuration.queue = 0;
}
#endif
memset(app, 0, sizeof(VkFFTApplication));
}
Expand Down
20 changes: 3 additions & 17 deletions vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_InitializeApp.h
Original file line number Diff line number Diff line change
Expand Up @@ -1069,27 +1069,13 @@ static inline VkFFTResult setConfigurationVkFFT(VkFFTApplication* app, VkFFTConf
deleteVkFFT(app);
return VKFFT_ERROR_INVALID_DEVICE;
}
app->configuration.device = (MTL::Device*)calloc(1, sizeof(MTL::Device));
if (!app->configuration.device) {
deleteVkFFT(app);
return VKFFT_ERROR_MALLOC_FAILED;
}
else {
app->configuration.device[0] = inputLaunchConfiguration.device[0];
}
app->configuration.device = inputLaunchConfiguration.device;

if (inputLaunchConfiguration.queue == 0) {
deleteVkFFT(app);
return VKFFT_ERROR_INVALID_QUEUE;
}
app->configuration.queue = (MTL::CommandQueue*)calloc(1, sizeof(MTL::CommandQueue));
if (!app->configuration.queue) {
deleteVkFFT(app);
return VKFFT_ERROR_MALLOC_FAILED;
}
else {
app->configuration.queue[0] = inputLaunchConfiguration.queue[0];
}
app->configuration.queue = inputLaunchConfiguration.queue;

const char dummy_kernel[50] = "kernel void VkFFT_dummy (){}";
const char function_name[20] = "VkFFT_dummy";
Expand Down Expand Up @@ -2327,4 +2313,4 @@ static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfigurat
return resFFT;
}

#endif
#endif
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,9 @@ static inline void appendPushConstants(VkFFTSpecializationConstantsLayout* sc) {
sc->tempLen = sprintf(sc->tempStr, "layout(push_constant) uniform PushConsts\n{\n");
PfAppendLine(sc);

#elif(VKFFT_BACKEND==1)
#else
sc->tempLen = sprintf(sc->tempStr, " typedef struct {\n");
PfAppendLine(sc);

#elif(VKFFT_BACKEND==2)
sc->tempLen = sprintf(sc->tempStr, " typedef struct {\n");
PfAppendLine(sc);

#elif(VKFFT_BACKEND==3)
sc->tempLen = sprintf(sc->tempStr, " typedef struct {\n");
PfAppendLine(sc);

#endif
char tempCopyStr[60];
if (sc->performWorkGroupShift[0]) {
Expand Down Expand Up @@ -109,22 +100,9 @@ static inline void appendPushConstants(VkFFTSpecializationConstantsLayout* sc) {
sc->tempLen = sprintf(sc->tempStr, "} consts;\n\n");
PfAppendLine(sc);

#elif(VKFFT_BACKEND==1)
#else
sc->tempLen = sprintf(sc->tempStr, " }PushConsts;\n");
PfAppendLine(sc);
//sc->tempLen = sprintf(sc->tempStr, " __constant__ PushConsts consts;\n");
//PfAppendLine(sc);
#elif(VKFFT_BACKEND==2)
sc->tempLen = sprintf(sc->tempStr, " }PushConsts;\n");
PfAppendLine(sc);

//sc->tempLen = sprintf(sc->tempStr, " __constant__ PushConsts consts;\n");
//PfAppendLine(sc);

#elif(VKFFT_BACKEND==3)
sc->tempLen = sprintf(sc->tempStr, " }PushConsts;\n");
PfAppendLine(sc);

#endif
return;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ static inline void appendSharedMemoryVkFFT(VkFFTSpecializationConstantsLayout* s
}
break;
}
case 1: case 2: //grouped_c2c + single_c2c_strided
case 1: case 2: //grouped_c2c + single_c2c_strided + axisSwapped
{
pfUINT shift = (sc->fftDim.data.i < (sc->numSharedBanks / 2)) ? (sc->numSharedBanks / 2) / sc->fftDim.data.i : 1;
sc->sharedStrideReadWriteConflict.type = 31;
Expand All @@ -168,7 +168,7 @@ static inline void appendSharedMemoryVkFFT(VkFFTSpecializationConstantsLayout* s
sc->sharedStride4StepLastAxisConflict.data.i = sc->localSize[0].data.i;

sc->maxSharedStride.type = 31;
sc->maxSharedStride.data.i = maxSequenceSharedMemory.data.i / sc->fftDim.data.i;// ((maxSequenceSharedMemory.data.i < sc->sharedStrideReadWriteConflict.data.i* (sc->fftDim.data.i / sc->registerBoost + (pfINT)additionalR2Cshared))) ? sc->localSize[0].data.i : sc->sharedStrideReadWriteConflict.data.i;
sc->maxSharedStride.data.i = maxSequenceSharedMemory.data.i / (sc->fftDim.data.i / sc->registerBoost + additionalR2Cshared);// ((maxSequenceSharedMemory.data.i < sc->sharedStrideReadWriteConflict.data.i* (sc->fftDim.data.i / sc->registerBoost + (pfINT)additionalR2Cshared))) ? sc->localSize[0].data.i : sc->sharedStrideReadWriteConflict.data.i;
sc->sharedStrideReadWriteConflict.data.i = (sc->maxSharedStride.data.i == sc->localSize[0].data.i) ? sc->localSize[0].data.i : sc->sharedStrideReadWriteConflict.data.i;
sc->sharedStride4StepLastAxisConflict.data.i = (sc->maxSharedStride.data.i == sc->localSize[0].data.i) ? sc->localSize[0].data.i : sc->sharedStride4StepLastAxisConflict.data.i;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,22 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
}
while ((axis->axisBlock[1] * (axis->specializationConstants.fftDim.data.i / axis->specializationConstants.registerBoost)) > maxSequenceLengthSharedMemory) axis->axisBlock[1] /= 2;

pfUINT additionalR2Cshared = 0;
if ((axis->specializationConstants.performR2C || ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2) || (axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3) || (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && ((axis->specializationConstants.fft_dim_full.data.i % 2) != 0)))) && (axis->specializationConstants.axis_id == 0) && (!axis->specializationConstants.performR2CmultiUpload) && (!axis->specializationConstants.performR2RmultiUpload)) {
additionalR2Cshared = ((axis->specializationConstants.fft_dim_full.data.i % 2) == 0) ? 2 : 1;
if ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2) || (axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3) || (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && ((axis->specializationConstants.fft_dim_full.data.i % 2) != 0))) additionalR2Cshared = 1;
}
if ((axis->specializationConstants.mergeSequencesR2C) && ((!axis->specializationConstants.performR2CmultiUpload) && (!axis->specializationConstants.performR2RmultiUpload) && ((axis->specializationConstants.fft_dim_full.data.i + additionalR2Cshared) <= maxSequenceLengthSharedMemory) && (FFTPlan->actualFFTSizePerAxis[axis_id][1] > 1) && ((FFTPlan->actualPerformR2CPerAxis[axis_id]) || ((((axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3)) || ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2)) || ((axis->specializationConstants.performDCT == 1) || (axis->specializationConstants.performDST == 1)) || (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && ((app->configuration.size[axis_id] % 2) != 0))) && (axis_id == 0))))) {
if (((axis->specializationConstants.fft_dim_full.data.i + additionalR2Cshared) * axis->axisBlock[1] * axis->specializationConstants.complexSize) > (app->configuration.sharedMemorySize - axis->specializationConstants.additionalRaderSharedSize.data.i * axis->specializationConstants.complexSize)) {
if (axis->axisBlock[1] > maxBatchCoalesced) {
axis->axisBlock[1] = maxBatchCoalesced;
}
else {
axis->specializationConstants.mergeSequencesR2C = 0;
}
}
}

axis->groupedBatch = axis->axisBlock[1];
if (((axis->specializationConstants.fftDim.data.i % 2 == 0) || (axis->axisBlock[0] < app->configuration.numSharedBanks / 4)) && (!(((!axis->specializationConstants.reorderFourStep) || (axis->specializationConstants.useBluesteinFFT)) && (FFTPlan->numAxisUploads[0] > 1))) && (axis->axisBlock[1] > 1) && (axis->axisBlock[1] * axis->specializationConstants.fftDim.data.i < maxSequenceLengthSharedMemory) && (!((app->configuration.performZeropadding[0] || app->configuration.performZeropadding[1] || app->configuration.performZeropadding[2])))) {
/*#if (VKFFT_BACKEND==0)
Expand Down Expand Up @@ -424,6 +440,21 @@ static inline VkFFTResult VkFFTSplitAxisBlock(VkFFTApplication* app, VkFFTPlan*
}
}
while ((axis->axisBlock[1] * (axis->specializationConstants.fftDim.data.i / axis->specializationConstants.registerBoost)) > maxSequenceLengthSharedMemory) axis->axisBlock[1] /= 2;
pfUINT additionalR2Cshared = 0;
if ((axis->specializationConstants.performR2C || ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2) || (axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3) || (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && ((axis->specializationConstants.fft_dim_full.data.i % 2) != 0)))) && (axis->specializationConstants.axis_id == 0) && (!axis->specializationConstants.performR2CmultiUpload) && (!axis->specializationConstants.performR2RmultiUpload)) {
additionalR2Cshared = ((axis->specializationConstants.fft_dim_full.data.i % 2) == 0) ? 2 : 1;
if ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2) || (axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3) || (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && ((axis->specializationConstants.fft_dim_full.data.i % 2) != 0))) additionalR2Cshared = 1;
}
if ((axis->specializationConstants.mergeSequencesR2C) && ((!axis->specializationConstants.performR2CmultiUpload) && (!axis->specializationConstants.performR2RmultiUpload) && ((axis->specializationConstants.fft_dim_full.data.i + additionalR2Cshared) <= maxSequenceLengthSharedMemory) && (FFTPlan->actualFFTSizePerAxis[axis_id][1] > 1) && ((FFTPlan->actualPerformR2CPerAxis[axis_id]) || ((((axis->specializationConstants.performDCT == 3) || (axis->specializationConstants.performDST == 3)) || ((axis->specializationConstants.performDCT == 2) || (axis->specializationConstants.performDST == 2)) || ((axis->specializationConstants.performDCT == 1) || (axis->specializationConstants.performDST == 1)) || (((axis->specializationConstants.performDCT == 4) || (axis->specializationConstants.performDST == 4)) && ((app->configuration.size[axis_id] % 2) != 0))) && (axis_id == 0))))) {
if (((axis->specializationConstants.fft_dim_full.data.i + additionalR2Cshared) * axis->axisBlock[1] * axis->specializationConstants.complexSize) > (app->configuration.sharedMemorySize - axis->specializationConstants.additionalRaderSharedSize.data.i * axis->specializationConstants.complexSize)) {
if (axis->axisBlock[1] > maxBatchCoalesced) {
axis->axisBlock[1] = maxBatchCoalesced;
}
else {
axis->specializationConstants.mergeSequencesR2C = 0;
}
}
}
axis->groupedBatch = axis->axisBlock[1];
if ((!axis->specializationConstants.useRaderMult) && (axis->axisBlock[1] >= 4) && (((axis->axisBlock[0] & (axis->axisBlock[0]-1))) || (axis->axisBlock[0] <= app->configuration.numSharedBanks / 2)) && (!(((!axis->specializationConstants.reorderFourStep) || (axis->specializationConstants.useBluesteinFFT)) && (FFTPlan->numAxisUploads[0] > 1))) && (axis->axisBlock[1] > 1) && (axis->axisBlock[1] * axis->specializationConstants.fftDim.data.i < maxSequenceLengthSharedMemory) && (!((app->configuration.performZeropadding[0] || app->configuration.performZeropadding[1] || app->configuration.performZeropadding[2])))) {
/*#if (VKFFT_BACKEND==0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ static inline VkFFTResult VkFFTConstructRaderTree(VkFFTApplication* app, VkFFTAx
}
return res;
}
static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTRaderContainer* raderContainer, int numRaderPrimes, int fftDim, int* min_registers_per_thread, int* registers_per_thread, int* registers_per_thread_per_radix) {
static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTApplication* app, VkFFTRaderContainer* raderContainer, int numRaderPrimes, int fftDim, int* min_registers_per_thread, int* registers_per_thread, int* registers_per_thread_per_radix) {
VkFFTResult res = VKFFT_SUCCESS;
for (pfINT i = 0; i < (pfINT)numRaderPrimes; i++) {
if (raderContainer[i].type == 0) {
Expand All @@ -413,7 +413,12 @@ static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTRaderContainer* ra
if (raderContainer[i].registers_per_thread_per_radix[j] > raderContainer[i].registers_per_thread) raderContainer[i].registers_per_thread = raderContainer[i].registers_per_thread_per_radix[j];
}
}

for (int j = 0; j < 68; j++) {
if (raderContainer[i].registers_per_thread_per_radix[j] > 0){
while (raderContainer[i].containerFFTNum * (int)pfceil(raderContainer[i].containerFFTDim / (double)raderContainer[i].registers_per_thread_per_radix[j]) > app->configuration.maxThreadsNum)
raderContainer[i].registers_per_thread_per_radix[j] += j;
}
}
/*if (raderContainer[i].min_registers_per_thread < min_registers_per_thread[0]) {
for (int j = 0; j < 68; j++) {
if (raderContainer[i].registers_per_thread_per_radix[j] > 0) {
Expand All @@ -424,7 +429,7 @@ static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTRaderContainer* ra
}
}
}*/
if (numRaderPrimes>1){
/*if (app->configuration.maxThreadsNum < fftDim / min_registers_per_thread[0]) {
for (pfINT j = 2; j < 68; j++) {
if (raderContainer[i].registers_per_thread_per_radix[j] != 0) {
double scaling = (raderContainer[i].containerFFTDim > raderContainer[i].registers_per_thread_per_radix[j]) ? pfceil(raderContainer[i].containerFFTDim / (double)raderContainer[i].registers_per_thread_per_radix[j]) : 1.0 / floor(raderContainer[i].registers_per_thread_per_radix[j] / (double)raderContainer[i].containerFFTDim);
Expand All @@ -435,7 +440,7 @@ static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTRaderContainer* ra
if (raderContainer[i].registers_per_thread_per_radix[j] > raderContainer[i].registers_per_thread) raderContainer[i].registers_per_thread = raderContainer[i].registers_per_thread_per_radix[j];
}
}
}
}*/
if (raderContainer[i].registers_per_thread > registers_per_thread[0]) registers_per_thread[0] = raderContainer[i].registers_per_thread;
}
}
Expand Down Expand Up @@ -466,7 +471,7 @@ static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTRaderContainer* ra
//subprimes optimization
for (pfINT i = 0; i < (pfINT)numRaderPrimes; i++) {
if (raderContainer[i].numSubPrimes) {
res = VkFFTOptimizeRaderFFTRegisters(raderContainer[i].container, raderContainer[i].numSubPrimes, fftDim, min_registers_per_thread, registers_per_thread, registers_per_thread_per_radix);
res = VkFFTOptimizeRaderFFTRegisters(app, raderContainer[i].container, raderContainer[i].numSubPrimes, fftDim, min_registers_per_thread, registers_per_thread, registers_per_thread_per_radix);
if (res != VKFFT_SUCCESS) return res;
}
}
Expand Down Expand Up @@ -1457,7 +1462,7 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
min_registers_per_thread = 2;
registers_per_thread = 2;
}
res = VkFFTOptimizeRaderFFTRegisters(axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes, (int)locAxisSplit[k], &min_registers_per_thread, &registers_per_thread, registers_per_thread_per_radix);
res = VkFFTOptimizeRaderFFTRegisters(app, axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes, (int)locAxisSplit[k], &min_registers_per_thread, &registers_per_thread, registers_per_thread_per_radix);
if (res != VKFFT_SUCCESS) return res;
}
if ((registerBoost == 4) && (registers_per_thread % 4 != 0)) {
Expand Down Expand Up @@ -1608,7 +1613,7 @@ static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPl
}
//second optimizer pass
if (axes[k].specializationConstants.numRaderPrimes) {
res = VkFFTOptimizeRaderFFTRegisters(axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes, (int)locAxisSplit[k], &min_registers_per_thread, &registers_per_thread, registers_per_thread_per_radix);
res = VkFFTOptimizeRaderFFTRegisters(app, axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes, (int)locAxisSplit[k], &min_registers_per_thread, &registers_per_thread, registers_per_thread_per_radix);
if (res != VKFFT_SUCCESS) return res;
}
for (int i = 2; i < 68; i++) {
Expand Down

0 comments on commit 79f7b7a

Please sign in to comment.