Skip to content

Commit

Permalink
New register assignment logic
Browse files Browse the repository at this point in the history
-Implemented radix codelets up to 47.
-Implemented composite radix codelets for arbitrary composite stage sizes.
-Implemented new register assignment logic, aimed at optimizing shared memory transfers, register usage and warp utilization.
-Performance improvements for all system sizes - please report regressions if they happen (especially for vendors other than Nvidia and AMD).
-All double pointers passed to VkFFT now make local copy of their contents (#184, #185)
-Fixed locale setting for code generator (vincefn/pyvkfft#38)
  • Loading branch information
DTolm committed Sep 23, 2024
1 parent 4ac61b9 commit 9a96811
Show file tree
Hide file tree
Showing 30 changed files with 3,675 additions and 3,891 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

//ROCM parts
#include "hip/hip_runtime.h"
#include <hipfft.h>
#include <hipfft/hipfft.h>

void launch_precision_rocFFT_double(void* inputC, void* output_rocFFT, int device_id, uint64_t* dims)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

//ROCM parts
#include "hip/hip_runtime.h"
#include <hipfft.h>
#include <hipfft/hipfft.h>

void launch_precision_rocFFT_r2c(void* inputC, void* output_rocFFT, int device_id, uint64_t* dims)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

//ROCM parts
#include "hip/hip_runtime.h"
#include <hipfft.h>
#include <hipfft/hipfft.h>

void launch_precision_rocFFT_single(void* inputC, void* output_rocFFT, int device_id, uint64_t* dims)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

//ROCM parts
#include "hip/hip_runtime.h"
#include <hipfft.h>
#include <hipfft/hipfft.h>

#define GROUP 1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

//ROCM parts
#include "hip/hip_runtime.h"
#include <hipfft.h>
#include <hipfft/hipfft.h>

#define GROUP 1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

//ROCM parts
#include "hip/hip_runtime.h"
#include <hipfft.h>
#include <hipfft/hipfft.h>

#define GROUP 1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

//ROCM parts
#include "hip/hip_runtime.h"
#include <hipfft.h>
#include <hipfft/hipfft.h>

#define GROUP 1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

//ROCM parts
#include "hip/hip_runtime.h"
#include <hipfft.h>
#include <hipfft/hipfft.h>

#define GROUP 1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

//ROCM parts
#include "hip/hip_runtime.h"
#include <hipfft.h>
#include <hipfft/hipfft.h>

#define GROUP 1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

//ROCM parts
#include "hip/hip_runtime.h"
#include <hipfft.h>
#include <hipfft/hipfft.h>

#define GROUP 1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

//ROCM parts
#include "hip/hip_runtime.h"
#include <hipfft.h>
#include <hipfft/hipfft.h>

#define GROUP 1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

//ROCM parts
#include "hip/hip_runtime.h"
#include <hipfft.h>
#include <hipfft/hipfft.h>

#define GROUP 1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

//ROCM parts
#include "hip/hip_runtime.h"
#include <hipfft.h>
#include <hipfft/hipfft.h>

#include "user_benchmark_rocFFT.h"

Expand Down
219 changes: 171 additions & 48 deletions vkFFT/vkFFT/vkFFT_AppManagement/vkFFT_DeleteApp.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,46 +29,20 @@ static inline void deleteVkFFT(VkFFTApplication* app) {
if (app == 0) {
return;
}
#if(VKFFT_BACKEND==0)
if (app->configuration.isCompilerInitialized) {
glslang_finalize_process();
app->configuration.isCompilerInitialized = 0;
}
#elif(VKFFT_BACKEND==1)
if (app->configuration.num_streams > 1) {
cudaError_t res_t = cudaSuccess;
for (pfUINT i = 0; i < app->configuration.num_streams; i++) {
if (app->configuration.stream_event[i] != 0) {
res_t = cudaEventDestroy(app->configuration.stream_event[i]);
if (res_t == cudaSuccess) app->configuration.stream_event[i] = 0;
}
}
if (app->configuration.stream_event != 0) {
free(app->configuration.stream_event);
app->configuration.stream_event = 0;
}
}
#elif(VKFFT_BACKEND==2)
if (app->configuration.num_streams > 1) {
hipError_t res_t = hipSuccess;
for (pfUINT i = 0; i < app->configuration.num_streams; i++) {
if (app->configuration.stream_event[i] != 0) {
res_t = hipEventDestroy(app->configuration.stream_event[i]);
if (res_t == hipSuccess) app->configuration.stream_event[i] = 0;
}
}
if (app->configuration.stream_event != 0) {
free(app->configuration.stream_event);
app->configuration.stream_event = 0;
}
}
#endif
if (app->numRaderFFTPrimes) {
for (pfUINT i = 0; i < app->numRaderFFTPrimes; i++) {
free(app->raderFFTkernel[i]);
app->raderFFTkernel[i] = 0;
}
}
if (app->configuration.bufferSize != 0) {
free(app->configuration.bufferSize);
app->configuration.bufferSize = 0;
}
if (app->configuration.buffer != 0) {
free((void*)app->configuration.buffer);
app->configuration.buffer = 0;
}
if (!app->configuration.userTempBuffer) {
if (app->configuration.allocateTempBuffer && (app->configuration.tempBuffer != 0)) {
app->configuration.allocateTempBuffer = 0;
Expand Down Expand Up @@ -110,14 +84,52 @@ static inline void deleteVkFFT(VkFFTApplication* app) {
((MTL::Buffer*)app->configuration.tempBuffer[0])->release();
}
#endif
if (app->configuration.tempBuffer != 0) {
free(app->configuration.tempBuffer);
app->configuration.tempBuffer = 0;
}
}
if (app->configuration.tempBufferSize != 0) {
free(app->configuration.tempBufferSize);
app->configuration.tempBufferSize = 0;
}
if (app->configuration.tempBufferSize != 0) {
free(app->configuration.tempBufferSize);
app->configuration.tempBufferSize = 0;
}
if (app->configuration.tempBuffer != 0) {
free(app->configuration.tempBuffer);
app->configuration.tempBuffer = 0;
}
if (app->configuration.isInputFormatted) {
if (app->configuration.inputBufferSize != 0) {
free(app->configuration.inputBufferSize);
app->configuration.inputBufferSize = 0;
}
if (app->configuration.inputBuffer != 0) {
free((void*)app->configuration.inputBuffer);
app->configuration.inputBuffer = 0;
}
}
else {
app->configuration.inputBufferSize = 0;
app->configuration.inputBuffer = 0;
}
if (app->configuration.isOutputFormatted) {
if (app->configuration.outputBufferSize != 0) {
free(app->configuration.outputBufferSize);
app->configuration.outputBufferSize = 0;
}
if (app->configuration.outputBuffer != 0) {
free((void*)app->configuration.outputBuffer);
app->configuration.outputBuffer = 0;
}
}
else {
app->configuration.outputBufferSize = 0;
app->configuration.outputBuffer = 0;
}
if (app->configuration.performConvolution) {
if (app->configuration.kernelSize != 0) {
free(app->configuration.kernelSize);
app->configuration.kernelSize = 0;
}
if (app->configuration.kernel != 0) {
free((void*)app->configuration.kernel);
app->configuration.kernel = 0;
}
}
for (pfUINT i = 0; i < app->configuration.FFTdim; i++) {
Expand Down Expand Up @@ -312,16 +324,127 @@ static inline void deleteVkFFT(VkFFTApplication* app) {
}
}
}
if (app->configuration.autoCustomBluesteinPaddingPattern) {
if (app->configuration.primeSizes != 0) {
free(app->configuration.primeSizes);
app->configuration.primeSizes = 0;
if (app->configuration.primeSizes != 0) {
free(app->configuration.primeSizes);
app->configuration.primeSizes = 0;
}
if (app->configuration.paddedSizes != 0) {
free(app->configuration.paddedSizes);
app->configuration.paddedSizes = 0;
}
#if(VKFFT_BACKEND==0)
if (app->configuration.isCompilerInitialized) {
glslang_finalize_process();
app->configuration.isCompilerInitialized = 0;
}
if (app->configuration.physicalDevice) {
free(app->configuration.physicalDevice);
app->configuration.physicalDevice = 0;
}
if (app->configuration.device) {
free(app->configuration.device);
app->configuration.device = 0;
}
if (app->configuration.queue) {
free(app->configuration.queue);
app->configuration.queue = 0;
}
if (app->configuration.commandPool) {
free(app->configuration.commandPool);
app->configuration.commandPool = 0;
}
if (app->configuration.fence) {
free(app->configuration.fence);
app->configuration.fence = 0;
}
if (app->configuration.pipelineCache != 0) {
free(app->configuration.pipelineCache);
app->configuration.pipelineCache = 0;
}
if (app->configuration.stagingBuffer != 0) {
free(app->configuration.stagingBuffer);
app->configuration.stagingBuffer = 0;
}
if (app->configuration.stagingBufferMemory != 0) {
free(app->configuration.stagingBufferMemory);
app->configuration.stagingBufferMemory = 0;
}
#elif(VKFFT_BACKEND==1)
if (app->configuration.device) {
free(app->configuration.device);
app->configuration.device = 0;
}
if (app->configuration.stream) {
free(app->configuration.stream);
app->configuration.stream = 0;
}
if (app->configuration.num_streams > 1) {
cudaError_t res_t = cudaSuccess;
for (pfUINT i = 0; i < app->configuration.num_streams; i++) {
if (app->configuration.stream_event[i] != 0) {
res_t = cudaEventDestroy(app->configuration.stream_event[i]);
if (res_t == cudaSuccess) app->configuration.stream_event[i] = 0;
}
}
if (app->configuration.stream_event != 0) {
free(app->configuration.stream_event);
app->configuration.stream_event = 0;
}
}
#elif(VKFFT_BACKEND==2)
if (app->configuration.device) {
free(app->configuration.device);
app->configuration.device = 0;
}
if (app->configuration.stream) {
free(app->configuration.stream);
app->configuration.stream = 0;
}
if (app->configuration.num_streams > 1) {
hipError_t res_t = hipSuccess;
for (pfUINT i = 0; i < app->configuration.num_streams; i++) {
if (app->configuration.stream_event[i] != 0) {
res_t = hipEventDestroy(app->configuration.stream_event[i]);
if (res_t == hipSuccess) app->configuration.stream_event[i] = 0;
}
}
if (app->configuration.paddedSizes != 0) {
free(app->configuration.paddedSizes);
app->configuration.paddedSizes = 0;
if (app->configuration.stream_event != 0) {
free(app->configuration.stream_event);
app->configuration.stream_event = 0;
}
}
#elif(VKFFT_BACKEND==3)
if (app->configuration.device) {
free(app->configuration.device);
app->configuration.device = 0;
}
if (app->configuration.context) {
free(app->configuration.context);
app->configuration.context = 0;
}
#elif(VKFFT_BACKEND==4)
if (app->configuration.device) {
free(app->configuration.device);
app->configuration.device = 0;
}
if (app->configuration.context) {
free(app->configuration.context);
app->configuration.context = 0;
}
if (app->configuration.commandQueue) {
free(app->configuration.commandQueue);
app->configuration.commandQueue = 0;
}
#elif(VKFFT_BACKEND==5)
if (app->configuration.device) {
free(app->configuration.device);
app->configuration.device = 0;
}
if (app->configuration.queue) {
free(app->configuration.queue);
app->configuration.queue = 0;
}
#endif
memset(app, 0, sizeof(VkFFTApplication));
}
#endif
Loading

0 comments on commit 9a96811

Please sign in to comment.