Skip to content

Commit 3fa0c21

Browse files
authored
VkFFT v1.3.2 release
-Added double-double support in VkFFT. Requires cpu initialization in full quad precision, so only supports gcc with quadmath dependency for now. Potentially possible to add full FP128 support or some other FP128 library (like mpir) in the future. -Data has to be stored in double-double before VkFFT kernels calls (no fp128<->double-double conversion on the GPU yet). -Full 1e-32 precision, but same range as FP64. See Library for Double-Double and Quad-Double Arithmetic by Y Hida for more information on double-double. -Double-double requires FMA contraction to be disabled (due to ab-cd contraction rounding mismatch). Doesn't work on Vulkan as I haven't found how to do that yet. -Added DST I-IV support. -Fixed warnings (#138) -Added proper check for app to be zero before initializeVkFFT call and zeroing on deletion (#134) -Added an option to provide a staging buffer in the application and VkGPU handle (#129) -Added guards for build type (#128) -Changed default innermost stride for real buffers in out-of-place R2C from size[0]+2 to size[0] (#139) -Allow specifying glslang version (#135) -Improved instruction count and accuracy for radix-7. -Fixed missing deallocation calls for the inverse Bluestein axes. Fixed the buffer layout size in Vulkan in some cases. -Refactored the code generator and container struct layout for better handling complex numbers (-5k loc). -Added more precision tests and benchmarks.
2 parents 116bf7f + f9b0ac9 commit 3fa0c21

File tree

75 files changed

+10826
-11514
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+10826
-11514
lines changed

CMakeLists.txt

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,15 @@
11
cmake_minimum_required(VERSION 3.11)
22
project(VkFFT_TestSuite)
3+
4+
if(NOT CMAKE_BUILD_TYPE)
35
set(CMAKE_CONFIGURATION_TYPES "Release" CACHE STRING "" FORCE)
46
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)
7+
endif()
8+
9+
if (NOT DEFINED GLSLANG_GIT_TAG)
10+
set(GLSLANG_GIT_TAG "12.3.1")
11+
endif()
12+
513
include(FetchContent)
614
set(VKFFT_BACKEND 0 CACHE STRING "0 - Vulkan, 1 - CUDA, 2 - HIP, 3 - OpenCL, 4 - Level Zero, 5 - Metal")
715

@@ -18,7 +26,8 @@ else()
1826
endif()
1927

2028
option(build_VkFFT_FFTW_precision "Build VkFFT FFTW precision comparison" OFF)
21-
option(VkFFT_use_FP128_Bluestein_RaderFFT "Use FP128 for Bluestein and Rader FFT kernel calculations. Currently requires FP128 FFT library, like FFTWl" OFF)
29+
option(VkFFT_use_FP128_Bluestein_RaderFFT "Use LD for Bluestein and Rader FFT kernel calculations. Currently requires LD FFT library, like FFTWl, will be reworked" OFF)
30+
option(VkFFT_use_FP128_double_double "Build VkFFT quad double-double" OFF)
2231

2332
if (MSVC)
2433
set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${PROJECT_NAME})
@@ -37,6 +46,7 @@ if(build_VkFFT_FFTW_precision)
3746
benchmark_scripts/vkFFT_scripts/src/sample_6_benchmark_VkFFT_single_r2c.cpp
3847
benchmark_scripts/vkFFT_scripts/src/sample_7_benchmark_VkFFT_single_Bluestein.cpp
3948
benchmark_scripts/vkFFT_scripts/src/sample_8_benchmark_VkFFT_double_Bluestein.cpp
49+
benchmark_scripts/vkFFT_scripts/src/sample_9_benchmark_VkFFT_quadDoubleDouble.cpp
4050
benchmark_scripts/vkFFT_scripts/src/sample_10_benchmark_VkFFT_single_multipleBuffers.cpp
4151
benchmark_scripts/vkFFT_scripts/src/sample_11_precision_VkFFT_single.cpp
4252
benchmark_scripts/vkFFT_scripts/src/sample_12_precision_VkFFT_double.cpp
@@ -46,14 +56,17 @@ if(build_VkFFT_FFTW_precision)
4656
benchmark_scripts/vkFFT_scripts/src/sample_16_precision_VkFFT_single_dct.cpp
4757
benchmark_scripts/vkFFT_scripts/src/sample_17_precision_VkFFT_double_dct.cpp
4858
benchmark_scripts/vkFFT_scripts/src/sample_18_precision_VkFFT_double_nonPow2.cpp
59+
benchmark_scripts/vkFFT_scripts/src/sample_19_precision_VkFFT_quadDoubleDouble_nonPow2.cpp
4960
benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp
5061
benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp
5162
benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
5263
benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
5364
benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
54-
benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp
65+
benchmark_scripts/vkFFT_scripts/src/sample_1000_benchmark_VkFFT_single_2_4096.cpp
5566
benchmark_scripts/vkFFT_scripts/src/sample_1001_benchmark_VkFFT_double_2_4096.cpp
56-
benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp)
67+
benchmark_scripts/vkFFT_scripts/src/sample_1002_benchmark_VkFFT_half_2_4096.cpp
68+
benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp
69+
benchmark_scripts/vkFFT_scripts/src/sample_1004_benchmark_VkFFT_quadDoubleDouble_2_4096.cpp)
5770
else()
5871
add_executable(${PROJECT_NAME} VkFFT_TestSuite.cpp
5972
benchmark_scripts/vkFFT_scripts/src/utils_VkFFT.cpp
@@ -67,15 +80,18 @@ else()
6780
benchmark_scripts/vkFFT_scripts/src/sample_6_benchmark_VkFFT_single_r2c.cpp
6881
benchmark_scripts/vkFFT_scripts/src/sample_7_benchmark_VkFFT_single_Bluestein.cpp
6982
benchmark_scripts/vkFFT_scripts/src/sample_8_benchmark_VkFFT_double_Bluestein.cpp
83+
benchmark_scripts/vkFFT_scripts/src/sample_9_benchmark_VkFFT_quadDoubleDouble.cpp
7084
benchmark_scripts/vkFFT_scripts/src/sample_10_benchmark_VkFFT_single_multipleBuffers.cpp
7185
benchmark_scripts/vkFFT_scripts/src/sample_50_convolution_VkFFT_single_1d_matrix.cpp
7286
benchmark_scripts/vkFFT_scripts/src/sample_51_convolution_VkFFT_single_3d_matrix_zeropadding_r2c.cpp
7387
benchmark_scripts/vkFFT_scripts/src/sample_52_convolution_VkFFT_single_2d_batched_r2c.cpp
7488
benchmark_scripts/vkFFT_scripts/src/sample_100_benchmark_VkFFT_single_nd_dct.cpp
7589
benchmark_scripts/vkFFT_scripts/src/sample_101_benchmark_VkFFT_double_nd_dct.cpp
76-
benchmark_scripts/vkFFT_scripts/src/sample_1000_VkFFT_single_2_4096.cpp
90+
benchmark_scripts/vkFFT_scripts/src/sample_1000_benchmark_VkFFT_single_2_4096.cpp
7791
benchmark_scripts/vkFFT_scripts/src/sample_1001_benchmark_VkFFT_double_2_4096.cpp
78-
benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp)
92+
benchmark_scripts/vkFFT_scripts/src/sample_1002_benchmark_VkFFT_half_2_4096.cpp
93+
benchmark_scripts/vkFFT_scripts/src/sample_1003_benchmark_VkFFT_single_3d_2_512.cpp
94+
benchmark_scripts/vkFFT_scripts/src/sample_1004_benchmark_VkFFT_quadDoubleDouble_2_4096.cpp)
7995
endif()
8096
target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_11)
8197
add_definitions(-DVKFFT_BACKEND=${VKFFT_BACKEND})
@@ -135,10 +151,15 @@ target_compile_definitions(${PROJECT_NAME} PUBLIC -DVK_API_VERSION=11)#10 - Vulk
135151
if(VkFFT_use_FP128_Bluestein_RaderFFT)
136152
target_compile_definitions(${PROJECT_NAME} PUBLIC -DVkFFT_use_FP128_Bluestein_RaderFFT)
137153
endif()
154+
if(VkFFT_use_FP128_double_double)
155+
target_compile_definitions(${PROJECT_NAME} PUBLIC -DVKFFT_USE_DOUBLEDOUBLE_FP128)
156+
target_link_libraries(${PROJECT_NAME} PUBLIC quadmath)
157+
endif()
138158
if(${VKFFT_BACKEND} EQUAL 0)
159+
set(ENABLE_OPT 0)
139160
FetchContent_Declare(
140161
glslang-main
141-
GIT_TAG "origin/main"
162+
GIT_TAG ${GLSLANG_GIT_TAG}
142163
GIT_REPOSITORY https://github.com/KhronosGroup/glslang
143164
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/glslang-main
144165
)
@@ -150,12 +171,12 @@ if(${VKFFT_BACKEND} EQUAL 0)
150171
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/glslang-main)
151172
endif()
152173

153-
target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vkFFT/)
154174
add_library(VkFFT INTERFACE)
175+
target_include_directories(VkFFT INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/vkFFT/)
155176
target_compile_definitions(VkFFT INTERFACE -DVKFFT_BACKEND=${VKFFT_BACKEND})
156177

157-
target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/half_lib/)
158178
add_library(half INTERFACE)
179+
target_include_directories(half INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/half_lib/)
159180

160181
target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_scripts/vkFFT_scripts/include/)
161182

@@ -165,6 +186,7 @@ elseif(${VKFFT_BACKEND} EQUAL 1)
165186
find_library(CUDA_NVRTC_LIB libnvrtc nvrtc HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64" "${LIBNVRTC_LIBRARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" /usr/lib64 /usr/local/cuda/lib64)
166187
add_definitions(-DCUDA_TOOLKIT_ROOT_DIR="${CUDA_TOOLKIT_ROOT_DIR}")
167188
target_link_libraries(${PROJECT_NAME} PUBLIC ${CUDA_LIBRARIES} cuda ${CUDA_NVRTC_LIB} VkFFT half)
189+
target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS})
168190
elseif(${VKFFT_BACKEND} EQUAL 2)
169191
target_link_libraries(${PROJECT_NAME} PUBLIC hip::host VkFFT half)
170192
elseif(${VKFFT_BACKEND} EQUAL 3)
@@ -194,6 +216,16 @@ if(build_VkFFT_FFTW_precision OR VkFFT_use_FP128_Bluestein_RaderFFT)
194216
NO_DEFAULT_PATH
195217
)
196218
target_include_directories(${PROJECT_NAME} PUBLIC ${FFTW_INCLUDES})
219+
if(VkFFT_use_FP128_double_double)
220+
find_library(
221+
FFTWQ_LIB
222+
NAMES "libfftw3q" "fftw3q"
223+
PATHS ${FFTW3_LIB_DIR}
224+
PATH_SUFFIXES "lib" "lib64"
225+
NO_DEFAULT_PATH
226+
)
227+
target_link_libraries (${PROJECT_NAME} PUBLIC ${FFTWQ_LIB})
228+
endif()
197229
if(VkFFT_use_FP128_Bluestein_RaderFFT)
198230
find_library(
199231
FFTWL_LIB
@@ -202,10 +234,9 @@ if(VkFFT_use_FP128_Bluestein_RaderFFT)
202234
PATH_SUFFIXES "lib" "lib64"
203235
NO_DEFAULT_PATH
204236
)
205-
target_link_libraries (${PROJECT_NAME} PUBLIC ${FFTW_LIB} ${FFTWL_LIB})
206-
else()
207-
target_link_libraries (${PROJECT_NAME} PUBLIC ${FFTW_LIB})
237+
target_link_libraries (${PROJECT_NAME} PUBLIC ${FFTWL_LIB})
208238
endif()
239+
target_link_libraries (${PROJECT_NAME} PUBLIC ${FFTW_LIB})
209240
endif()
210241

211242
if(build_VkFFT_cuFFT_benchmark)
@@ -253,6 +284,7 @@ if(build_VkFFT_cuFFT_benchmark)
253284
-gencode arch=compute_80,code=compute_80
254285
-gencode arch=compute_86,code=compute_86>")
255286
target_include_directories(cuFFT_scripts PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_scripts/cuFFT_scripts/include)
287+
target_include_directories(cuFFT_scripts PUBLIC ${CUDA_INCLUDE_DIRS})
256288
set_target_properties(cuFFT_scripts PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
257289
set_target_properties(cuFFT_scripts PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
258290
target_link_libraries(${PROJECT_NAME} PUBLIC cuFFT_scripts)

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ VkFFT is an efficient GPU-accelerated multidimensional Fast Fourier Transform li
1010
- Radix-2/3/4/5/7/8/11/13 FFT. Sequences using radix 3, 5, 7, 11 and 13 have comparable performance to that of powers of 2.
1111
- Rader's FFT algorithm for primes from 17 up to max shared memory length (~10000). Inlined and done without additional memory transfers.
1212
- Bluestein's FFT algorithm for all other sequences. Full coverage of C2C range, single upload (2^12, 2^12, 2^12) for R2C/C2R/R2R. Optimized to have as few memory transfers as possible by using zero padding and merged convolution support of VkFFT.
13-
- Single, double and half precision support. Double precision uses CPU-generated LUT tables. Half precision still does all computations in single and only uses half precision to store data.
13+
- Single, double, half and quad (double-double) precision support. Double and quad precision uses CPU-generated LUT tables. Half precision still does all computations in single and only uses half precision to store data.
1414
- All transformations are performed in-place with no performance loss. Out-of-place transforms are supported by selecting different input/output buffers.
1515
- No additional transposition uploads. Note: Data can be reshuffled after the Four Step FFT algorithm with an additional buffer (for big sequences). Doesn't matter for convolutions - they return to the input ordering (saves memory).
1616
- Complex to complex (C2C), real to complex (R2C), complex to real (C2R) transformations and real to real (R2R) Discrete Cosine Transformations of types I, II, III and IV. R2R, R2C and C2R are optimized to run up to 2x times faster than C2C and take 2x less memory.
@@ -33,19 +33,19 @@ Include the vkFFT.h file and glslang compiler. Provide the library with correctl
3333
For single and double precision, Vulkan 1.0 is required. For half precision, Vulkan 1.1 is required.
3434

3535
CUDA/HIP:
36-
Include the vkFFT.h file and make sure your system has NVRTC/HIPRTC built. Provide the library with correctly chosen VKFFT_BACKEND definition. Only single/double precision for now.\
36+
Include the vkFFT.h file and make sure your system has NVRTC/HIPRTC built. Provide the library with correctly chosen VKFFT_BACKEND definition.\
3737
To build CUDA/HIP version of the benchmark, replace VKFFT_BACKEND in CMakeLists (line 5) with the correct one and optionally enable FFTW. VKFFT_BACKEND=1 for CUDA, VKFFT_BACKEND=2 for HIP.
3838

3939
OpenCL:
40-
Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition. Only single/double precision for now.\
40+
Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition.\
4141
To build OpenCL version of the benchmark, replace VKFFT_BACKEND in CMakeLists (line 5) with the value 3 and optionally enable FFTW.
4242

4343
Level Zero:
44-
Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition. Clang and llvm-spirv must be valid system calls. Only single/double precision for now.\
44+
Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition. Clang and llvm-spirv must be valid system calls.\
4545
To build Level Zero version of the benchmark, replace VKFFT_BACKEND in CMakeLists (line 5) with the value 4 and optionally enable FFTW.
4646

4747
Metal:
48-
Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition. VkFFT uses metal-cpp as a C++ bindings to Apple's libraries - Foundation.hpp, QuartzCore.hpp and Metal.hpp. Only single precision.\
48+
Include the vkFFT.h file. Provide the library with correctly chosen VKFFT_BACKEND definition. VkFFT uses metal-cpp as a C++ bindings to Apple's libraries - Foundation.hpp, QuartzCore.hpp and Metal.hpp.\
4949
To build Metal version of the benchmark, replace VKFFT_BACKEND in CMakeLists (line 5) with the value 5 and optionally enable FFTW.
5050

5151
## Command-line interface

0 commit comments

Comments
 (0)