Skip to content

Commit

Permalink
Documentation update for the release
Browse files Browse the repository at this point in the history
  • Loading branch information
DTolm committed Jan 8, 2024
1 parent 1b8962b commit c7e8d9f
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 33 deletions.
2 changes: 1 addition & 1 deletion VkFFT_TestSuite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ int main(int argc, char* argv[])
version_decomposed[0] = version / 10000;
version_decomposed[1] = (version - version_decomposed[0] * 10000) / 100;
version_decomposed[2] = (version - version_decomposed[0] * 10000 - version_decomposed[1] * 100);
printf("VkFFT v%d.%d.%d (23-10-2023). Author: Tolmachev Dmitrii\n", version_decomposed[0], version_decomposed[1], version_decomposed[2]);
printf("VkFFT v%d.%d.%d (08-01-2024). Author: Tolmachev Dmitrii\n", version_decomposed[0], version_decomposed[1], version_decomposed[2]);
#if (VKFFT_BACKEND==0)
printf("Vulkan backend\n");
#elif (VKFFT_BACKEND==1)
Expand Down
69 changes: 57 additions & 12 deletions documentation/VkFFT_API_guide.lyx
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ vspace{1cm}

{
\backslash
large October 2023, version 1.3.2
large January 2024, version 1.3.3
\backslash
par}
\end_layout
Expand Down Expand Up @@ -467,12 +467,12 @@ target_compile_definitions(${PROJECT_NAME} PUBLIC -DVK_API_VERSION=11)#10
\begin_layout Plain Layout

target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/gl
slang-master/glslang/Include/)
slang-main/glslang/Include/)
\end_layout

\begin_layout Plain Layout

add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/glslang-master)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/glslang-main)
\end_layout

\begin_layout Plain Layout
Expand Down Expand Up @@ -556,8 +556,8 @@ enable_language(CUDA)

\begin_layout Plain Layout

set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_ARCHITECTURES 35 60 70
75 80 86)
set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_ARCHITECTURESs 60 70 75
80 86)
\end_layout

\begin_layout Plain Layout
Expand Down Expand Up @@ -919,6 +919,14 @@ find_path(
\begin_layout Plain Layout

PATH_SUFFIXES "include"
\begin_inset Quotes eld
\end_inset

level_zero
\begin_inset Quotes erd
\end_inset


\end_layout

\begin_layout Plain Layout
Expand Down Expand Up @@ -1733,15 +1741,21 @@ For even sequences there exists an easy mapping between R2C/C2R FFTs and
is applied.
\end_layout

\begin_layout Standard
VkFFt also implements a general R2C/C2R algorithm that computes R2C as a
C2C of the same length with imaginary part set to zero.
The memory layout of it is optimized to reduce memory footprint.
\end_layout

\begin_layout Subsubsection
R2R Discrete Cosine/Sine Transforms
\end_layout

\begin_layout Standard
There exist many different mappings between DCT and FFT.
DSTs are reformulated as DCTs inside the VkFFT, so they use the same algorithms.
As of now, VkFFT has the following algorithms implemented (all single-upload
for now):
As of now, VkFFT has the following algorithms implemented (both single
and multiple uploads):
\end_layout

\begin_layout Itemize
Expand All @@ -1767,6 +1781,16 @@ DCT-IV - for even sizes, mapping between R2R and C2C sequence of half-length.
the imaginary part to the next FFT sequence)).
\end_layout

\begin_layout Standard
The single upload versions (that can fit in shared memory) of these algorithms
have all mappings done in shared memory of the compute unit.
For multiple upload sequence sizes, special callback versions of these
algorithms are created that work similar to callbacks in CUDA (though more
sophisticated as these algorithms can use multiple input values).
The callback functionality has not been yet exposed to the user for general
applications and its design is open to discussion.
\end_layout

\begin_layout Subsubsection
Register overutilization
\end_layout
Expand Down Expand Up @@ -2552,6 +2576,12 @@ VKFFT_ERROR_EMPTY_app = 2015, // app pointer is zero

\begin_layout Plain Layout

VKFFT_ERROR_INVALID_user_tempBuffer_too_small = 2016, // user provided tempBuffe
r is not sufficient for VkFFT intermediate calculations
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_UNSUPPORTED_RADIX = 3001, // VkFFT has encountered unsupported
radix (more than 13) during decomposition and Bluestein's FFT fallback
did not work
Expand All @@ -2566,13 +2596,13 @@ VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH = 3002, // VkFFT can not do this sequence
\begin_layout Plain Layout

VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C = 3003, // VkFFT can not do this
sequence length currently - odd multi-upload R2C/C2R FFTs
sequence length currently - should no longer be thrown
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2R = 3004, // VkFFT can not do this
sequence length currently - multi-upload R2R transforms
sequence length currently - should no longer be thrown
\end_layout

\begin_layout Plain Layout
Expand Down Expand Up @@ -3874,7 +3904,7 @@ pfUINT omitDimension[VKFFT_MAX_FFT_DIMENSIONS]; // Disable FFT for this

\begin_layout Plain Layout

pfUINT performBandwidthBoost; // Try to reduce coalsesced number by a factor
int performBandwidthBoost; // Try to reduce coalsesced number by a factor
of X to get bigger sequence in one upload for strided axes.
Default: -1 for DCT/DST, 2 for Bluestein's algorithm (or -1 if DCT/DST),
0 otherwise
Expand Down Expand Up @@ -3946,7 +3976,17 @@ pfUINT performDST; // Perform DST transformation (X - DST type, 1-4)
\begin_layout Plain Layout

pfUINT disableMergeSequencesR2C; // Disable merging of two real sequences
to reduce calculations (0 - off, 1 - on)
to reduce calculations (0 - off, 1 - on)
\end_layout

\begin_layout Plain Layout

pfUINT forceCallbackVersionRealTransforms; // Force callback version of
R2C and R2R algorithms for all usecases (0 - off, 1 - on)
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout
Expand Down Expand Up @@ -5057,7 +5097,7 @@ uint64_t useUint64 - forces 64-bit addressing in generated kernels.
\end_layout

\begin_layout Standard
uint64_t performBandwidthBoost - try to reduce coalsesced number by a factor
int performBandwidthBoost - try to reduce coalsesced number by a factor
of X to get bigger sequence in one upload for strided axes.
Default: -1(inf) for DCT and DST, 2 for Bluestein's algorithm (or -1 if
DCT and DST), 0 otherwise
Expand All @@ -5072,6 +5112,11 @@ uint64_t disableMergeSequencesR2C - disable the optimization that performs
Optional parameter.
\end_layout

\begin_layout Standard
uint64_t forceCallbackVersionRealTransforms - force callback version of
R2C and R2R algorithms for all usecases (0 - off, 1 - on)
\end_layout

\begin_layout Standard
uint64_t disableReorderFourStep - disables unshuffling of the Four Step
FFT algorithm (last transposition of data).
Expand Down
Binary file modified documentation/VkFFT_API_guide.pdf
Binary file not shown.
49 changes: 33 additions & 16 deletions documentation/VkFFT_API_guide.tex
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
%% LyX 2.3.7 created this file. For more info, see http://www.lyx.org/.
%% LyX 2.3.4.3 created this file. For more info, see http://www.lyx.org/.
%% Do not edit unless you really know what you are doing.
\documentclass[12pt,english]{article}
\usepackage{amsmath}
Expand Down Expand Up @@ -44,7 +44,7 @@
{\Large Dmitrii Tolmachev\par}

\vspace{1cm}
{\large October 2023, version 1.3.2\par}
{\large January 2024, version 1.3.3\par}
\end{titlepage}

\newpage{}
Expand Down Expand Up @@ -118,8 +118,8 @@ \subsection{Installing VkFFT}
\begin{minted}[tabsize=4,obeytabs,breaklines]{cmake}
find_package(Vulkan REQUIRED)
target_compile_definitions(${PROJECT_NAME} PUBLIC -DVK_API_VERSION=11)#10 - Vulkan 1.0, 11 - Vulkan 1.1, 12 - Vulkan 1.2
target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/glslang-master/glslang/Include/)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/glslang-master)
target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/glslang-main/glslang/Include/)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/glslang-main)

target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vkFFT/)
add_library(VkFFT INTERFACE)
Expand All @@ -132,7 +132,7 @@ \subsection{Installing VkFFT}
\begin{minted}[tabsize=4,obeytabs,breaklines]{cmake}
find_package(CUDA 9.0 REQUIRED)
enable_language(CUDA)
set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_ARCHITECTURES 35 60 70 75 80 86)
set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_ARCHITECTURESs 60 70 75 80 86)
target_compile_options(${PROJECT_NAME} PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:
-DVKFFT_BACKEND=${VKFFT_BACKEND}
-gencode arch=compute_60,code=compute_60
Expand Down Expand Up @@ -193,7 +193,7 @@ \subsection{Installing VkFFT}
LevelZero_INCLUDES
NAMES "ze_api.h"
PATHS ${LevelZero_INCLUDE_DIR}
PATH_SUFFIXES "include"
PATH_SUFFIXES "include" "level_zero"
NO_DEFAULT_PATH
)
target_include_directories(${PROJECT_NAME} PUBLIC ${LevelZero_INCLUDES})
Expand Down Expand Up @@ -572,12 +572,16 @@ \subsubsection{R2C/C2R multi-upload FFT algorithm}
done with the help of the Four-Step FFT algorithm. When FFT is done,
separate post-processing for R2C/pre-processing for C2R is applied.
VkFFt also implements a general R2C/C2R algorithm that computes R2C
as a C2C of the same length with imaginary part set to zero. The memory
layout of it is optimized to reduce memory footprint.
\subsubsection{R2R Discrete Cosine/Sine Transforms}
There exist many different mappings between DCT and FFT. DSTs are
reformulated as DCTs inside the VkFFT, so they use the same algorithms.
As of now, VkFFT has the following algorithms implemented (all single-upload
for now):
As of now, VkFFT has the following algorithms implemented (both single
and multiple uploads):
\begin{itemize}
\item DCT-I - mapping between R2R and C2C of the $2N-2$ length. For non-strided
axis can use an optimization similar to the R2C/C2R multidimensional
Expand All @@ -590,6 +594,13 @@ \subsubsection{R2R Discrete Cosine/Sine Transforms}
axis can use an optimization similar to the R2C/C2R multidimensional
case (setting the imaginary part to the next FFT sequence)).
\end{itemize}
The single upload versions (that can fit in shared memory) of these
algorithms have all mappings done in shared memory of the compute
unit. For multiple upload sequence sizes, special callback versions
of these algorithms are created that work similar to callbacks in
CUDA (though more sophisticated as these algorithms can use multiple
input values). The callback functionality has not been yet exposed
to the user for general applications and its design is open to discussion.
\subsubsection{Register overutilization}
Expand Down Expand Up @@ -973,10 +984,11 @@ \subsection{Return value VkFFTResult}
VKFFT_ERROR_EMPTY_applicationString = 2013, // loadApplicationString is zero when loadApplicationFromString is enabled
VKFFT_ERROR_EMPRY_useCustomBluesteinPaddingPattern_arrays = 2014, // pointers to primeSizes or paddedSizes arrays are zero when useCustomBluesteinPaddingPattern is enabled
VKFFT_ERROR_EMPTY_app = 2015, // app pointer is zero
VKFFT_ERROR_INVALID_user_tempBuffer_too_small = 2016, // user provided tempBuffer is not sufficient for VkFFT intermediate calculations
VKFFT_ERROR_UNSUPPORTED_RADIX = 3001, // VkFFT has encountered unsupported radix (more than 13) during decomposition and Bluestein's FFT fallback did not work
VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH = 3002, // VkFFT can not do this sequence length currently - it requires mor than three-upload Four step FFT
VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C = 3003, // VkFFT can not do this sequence length currently - odd multi-upload R2C/C2R FFTs
VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2R = 3004, // VkFFT can not do this sequence length currently - multi-upload R2R transforms
VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C = 3003, // VkFFT can not do this sequence length currently - should no longer be thrown
VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2R = 3004, // VkFFT can not do this sequence length currently - should no longer be thrown
VKFFT_ERROR_UNSUPPORTED_FFT_OMIT = 3005, // VkFFT can not omit sequences in convolution calculations and R2C/C2R case
VKFFT_ERROR_FAILED_TO_ALLOCATE = 4001, // VkFFT failed to allocate GPU memory
VKFFT_ERROR_FAILED_TO_MAP_MEMORY = 4002, // 4002-4052 are handlers for errors of used backend APIs. They may indicate a driver failure. If they are thrown - report to the GitHub repo
Expand Down Expand Up @@ -1266,7 +1278,7 @@ \subsection{VkFFT configuration}
pfUINT numberBatches; // N - used to perform multiple batches of initial data. Default 1
pfUINT useUint64; // Use 64-bit addressing mode in generated kernels
pfUINT omitDimension[VKFFT_MAX_FFT_DIMENSIONS]; // Disable FFT for this dimension (0 - FFT enabled, 1 - FFT disabled). Default 0. Doesn't work for R2C first axis for now. Doesn't work with convolutions.
pfUINT performBandwidthBoost; // Try to reduce coalsesced number by a factor of X to get bigger sequence in one upload for strided axes. Default: -1 for DCT/DST, 2 for Bluestein's algorithm (or -1 if DCT/DST), 0 otherwise
int performBandwidthBoost; // Try to reduce coalsesced number by a factor of X to get bigger sequence in one upload for strided axes. Default: -1 for DCT/DST, 2 for Bluestein's algorithm (or -1 if DCT/DST), 0 otherwise
pfUINT doublePrecision; // Perform calculations in double precision (0 - off, 1 - on).
pfUINT quadDoubleDoublePrecision; // Perform calculations in double-double emulation of quad precision (0 - off, 1 - on).
Expand All @@ -1278,7 +1290,9 @@ \subsection{VkFFT configuration}
pfUINT performR2C; // Perform R2C/C2R decomposition (0 - off, 1 - on)
pfUINT performDCT; // Perform DCT transformation (X - DCT type, 1-4)
pfUINT performDST; // Perform DST transformation (X - DST type, 1-4)
pfUINT disableMergeSequencesR2C; // Disable merging of two real sequences to reduce calculations (0 - off, 1 - on)
pfUINT disableMergeSequencesR2C; // Disable merging of two real sequences to reduce calculations (0 - off, 1 - on)
pfUINT forceCallbackVersionRealTransforms; // Force callback version of R2C and R2R algorithms for all usecases (0 - off, 1 - on)
pfUINT normalize; // Normalize inverse transform (0 - off, 1 - on)
pfUINT disableReorderFourStep; // Disables unshuffling of Four step algorithm. Requires tempbuffer allocation (0 - off, 1 - on)
pfINT useLUT; // Switches from calculating sincos to using precomputed LUT tables (0 - off, 1 - on). Configured by initialization routine
Expand Down Expand Up @@ -1645,17 +1659,20 @@ \subsubsection{Advanced parameters (code will work fine without using them)}
work in Vulkan API (use multiple buffer binding). Default 0, set to
1 to enable. Optional parameter.
uint64\_t performBandwidthBoost - try to reduce coalsesced number
by a factor of X to get bigger sequence in one upload for strided
axes. Default: -1(inf) for DCT and DST, 2 for Bluestein's algorithm
(or -1 if DCT and DST), 0 otherwise
int performBandwidthBoost - try to reduce coalsesced number by a factor
of X to get bigger sequence in one upload for strided axes. Default:
-1(inf) for DCT and DST, 2 for Bluestein's algorithm (or -1 if DCT
and DST), 0 otherwise
uint64\_t disableMergeSequencesR2C - disable the optimization that
performs merging of two real sequences to reduce calculations (in
R2C/C2R and R2R). If enabled, calculations will be performed by simply
setting the imaginary component to zero. Default 0, set to 1 to enable.
Optional parameter.
uint64\_t forceCallbackVersionRealTransforms - force callback version
of R2C and R2R algorithms for all usecases (0 - off, 1 - on)
uint64\_t disableReorderFourStep - disables unshuffling of the Four
Step FFT algorithm (last transposition of data). With this option
enabled, tempBuffer will not be needed (unless it is required by Bluestein's
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,14 @@ static inline void appendPushConstants(VkFFTSpecializationConstantsLayout* sc) {
#elif(VKFFT_BACKEND==1)
sc->tempLen = sprintf(sc->tempStr, " }PushConsts;\n");
PfAppendLine(sc);
sc->tempLen = sprintf(sc->tempStr, " __constant__ PushConsts consts;\n");
PfAppendLine(sc);
//sc->tempLen = sprintf(sc->tempStr, " __constant__ PushConsts consts;\n");
//PfAppendLine(sc);
#elif(VKFFT_BACKEND==2)
sc->tempLen = sprintf(sc->tempStr, " }PushConsts;\n");
PfAppendLine(sc);

sc->tempLen = sprintf(sc->tempStr, " __constant__ PushConsts consts;\n");
PfAppendLine(sc);
//sc->tempLen = sprintf(sc->tempStr, " __constant__ PushConsts consts;\n");
//PfAppendLine(sc);

#elif(VKFFT_BACKEND==3)
sc->tempLen = sprintf(sc->tempStr, " }PushConsts;\n");
Expand Down

0 comments on commit c7e8d9f

Please sign in to comment.