Documentation update for the release

DTolm · Jan 8, 2024 · c7e8d9f · c7e8d9f
1 parent 1b8962b
commit c7e8d9f
Show file tree

Hide file tree

Showing 5 changed files with 95 additions and 33 deletions.
diff --git a/VkFFT_TestSuite.cpp b/VkFFT_TestSuite.cpp
@@ -562,7 +562,7 @@ int main(int argc, char* argv[])
 		version_decomposed[0] = version / 10000;
 		version_decomposed[1] = (version - version_decomposed[0] * 10000) / 100;
 		version_decomposed[2] = (version - version_decomposed[0] * 10000 - version_decomposed[1] * 100);
-		printf("VkFFT v%d.%d.%d (23-10-2023). Author: Tolmachev Dmitrii\n", version_decomposed[0], version_decomposed[1], version_decomposed[2]);
+		printf("VkFFT v%d.%d.%d (08-01-2024). Author: Tolmachev Dmitrii\n", version_decomposed[0], version_decomposed[1], version_decomposed[2]);
 #if (VKFFT_BACKEND==0)
 		printf("Vulkan backend\n");
 #elif (VKFFT_BACKEND==1)

diff --git a/documentation/VkFFT_API_guide.lyx b/documentation/VkFFT_API_guide.lyx
@@ -192,7 +192,7 @@ vspace{1cm}
 
 {
 \backslash
-large October 2023, version 1.3.2
+large January 2024, version 1.3.3
 \backslash
 par} 
 \end_layout
@@ -467,12 +467,12 @@ target_compile_definitions(${PROJECT_NAME} PUBLIC -DVK_API_VERSION=11)#10
 \begin_layout Plain Layout
 
 target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/gl
-slang-master/glslang/Include/) 
+slang-main/glslang/Include/) 
 \end_layout
 
 \begin_layout Plain Layout
 
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/glslang-master)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/glslang-main)
 \end_layout
 
 \begin_layout Plain Layout
@@ -556,8 +556,8 @@ enable_language(CUDA)
 
 \begin_layout Plain Layout
 
-set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_ARCHITECTURES 35 60 70
- 75 80 86) 	
+set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_ARCHITECTURESs 60 70 75
+ 80 86) 	
 \end_layout
 
 \begin_layout Plain Layout
@@ -919,6 +919,14 @@ find_path(
 \begin_layout Plain Layout
 
 	PATH_SUFFIXES "include" 
+\begin_inset Quotes eld
+\end_inset
+
+level_zero
+\begin_inset Quotes erd
+\end_inset
+
+
 \end_layout
 
 \begin_layout Plain Layout
@@ -1733,15 +1741,21 @@ For even sequences there exists an easy mapping between R2C/C2R FFTs and
  is applied.
 \end_layout
 
+\begin_layout Standard
+VkFFt also implements a general R2C/C2R algorithm that computes R2C as a
+ C2C of the same length with imaginary part set to zero.
+ The memory layout of it is optimized to reduce memory footprint.
+\end_layout
+
 \begin_layout Subsubsection
 R2R Discrete Cosine/Sine Transforms
 \end_layout
 
 \begin_layout Standard
 There exist many different mappings between DCT and FFT.
  DSTs are reformulated as DCTs inside the VkFFT, so they use the same algorithms.
- As of now, VkFFT has the following algorithms implemented (all single-upload
- for now):
+ As of now, VkFFT has the following algorithms implemented (both single
+ and multiple uploads):
 \end_layout
 
 \begin_layout Itemize
@@ -1767,6 +1781,16 @@ DCT-IV - for even sizes, mapping between R2R and C2C sequence of half-length.
  the imaginary part to the next FFT sequence)).
 \end_layout
 
+\begin_layout Standard
+The single upload versions (that can fit in shared memory) of these algorithms
+ have all mappings done in shared memory of the compute unit.
+ For multiple upload sequence sizes, special callback versions of these
+ algorithms are created that work similar to callbacks in CUDA (though more
+ sophisticated as these algorithms can use multiple input values).
+ The callback functionality has not been yet exposed to the user for general
+ applications and its design is open to discussion.
+\end_layout
+
 \begin_layout Subsubsection
 Register overutilization
 \end_layout
@@ -2552,6 +2576,12 @@ VKFFT_ERROR_EMPTY_app = 2015,	// app pointer is zero
 
 \begin_layout Plain Layout
 
+VKFFT_ERROR_INVALID_user_tempBuffer_too_small = 2016,	// user provided tempBuffe
+r is not sufficient for VkFFT intermediate calculations
+\end_layout
+
+\begin_layout Plain Layout
+
 VKFFT_ERROR_UNSUPPORTED_RADIX = 3001,	// VkFFT has encountered unsupported
  radix (more than 13) during decomposition and Bluestein's FFT fallback
  did not work
@@ -2566,13 +2596,13 @@ VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH = 3002,	// VkFFT can not do this sequence
 \begin_layout Plain Layout
 
 VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C = 3003,	// VkFFT can not do this
- sequence length currently - odd multi-upload R2C/C2R FFTs
+ sequence length currently - should no longer be thrown
 \end_layout
 
 \begin_layout Plain Layout
 
 VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2R = 3004,	// VkFFT can not do this
- sequence length currently - multi-upload R2R transforms
+ sequence length currently - should no longer be thrown
 \end_layout
 
 \begin_layout Plain Layout
@@ -3874,7 +3904,7 @@ pfUINT omitDimension[VKFFT_MAX_FFT_DIMENSIONS];	// Disable FFT for this
 
 \begin_layout Plain Layout
 
-pfUINT performBandwidthBoost; // Try to reduce coalsesced number by a factor
+int performBandwidthBoost; // Try to reduce coalsesced number by a factor
  of X to get bigger sequence in one upload for strided axes.
  Default: -1 for DCT/DST, 2 for Bluestein's algorithm (or -1 if DCT/DST),
  0 otherwise  
@@ -3946,7 +3976,17 @@ pfUINT performDST;	// Perform DST transformation (X - DST type, 1-4)
 \begin_layout Plain Layout
 
 pfUINT disableMergeSequencesR2C;	// Disable merging of two real sequences
- to reduce calculations (0 - off, 1 - on) 
+ to reduce calculations (0 - off, 1 - on)
+\end_layout
+
+\begin_layout Plain Layout
+
+pfUINT forceCallbackVersionRealTransforms;	// Force callback version of
+ R2C and R2R algorithms for all usecases (0 - off, 1 - on) 
+\end_layout
+
+\begin_layout Plain Layout
+
 \end_layout
 
 \begin_layout Plain Layout
@@ -5057,7 +5097,7 @@ uint64_t useUint64 - forces 64-bit addressing in generated kernels.
 \end_layout
 
 \begin_layout Standard
-uint64_t performBandwidthBoost - try to reduce coalsesced number by a factor
+int performBandwidthBoost - try to reduce coalsesced number by a factor
  of X to get bigger sequence in one upload for strided axes.
  Default: -1(inf) for DCT and DST, 2 for Bluestein's algorithm (or -1 if
  DCT and DST), 0 otherwise 
@@ -5072,6 +5112,11 @@ uint64_t disableMergeSequencesR2C - disable the optimization that performs
  Optional parameter.
 \end_layout
 
+\begin_layout Standard
+uint64_t forceCallbackVersionRealTransforms - force callback version of
+ R2C and R2R algorithms for all usecases (0 - off, 1 - on) 
+\end_layout
+
 \begin_layout Standard
 uint64_t disableReorderFourStep - disables unshuffling of the Four Step
  FFT algorithm (last transposition of data).

diff --git a/documentation/VkFFT_API_guide.pdf b/documentation/VkFFT_API_guide.pdf
diff --git a/documentation/VkFFT_API_guide.tex b/documentation/VkFFT_API_guide.tex
@@ -1,4 +1,4 @@
-%% LyX 2.3.7 created this file.  For more info, see http://www.lyx.org/.
+%% LyX 2.3.4.3 created this file.  For more info, see http://www.lyx.org/.
 %% Do not edit unless you really know what you are doing.
 \documentclass[12pt,english]{article}
 \usepackage{amsmath}
@@ -44,7 +44,7 @@
 {\Large Dmitrii Tolmachev\par} 	
 
 \vspace{1cm} 	
-{\large October 2023, version 1.3.2\par} 
+{\large January 2024, version 1.3.3\par} 
 \end{titlepage}
 
 \newpage{}
@@ -118,8 +118,8 @@ \subsection{Installing VkFFT}
 \begin{minted}[tabsize=4,obeytabs,breaklines]{cmake}
 find_package(Vulkan REQUIRED)
 target_compile_definitions(${PROJECT_NAME} PUBLIC -DVK_API_VERSION=11)#10 - Vulkan 1.0, 11 - Vulkan 1.1, 12 - Vulkan 1.2 
-target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/glslang-master/glslang/Include/) 
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/glslang-master)
+target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/glslang-main/glslang/Include/) 
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/glslang-main)
 
 target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vkFFT/)
 add_library(VkFFT INTERFACE)
@@ -132,7 +132,7 @@ \subsection{Installing VkFFT}
 \begin{minted}[tabsize=4,obeytabs,breaklines]{cmake}
 find_package(CUDA 9.0 REQUIRED) 	
 enable_language(CUDA) 	
-set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_ARCHITECTURES 35 60 70 75 80 86) 	
+set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_ARCHITECTURESs 60 70 75 80 86) 	
 target_compile_options(${PROJECT_NAME} PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:SHELL: 	
 	-DVKFFT_BACKEND=${VKFFT_BACKEND} 	
 	-gencode arch=compute_60,code=compute_60
@@ -193,7 +193,7 @@ \subsection{Installing VkFFT}
 	LevelZero_INCLUDES
 	NAMES "ze_api.h"
 	PATHS ${LevelZero_INCLUDE_DIR}
-	PATH_SUFFIXES "include" 
+	PATH_SUFFIXES "include" "level_zero"
 	NO_DEFAULT_PATH
   )
 target_include_directories(${PROJECT_NAME} PUBLIC ${LevelZero_INCLUDES})
@@ -572,12 +572,16 @@ \subsubsection{R2C/C2R multi-upload FFT algorithm}
 done with the help of the Four-Step FFT algorithm. When FFT is done,
 separate post-processing for R2C/pre-processing for C2R is applied.
 
+VkFFt also implements a general R2C/C2R algorithm that computes R2C
+as a C2C of the same length with imaginary part set to zero. The memory
+layout of it is optimized to reduce memory footprint.
+
 \subsubsection{R2R Discrete Cosine/Sine Transforms}
 
 There exist many different mappings between DCT and FFT. DSTs are
 reformulated as DCTs inside the VkFFT, so they use the same algorithms.
-As of now, VkFFT has the following algorithms implemented (all single-upload
-for now):
+As of now, VkFFT has the following algorithms implemented (both single
+and multiple uploads):
 \begin{itemize}
 \item DCT-I - mapping between R2R and C2C of the $2N-2$ length. For non-strided
 axis can use an optimization similar to the R2C/C2R multidimensional
@@ -590,6 +594,13 @@ \subsubsection{R2R Discrete Cosine/Sine Transforms}
 axis can use an optimization similar to the R2C/C2R multidimensional
 case (setting the imaginary part to the next FFT sequence)).
 \end{itemize}
+The single upload versions (that can fit in shared memory) of these
+algorithms have all mappings done in shared memory of the compute
+unit. For multiple upload sequence sizes, special callback versions
+of these algorithms are created that work similar to callbacks in
+CUDA (though more sophisticated as these algorithms can use multiple
+input values). The callback functionality has not been yet exposed
+to the user for general applications and its design is open to discussion.
 
 \subsubsection{Register overutilization}
 
@@ -973,10 +984,11 @@ \subsection{Return value VkFFTResult}
 VKFFT_ERROR_EMPTY_applicationString = 2013,	// loadApplicationString is zero when loadApplicationFromString is enabled
 VKFFT_ERROR_EMPRY_useCustomBluesteinPaddingPattern_arrays = 2014,	// pointers to primeSizes or paddedSizes arrays are zero when useCustomBluesteinPaddingPattern is enabled	
 VKFFT_ERROR_EMPTY_app = 2015,	// app pointer is zero
+VKFFT_ERROR_INVALID_user_tempBuffer_too_small = 2016,	// user provided tempBuffer is not sufficient for VkFFT intermediate calculations
 VKFFT_ERROR_UNSUPPORTED_RADIX = 3001,	// VkFFT has encountered unsupported radix (more than 13) during decomposition and Bluestein's FFT fallback did not work
 VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH = 3002,	// VkFFT can not do this sequence length currently - it requires mor than three-upload Four step FFT
-VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C = 3003,	// VkFFT can not do this sequence length currently - odd multi-upload R2C/C2R FFTs
-VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2R = 3004,	// VkFFT can not do this sequence length currently - multi-upload R2R transforms
+VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C = 3003,	// VkFFT can not do this sequence length currently - should no longer be thrown
+VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2R = 3004,	// VkFFT can not do this sequence length currently - should no longer be thrown
 VKFFT_ERROR_UNSUPPORTED_FFT_OMIT = 3005,	// VkFFT can not omit sequences in convolution calculations and R2C/C2R case
 VKFFT_ERROR_FAILED_TO_ALLOCATE = 4001,	// VkFFT failed to allocate GPU memory
 VKFFT_ERROR_FAILED_TO_MAP_MEMORY = 4002,	// 4002-4052 are handlers for errors of used backend APIs. They may indicate a driver failure. If they are thrown - report to the GitHub repo
@@ -1266,7 +1278,7 @@ \subsection{VkFFT configuration}
 pfUINT numberBatches;	// N - used to perform multiple batches of initial data. Default 1
 pfUINT useUint64;	// Use 64-bit addressing mode in generated kernels
 pfUINT omitDimension[VKFFT_MAX_FFT_DIMENSIONS];	// Disable FFT for this dimension (0 - FFT enabled, 1 - FFT disabled). Default 0. Doesn't work for R2C first axis for now. Doesn't work with convolutions.
-pfUINT performBandwidthBoost; // Try to reduce coalsesced number by a factor of X to get bigger sequence in one upload for strided axes. Default: -1 for DCT/DST, 2 for Bluestein's algorithm (or -1 if DCT/DST), 0 otherwise  
+int performBandwidthBoost; // Try to reduce coalsesced number by a factor of X to get bigger sequence in one upload for strided axes. Default: -1 for DCT/DST, 2 for Bluestein's algorithm (or -1 if DCT/DST), 0 otherwise  
 
 pfUINT doublePrecision;	// Perform calculations in double precision (0 - off, 1 - on).
 pfUINT quadDoubleDoublePrecision; // Perform calculations in double-double emulation of quad precision (0 - off, 1 - on).
@@ -1278,7 +1290,9 @@ \subsection{VkFFT configuration}
 pfUINT performR2C;	// Perform R2C/C2R decomposition (0 - off, 1 - on)
 pfUINT performDCT;	// Perform DCT transformation (X - DCT type, 1-4)
 pfUINT performDST;	// Perform DST transformation (X - DST type, 1-4)
-pfUINT disableMergeSequencesR2C;	// Disable merging of two real sequences to reduce calculations (0 - off, 1 - on) 
+pfUINT disableMergeSequencesR2C;	// Disable merging of two real sequences to reduce calculations (0 - off, 1 - on)
+pfUINT forceCallbackVersionRealTransforms;	// Force callback version of R2C and R2R algorithms for all usecases (0 - off, 1 - on) 
+
 pfUINT normalize;	// Normalize inverse transform (0 - off, 1 - on)
 pfUINT disableReorderFourStep;	// Disables unshuffling of Four step algorithm. Requires tempbuffer allocation (0 - off, 1 - on)
 pfINT useLUT;	// Switches from calculating sincos to using precomputed LUT tables (0 - off, 1 - on). Configured by initialization routine
@@ -1645,17 +1659,20 @@ \subsubsection{Advanced parameters (code will work fine without using them)}
 work in Vulkan API (use multiple buffer binding). Default 0, set to
 1 to enable. Optional parameter.
 
-uint64\_t performBandwidthBoost - try to reduce coalsesced number
-by a factor of X to get bigger sequence in one upload for strided
-axes. Default: -1(inf) for DCT and DST, 2 for Bluestein's algorithm
-(or -1 if DCT and DST), 0 otherwise 
+int performBandwidthBoost - try to reduce coalsesced number by a factor
+of X to get bigger sequence in one upload for strided axes. Default:
+-1(inf) for DCT and DST, 2 for Bluestein's algorithm (or -1 if DCT
+and DST), 0 otherwise 
 
 uint64\_t disableMergeSequencesR2C - disable the optimization that
 performs merging of two real sequences to reduce calculations (in
 R2C/C2R and R2R). If enabled, calculations will be performed by simply
 setting the imaginary component to zero. Default 0, set to 1 to enable.
 Optional parameter.
 
+uint64\_t forceCallbackVersionRealTransforms - force callback version
+of R2C and R2R algorithms for all usecases (0 - off, 1 - on) 
+
 uint64\_t disableReorderFourStep - disables unshuffling of the Four
 Step FFT algorithm (last transposition of data). With this option
 enabled, tempBuffer will not be needed (unless it is required by Bluestein's

diff --git a/...FFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_PushConstants.h b/...FFT_KernelsLevel0/vkFFT_MemoryManagement/vkFFT_MemoryInitialization/vkFFT_PushConstants.h
@@ -97,14 +97,14 @@ static inline void appendPushConstants(VkFFTSpecializationConstantsLayout* sc) {
 #elif(VKFFT_BACKEND==1)
 	sc->tempLen = sprintf(sc->tempStr, "	}PushConsts;\n");
 	PfAppendLine(sc);
-	sc->tempLen = sprintf(sc->tempStr, "	__constant__ PushConsts consts;\n");
-	PfAppendLine(sc);
+	//sc->tempLen = sprintf(sc->tempStr, "	__constant__ PushConsts consts;\n");
+	//PfAppendLine(sc);
 #elif(VKFFT_BACKEND==2)
 	sc->tempLen = sprintf(sc->tempStr, "	}PushConsts;\n");
 	PfAppendLine(sc);
 
-	sc->tempLen = sprintf(sc->tempStr, "	__constant__ PushConsts consts;\n");
-	PfAppendLine(sc);
+	//sc->tempLen = sprintf(sc->tempStr, "	__constant__ PushConsts consts;\n");
+	//PfAppendLine(sc);
 
 #elif(VKFFT_BACKEND==3)
 	sc->tempLen = sprintf(sc->tempStr, "	}PushConsts;\n");