Bugfixes

-Issue #50 should be partially resolved -Issue #51 should be resolved -Fixed ROCm 4.5 build problems with new version of HIPRTC -Updated Half version -Fixed some missing error handling and tempStr deallocation
DTolm · Nov 19, 2021 · 2ce8588 · 2ce8588
1 parent 16ede62
commit 2ce8588
Show file tree

Hide file tree

Showing 4 changed files with 104 additions and 58 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -87,6 +87,7 @@ else()
 endif()
 	set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_ARCHITECTURES 35 60 70 75 80 86)
 	target_compile_options(${PROJECT_NAME} PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:
+	-std=c++11
 	-DVKFFT_BACKEND=${VKFFT_BACKEND}
 	-gencode arch=compute_35,code=compute_35
 	-gencode arch=compute_60,code=compute_60
@@ -99,6 +100,7 @@ endif()
 elseif(${VKFFT_BACKEND} EQUAL 2)
 	list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
 	find_package(hip)
+	#target_compile_definitions(${PROJECT_NAME} PUBLIC -DVKFFT_OLD_ROCM) #ROCm versions before 4.5 needed kernel include of hiprtc
 elseif(${VKFFT_BACKEND} EQUAL 3)
 	find_package(OpenCL REQUIRED)
 endif()
@@ -190,6 +192,7 @@ if(build_VkFFT_cuFFT_benchmark)
 	set_property(TARGET cuFFT_scripts PROPERTY CUDA_ARCHITECTURES 35 60 70 75 80 86)
 	CUDA_ADD_CUFFT_TO_TARGET(cuFFT_scripts)
 	target_compile_options(cuFFT_scripts PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:
+	-std=c++11
 	-gencode arch=compute_35,code=compute_35
 	-gencode arch=compute_60,code=compute_60
 	-gencode arch=compute_70,code=compute_70 
@@ -205,7 +208,7 @@ if(build_VkFFT_rocFFT_benchmark)
 	add_definitions(-DUSE_rocFFT)
 	list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
 	find_package(hip)
-    	find_package(rocfft)
+    find_package(hipfft)
 
 	if(build_VkFFT_FFTW_precision)
 		add_library(rocFFT_scripts STATIC
@@ -236,6 +239,6 @@ if(build_VkFFT_rocFFT_benchmark)
 		benchmark_scripts/rocFFT_scripts/src/sample_1003_benchmark_rocFFT_single_3d_2_512.cpp)
 	endif()
 	target_include_directories(rocFFT_scripts PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_scripts/rocFFT_scripts/include)
-	target_link_libraries(rocFFT_scripts PRIVATE hip::host roc::rocfft)
+	target_link_libraries(rocFFT_scripts PRIVATE hip::host hip::hipfft)
 	target_link_libraries(${PROJECT_NAME} PUBLIC rocFFT_scripts)
 endif()
diff --git a/Vulkan_FFT.cpp b/Vulkan_FFT.cpp
@@ -439,7 +439,7 @@ int main(int argc, char* argv[])
 		version_decomposed[0] = version / 10000;
 		version_decomposed[1] = (version - version_decomposed[0] * 10000) / 100;
 		version_decomposed[2] = (version - version_decomposed[0] * 10000 - version_decomposed[1] * 100);
-		printf("VkFFT v%d.%d.%d (19-10-2021). Author: Tolmachev Dmitrii\n", version_decomposed[0], version_decomposed[1], version_decomposed[2]);
+		printf("VkFFT v%d.%d.%d (19-11-2021). Author: Tolmachev Dmitrii\n", version_decomposed[0], version_decomposed[1], version_decomposed[2]);
 #if (VKFFT_BACKEND==0)
 		printf("Vulkan backend\n");
 #elif (VKFFT_BACKEND==1)

diff --git a/half_lib/half.hpp b/half_lib/half.hpp
@@ -1,6 +1,6 @@
 // half - IEEE 754-based half-precision floating-point library.
 //
-// Copyright (c) 2012-2019 Christian Rau <[email protected]>
+// Copyright (c) 2012-2021 Christian Rau <[email protected]>
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation 
 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
@@ -14,7 +14,7 @@
 // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-// Version 2.1.0
+// Version 2.2.0
 
 /// \file
 /// Main header file for half-precision functionality.
@@ -266,9 +266,6 @@
 #if HALF_ENABLE_CPP11_HASH
 	#include <functional>
 #endif
-#if HALF_ENABLE_F16C_INTRINSICS
-	#include <immintrin.h>
-#endif
 
 
 #ifndef HALF_ENABLE_F16C_INTRINSICS
@@ -280,6 +277,9 @@
 	/// Unless predefined it will be enabled automatically when the `__F16C__` symbol is defined, which some compilers do on supporting platforms.
 	#define HALF_ENABLE_F16C_INTRINSICS __F16C__
 #endif
+#if HALF_ENABLE_F16C_INTRINSICS
+	#include <immintrin.h>
+#endif
 
 #ifdef HALF_DOXYGEN_ONLY
 /// Type for internal floating-point computations.
@@ -869,12 +869,12 @@ namespace half_float
 
 		/// Convert fixed point to half-precision floating-point.
 		/// \tparam R rounding mode to use
-		/// \tparam F number of fractional bits (at least 11)
+		/// \tparam F number of fractional bits in [11,31]
 		/// \tparam S `true` for signed, `false` for unsigned
 		/// \tparam N `true` for additional normalization step, `false` if already normalized to 1.F
 		/// \tparam I `true` to always raise INEXACT exception, `false` to raise only for rounded results
 		/// \param m mantissa in Q1.F fixed point format
-		/// \param exp exponent
+		/// \param exp biased exponent - 1
 		/// \param sign half-precision value with sign bit only
 		/// \param s sticky bit (or of all but the most significant already discarded bits)
 		/// \return value converted to half-precision
@@ -1676,34 +1676,34 @@ namespace half_float
 
 		/// Postprocessing for binary exponential.
 		/// \tparam R rounding mode to use
-		/// \tparam I `true` to always raise INEXACT exception, `false` to raise only for rounded results
-		/// \param m mantissa as Q1.31
+		/// \param m fractional part of as Q0.31
 		/// \param exp absolute value of unbiased exponent
 		/// \param esign sign of actual exponent
 		/// \param sign sign bit of result
+		/// \param n number of BKM iterations (at most 32)
 		/// \return value converted to half-precision
 		/// \exception FE_OVERFLOW on overflows
 		/// \exception FE_UNDERFLOW on underflows
 		/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
-		template<std::float_round_style R,bool I> unsigned int exp2_post(uint32 m, int exp, bool esign, unsigned int sign = 0)
+		template<std::float_round_style R> unsigned int exp2_post(uint32 m, int exp, bool esign, unsigned int sign = 0, unsigned int n = 32)
 		{
-			int s = 0;
 			if(esign)
 			{
-				if(m > 0x80000000)
-				{
-					m = divide64(0x80000000, m, s);
-					++exp;
-				}
-				if(exp > 25)
+				exp = -exp - (m!=0);
+				if(exp < -25)
 					return underflow<R>(sign);
-				else if(exp == 25)
-					return rounded<R,I>(sign, 1, (m&0x7FFFFFFF)!=0);
-				exp = -exp;
+				else if(exp == -25)
+					return rounded<R,false>(sign, 1, m!=0);
 			}
 			else if(exp > 15)
 				return overflow<R>(sign);
-			return fixed2half<R,31,false,false,I>(m, exp+14, sign, s);
+			if(!m)
+				return sign | (((exp+=15)>0) ? (exp<<10) : check_underflow(0x200>>-exp));
+			m = exp2(m, n);
+			int s = 0;
+			if(esign)
+				m = divide64(0x80000000, m, s);
+			return fixed2half<R,31,false,false,true>(m, exp+14, sign, s);
 		}
 
 		/// Postprocessing for binary logarithm.
@@ -1737,7 +1737,7 @@ namespace half_float
 		/// Hypotenuse square root and postprocessing.
 		/// \tparam R rounding mode to use
 		/// \param r mantissa as Q2.30
-		/// \param exp unbiased exponent
+		/// \param exp biased exponent
 		/// \return square root converted to half-precision
 		/// \exception FE_OVERFLOW on overflows
 		/// \exception FE_UNDERFLOW on underflows
@@ -2201,6 +2201,7 @@ namespace half_float
 		friend half log2(half);
 		friend half log1p(half);
 		friend half sqrt(half);
+		friend half rsqrt(half);
 		friend half cbrt(half);
 		friend half hypot(half, half);
 		friend half hypot(half, half, half);
@@ -2937,15 +2938,14 @@ namespace half_float
 	#ifdef HALF_ARITHMETIC_TYPE
 		return half(detail::binary, detail::float2half<half::round_style>(std::exp(detail::half2float<detail::internal_t>(arg.data_))));
 	#else
-		int abs = arg.data_ & 0x7FFF;
+		int abs = arg.data_ & 0x7FFF, e = (abs>>10) + (abs<=0x3FF), exp;
 		if(!abs)
 			return half(detail::binary, 0x3C00);
 		if(abs >= 0x7C00)
 			return half(detail::binary, (abs==0x7C00) ? (0x7C00&((arg.data_>>15)-1U)) : detail::signal(arg.data_));
 		if(abs >= 0x4C80)
 			return half(detail::binary, (arg.data_&0x8000) ? detail::underflow<half::round_style>() : detail::overflow<half::round_style>());
 		detail::uint32 m = detail::multiply64(static_cast<detail::uint32>((abs&0x3FF)+((abs>0x3FF)<<10))<<21, 0xB8AA3B29);
-		int e = (abs>>10) + (abs<=0x3FF), exp;
 		if(e < 14)
 		{
 			exp = 0;
@@ -2956,7 +2956,7 @@ namespace half_float
 			exp = m >> (45-e);
 			m = (m<<(e-14)) & 0x7FFFFFFF;
 		}
-		return half(detail::binary, detail::exp2_post<half::round_style,true>(detail::exp2(m, 26), exp, (arg.data_&0x8000)!=0));
+		return half(detail::binary, detail::exp2_post<half::round_style>(m, exp, (arg.data_&0x8000)!=0, 0, 26));
 	#endif
 	}
 
@@ -2973,25 +2973,15 @@ namespace half_float
 	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
 		return half(detail::binary, detail::float2half<half::round_style>(std::exp2(detail::half2float<detail::internal_t>(arg.data_))));
 	#else
-		int abs = arg.data_ & 0x7FFF;
+		int abs = arg.data_ & 0x7FFF, e = (abs>>10) + (abs<=0x3FF), exp = (abs&0x3FF) + ((abs>0x3FF)<<10);
 		if(!abs)
 			return half(detail::binary, 0x3C00);
 		if(abs >= 0x7C00)
 			return half(detail::binary, (abs==0x7C00) ? (0x7C00&((arg.data_>>15)-1U)) : detail::signal(arg.data_));
 		if(abs >= 0x4E40)
 			return half(detail::binary, (arg.data_&0x8000) ? detail::underflow<half::round_style>() : detail::overflow<half::round_style>());
-		int e = (abs>>10) + (abs<=0x3FF), exp = (abs&0x3FF) + ((abs>0x3FF)<<10);
-		detail::uint32 m = detail::exp2((static_cast<detail::uint32>(exp)<<(6+e))&0x7FFFFFFF, 28);
-		exp >>= 25 - e;
-		if(m == 0x80000000)
-		{
-			if(arg.data_&0x8000)
-				exp = -exp;
-			else if(exp > 15)
-				return half(detail::binary, detail::overflow<half::round_style>());
-			return half(detail::binary, detail::fixed2half<half::round_style,31,false,false,false>(m, exp+14));
-		}
-		return half(detail::binary, detail::exp2_post<half::round_style,true>(m, exp, (arg.data_&0x8000)!=0));
+		return half(detail::binary, detail::exp2_post<half::round_style>(
+			(static_cast<detail::uint32>(exp)<<(6+e))&0x7FFFFFFF, exp>>(25-e), (arg.data_&0x8000)!=0, 0, 28));
 	#endif
 	}
 
@@ -3009,15 +2999,14 @@ namespace half_float
 	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
 		return half(detail::binary, detail::float2half<half::round_style>(std::expm1(detail::half2float<detail::internal_t>(arg.data_))));
 	#else
-		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000, e = (abs>>10) + (abs<=0x3FF), exp;
 		if(!abs)
 			return arg;
 		if(abs >= 0x7C00)
 			return half(detail::binary, (abs==0x7C00) ? (0x7C00+(sign>>1)) : detail::signal(arg.data_));
 		if(abs >= 0x4A00)
 			return half(detail::binary, (arg.data_&0x8000) ? detail::rounded<half::round_style,true>(0xBBFF, 1, 1) : detail::overflow<half::round_style>());
 		detail::uint32 m = detail::multiply64(static_cast<detail::uint32>((abs&0x3FF)+((abs>0x3FF)<<10))<<21, 0xB8AA3B29);
-		int e = (abs>>10) + (abs<=0x3FF), exp;
 		if(e < 14)
 		{
 			exp = 0;
@@ -3213,7 +3202,7 @@ namespace half_float
 	/// \param arg function argument
 	/// \return square root of \a arg
 	/// \exception FE_INVALID for signaling NaN and negative arguments
-	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	/// \exception FE_INEXACT according to rounding
 	inline half sqrt(half arg)
 	{
 	#ifdef HALF_ARITHMETIC_TYPE
@@ -3228,14 +3217,50 @@ namespace half_float
 	#endif
 	}
 
+	/// Inverse square root.
+	/// This function is exact to rounding for all rounding modes and thus generally more accurate than directly computing 
+	/// 1 / sqrt(\a arg) in half-precision, in addition to also being faster.
+	/// \param arg function argument
+	/// \return reciprocal of square root of \a arg
+	/// \exception FE_INVALID for signaling NaN and negative arguments
+	/// \exception FE_INEXACT according to rounding
+	inline half rsqrt(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(detail::internal_t(1)/std::sqrt(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF, bias = 0x4000;
+		if(!abs || arg.data_ >= 0x7C00)
+			return half(detail::binary,	(abs>0x7C00) ? detail::signal(arg.data_) : (arg.data_>0x8000) ?
+										detail::invalid() : !abs ? detail::pole(arg.data_&0x8000) : 0);
+		for(; abs<0x400; abs<<=1,bias-=0x400) ;
+		unsigned int frac = (abs+=bias) & 0x7FF;
+		if(frac == 0x400)
+			return half(detail::binary, 0x7A00-(abs>>1));
+		if((half::round_style == std::round_to_nearest && (frac == 0x3FE || frac == 0x76C)) ||
+		   (half::round_style != std::round_to_nearest && (frac == 0x15A || frac == 0x3FC || frac == 0x401 || frac == 0x402 || frac == 0x67B)))
+			return pow(arg, half(detail::binary, 0xB800));
+		detail::uint32 f = 0x17376 - abs, mx = (abs&0x3FF) | 0x400, my = ((f>>1)&0x3FF) | 0x400, mz = my * my;
+		int expy = (f>>11) - 31, expx = 32 - (abs>>10), i = mz >> 21;
+		for(mz=0x60000000-(((mz>>i)*mx)>>(expx-2*expy-i)); mz<0x40000000; mz<<=1,--expy) ;
+		i = (my*=mz>>10) >> 31;
+		expy += i;
+		my = (my>>(20+i)) + 1;
+		i = (mz=my*my) >> 21;
+		for(mz=0x60000000-(((mz>>i)*mx)>>(expx-2*expy-i)); mz<0x40000000; mz<<=1,--expy) ;
+		i = (my*=(mz>>10)+1) >> 31;
+		return half(detail::binary, detail::fixed2half<half::round_style,30,false,false,true>(my>>i, expy+i+14));
+	#endif
+	}
+
 	/// Cubic root.
 	/// This function is exact to rounding for all rounding modes.
 	///
 	/// **See also:** Documentation for [std::cbrt](https://en.cppreference.com/w/cpp/numeric/math/cbrt).
 	/// \param arg function argument
 	/// \return cubic root of \a arg
 	/// \exception FE_INVALID for signaling NaN
-	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	/// \exception FE_INEXACT according to rounding
 	inline half cbrt(half arg)
 	{
 	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
@@ -3419,12 +3444,13 @@ namespace half_float
 			return half(detail::binary, detail::invalid());
 		if(x.data_ == 0xBC00)
 			return half(detail::binary, sign|0x3C00);
-		if(y.data_ == 0x3800)
-			return sqrt(x);
-		if(y.data_ == 0x3C00)
-			return half(detail::binary, detail::check_underflow(x.data_));
-		if(y.data_ == 0x4000)
-			return x * x;
+		switch(y.data_)
+		{
+			case 0x3800: return sqrt(x);
+			case 0x3C00: return half(detail::binary, detail::check_underflow(x.data_));
+			case 0x4000: return x * x;
+			case 0xBC00: return half(detail::binary, 0x3C00) / x;
+		}
 		for(; absx<0x400; absx<<=1,--exp) ;
 		detail::uint32 ilog = exp + (absx>>10), msign = detail::sign_mask(ilog), f, m = 
 			(((ilog<<27)+((detail::log2(static_cast<detail::uint32>((absx&0x3FF)|0x400)<<20)+8)>>4))^msign) - msign;
@@ -3444,7 +3470,7 @@ namespace half_float
 			f = (m<<exp) & 0x7FFFFFFF;
 			exp = m >> (31-exp);
 		}
-		return half(detail::binary, detail::exp2_post<half::round_style,false>(detail::exp2(f), exp, ((msign&1)^(y.data_>>15))!=0, sign));
+		return half(detail::binary, detail::exp2_post<half::round_style>(f, exp, ((msign&1)^(y.data_>>15))!=0, sign));
 	#endif
 	}