src/jit/stubs/semifloat.c

#include "freestanding.h"

#ifdef __wasm__
#include <wasm_simd128.h>
#else
#include "wasm_simd128_polyfill.h"
#endif

static inline v128_t sum_residue(v128_t a, v128_t b, v128_t c) {
	v128_t delta_a = wasm_f64x2_sub(a, wasm_f64x2_sub(c, b));
	v128_t delta_b = wasm_f64x2_sub(b, wasm_f64x2_sub(c, a));
	v128_t res = wasm_f64x2_add(delta_a, delta_b);
	return res;
}

// preconditions
// - no NaNs, no subnormals
// - source registers: always finite
// 1. finite - no infinities (register type F, additive instructions)
// 2. zero   - no zero, no negatives, positive infinity (register type E, multiplicative instructions)

// `c` can be zero, when zero is generated it is ALWAYS a perfectly rounded operation (generated by x - x == 0)
// that means there is no need for special cases, as it'll be masked out by a (false) `to_round`

// toward negative infinity
static inline v128_t nextafter_1_finite(v128_t res, v128_t c) {
	v128_t to_round = wasm_f64x2_lt(res, wasm_f64x2_const(0.0, 0.0)); // res < 0.0

	v128_t k = wasm_i64x2_add(c, wasm_v128_or(wasm_f64x2_gt(c, wasm_f64x2_const(0.0, 0.0)), wasm_i64x2_const(1, 1)));

	// to_round ? k : c
	return wasm_v128_bitselect(k, c, to_round);
}

// toward positive infinity
static inline v128_t nextafter_2_finite(v128_t res, v128_t c) {
	v128_t to_round = wasm_f64x2_gt(res, wasm_f64x2_const(0.0, 0.0)); // res > 0.0

	v128_t k = wasm_i64x2_add(c, wasm_v128_or(wasm_f64x2_lt(c, wasm_f64x2_const(0.0, 0.0)), wasm_i64x2_const(1, 1)));

	// to_round ? k : c
	return wasm_v128_bitselect(k, c, to_round);
}

// toward zero
static inline v128_t nextafter_3_finite(v128_t res, v128_t c) {
	// if ((res > 0.0 && c < 0.0) || (res < 0.0 && c > 0.0))
	v128_t to_round = wasm_v128_or(
		wasm_v128_and(wasm_f64x2_gt(res, wasm_f64x2_const(0.0, 0.0)), wasm_f64x2_lt(c, wasm_f64x2_const(0.0, 0.0))),
		wasm_v128_and(wasm_f64x2_lt(res, wasm_f64x2_const(0.0, 0.0)), wasm_f64x2_gt(c, wasm_f64x2_const(0.0, 0.0)))
	);

	v128_t k = wasm_i64x2_sub(c, wasm_i64x2_const(1, 1));

	// to_round ? k : c
	return wasm_v128_bitselect(k, c, to_round);
}

// toward negative infinity
static inline v128_t nextafter_1_nozero(v128_t res, v128_t c) {
	v128_t to_round = wasm_f64x2_lt(res, wasm_f64x2_const(0.0, 0.0)); // res < 0.0
	v128_t k = wasm_i64x2_add(c, wasm_v128_or(wasm_f64x2_gt(c, wasm_f64x2_const(0.0, 0.0)), wasm_i64x2_const(1, 1)));

	// to_round ? k : c
	return wasm_v128_bitselect(k, c, to_round);
}

// toward positive infinity
static inline v128_t nextafter_2_nozero(v128_t res, v128_t c) {
	v128_t to_round = wasm_f64x2_gt(res, wasm_f64x2_const(0.0, 0.0)); // res > 0.0
	v128_t k = wasm_i64x2_add(c, wasm_v128_or(wasm_f64x2_lt(c, wasm_f64x2_const(0.0, 0.0)), wasm_i64x2_const(1, 1)));

	// to_round ? k : c
	return wasm_v128_bitselect(k, c, to_round);
}

// toward zero
static inline v128_t nextafter_3_nozero(v128_t res, v128_t c) {
	v128_t to_round = wasm_f64x2_lt(res, wasm_f64x2_const(0.0, 0.0)); // res < 0.0
	v128_t k = wasm_i64x2_sub(c, wasm_i64x2_const(1, 1));

	// to_round ? k : c
	return wasm_v128_bitselect(k, c, to_round);
}

WASM_EXPORT("fadd_0")
v128_t fadd_0(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_add(dest, src);
	return c;
}

WASM_EXPORT("fadd_1")
v128_t fadd_1(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_add(dest, src);
	v128_t res = sum_residue(dest, src, c);
	return nextafter_1_finite(res, c);
}

WASM_EXPORT("fadd_2")
v128_t fadd_2(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_add(dest, src);
	v128_t res = sum_residue(dest, src, c);
	return nextafter_2_finite(res, c);
}

WASM_EXPORT("fadd_3")
v128_t fadd_3(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_add(dest, src);
	v128_t res = sum_residue(dest, src, c);
	return nextafter_3_finite(res, c);
}

WASM_EXPORT("fsub_0")
v128_t fsub_0(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_sub(dest, src);
	return c;
}

WASM_EXPORT("fsub_1")
v128_t fsub_1(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_sub(dest, src);
	v128_t res = sum_residue(dest, wasm_f64x2_neg(src), c);
	return nextafter_1_finite(res, c);
}

WASM_EXPORT("fsub_2")
v128_t fsub_2(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_sub(dest, src);
	v128_t res = sum_residue(dest, wasm_f64x2_neg(src), c);
	return nextafter_2_finite(res, c);
}

WASM_EXPORT("fsub_3")
v128_t fsub_3(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_sub(dest, src);
	v128_t res = sum_residue(dest, wasm_f64x2_neg(src), c);
	return nextafter_3_finite(res, c);
}

static inline v128_t upper_half(v128_t x) {
	v128_t secator = wasm_f64x2_const(134217729.0, 134217729.0);
	v128_t p = wasm_f64x2_mul(x, secator);
	return wasm_f64x2_add(p, wasm_f64x2_sub(x, p));
}

static inline v128_t mul_residue(v128_t a, v128_t b, v128_t c) {
	v128_t aup = upper_half(a);
	v128_t alo = wasm_f64x2_sub(a, aup);
	v128_t bup = upper_half(b);
	v128_t blo = wasm_f64x2_sub(b, bup);

	v128_t high = wasm_f64x2_mul(aup, bup); 
	v128_t mid = wasm_f64x2_add(wasm_f64x2_mul(aup, blo), wasm_f64x2_mul(alo, bup));
	v128_t low = wasm_f64x2_mul(alo, blo);
	v128_t ab = wasm_f64x2_add(high, mid);
	v128_t resab = wasm_f64x2_add(wasm_f64x2_sub(high, ab), mid);
	resab = wasm_f64x2_add(resab, low);

	v128_t fma = wasm_f64x2_sub(ab, c); // a*b - c
	return wasm_f64x2_add(resab, fma);
}

static inline v128_t mul_residue_fma(v128_t a, v128_t b, v128_t c) {
	return wasm_f64x2_relaxed_madd(wasm_f64x2_neg(c), b, a); // a*b - c
}

// reference:
// f64x2            [  mant    exp+s ][  mant    exp+s ]
// i32x4            [  lo   ][  hi   ][  lo   ][  hi   ]
// wasm_i32x4_const(         ,        ,        ,        )

// assume no NaN, subnormal, inf, or zero
static inline v128_t frexp_reg_e_nozero_noinf(v128_t x, v128_t *eptr) {
	/* int hx, ix, lx;
	hx = __HI(x);
	ix = 0x7fffffff & hx;
	lx = __LO(x);
	*eptr = 0;
	*eptr += (ix >> 20) - 1022;
	hx = (hx & 0x800fffff) | 0x3fe00000;
	__HI(x) = hx;
	return x; */

	v128_t ix = wasm_v128_and(x, wasm_i32x4_const(0, 0x7ff00000, 0, 0x7ff00000));
	*eptr = wasm_i64x2_sub(ix, wasm_i64x2_const(UINT64_C(1022) << 52, UINT64_C(1022) << 52));

	v128_t hx = wasm_v128_and(x, wasm_i32x4_const(0xFFFFFFFF, 0x800fffff, 0xFFFFFFFF, 0x800fffff));
	hx = wasm_v128_or(hx, wasm_i32x4_const(0, 0x3fe00000, 0, 0x3fe00000));
	return hx;
}

// assume no NaN, subnormal, inf, or zero
static inline v128_t ldexp_reg_e_nozero_noinf(v128_t x, v128_t n) {
	/* int hx, k;
	hx = __HI(x);
	k = (hx & 0x7ff00000) >> 20;
	k += n;
	__HI(x) = (hx & 0x800fffff) | (k << 20);
	return x; */

	v128_t k = wasm_i64x2_add(wasm_v128_and(x, wasm_i32x4_const(0, 0x7ff00000, 0, 0x7ff00000)), n);
	v128_t hx = wasm_v128_and(x, wasm_i32x4_const(0xFFFFFFFF, 0x800fffff, 0xFFFFFFFF, 0x800fffff));
	hx = wasm_v128_or(hx, k);

	return hx;
}

// infinities absolutely unlikely. 0.003414% of all cases
// TODO: it seems like this branch doesn't do much to influence the performance

WASM_EXPORT("fmul_0")
v128_t fmul_0(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_mul(dest, src);
	return c;
}

WASM_EXPORT("fmul_1")
v128_t fmul_1(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_mul(dest, src);

	v128_t expa, expb;
	v128_t a_scaled = frexp_reg_e_nozero_noinf(dest, &expa);
	v128_t b_scaled = frexp_reg_e_nozero_noinf(src, &expb);
	v128_t c_scaled = ldexp_reg_e_nozero_noinf(c, wasm_i64x2_sub(wasm_i64x2_neg(expa), expb)); // -expa - expb
	v128_t res = mul_residue(a_scaled, b_scaled, c_scaled);

	v128_t isinf = wasm_f64x2_eq(c, wasm_f64x2_const(INFINITY, INFINITY));

	if (unlikely(wasm_v128_any_true(isinf))) {
		v128_t isfinite_a = wasm_f64x2_ne(dest, wasm_f64x2_const(INFINITY, INFINITY));
		// fin * fin = _inf_; round down to the nearest representable number
		// inf * fin = inf
		//
		// res = isinf ? (isfinite_a ? -1.0 : 1.0) : res
		//                             ^^^^   ^^^
		//                    rounding down   no rounding
		v128_t res_isinf = wasm_v128_bitselect(wasm_f64x2_const(-1.0, -1.0), wasm_f64x2_const(1.0, 1.0), isfinite_a);
		res = wasm_v128_bitselect(res_isinf, res, isinf);
	}

	return nextafter_1_nozero(res, c);
}

WASM_EXPORT("fmul_2")
v128_t fmul_2(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_mul(dest, src);

	v128_t expa, expb;
	v128_t a_scaled = frexp_reg_e_nozero_noinf(dest, &expa);
	v128_t b_scaled = frexp_reg_e_nozero_noinf(src, &expb);
	v128_t c_scaled = ldexp_reg_e_nozero_noinf(c, wasm_i64x2_sub(wasm_i64x2_neg(expa), expb)); // -expa - expb
	v128_t res = mul_residue(a_scaled, b_scaled, c_scaled);

	v128_t isinf = wasm_f64x2_eq(c, wasm_f64x2_const(INFINITY, INFINITY));

	if (unlikely(wasm_v128_any_true(isinf))) {
		// fin * fin = inf
		// inf * inf = inf
		//
		// res = isinf ? -1.0 : res
		//               ^^^^
		//            no rounding
		res = wasm_v128_bitselect(wasm_f64x2_const(-1.0, -1.0), res, isinf);
	}

	return nextafter_2_nozero(res, c);
}

WASM_EXPORT("fmul_3")
v128_t fmul_3(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_mul(dest, src);

	v128_t expa, expb;
	v128_t a_scaled = frexp_reg_e_nozero_noinf(dest, &expa);
	v128_t b_scaled = frexp_reg_e_nozero_noinf(src, &expb);
	v128_t c_scaled = ldexp_reg_e_nozero_noinf(c, wasm_i64x2_sub(wasm_i64x2_neg(expa), expb)); // -expa - expb
	v128_t res = mul_residue(a_scaled, b_scaled, c_scaled);

	v128_t isinf = wasm_f64x2_eq(c, wasm_f64x2_const(INFINITY, INFINITY));

	if (unlikely(wasm_v128_any_true(isinf))) {
		v128_t isfinite_a = wasm_f64x2_ne(dest, wasm_f64x2_const(INFINITY, INFINITY));
		// fin * fin = _inf_; round down to the nearest representable number
		// inf * fin = inf
		//
		// res = isinf ? (isfinite_a ? -1.0 : 0.0) : res
		//                             ^^^^   ^^^
		//                    rounding down   no rounding
		v128_t res_isinf = wasm_v128_bitselect(wasm_f64x2_const(-1.0, -1.0), wasm_f64x2_const(0.0, 0.0), isfinite_a);
		res = wasm_v128_bitselect(res_isinf, res, isinf);
	}

	return nextafter_3_nozero(res, c);
}

WASM_EXPORT("fmul_fma_1")
v128_t fmul_fma_1(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_mul(dest, src);
	v128_t res = mul_residue_fma(dest, src, c);
	return nextafter_1_nozero(res, c);
}

WASM_EXPORT("fmul_fma_2")
v128_t fmul_fma_2(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_mul(dest, src);
	v128_t res = mul_residue_fma(dest, src, c);
	return nextafter_2_nozero(res, c);
}

WASM_EXPORT("fmul_fma_3")
v128_t fmul_fma_3(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_mul(dest, src);
	v128_t res = mul_residue_fma(dest, src, c);
	return nextafter_3_nozero(res, c);
}

WASM_EXPORT("fdiv_0")
v128_t fdiv_0(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_div(dest, src);
	return c;
}

WASM_EXPORT("fdiv_1")
v128_t fdiv_1(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_div(dest, src);

	v128_t expa, expb;
	v128_t a_scaled = frexp_reg_e_nozero_noinf(dest, &expa);
	v128_t b_scaled = frexp_reg_e_nozero_noinf(src, &expb);
	v128_t c_scaled = ldexp_reg_e_nozero_noinf(c, wasm_i64x2_add(wasm_i64x2_neg(expa), expb)); // -expa + expb
	v128_t res = mul_residue(c_scaled, b_scaled, a_scaled);

	v128_t isinf = wasm_f64x2_eq(c, wasm_f64x2_const(INFINITY, INFINITY));

	if (unlikely(wasm_v128_any_true(isinf))) {
		v128_t isfinite_b = wasm_f64x2_ne(dest, wasm_f64x2_const(INFINITY, INFINITY));
		// fin / fin = _inf_; round down to the nearest representable number
		// inf / fin = inf
		//
		// res = isinf ? (isfinite_b ? 1.0 : 0.0) : res
		//                             ^^^   ^^^
		//                   rounding down   no rounding
		v128_t res_isinf = wasm_v128_bitselect(wasm_f64x2_const(1.0, 1.0), wasm_f64x2_const(0.0, 0.0), isfinite_b);
		res = wasm_v128_bitselect(res_isinf, res, isinf);
	}

	return nextafter_1_nozero(wasm_f64x2_neg(res), c);
}

WASM_EXPORT("fdiv_2")
v128_t fdiv_2(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_div(dest, src);

	v128_t expa, expb;
	v128_t a_scaled = frexp_reg_e_nozero_noinf(dest, &expa);
	v128_t b_scaled = frexp_reg_e_nozero_noinf(src, &expb);
	v128_t c_scaled = ldexp_reg_e_nozero_noinf(c, wasm_i64x2_add(wasm_i64x2_neg(expa), expb)); // -expa + expb
	v128_t res = mul_residue(c_scaled, b_scaled, a_scaled);

	v128_t isinf = wasm_f64x2_eq(c, wasm_f64x2_const(INFINITY, INFINITY));

	if (unlikely(wasm_v128_any_true(isinf))) {
		// fin / fin = fin
		// inf / fin = inf
		//
		// res = isinf ? 0.0 : res
		//               ^^^
		//           no rounding
		res = wasm_v128_bitselect(wasm_f64x2_const(0.0, 0.0), res, isinf);
	}

	return nextafter_2_nozero(wasm_f64x2_neg(res), c);
}

WASM_EXPORT("fdiv_3")
v128_t fdiv_3(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_div(dest, src);

	v128_t expa, expb;
	v128_t a_scaled = frexp_reg_e_nozero_noinf(dest, &expa);
	v128_t b_scaled = frexp_reg_e_nozero_noinf(src, &expb);
	v128_t c_scaled = ldexp_reg_e_nozero_noinf(c, wasm_i64x2_add(wasm_i64x2_neg(expa), expb)); // -expa + expb
	v128_t res = mul_residue(c_scaled, b_scaled, a_scaled);

	v128_t isinf = wasm_f64x2_eq(c, wasm_f64x2_const(INFINITY, INFINITY));

	if (unlikely(wasm_v128_any_true(isinf))) {
		v128_t isfinite_b = wasm_f64x2_ne(dest, wasm_f64x2_const(INFINITY, INFINITY));
		// fin / fin = _inf_; round down to the nearest representable number
		// inf / fin = inf
		//
		// res = isinf ? (isfinite_b ? 1.0 : 0.0) : res
		//                             ^^^   ^^^
		//                   rounding down   no rounding
		v128_t res_isinf = wasm_v128_bitselect(wasm_f64x2_const(1.0, 1.0), wasm_f64x2_const(0.0, 0.0), isfinite_b);
		res = wasm_v128_bitselect(res_isinf, res, isinf);
	}

	return nextafter_3_nozero(wasm_f64x2_neg(res), c);
}

WASM_EXPORT("fdiv_fma_1")
v128_t fdiv_fma_1(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_div(dest, src);
	v128_t res = mul_residue_fma(c, src, dest);
	return nextafter_1_nozero(wasm_f64x2_neg(res), c);
}

WASM_EXPORT("fdiv_fma_2")
v128_t fdiv_fma_2(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_div(dest, src);
	v128_t res = mul_residue_fma(c, src, dest);
	return nextafter_2_nozero(wasm_f64x2_neg(res), c);
}

WASM_EXPORT("fdiv_fma_3")
v128_t fdiv_fma_3(v128_t dest, v128_t src) {
	v128_t c = wasm_f64x2_div(dest, src);
	v128_t res = mul_residue_fma(c, src, dest);
	return nextafter_3_nozero(wasm_f64x2_neg(res), c);
}

WASM_EXPORT("fsqrt_0")
v128_t fsqrt_0(v128_t dest) {
	v128_t c = wasm_f64x2_sqrt(dest);
	return c;
}

WASM_EXPORT("fsqrt_1")
v128_t fsqrt_1(v128_t dest) {
	v128_t c = wasm_f64x2_sqrt(dest);
	v128_t expc;
	v128_t c_scaled = frexp_reg_e_nozero_noinf(c, &expc);
	v128_t a_scaled = ldexp_reg_e_nozero_noinf(dest, wasm_i64x2_mul(wasm_i64x2_const(-2, -2), expc)); // -2 * expc
	v128_t res = mul_residue(c_scaled, c_scaled, a_scaled);

	v128_t isinf = wasm_f64x2_eq(c, wasm_f64x2_const(INFINITY, INFINITY));

	if (unlikely(wasm_v128_any_true(isinf))) {
		// sqrt(inf) = inf
		//
		// res = isinf ? 0.0 : res
		//               ^^^
		//           no rounding
		res = wasm_v128_bitselect(wasm_f64x2_const(0.0, 0.0), res, isinf);
	}

	return nextafter_1_nozero(wasm_f64x2_neg(res), c);
}

WASM_EXPORT("fsqrt_2")
v128_t fsqrt_2(v128_t dest) {
	v128_t c = wasm_f64x2_sqrt(dest);
	v128_t expc;
	v128_t c_scaled = frexp_reg_e_nozero_noinf(c, &expc);
	v128_t a_scaled = ldexp_reg_e_nozero_noinf(dest, wasm_i64x2_mul(wasm_i64x2_const(-2, -2), expc)); // -2 * expc
	v128_t res = mul_residue(c_scaled, c_scaled, a_scaled);

	v128_t isinf = wasm_f64x2_eq(c, wasm_f64x2_const(INFINITY, INFINITY));

	if (unlikely(wasm_v128_any_true(isinf))) {
		// sqrt(inf) = inf
		//
		// res = isinf ? 0.0 : res
		//               ^^^
		//           no rounding
		res = wasm_v128_bitselect(wasm_f64x2_const(0.0, 0.0), res, isinf);
	}

	return nextafter_2_nozero(wasm_f64x2_neg(res), c);
}

WASM_EXPORT("fsqrt_3")
v128_t fsqrt_3(v128_t dest) {
	v128_t c = wasm_f64x2_sqrt(dest);
	v128_t expc;
	v128_t c_scaled = frexp_reg_e_nozero_noinf(c, &expc);
	v128_t a_scaled = ldexp_reg_e_nozero_noinf(dest, wasm_i64x2_mul(wasm_i64x2_const(-2, -2), expc)); // -2 * expc
	v128_t res = mul_residue(c_scaled, c_scaled, a_scaled);

	v128_t isinf = wasm_f64x2_eq(c, wasm_f64x2_const(INFINITY, INFINITY));

	if (unlikely(wasm_v128_any_true(isinf))) {
		// sqrt(inf) = inf
		//
		// res = isinf ? 0.0 : res
		//               ^^^
		//           no rounding
		res = wasm_v128_bitselect(wasm_f64x2_const(0.0, 0.0), res, isinf);
	}

	return nextafter_3_nozero(wasm_f64x2_neg(res), c);
}

WASM_EXPORT("fsqrt_fma_1")
v128_t fsqrt_fma_1(v128_t dest) {
	v128_t c = wasm_f64x2_sqrt(dest);
	v128_t res = mul_residue_fma(c, c, dest);
	return nextafter_1_nozero(wasm_f64x2_neg(res), c);
}

WASM_EXPORT("fsqrt_fma_2")
v128_t fsqrt_fma_2(v128_t dest) {
	v128_t c = wasm_f64x2_sqrt(dest);
	v128_t res = mul_residue_fma(c, c, dest);
	return nextafter_2_nozero(wasm_f64x2_neg(res), c);
}

WASM_EXPORT("fsqrt_fma_3")
v128_t fsqrt_fma_3(v128_t dest) {
	v128_t c = wasm_f64x2_sqrt(dest);
	v128_t res = mul_residue_fma(c, c, dest);
	return nextafter_3_nozero(wasm_f64x2_neg(res), c);
}