Merge pull request #627 from robertknight/simd-xor

robertknight · web-flow · commit 110b8d9f61b2 · 2025-03-18T07:57:46.000Z
Implement bitwise NOT and XOR in new SIMD API
diff --git a/rten-simd/src/safe.rs b/rten-simd/src/safe.rs
@@ -173,8 +173,17 @@ macro_rules! assert_simd_eq {
     };
 }
 
+/// Test that two [`Simd`] vectors are not equal according to a [`PartialEq`]
+/// comparison of their array representations.
+#[cfg(test)]
+macro_rules! assert_simd_ne {
+    ($x:expr, $y:expr) => {
+        assert_ne!($x.to_array(), $y.to_array());
+    };
+}
+
 #[cfg(test)]
-pub(crate) use assert_simd_eq;
+pub(crate) use {assert_simd_eq, assert_simd_ne};
 
 #[cfg(test)]
 mod tests {
diff --git a/rten-simd/src/safe/arch/aarch64.rs b/rten-simd/src/safe/arch/aarch64.rs
@@ -6,12 +6,12 @@ use std::arch::aarch64::{
     vcgeq_u16, vcgeq_u8, vcgtq_f32, vcgtq_s16, vcgtq_s32, vcgtq_s8, vcgtq_u16, vcgtq_u8, vcleq_f32,
     vcleq_s16, vcleq_s8, vcleq_u16, vcleq_u8, vcltq_f32, vcltq_s16, vcltq_s8, vcltq_u16, vcltq_u8,
     vcombine_s16, vcombine_u8, vcvtnq_s32_f32, vcvtq_s32_f32, vdivq_f32, vdupq_n_f32, vdupq_n_s16,
-    vdupq_n_s32, vdupq_n_s8, vdupq_n_u16, vdupq_n_u8, vfmaq_f32, vget_low_s16, vget_low_s8,
-    vld1q_f32, vld1q_s16, vld1q_s32, vld1q_s8, vld1q_u16, vld1q_u32, vld1q_u8, vmaxq_f32,
-    vminq_f32, vmovl_high_s16, vmovl_high_s8, vmovl_s16, vmovl_s8, vmulq_f32, vmulq_s16, vmulq_s32,
-    vmulq_s8, vmulq_u16, vmulq_u8, vnegq_f32, vnegq_s16, vnegq_s32, vnegq_s8, vqmovn_s32,
-    vqmovun_s16, vshlq_n_s16, vshlq_n_s32, vshlq_n_s8, vst1q_f32, vst1q_s16, vst1q_s32, vst1q_s8,
-    vst1q_u16, vst1q_u8, vsubq_f32, vsubq_s16, vsubq_s32, vsubq_s8, vsubq_u16, vsubq_u8,
+    vdupq_n_s32, vdupq_n_s8, vdupq_n_u16, vdupq_n_u8, veorq_u32, vfmaq_f32, vget_low_s16,
+    vget_low_s8, vld1q_f32, vld1q_s16, vld1q_s32, vld1q_s8, vld1q_u16, vld1q_u32, vld1q_u8,
+    vmaxq_f32, vminq_f32, vmovl_high_s16, vmovl_high_s8, vmovl_s16, vmovl_s8, vmulq_f32, vmulq_s16,
+    vmulq_s32, vmulq_s8, vmulq_u16, vmulq_u8, vmvnq_u32, vnegq_f32, vnegq_s16, vnegq_s32, vnegq_s8,
+    vqmovn_s32, vqmovun_s16, vshlq_n_s16, vshlq_n_s32, vshlq_n_s8, vst1q_f32, vst1q_s16, vst1q_s32,
+    vst1q_s8, vst1q_u16, vst1q_u8, vsubq_f32, vsubq_s16, vsubq_s32, vsubq_s8, vsubq_u16, vsubq_u8,
     vzip1q_s16, vzip1q_s8, vzip2q_s16, vzip2q_s8,
 };
 use std::mem::transmute;
@@ -113,6 +113,28 @@ macro_rules! simd_ops_common {
         fn mask_ops(self) -> impl MaskOps<$mask> {
             self
         }
+
+        // Since bitwise ops work on individual bits, we can use the same
+        // implementation regardless of numeric type.
+
+        #[inline]
+        fn xor(self, x: $simd, y: $simd) -> $simd {
+            unsafe {
+                let x = transmute::<$simd, uint32x4_t>(x);
+                let y = transmute::<$simd, uint32x4_t>(y);
+                let tmp = veorq_u32(x, y);
+                transmute::<uint32x4_t, $simd>(tmp)
+            }
+        }
+
+        #[inline]
+        fn not(self, x: $simd) -> $simd {
+            unsafe {
+                let x = transmute::<$simd, uint32x4_t>(x);
+                let tmp = vmvnq_u32(x);
+                transmute::<uint32x4_t, $simd>(tmp)
+            }
+        }
     };
 }
 
diff --git a/rten-simd/src/safe/arch/generic.rs b/rten-simd/src/safe/arch/generic.rs
@@ -225,8 +225,32 @@ macro_rules! simd_ops_common {
     };
 }
 
+macro_rules! simd_int_ops_common {
+    ($simd:ty) => {
+        #[inline]
+        fn xor(self, x: $simd, y: $simd) -> $simd {
+            array::from_fn(|i| x.0[i] ^ y.0[i]).into()
+        }
+
+        #[inline]
+        fn not(self, x: $simd) -> $simd {
+            array::from_fn(|i| !x.0[i]).into()
+        }
+    };
+}
+
 unsafe impl NumOps<F32x4> for GenericIsa {
     simd_ops_common!(F32x4, f32, 4, M32);
+
+    #[inline]
+    fn xor(self, x: F32x4, y: F32x4) -> F32x4 {
+        array::from_fn(|i| f32::from_bits(x.0[i].to_bits() ^ y.0[i].to_bits())).into()
+    }
+
+    #[inline]
+    fn not(self, x: F32x4) -> F32x4 {
+        array::from_fn(|i| f32::from_bits(!x.0[i].to_bits())).into()
+    }
 }
 
 impl FloatOps<F32x4> for GenericIsa {
@@ -267,6 +291,7 @@ macro_rules! impl_simd_signed_int_ops {
     ($simd:ident, $elem:ty, $len:expr, $mask:ident) => {
         unsafe impl NumOps<$simd> for GenericIsa {
             simd_ops_common!($simd, $elem, $len, $mask);
+            simd_int_ops_common!($simd);
         }
 
         impl SignedIntOps<$simd> for GenericIsa {
@@ -334,6 +359,7 @@ macro_rules! impl_simd_unsigned_int_ops {
     ($simd:ident, $elem:ty, $len:expr, $mask:ident) => {
         unsafe impl NumOps<$simd> for GenericIsa {
             simd_ops_common!($simd, $elem, $len, $mask);
+            simd_int_ops_common!($simd);
         }
     };
 }
diff --git a/rten-simd/src/safe/arch/wasm32.rs b/rten-simd/src/safe/arch/wasm32.rs
@@ -9,7 +9,8 @@ use std::arch::wasm32::{
     i8x16_neg, i8x16_shl, i8x16_shuffle, i8x16_splat, i8x16_sub, u16x8_add, u16x8_eq,
     u16x8_extmul_high_u8x16, u16x8_extmul_low_u8x16, u16x8_ge, u16x8_gt, u16x8_mul, u16x8_splat,
     u16x8_sub, u8x16_add, u8x16_eq, u8x16_ge, u8x16_gt, u8x16_narrow_i16x8, u8x16_shuffle,
-    u8x16_splat, u8x16_sub, v128, v128_and, v128_bitselect, v128_load, v128_store,
+    u8x16_splat, u8x16_sub, v128, v128_and, v128_bitselect, v128_load, v128_not, v128_store,
+    v128_xor,
 };
 use std::mem::transmute;
 
@@ -141,6 +142,16 @@ macro_rules! simd_ops_common {
         fn select(self, x: $simd, y: $simd, mask: <$simd as Simd>::Mask) -> $simd {
             $simd(v128_bitselect(x.0, y.0, mask.0))
         }
+
+        #[inline]
+        fn xor(self, x: $simd, y: $simd) -> $simd {
+            v128_xor(x.0, y.0).into()
+        }
+
+        #[inline]
+        fn not(self, x: $simd) -> $simd {
+            v128_not(x.0).into()
+        }
     };
 }
 
diff --git a/rten-simd/src/safe/arch/x86_64/avx2.rs b/rten-simd/src/safe/arch/x86_64/avx2.rs
@@ -1,22 +1,22 @@
 use std::arch::x86_64::{
     __m128i, __m256, __m256i, _mm256_add_epi16, _mm256_add_epi32, _mm256_add_epi8, _mm256_add_ps,
-    _mm256_and_ps, _mm256_and_si256, _mm256_andnot_ps, _mm256_blendv_epi8, _mm256_blendv_ps,
-    _mm256_castps256_ps128, _mm256_castsi256_si128, _mm256_cmp_ps, _mm256_cmpeq_epi16,
-    _mm256_cmpeq_epi32, _mm256_cmpeq_epi8, _mm256_cmpgt_epi16, _mm256_cmpgt_epi32,
-    _mm256_cmpgt_epi8, _mm256_cvtepi16_epi32, _mm256_cvtepi8_epi16, _mm256_cvtepu8_epi16,
-    _mm256_cvtps_epi32, _mm256_cvttps_epi32, _mm256_div_ps, _mm256_extractf128_ps,
-    _mm256_extracti128_si256, _mm256_fmadd_ps, _mm256_insertf128_si256, _mm256_loadu_ps,
-    _mm256_loadu_si256, _mm256_maskload_epi32, _mm256_maskload_ps, _mm256_maskstore_epi32,
-    _mm256_maskstore_ps, _mm256_max_ps, _mm256_min_ps, _mm256_movemask_epi8, _mm256_mul_ps,
-    _mm256_mullo_epi16, _mm256_mullo_epi32, _mm256_or_si256, _mm256_packs_epi32,
-    _mm256_packus_epi16, _mm256_permute2x128_si256, _mm256_permute4x64_epi64, _mm256_set1_epi16,
-    _mm256_set1_epi32, _mm256_set1_epi8, _mm256_set1_ps, _mm256_setr_m128i, _mm256_setzero_si256,
-    _mm256_slli_epi16, _mm256_slli_epi32, _mm256_storeu_ps, _mm256_storeu_si256, _mm256_sub_epi16,
-    _mm256_sub_epi32, _mm256_sub_epi8, _mm256_sub_ps, _mm256_unpackhi_epi16, _mm256_unpackhi_epi8,
-    _mm256_unpacklo_epi16, _mm256_unpacklo_epi8, _mm256_xor_ps, _mm256_xor_si256, _mm_add_ps,
-    _mm_cvtss_f32, _mm_movehl_ps, _mm_prefetch, _mm_setr_epi8, _mm_shuffle_epi8, _mm_shuffle_ps,
-    _mm_unpacklo_epi64, _CMP_EQ_OQ, _CMP_GE_OQ, _CMP_GT_OQ, _CMP_LE_OQ, _CMP_LT_OQ, _MM_HINT_ET0,
-    _MM_HINT_T0,
+    _mm256_and_ps, _mm256_and_si256, _mm256_andnot_ps, _mm256_andnot_si256, _mm256_blendv_epi8,
+    _mm256_blendv_ps, _mm256_castps256_ps128, _mm256_castsi256_si128, _mm256_cmp_ps,
+    _mm256_cmpeq_epi16, _mm256_cmpeq_epi32, _mm256_cmpeq_epi8, _mm256_cmpgt_epi16,
+    _mm256_cmpgt_epi32, _mm256_cmpgt_epi8, _mm256_cvtepi16_epi32, _mm256_cvtepi8_epi16,
+    _mm256_cvtepu8_epi16, _mm256_cvtps_epi32, _mm256_cvttps_epi32, _mm256_div_ps,
+    _mm256_extractf128_ps, _mm256_extracti128_si256, _mm256_fmadd_ps, _mm256_insertf128_si256,
+    _mm256_loadu_ps, _mm256_loadu_si256, _mm256_maskload_epi32, _mm256_maskload_ps,
+    _mm256_maskstore_epi32, _mm256_maskstore_ps, _mm256_max_ps, _mm256_min_ps,
+    _mm256_movemask_epi8, _mm256_mul_ps, _mm256_mullo_epi16, _mm256_mullo_epi32, _mm256_or_si256,
+    _mm256_packs_epi32, _mm256_packus_epi16, _mm256_permute2x128_si256, _mm256_permute4x64_epi64,
+    _mm256_set1_epi16, _mm256_set1_epi32, _mm256_set1_epi8, _mm256_set1_ps, _mm256_setr_m128i,
+    _mm256_setzero_si256, _mm256_slli_epi16, _mm256_slli_epi32, _mm256_storeu_ps,
+    _mm256_storeu_si256, _mm256_sub_epi16, _mm256_sub_epi32, _mm256_sub_epi8, _mm256_sub_ps,
+    _mm256_unpackhi_epi16, _mm256_unpackhi_epi8, _mm256_unpacklo_epi16, _mm256_unpacklo_epi8,
+    _mm256_xor_ps, _mm256_xor_si256, _mm_add_ps, _mm_cvtss_f32, _mm_movehl_ps, _mm_prefetch,
+    _mm_setr_epi8, _mm_shuffle_epi8, _mm_shuffle_ps, _mm_unpacklo_epi64, _CMP_EQ_OQ, _CMP_GE_OQ,
+    _CMP_GT_OQ, _CMP_LE_OQ, _CMP_LT_OQ, _MM_HINT_ET0, _MM_HINT_T0,
 };
 use std::is_x86_feature_detected;
 use std::mem::transmute;
@@ -116,6 +116,20 @@ macro_rules! simd_ops_common {
     };
 }
 
+macro_rules! simd_int_ops_common {
+    ($simd:ty) => {
+        #[inline]
+        fn xor(self, x: $simd, y: $simd) -> $simd {
+            unsafe { _mm256_xor_si256(x.0, y.0) }.into()
+        }
+
+        #[inline]
+        fn not(self, x: $simd) -> $simd {
+            unsafe { _mm256_andnot_si256(x.0, _mm256_set1_epi8(-1)) }.into()
+        }
+    };
+}
+
 unsafe impl NumOps<F32x8> for Avx2Isa {
     simd_ops_common!(F32x8, F32x8);
 
@@ -180,6 +194,17 @@ unsafe impl NumOps<F32x8> for Avx2Isa {
         unsafe { _mm256_max_ps(x.0, y.0) }.into()
     }
 
+    #[inline]
+    fn xor(self, x: F32x8, y: F32x8) -> F32x8 {
+        unsafe { _mm256_xor_ps(x.0, y.0) }.into()
+    }
+
+    #[inline]
+    fn not(self, x: F32x8) -> F32x8 {
+        let all_ones: F32x8 = self.splat(f32::from_bits(0xFFFFFFFF));
+        unsafe { _mm256_andnot_ps(x.0, all_ones.0) }.into()
+    }
+
     #[inline]
     fn splat(self, x: f32) -> F32x8 {
         unsafe { _mm256_set1_ps(x) }.into()
@@ -259,6 +284,7 @@ impl FloatOps<F32x8> for Avx2Isa {
 
 unsafe impl NumOps<I32x8> for Avx2Isa {
     simd_ops_common!(I32x8, I32x8);
+    simd_int_ops_common!(I32x8);
 
     #[inline]
     fn first_n_mask(self, n: usize) -> I32x8 {
@@ -362,6 +388,7 @@ impl NarrowSaturate<I32x8, I16x16> for Avx2Isa {
 
 unsafe impl NumOps<I16x16> for Avx2Isa {
     simd_ops_common!(I16x16, I16x16);
+    simd_int_ops_common!(I16x16);
 
     #[inline]
     fn first_n_mask(self, n: usize) -> I16x16 {
@@ -506,6 +533,7 @@ impl Interleave<I16x16> for Avx2Isa {
 
 unsafe impl NumOps<I8x32> for Avx2Isa {
     simd_ops_common!(I8x32, I8x32);
+    simd_int_ops_common!(I8x32);
 
     #[inline]
     fn first_n_mask(self, n: usize) -> I8x32 {
@@ -647,6 +675,7 @@ impl Interleave<I8x32> for Avx2Isa {
 
 unsafe impl NumOps<U8x32> for Avx2Isa {
     simd_ops_common!(U8x32, I8x32);
+    simd_int_ops_common!(U8x32);
 
     #[inline]
     fn first_n_mask(self, n: usize) -> I8x32 {
@@ -755,6 +784,7 @@ unsafe impl NumOps<U8x32> for Avx2Isa {
 
 unsafe impl NumOps<U16x16> for Avx2Isa {
     simd_ops_common!(U16x16, I16x16);
+    simd_int_ops_common!(U16x16);
 
     #[inline]
     fn first_n_mask(self, n: usize) -> I16x16 {
diff --git a/rten-simd/src/safe/arch/x86_64/avx512.rs b/rten-simd/src/safe/arch/x86_64/avx512.rs
@@ -1,6 +1,6 @@
 use std::arch::x86_64::{
     __m512, __m512i, __mmask16, __mmask32, __mmask64, _mm512_add_epi16, _mm512_add_epi32,
-    _mm512_add_epi8, _mm512_add_ps, _mm512_andnot_ps, _mm512_castsi256_si512,
+    _mm512_add_epi8, _mm512_add_ps, _mm512_andnot_ps, _mm512_andnot_si512, _mm512_castsi256_si512,
     _mm512_cmp_epi16_mask, _mm512_cmp_epi32_mask, _mm512_cmp_epu16_mask, _mm512_cmp_ps_mask,
     _mm512_cmpeq_epi8_mask, _mm512_cmpeq_epu8_mask, _mm512_cmpge_epi8_mask, _mm512_cmpge_epu8_mask,
     _mm512_cmpgt_epi8_mask, _mm512_cmpgt_epu8_mask, _mm512_cvtepi16_epi32, _mm512_cvtepi16_epi8,
@@ -16,8 +16,8 @@ use std::arch::x86_64::{
     _mm512_setzero_si512, _mm512_sllv_epi16, _mm512_sllv_epi32, _mm512_storeu_ps,
     _mm512_storeu_si512, _mm512_sub_epi16, _mm512_sub_epi32, _mm512_sub_epi8, _mm512_sub_ps,
     _mm512_unpackhi_epi16, _mm512_unpackhi_epi8, _mm512_unpacklo_epi16, _mm512_unpacklo_epi8,
-    _mm512_xor_ps, _mm_prefetch, _CMP_EQ_OQ, _CMP_GE_OQ, _CMP_GT_OQ, _CMP_LE_OQ, _CMP_LT_OQ,
-    _MM_CMPINT_EQ, _MM_CMPINT_NLE, _MM_CMPINT_NLT, _MM_HINT_ET0, _MM_HINT_T0,
+    _mm512_xor_ps, _mm512_xor_si512, _mm_prefetch, _CMP_EQ_OQ, _CMP_GE_OQ, _CMP_GT_OQ, _CMP_LE_OQ,
+    _CMP_LT_OQ, _MM_CMPINT_EQ, _MM_CMPINT_NLE, _MM_CMPINT_NLT, _MM_HINT_ET0, _MM_HINT_T0,
 };
 use std::mem::transmute;
 
@@ -125,6 +125,20 @@ macro_rules! simd_ops_common {
     };
 }
 
+macro_rules! simd_int_ops_common {
+    ($simd:ty) => {
+        #[inline]
+        fn xor(self, x: $simd, y: $simd) -> $simd {
+            unsafe { _mm512_xor_si512(x.0, y.0) }.into()
+        }
+
+        #[inline]
+        fn not(self, x: $simd) -> $simd {
+            unsafe { _mm512_andnot_si512(x.0, _mm512_set1_epi8(-1)) }.into()
+        }
+    };
+}
+
 unsafe impl NumOps<F32x16> for Avx512Isa {
     simd_ops_common!(F32x16, __mmask16);
 
@@ -183,6 +197,17 @@ unsafe impl NumOps<F32x16> for Avx512Isa {
         unsafe { _mm512_max_ps(x.0, y.0) }.into()
     }
 
+    #[inline]
+    fn xor(self, x: F32x16, y: F32x16) -> F32x16 {
+        unsafe { _mm512_xor_ps(x.0, y.0) }.into()
+    }
+
+    #[inline]
+    fn not(self, x: F32x16) -> F32x16 {
+        let all_ones: F32x16 = self.splat(f32::from_bits(0xFFFFFFFF));
+        unsafe { _mm512_andnot_ps(x.0, all_ones.0) }.into()
+    }
+
     #[inline]
     fn splat(self, x: f32) -> F32x16 {
         unsafe { _mm512_set1_ps(x) }.into()
@@ -250,6 +275,7 @@ impl FloatOps<F32x16> for Avx512Isa {
 
 unsafe impl NumOps<I32x16> for Avx512Isa {
     simd_ops_common!(I32x16, __mmask16);
+    simd_int_ops_common!(I32x16);
 
     #[inline]
     fn add(self, x: I32x16, y: I32x16) -> I32x16 {
@@ -343,6 +369,7 @@ impl NarrowSaturate<I32x16, I16x32> for Avx512Isa {
 
 unsafe impl NumOps<I16x32> for Avx512Isa {
     simd_ops_common!(I16x32, __mmask32);
+    simd_int_ops_common!(I16x32);
 
     #[inline]
     fn add(self, x: I16x32, y: I16x32) -> I16x32 {
@@ -463,6 +490,7 @@ impl Interleave<I16x32> for Avx512Isa {
 
 unsafe impl NumOps<I8x64> for Avx512Isa {
     simd_ops_common!(I8x64, __mmask64);
+    simd_int_ops_common!(I8x64);
 
     #[inline]
     fn add(self, x: I8x64, y: I8x64) -> I8x64 {
@@ -581,6 +609,7 @@ impl Interleave<I8x64> for Avx512Isa {
 
 unsafe impl NumOps<U8x64> for Avx512Isa {
     simd_ops_common!(U8x64, __mmask64);
+    simd_int_ops_common!(U8x64);
 
     #[inline]
     fn add(self, x: U8x64, y: U8x64) -> U8x64 {
@@ -728,6 +757,7 @@ impl Narrow<U16x32> for Avx512Isa {
 
 unsafe impl NumOps<U16x32> for Avx512Isa {
     simd_ops_common!(U16x32, __mmask32);
+    simd_int_ops_common!(U16x32);
 
     #[inline]
     fn add(self, x: U16x32, y: U16x32) -> U16x32 {
diff --git a/rten-simd/src/safe/vec.rs b/rten-simd/src/safe/vec.rs