From e81a6fc46b4ae48fbf7cf666829defcb4aea3f6b Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 27 May 2026 10:28:35 +0100 Subject: [PATCH] PoC: remove unsafe from generated calls to intrinsics --- fearless_simd/src/generated/avx2.rs | 2829 ++++++++++--------------- fearless_simd/src/generated/neon.rs | 921 ++++---- fearless_simd/src/generated/sse4_2.rs | 1117 +++++----- fearless_simd_gen/src/mk_neon.rs | 174 +- fearless_simd_gen/src/mk_x86.rs | 672 +++--- fearless_simd_gen/src/ops.rs | 183 +- 6 files changed, 2725 insertions(+), 3171 deletions(-) diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index 48e545a20..44e460991 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -98,7 +98,8 @@ impl Simd for Avx2 { } #[inline(always)] fn splat_f32x4(self, val: f32) -> f32x4 { - unsafe { _mm_set1_ps(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : f32) -> f32x4 < Avx2 > { _mm_set1_ps (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4 { @@ -177,78 +178,93 @@ impl Simd for Avx2 { } #[inline(always)] fn abs_f32x4(self, a: f32x4) -> f32x4 { - unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_andnot_ps (_mm_set1_ps (- 0.0) , a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn neg_f32x4(self, a: f32x4) -> f32x4 { - unsafe { _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_xor_ps (a . into () , _mm_set1_ps (- 0.0)) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn sqrt_f32x4(self, a: f32x4) -> f32x4 { - unsafe { _mm_sqrt_ps(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_sqrt_ps (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn approximate_recip_f32x4(self, a: f32x4) -> f32x4 { - unsafe { _mm_rcp_ps(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_rcp_ps (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_add_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_sub_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_sub_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_mul_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_mul_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn div_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_div_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_div_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn copysign_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { - let mask = _mm_set1_ps(-0.0); - _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 >) -> f32x4 < Avx2 > { let mask = _mm_set1_ps (- 0.0) ; _mm_or_ps (_mm_and_ps (mask , b . into ()) , _mm_andnot_ps (mask , a . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_eq_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmpeq_ps(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 >) -> mask32x4 < Avx2 > { _mm_castps_si128 (_mm_cmpeq_ps (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmplt_ps(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 >) -> mask32x4 < Avx2 > { _mm_castps_si128 (_mm_cmplt_ps (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmple_ps(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 >) -> mask32x4 < Avx2 > { _mm_castps_si128 (_mm_cmple_ps (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmpge_ps(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 >) -> mask32x4 < Avx2 > { _mm_castps_si128 (_mm_cmpge_ps (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmpgt_ps(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 >) -> mask32x4 < Avx2 > { _mm_castps_si128 (_mm_cmpgt_ps (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_unpacklo_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_unpacklo_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_unpackhi_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_unpackhi_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_shuffle_ps :: < 0b10_00_10_00 > (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_shuffle_ps :: < 0b11_01_11_01 > (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_f32x4(self, a: f32x4, b: f32x4) -> (f32x4, f32x4) { @@ -260,54 +276,48 @@ impl Simd for Avx2 { } #[inline(always)] fn max_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_max_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn min_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_min_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { - let intermediate = _mm_max_ps(a.into(), b.into()); - let b_is_nan = _mm_cmpunord_ps(b.into(), b.into()); - _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 >) -> f32x4 < Avx2 > { let intermediate = _mm_max_ps (a . into () , b . into ()) ; let b_is_nan = _mm_cmpunord_ps (b . into () , b . into ()) ; _mm_blendv_ps (intermediate , a . into () , b_is_nan) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn min_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { - let intermediate = _mm_min_ps(a.into(), b.into()); - let b_is_nan = _mm_cmpunord_ps(b.into(), b.into()); - _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 >) -> f32x4 < Avx2 > { let intermediate = _mm_min_ps (a . into () , b . into ()) ; let b_is_nan = _mm_cmpunord_ps (b . into () , b . into ()) ; _mm_blendv_ps (intermediate , a . into () , b_is_nan) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { - unsafe { _mm_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 > , c : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_fmadd_ps (a . into () , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn mul_sub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { - unsafe { _mm_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 > , c : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_fmsub_ps (a . into () , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn floor_f32x4(self, a: f32x4) -> f32x4 { - unsafe { - _mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_round_ps :: < { _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn ceil_f32x4(self, a: f32x4) -> f32x4 { - unsafe { - _mm_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_round_ps :: < { _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn round_ties_even_f32x4(self, a: f32x4) -> f32x4 { - unsafe { - _mm_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_round_ps :: < { _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn fract_f32x4(self, a: f32x4) -> f32x4 { @@ -315,96 +325,63 @@ impl Simd for Avx2 { } #[inline(always)] fn trunc_f32x4(self, a: f32x4) -> f32x4 { - unsafe { - _mm_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_round_ps :: < { _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn select_f32x4(self, a: mask32x4, b: f32x4, c: f32x4) -> f32x4 { - unsafe { _mm_blendv_ps(c.into(), b.into(), _mm_castsi128_ps(a.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x4 < Avx2 > , b : f32x4 < Avx2 > , c : f32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_blendv_ps (c . into () , b . into () , _mm_castsi128_ps (a . into ())) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn combine_f32x4(self, a: f32x4, b: f32x4) -> f32x8 { - unsafe { _mm256_setr_m128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 > , b : f32x4 < Avx2 >) -> f32x8 < Avx2 > { _mm256_setr_m128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2 { - unsafe { _mm_castps_pd(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 >) -> f64x2 < Avx2 > { _mm_castps_pd (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_i32_f32x4(self, a: f32x4) -> i32x4 { - unsafe { _mm_castps_si128(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 >) -> i32x4 < Avx2 > { _mm_castps_si128 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_f32x4(self, a: f32x4) -> u8x16 { - unsafe { _mm_castps_si128(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 >) -> u8x16 < Avx2 > { _mm_castps_si128 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_f32x4(self, a: f32x4) -> u32x4 { - unsafe { _mm_castps_si128(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 >) -> u32x4 < Avx2 > { _mm_castps_si128 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_u32_f32x4(self, a: f32x4) -> u32x4 { - unsafe { - let mut converted = _mm_cvttps_epi32(a.into()); - let in_range = _mm_cmplt_ps(a.into(), _mm_set1_ps(2147483648.0)); - let all_in_range = _mm_movemask_ps(in_range) == 0b1111; - if !all_in_range { - let excess = _mm_sub_ps(a.into(), _mm_set1_ps(2147483648.0)); - let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess)); - converted = _mm_add_epi32(converted, excess_converted); - } - converted.simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 >) -> u32x4 < Avx2 > { let mut converted = _mm_cvttps_epi32 (a . into ()) ; let in_range = _mm_cmplt_ps (a . into () , _mm_set1_ps (2147483648.0)) ; let all_in_range = _mm_movemask_ps (in_range) == 0b1111 ; if ! all_in_range { let excess = _mm_sub_ps (a . into () , _mm_set1_ps (2147483648.0)) ; let excess_converted = _mm_cvttps_epi32 (_mm_andnot_ps (in_range , excess)) ; converted = _mm_add_epi32 (converted , excess_converted) ; } converted . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_u32_precise_f32x4(self, a: f32x4) -> u32x4 { - unsafe { - let a = _mm_max_ps(a.into(), _mm_setzero_ps()); - let mut converted = _mm_cvttps_epi32(a); - let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0)); - let all_in_range = _mm_movemask_ps(in_range) == 0b1111; - if !all_in_range { - let exceeds_unsigned_range = - _mm_castps_si128(_mm_cmplt_ps(_mm_set1_ps(4294967040.0), a)); - let excess = _mm_sub_ps(a, _mm_set1_ps(2147483648.0)); - let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess)); - converted = _mm_add_epi32(converted, excess_converted); - converted = _mm_blendv_epi8( - converted, - _mm_set1_epi32(u32::MAX.cast_signed()), - exceeds_unsigned_range, - ); - } - converted.simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 >) -> u32x4 < Avx2 > { let a = _mm_max_ps (a . into () , _mm_setzero_ps ()) ; let mut converted = _mm_cvttps_epi32 (a) ; let in_range = _mm_cmplt_ps (a , _mm_set1_ps (2147483648.0)) ; let all_in_range = _mm_movemask_ps (in_range) == 0b1111 ; if ! all_in_range { let exceeds_unsigned_range = _mm_castps_si128 (_mm_cmplt_ps (_mm_set1_ps (4294967040.0) , a)) ; let excess = _mm_sub_ps (a , _mm_set1_ps (2147483648.0)) ; let excess_converted = _mm_cvttps_epi32 (_mm_andnot_ps (in_range , excess)) ; converted = _mm_add_epi32 (converted , excess_converted) ; converted = _mm_blendv_epi8 (converted , _mm_set1_epi32 (u32 :: MAX . cast_signed ()) , exceeds_unsigned_range) ; } converted . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_i32_f32x4(self, a: f32x4) -> i32x4 { - unsafe { _mm_cvttps_epi32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 >) -> i32x4 < Avx2 > { _mm_cvttps_epi32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_i32_precise_f32x4(self, a: f32x4) -> i32x4 { - unsafe { - let a = a.into(); - let mut converted = _mm_cvttps_epi32(a); - let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0)); - let all_in_range = _mm_movemask_ps(in_range) == 0b1111; - if !all_in_range { - converted = _mm_blendv_epi8( - _mm_set1_epi32(i32::MAX), - converted, - _mm_castps_si128(in_range), - ); - let is_not_nan = _mm_castps_si128(_mm_cmpord_ps(a, a)); - converted = _mm_and_si128(converted, is_not_nan); - } - converted.simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x4 < Avx2 >) -> i32x4 < Avx2 > { let a = a . into () ; let mut converted = _mm_cvttps_epi32 (a) ; let in_range = _mm_cmplt_ps (a , _mm_set1_ps (2147483648.0)) ; let all_in_range = _mm_movemask_ps (in_range) == 0b1111 ; if ! all_in_range { converted = _mm_blendv_epi8 (_mm_set1_epi32 (i32 :: MAX) , converted , _mm_castps_si128 (in_range)) ; let is_not_nan = _mm_castps_si128 (_mm_cmpord_ps (a , a)) ; converted = _mm_and_si128 (converted , is_not_nan) ; } converted . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_i8x16(self, val: i8) -> i8x16 { - unsafe { _mm_set1_epi8(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : i8) -> i8x16 < Avx2 > { _mm_set1_epi8 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16 { @@ -483,36 +460,33 @@ impl Simd for Avx2 { } #[inline(always)] fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , b : i8x16 < Avx2 >) -> i8x16 < Avx2 > { _mm_add_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , b : i8x16 < Avx2 >) -> i8x16 < Avx2 > { _mm_sub_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { - let dst_even = _mm_mullo_epi16(a.into(), b.into()); - let dst_odd = - _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into())); - _mm_or_si128( - _mm_slli_epi16(dst_odd, 8), - _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)), - ) - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , b : i8x16 < Avx2 >) -> i8x16 < Avx2 > { let dst_even = _mm_mullo_epi16 (a . into () , b . into ()) ; let dst_odd = _mm_mullo_epi16 (_mm_srli_epi16 :: < 8 > (a . into ()) , _mm_srli_epi16 :: < 8 > (b . into ())) ; _mm_or_si128 (_mm_slli_epi16 (dst_odd , 8) , _mm_and_si128 (dst_even , _mm_set1_epi16 (0xFF))) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , b : i8x16 < Avx2 >) -> i8x16 < Avx2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , b : i8x16 < Avx2 >) -> i8x16 < Avx2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , b : i8x16 < Avx2 >) -> i8x16 < Avx2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_i8x16(self, a: i8x16) -> i8x16 { @@ -520,15 +494,8 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_i8x16(self, a: i8x16, shift: u32) -> i8x16 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); - let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); - let lo_shifted = _mm_sll_epi16(lo_16, shift_count); - let hi_shifted = _mm_sll_epi16(hi_16, shift_count); - _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , shift : u32) -> i8x16 < Avx2 > { let val = a . into () ; let shift_count = _mm_cvtsi32_si128 (shift . cast_signed ()) ; let lo_16 = _mm_unpacklo_epi8 (val , _mm_cmpgt_epi8 (_mm_setzero_si128 () , val)) ; let hi_16 = _mm_unpackhi_epi8 (val , _mm_cmpgt_epi8 (_mm_setzero_si128 () , val)) ; let lo_shifted = _mm_sll_epi16 (lo_16 , shift_count) ; let hi_shifted = _mm_sll_epi16 (hi_16 , shift_count) ; _mm_packs_epi16 (lo_shifted , hi_shifted) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { @@ -536,15 +503,8 @@ impl Simd for Avx2 { } #[inline(always)] fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); - let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); - let lo_shifted = _mm_sra_epi16(lo_16, shift_count); - let hi_shifted = _mm_sra_epi16(hi_16, shift_count); - _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , shift : u32) -> i8x16 < Avx2 > { let val = a . into () ; let shift_count = _mm_cvtsi32_si128 (shift . cast_signed ()) ; let lo_16 = _mm_unpacklo_epi8 (val , _mm_cmpgt_epi8 (_mm_setzero_si128 () , val)) ; let hi_16 = _mm_unpackhi_epi8 (val , _mm_cmpgt_epi8 (_mm_setzero_si128 () , val)) ; let lo_shifted = _mm_sra_epi16 (lo_16 , shift_count) ; let hi_shifted = _mm_sra_epi16 (hi_16 , shift_count) ; _mm_packs_epi16 (lo_shifted , hi_shifted) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { @@ -552,49 +512,48 @@ impl Simd for Avx2 { } #[inline(always)] fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , b : i8x16 < Avx2 >) -> mask8x16 < Avx2 > { _mm_cmpeq_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpgt_epi8(b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , b : i8x16 < Avx2 >) -> mask8x16 < Avx2 > { _mm_cmpgt_epi8 (b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(_mm_min_epi8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , b : i8x16 < Avx2 >) -> mask8x16 < Avx2 > { _mm_cmpeq_epi8 (_mm_min_epi8 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(_mm_max_epi8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , b : i8x16 < Avx2 >) -> mask8x16 < Avx2 > { _mm_cmpeq_epi8 (_mm_max_epi8 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpgt_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , b : i8x16 < Avx2 >) -> mask8x16 < Avx2 > { _mm_cmpgt_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , b : i8x16 < Avx2 >) -> i8x16 < Avx2 > { _mm_unpacklo_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , b : i8x16 < Avx2 >) -> i8x16 < Avx2 > { _mm_unpackhi_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , b : i8x16 < Avx2 >) -> i8x16 < Avx2 > { let mask = _mm_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15) ; let t1 = _mm_shuffle_epi8 (a . into () , mask) ; let t2 = _mm_shuffle_epi8 (b . into () , mask) ; _mm_unpacklo_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , b : i8x16 < Avx2 >) -> i8x16 < Avx2 > { let mask = _mm_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15) ; let t1 = _mm_shuffle_epi8 (a . into () , mask) ; let t2 = _mm_shuffle_epi8 (b . into () , mask) ; _mm_unpackhi_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_i8x16(self, a: i8x16, b: i8x16) -> (i8x16, i8x16) { @@ -606,35 +565,43 @@ impl Simd for Avx2 { } #[inline(always)] fn select_i8x16(self, a: mask8x16, b: i8x16, c: i8x16) -> i8x16 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x16 < Avx2 > , b : i8x16 < Avx2 > , c : i8x16 < Avx2 >) -> i8x16 < Avx2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_min_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , b : i8x16 < Avx2 >) -> i8x16 < Avx2 > { _mm_min_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_max_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , b : i8x16 < Avx2 >) -> i8x16 < Avx2 > { _mm_max_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_i8x16(self, a: i8x16, b: i8x16) -> i8x32 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 > , b : i8x16 < Avx2 >) -> i8x32 < Avx2 > { _mm256_setr_m128i (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn neg_i8x16(self, a: i8x16) -> i8x16 { - unsafe { _mm_sub_epi8(_mm_setzero_si128(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 >) -> i8x16 < Avx2 > { _mm_sub_epi8 (_mm_setzero_si128 () , a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i8x16(self, a: i8x16) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 >) -> u8x16 < Avx2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i8x16(self, a: i8x16) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x16 < Avx2 >) -> u32x4 < Avx2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_u8x16(self, val: u8) -> u8x16 { - unsafe { _mm_set1_epi8(val.cast_signed()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : u8) -> u8x16 < Avx2 > { _mm_set1_epi8 (val . cast_signed ()) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16 { @@ -713,36 +680,33 @@ impl Simd for Avx2 { } #[inline(always)] fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , b : u8x16 < Avx2 >) -> u8x16 < Avx2 > { _mm_add_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , b : u8x16 < Avx2 >) -> u8x16 < Avx2 > { _mm_sub_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { - let dst_even = _mm_mullo_epi16(a.into(), b.into()); - let dst_odd = - _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into())); - _mm_or_si128( - _mm_slli_epi16(dst_odd, 8), - _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)), - ) - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , b : u8x16 < Avx2 >) -> u8x16 < Avx2 > { let dst_even = _mm_mullo_epi16 (a . into () , b . into ()) ; let dst_odd = _mm_mullo_epi16 (_mm_srli_epi16 :: < 8 > (a . into ()) , _mm_srli_epi16 :: < 8 > (b . into ())) ; _mm_or_si128 (_mm_slli_epi16 (dst_odd , 8) , _mm_and_si128 (dst_even , _mm_set1_epi16 (0xFF))) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , b : u8x16 < Avx2 >) -> u8x16 < Avx2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , b : u8x16 < Avx2 >) -> u8x16 < Avx2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , b : u8x16 < Avx2 >) -> u8x16 < Avx2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_u8x16(self, a: u8x16) -> u8x16 { @@ -750,15 +714,8 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_u8x16(self, a: u8x16, shift: u32) -> u8x16 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); - let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); - let lo_shifted = _mm_sll_epi16(lo_16, shift_count); - let hi_shifted = _mm_sll_epi16(hi_16, shift_count); - _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , shift : u32) -> u8x16 < Avx2 > { let val = a . into () ; let shift_count = _mm_cvtsi32_si128 (shift . cast_signed ()) ; let lo_16 = _mm_unpacklo_epi8 (val , _mm_setzero_si128 ()) ; let hi_16 = _mm_unpackhi_epi8 (val , _mm_setzero_si128 ()) ; let lo_shifted = _mm_sll_epi16 (lo_16 , shift_count) ; let hi_shifted = _mm_sll_epi16 (hi_16 , shift_count) ; _mm_packus_epi16 (lo_shifted , hi_shifted) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { @@ -766,15 +723,8 @@ impl Simd for Avx2 { } #[inline(always)] fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); - let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); - let lo_shifted = _mm_srl_epi16(lo_16, shift_count); - let hi_shifted = _mm_srl_epi16(hi_16, shift_count); - _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , shift : u32) -> u8x16 < Avx2 > { let val = a . into () ; let shift_count = _mm_cvtsi32_si128 (shift . cast_signed ()) ; let lo_16 = _mm_unpacklo_epi8 (val , _mm_setzero_si128 ()) ; let hi_16 = _mm_unpackhi_epi8 (val , _mm_setzero_si128 ()) ; let lo_shifted = _mm_srl_epi16 (lo_16 , shift_count) ; let hi_shifted = _mm_srl_epi16 (hi_16 , shift_count) ; _mm_packus_epi16 (lo_shifted , hi_shifted) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { @@ -782,59 +732,48 @@ impl Simd for Avx2 { } #[inline(always)] fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , b : u8x16 < Avx2 >) -> mask8x16 < Avx2 > { _mm_cmpeq_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { - let sign_bit = _mm_set1_epi8(0x80u8.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi8(b_signed, a_signed).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , b : u8x16 < Avx2 >) -> mask8x16 < Avx2 > { let sign_bit = _mm_set1_epi8 (0x80u8 . cast_signed ()) ; let a_signed = _mm_xor_si128 (a . into () , sign_bit) ; let b_signed = _mm_xor_si128 (b . into () , sign_bit) ; _mm_cmpgt_epi8 (b_signed , a_signed) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(_mm_min_epu8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , b : u8x16 < Avx2 >) -> mask8x16 < Avx2 > { _mm_cmpeq_epi8 (_mm_min_epu8 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(_mm_max_epu8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , b : u8x16 < Avx2 >) -> mask8x16 < Avx2 > { _mm_cmpeq_epi8 (_mm_max_epu8 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { - let sign_bit = _mm_set1_epi8(0x80u8.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , b : u8x16 < Avx2 >) -> mask8x16 < Avx2 > { let sign_bit = _mm_set1_epi8 (0x80u8 . cast_signed ()) ; let a_signed = _mm_xor_si128 (a . into () , sign_bit) ; let b_signed = _mm_xor_si128 (b . into () , sign_bit) ; _mm_cmpgt_epi8 (a_signed , b_signed) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , b : u8x16 < Avx2 >) -> u8x16 < Avx2 > { _mm_unpacklo_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , b : u8x16 < Avx2 >) -> u8x16 < Avx2 > { _mm_unpackhi_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , b : u8x16 < Avx2 >) -> u8x16 < Avx2 > { let mask = _mm_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15) ; let t1 = _mm_shuffle_epi8 (a . into () , mask) ; let t2 = _mm_shuffle_epi8 (b . into () , mask) ; _mm_unpacklo_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , b : u8x16 < Avx2 >) -> u8x16 < Avx2 > { let mask = _mm_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15) ; let t1 = _mm_shuffle_epi8 (a . into () , mask) ; let t2 = _mm_shuffle_epi8 (b . into () , mask) ; _mm_unpackhi_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_u8x16(self, a: u8x16, b: u8x16) -> (u8x16, u8x16) { @@ -846,34 +785,38 @@ impl Simd for Avx2 { } #[inline(always)] fn select_u8x16(self, a: mask8x16, b: u8x16, c: u8x16) -> u8x16 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x16 < Avx2 > , b : u8x16 < Avx2 > , c : u8x16 < Avx2 >) -> u8x16 < Avx2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_min_epu8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , b : u8x16 < Avx2 >) -> u8x16 < Avx2 > { _mm_min_epu8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_max_epu8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , b : u8x16 < Avx2 >) -> u8x16 < Avx2 > { _mm_max_epu8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_u8x16(self, a: u8x16, b: u8x16) -> u8x32 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 > , b : u8x16 < Avx2 >) -> u8x32 < Avx2 > { _mm256_setr_m128i (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn widen_u8x16(self, a: u8x16) -> u16x16 { - unsafe { _mm256_cvtepu8_epi16(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 >) -> u16x16 < Avx2 > { _mm256_cvtepu8_epi16 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_u8x16(self, a: u8x16) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x16 < Avx2 >) -> u32x4 < Avx2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_mask8x16(self, val: bool) -> mask8x16 { - unsafe { - let val: i8 = if val { !0 } else { 0 }; - _mm_set1_epi8(val).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : bool) -> mask8x16 < Avx2 > { let val : i8 = if val { ! 0 } else { 0 } ; _mm_set1_epi8 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16 { @@ -888,35 +831,28 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16 { - unsafe { - { - let bit_bytes = _mm_cvtsi32_si128(bits as i32); - let bit_bytes = _mm_shuffle_epi8( - bit_bytes, - _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1), - ); - let bit_mask = - _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128); - _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) - } - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , bits : u64) -> mask8x16 < Avx2 > { { let bit_bytes = _mm_cvtsi32_si128 (bits as i32) ; let bit_bytes = _mm_shuffle_epi8 (bit_bytes , _mm_setr_epi8 (0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1)) ; let bit_mask = _mm_setr_epi8 (1 , 2 , 4 , 8 , 16 , 32 , 64 , - 128 , 1 , 2 , 4 , 8 , 16 , 32 , 64 , - 128) ; _mm_cmpeq_epi8 (_mm_and_si128 (bit_bytes , bit_mask) , bit_mask) } . simd_into (token) } } + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask8x16(self, a: mask8x16) -> u64 { - unsafe { _mm_movemask_epi8(a.into()) as u32 as u64 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x16 < Avx2 >) -> u64 { _mm_movemask_epi8 (a . into ()) as u32 as u64 } } + kernel(self, a) } #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x16 < Avx2 > , b : mask8x16 < Avx2 >) -> mask8x16 < Avx2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x16 < Avx2 > , b : mask8x16 < Avx2 >) -> mask8x16 < Avx2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x16 < Avx2 > , b : mask8x16 < Avx2 >) -> mask8x16 < Avx2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_mask8x16(self, a: mask8x16) -> mask8x16 { @@ -929,35 +865,43 @@ impl Simd for Avx2 { b: mask8x16, c: mask8x16, ) -> mask8x16 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x16 < Avx2 > , b : mask8x16 < Avx2 > , c : mask8x16 < Avx2 >) -> mask8x16 < Avx2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x16 < Avx2 > , b : mask8x16 < Avx2 >) -> mask8x16 < Avx2 > { _mm_cmpeq_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn any_true_mask8x16(self, a: mask8x16) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x16 < Avx2 >) -> bool { _mm_movemask_epi8 (a . into ()) as u32 != 0 } } + kernel(self, a) } #[inline(always)] fn all_true_mask8x16(self, a: mask8x16) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x16 < Avx2 >) -> bool { _mm_movemask_epi8 (a . into ()) as u32 == 0xffff } } + kernel(self, a) } #[inline(always)] fn any_false_mask8x16(self, a: mask8x16) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x16 < Avx2 >) -> bool { _mm_movemask_epi8 (a . into ()) as u32 != 0xffff } } + kernel(self, a) } #[inline(always)] fn all_false_mask8x16(self, a: mask8x16) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x16 < Avx2 >) -> bool { _mm_movemask_epi8 (a . into ()) as u32 == 0 } } + kernel(self, a) } #[inline(always)] fn combine_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x32 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x16 < Avx2 > , b : mask8x16 < Avx2 >) -> mask8x32 < Avx2 > { _mm256_setr_m128i (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn splat_i16x8(self, val: i16) -> i16x8 { - unsafe { _mm_set1_epi16(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : i16) -> i16x8 < Avx2 > { _mm_set1_epi16 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8 { @@ -1036,27 +980,33 @@ impl Simd for Avx2 { } #[inline(always)] fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , b : i16x8 < Avx2 >) -> i16x8 < Avx2 > { _mm_add_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , b : i16x8 < Avx2 >) -> i16x8 < Avx2 > { _mm_sub_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , b : i16x8 < Avx2 >) -> i16x8 < Avx2 > { _mm_mullo_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , b : i16x8 < Avx2 >) -> i16x8 < Avx2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , b : i16x8 < Avx2 >) -> i16x8 < Avx2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , b : i16x8 < Avx2 >) -> i16x8 < Avx2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_i16x8(self, a: i16x8) -> i16x8 { @@ -1064,7 +1014,8 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_i16x8(self, a: i16x8, shift: u32) -> i16x8 { - unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , shift : u32) -> i16x8 < Avx2 > { _mm_sll_epi16 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { @@ -1072,7 +1023,8 @@ impl Simd for Avx2 { } #[inline(always)] fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8 { - unsafe { _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , shift : u32) -> i16x8 < Avx2 > { _mm_sra_epi16 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { @@ -1080,49 +1032,48 @@ impl Simd for Avx2 { } #[inline(always)] fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , b : i16x8 < Avx2 >) -> mask16x8 < Avx2 > { _mm_cmpeq_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpgt_epi16(b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , b : i16x8 < Avx2 >) -> mask16x8 < Avx2 > { _mm_cmpgt_epi16 (b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(_mm_min_epi16(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , b : i16x8 < Avx2 >) -> mask16x8 < Avx2 > { _mm_cmpeq_epi16 (_mm_min_epi16 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(_mm_max_epi16(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , b : i16x8 < Avx2 >) -> mask16x8 < Avx2 > { _mm_cmpeq_epi16 (_mm_max_epi16 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpgt_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , b : i16x8 < Avx2 >) -> mask16x8 < Avx2 > { _mm_cmpgt_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , b : i16x8 < Avx2 >) -> i16x8 < Avx2 > { _mm_unpacklo_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , b : i16x8 < Avx2 >) -> i16x8 < Avx2 > { _mm_unpackhi_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , b : i16x8 < Avx2 >) -> i16x8 < Avx2 > { let mask = _mm_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15) ; let t1 = _mm_shuffle_epi8 (a . into () , mask) ; let t2 = _mm_shuffle_epi8 (b . into () , mask) ; _mm_unpacklo_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , b : i16x8 < Avx2 >) -> i16x8 < Avx2 > { let mask = _mm_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15) ; let t1 = _mm_shuffle_epi8 (a . into () , mask) ; let t2 = _mm_shuffle_epi8 (b . into () , mask) ; _mm_unpackhi_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_i16x8(self, a: i16x8, b: i16x8) -> (i16x8, i16x8) { @@ -1134,35 +1085,43 @@ impl Simd for Avx2 { } #[inline(always)] fn select_i16x8(self, a: mask16x8, b: i16x8, c: i16x8) -> i16x8 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x8 < Avx2 > , b : i16x8 < Avx2 > , c : i16x8 < Avx2 >) -> i16x8 < Avx2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_min_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , b : i16x8 < Avx2 >) -> i16x8 < Avx2 > { _mm_min_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_max_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , b : i16x8 < Avx2 >) -> i16x8 < Avx2 > { _mm_max_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_i16x8(self, a: i16x8, b: i16x8) -> i16x16 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 > , b : i16x8 < Avx2 >) -> i16x16 < Avx2 > { _mm256_setr_m128i (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn neg_i16x8(self, a: i16x8) -> i16x8 { - unsafe { _mm_sub_epi16(_mm_setzero_si128(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 >) -> i16x8 < Avx2 > { _mm_sub_epi16 (_mm_setzero_si128 () , a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i16x8(self, a: i16x8) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 >) -> u8x16 < Avx2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i16x8(self, a: i16x8) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x8 < Avx2 >) -> u32x4 < Avx2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_u16x8(self, val: u16) -> u16x8 { - unsafe { _mm_set1_epi16(val.cast_signed()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : u16) -> u16x8 < Avx2 > { _mm_set1_epi16 (val . cast_signed ()) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8 { @@ -1241,27 +1200,33 @@ impl Simd for Avx2 { } #[inline(always)] fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , b : u16x8 < Avx2 >) -> u16x8 < Avx2 > { _mm_add_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , b : u16x8 < Avx2 >) -> u16x8 < Avx2 > { _mm_sub_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , b : u16x8 < Avx2 >) -> u16x8 < Avx2 > { _mm_mullo_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , b : u16x8 < Avx2 >) -> u16x8 < Avx2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , b : u16x8 < Avx2 >) -> u16x8 < Avx2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , b : u16x8 < Avx2 >) -> u16x8 < Avx2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_u16x8(self, a: u16x8) -> u16x8 { @@ -1269,7 +1234,8 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_u16x8(self, a: u16x8, shift: u32) -> u16x8 { - unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , shift : u32) -> u16x8 < Avx2 > { _mm_sll_epi16 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { @@ -1277,7 +1243,8 @@ impl Simd for Avx2 { } #[inline(always)] fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8 { - unsafe { _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , shift : u32) -> u16x8 < Avx2 > { _mm_srl_epi16 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { @@ -1285,59 +1252,48 @@ impl Simd for Avx2 { } #[inline(always)] fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , b : u16x8 < Avx2 >) -> mask16x8 < Avx2 > { _mm_cmpeq_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { - let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi16(b_signed, a_signed).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , b : u16x8 < Avx2 >) -> mask16x8 < Avx2 > { let sign_bit = _mm_set1_epi16 (0x8000u16 . cast_signed ()) ; let a_signed = _mm_xor_si128 (a . into () , sign_bit) ; let b_signed = _mm_xor_si128 (b . into () , sign_bit) ; _mm_cmpgt_epi16 (b_signed , a_signed) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(_mm_min_epu16(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , b : u16x8 < Avx2 >) -> mask16x8 < Avx2 > { _mm_cmpeq_epi16 (_mm_min_epu16 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(_mm_max_epu16(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , b : u16x8 < Avx2 >) -> mask16x8 < Avx2 > { _mm_cmpeq_epi16 (_mm_max_epu16 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { - let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , b : u16x8 < Avx2 >) -> mask16x8 < Avx2 > { let sign_bit = _mm_set1_epi16 (0x8000u16 . cast_signed ()) ; let a_signed = _mm_xor_si128 (a . into () , sign_bit) ; let b_signed = _mm_xor_si128 (b . into () , sign_bit) ; _mm_cmpgt_epi16 (a_signed , b_signed) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , b : u16x8 < Avx2 >) -> u16x8 < Avx2 > { _mm_unpacklo_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , b : u16x8 < Avx2 >) -> u16x8 < Avx2 > { _mm_unpackhi_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , b : u16x8 < Avx2 >) -> u16x8 < Avx2 > { let mask = _mm_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15) ; let t1 = _mm_shuffle_epi8 (a . into () , mask) ; let t2 = _mm_shuffle_epi8 (b . into () , mask) ; _mm_unpacklo_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , b : u16x8 < Avx2 >) -> u16x8 < Avx2 > { let mask = _mm_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15) ; let t1 = _mm_shuffle_epi8 (a . into () , mask) ; let t2 = _mm_shuffle_epi8 (b . into () , mask) ; _mm_unpackhi_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_u16x8(self, a: u16x8, b: u16x8) -> (u16x8, u16x8) { @@ -1349,34 +1305,38 @@ impl Simd for Avx2 { } #[inline(always)] fn select_u16x8(self, a: mask16x8, b: u16x8, c: u16x8) -> u16x8 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x8 < Avx2 > , b : u16x8 < Avx2 > , c : u16x8 < Avx2 >) -> u16x8 < Avx2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_min_epu16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , b : u16x8 < Avx2 >) -> u16x8 < Avx2 > { _mm_min_epu16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_max_epu16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , b : u16x8 < Avx2 >) -> u16x8 < Avx2 > { _mm_max_epu16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_u16x8(self, a: u16x8, b: u16x8) -> u16x16 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 > , b : u16x8 < Avx2 >) -> u16x16 < Avx2 > { _mm256_setr_m128i (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn reinterpret_u8_u16x8(self, a: u16x8) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 >) -> u8x16 < Avx2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_u16x8(self, a: u16x8) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x8 < Avx2 >) -> u32x4 < Avx2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_mask16x8(self, val: bool) -> mask16x8 { - unsafe { - let val: i16 = if val { !0 } else { 0 }; - _mm_set1_epi16(val).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : bool) -> mask16x8 < Avx2 > { let val : i16 = if val { ! 0 } else { 0 } ; _mm_set1_epi16 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8 { @@ -1391,35 +1351,28 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8 { - unsafe { - { - let bit_lanes = _mm_set1_epi16(bits as i16); - let bit_mask = _mm_setr_epi16(1, 2, 4, 8, 16, 32, 64, 128); - _mm_cmpeq_epi16(_mm_and_si128(bit_lanes, bit_mask), bit_mask) - } - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , bits : u64) -> mask16x8 < Avx2 > { { let bit_lanes = _mm_set1_epi16 (bits as i16) ; let bit_mask = _mm_setr_epi16 (1 , 2 , 4 , 8 , 16 , 32 , 64 , 128) ; _mm_cmpeq_epi16 (_mm_and_si128 (bit_lanes , bit_mask) , bit_mask) } . simd_into (token) } } + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask16x8(self, a: mask16x8) -> u64 { - unsafe { - { - let packed = _mm_packs_epi16(a.into(), a.into()); - _mm_movemask_epi8(packed) as u8 as u64 - } - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x8 < Avx2 >) -> u64 { { let packed = _mm_packs_epi16 (a . into () , a . into ()) ; _mm_movemask_epi8 (packed) as u8 as u64 } } } + kernel(self, a) } #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x8 < Avx2 > , b : mask16x8 < Avx2 >) -> mask16x8 < Avx2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x8 < Avx2 > , b : mask16x8 < Avx2 >) -> mask16x8 < Avx2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x8 < Avx2 > , b : mask16x8 < Avx2 >) -> mask16x8 < Avx2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_mask16x8(self, a: mask16x8) -> mask16x8 { @@ -1432,35 +1385,43 @@ impl Simd for Avx2 { b: mask16x8, c: mask16x8, ) -> mask16x8 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x8 < Avx2 > , b : mask16x8 < Avx2 > , c : mask16x8 < Avx2 >) -> mask16x8 < Avx2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x8 < Avx2 > , b : mask16x8 < Avx2 >) -> mask16x8 < Avx2 > { _mm_cmpeq_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn any_true_mask16x8(self, a: mask16x8) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x8 < Avx2 >) -> bool { _mm_movemask_epi8 (a . into ()) as u32 != 0 } } + kernel(self, a) } #[inline(always)] fn all_true_mask16x8(self, a: mask16x8) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x8 < Avx2 >) -> bool { _mm_movemask_epi8 (a . into ()) as u32 == 0xffff } } + kernel(self, a) } #[inline(always)] fn any_false_mask16x8(self, a: mask16x8) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x8 < Avx2 >) -> bool { _mm_movemask_epi8 (a . into ()) as u32 != 0xffff } } + kernel(self, a) } #[inline(always)] fn all_false_mask16x8(self, a: mask16x8) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x8 < Avx2 >) -> bool { _mm_movemask_epi8 (a . into ()) as u32 == 0 } } + kernel(self, a) } #[inline(always)] fn combine_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x16 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x8 < Avx2 > , b : mask16x8 < Avx2 >) -> mask16x16 < Avx2 > { _mm256_setr_m128i (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn splat_i32x4(self, val: i32) -> i32x4 { - unsafe { _mm_set1_epi32(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : i32) -> i32x4 < Avx2 > { _mm_set1_epi32 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4 { @@ -1539,27 +1500,33 @@ impl Simd for Avx2 { } #[inline(always)] fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> i32x4 < Avx2 > { _mm_add_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> i32x4 < Avx2 > { _mm_sub_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> i32x4 < Avx2 > { _mm_mullo_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> i32x4 < Avx2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> i32x4 < Avx2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> i32x4 < Avx2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_i32x4(self, a: i32x4) -> i32x4 { @@ -1567,63 +1534,68 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_i32x4(self, a: i32x4, shift: u32) -> i32x4 { - unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , shift : u32) -> i32x4 < Avx2 > { _mm_sll_epi32 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_sllv_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> i32x4 < Avx2 > { _mm_sllv_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn shr_i32x4(self, a: i32x4, shift: u32) -> i32x4 { - unsafe { _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , shift : u32) -> i32x4 < Avx2 > { _mm_sra_epi32 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_srav_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> i32x4 < Avx2 > { _mm_srav_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_eq_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> mask32x4 < Avx2 > { _mm_cmpeq_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpgt_epi32(b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> mask32x4 < Avx2 > { _mm_cmpgt_epi32 (b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(_mm_min_epi32(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> mask32x4 < Avx2 > { _mm_cmpeq_epi32 (_mm_min_epi32 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(_mm_max_epi32(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> mask32x4 < Avx2 > { _mm_cmpeq_epi32 (_mm_max_epi32 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpgt_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> mask32x4 < Avx2 > { _mm_cmpgt_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> i32x4 < Avx2 > { _mm_unpacklo_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> i32x4 < Avx2 > { _mm_unpackhi_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> i32x4 < Avx2 > { let t1 = _mm_shuffle_epi32 :: < 0b11_01_10_00 > (a . into ()) ; let t2 = _mm_shuffle_epi32 :: < 0b11_01_10_00 > (b . into ()) ; _mm_unpacklo_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> i32x4 < Avx2 > { let t1 = _mm_shuffle_epi32 :: < 0b11_01_10_00 > (a . into ()) ; let t2 = _mm_shuffle_epi32 :: < 0b11_01_10_00 > (b . into ()) ; _mm_unpackhi_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_i32x4(self, a: i32x4, b: i32x4) -> (i32x4, i32x4) { @@ -1635,39 +1607,48 @@ impl Simd for Avx2 { } #[inline(always)] fn select_i32x4(self, a: mask32x4, b: i32x4, c: i32x4) -> i32x4 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x4 < Avx2 > , b : i32x4 < Avx2 > , c : i32x4 < Avx2 >) -> i32x4 < Avx2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_min_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> i32x4 < Avx2 > { _mm_min_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_max_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> i32x4 < Avx2 > { _mm_max_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_i32x4(self, a: i32x4, b: i32x4) -> i32x8 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 > , b : i32x4 < Avx2 >) -> i32x8 < Avx2 > { _mm256_setr_m128i (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn neg_i32x4(self, a: i32x4) -> i32x4 { - unsafe { _mm_sub_epi32(_mm_setzero_si128(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 >) -> i32x4 < Avx2 > { _mm_sub_epi32 (_mm_setzero_si128 () , a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i32x4(self, a: i32x4) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 >) -> u8x16 < Avx2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i32x4(self, a: i32x4) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 >) -> u32x4 < Avx2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_f32_i32x4(self, a: i32x4) -> f32x4 { - unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x4 < Avx2 >) -> f32x4 < Avx2 > { _mm_cvtepi32_ps (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_u32x4(self, val: u32) -> u32x4 { - unsafe { _mm_set1_epi32(val.cast_signed()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : u32) -> u32x4 < Avx2 > { _mm_set1_epi32 (val . cast_signed ()) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4 { @@ -1746,27 +1727,33 @@ impl Simd for Avx2 { } #[inline(always)] fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> u32x4 < Avx2 > { _mm_add_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> u32x4 < Avx2 > { _mm_sub_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> u32x4 < Avx2 > { _mm_mullo_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> u32x4 < Avx2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> u32x4 < Avx2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> u32x4 < Avx2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_u32x4(self, a: u32x4) -> u32x4 { @@ -1774,73 +1761,68 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_u32x4(self, a: u32x4, shift: u32) -> u32x4 { - unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , shift : u32) -> u32x4 < Avx2 > { _mm_sll_epi32 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_sllv_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> u32x4 < Avx2 > { _mm_sllv_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn shr_u32x4(self, a: u32x4, shift: u32) -> u32x4 { - unsafe { _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , shift : u32) -> u32x4 < Avx2 > { _mm_srl_epi32 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_srlv_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> u32x4 < Avx2 > { _mm_srlv_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_eq_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> mask32x4 < Avx2 > { _mm_cmpeq_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { - let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi32(b_signed, a_signed).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> mask32x4 < Avx2 > { let sign_bit = _mm_set1_epi32 (0x80000000u32 . cast_signed ()) ; let a_signed = _mm_xor_si128 (a . into () , sign_bit) ; let b_signed = _mm_xor_si128 (b . into () , sign_bit) ; _mm_cmpgt_epi32 (b_signed , a_signed) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(_mm_min_epu32(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> mask32x4 < Avx2 > { _mm_cmpeq_epi32 (_mm_min_epu32 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(_mm_max_epu32(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> mask32x4 < Avx2 > { _mm_cmpeq_epi32 (_mm_max_epu32 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { - let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> mask32x4 < Avx2 > { let sign_bit = _mm_set1_epi32 (0x80000000u32 . cast_signed ()) ; let a_signed = _mm_xor_si128 (a . into () , sign_bit) ; let b_signed = _mm_xor_si128 (b . into () , sign_bit) ; _mm_cmpgt_epi32 (a_signed , b_signed) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> u32x4 < Avx2 > { _mm_unpacklo_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> u32x4 < Avx2 > { _mm_unpackhi_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> u32x4 < Avx2 > { let t1 = _mm_shuffle_epi32 :: < 0b11_01_10_00 > (a . into ()) ; let t2 = _mm_shuffle_epi32 :: < 0b11_01_10_00 > (b . into ()) ; _mm_unpacklo_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> u32x4 < Avx2 > { let t1 = _mm_shuffle_epi32 :: < 0b11_01_10_00 > (a . into ()) ; let t2 = _mm_shuffle_epi32 :: < 0b11_01_10_00 > (b . into ()) ; _mm_unpackhi_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_u32x4(self, a: u32x4, b: u32x4) -> (u32x4, u32x4) { @@ -1852,44 +1834,38 @@ impl Simd for Avx2 { } #[inline(always)] fn select_u32x4(self, a: mask32x4, b: u32x4, c: u32x4) -> u32x4 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x4 < Avx2 > , b : u32x4 < Avx2 > , c : u32x4 < Avx2 >) -> u32x4 < Avx2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_min_epu32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> u32x4 < Avx2 > { _mm_min_epu32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_max_epu32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> u32x4 < Avx2 > { _mm_max_epu32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_u32x4(self, a: u32x4, b: u32x4) -> u32x8 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 > , b : u32x4 < Avx2 >) -> u32x8 < Avx2 > { _mm256_setr_m128i (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn reinterpret_u8_u32x4(self, a: u32x4) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 >) -> u8x16 < Avx2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_f32_u32x4(self, a: u32x4) -> f32x4 { - unsafe { - let a = a.into(); - let lo = _mm_blend_epi16::<0xAA>(a, _mm_set1_epi32(0x4B000000)); - let hi = _mm_blend_epi16::<0xAA>(_mm_srli_epi32::<16>(a), _mm_set1_epi32(0x53000000)); - let fhi = _mm_sub_ps( - _mm_castsi128_ps(hi), - _mm_set1_ps(f32::from_bits(0x53000080)), - ); - let result = _mm_add_ps(_mm_castsi128_ps(lo), fhi); - result.simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x4 < Avx2 >) -> f32x4 < Avx2 > { let a = a . into () ; let lo = _mm_blend_epi16 :: < 0xAA > (a , _mm_set1_epi32 (0x4B000000)) ; let hi = _mm_blend_epi16 :: < 0xAA > (_mm_srli_epi32 :: < 16 > (a) , _mm_set1_epi32 (0x53000000)) ; let fhi = _mm_sub_ps (_mm_castsi128_ps (hi) , _mm_set1_ps (f32 :: from_bits (0x53000080))) ; let result = _mm_add_ps (_mm_castsi128_ps (lo) , fhi) ; result . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_mask32x4(self, val: bool) -> mask32x4 { - unsafe { - let val: i32 = if val { !0 } else { 0 }; - _mm_set1_epi32(val).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : bool) -> mask32x4 < Avx2 > { let val : i32 = if val { ! 0 } else { 0 } ; _mm_set1_epi32 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4 { @@ -1904,30 +1880,28 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4 { - unsafe { - { - let bit_lanes = _mm_set1_epi32(bits as i32); - let bit_mask = _mm_setr_epi32(1, 2, 4, 8); - _mm_cmpeq_epi32(_mm_and_si128(bit_lanes, bit_mask), bit_mask) - } - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , bits : u64) -> mask32x4 < Avx2 > { { let bit_lanes = _mm_set1_epi32 (bits as i32) ; let bit_mask = _mm_setr_epi32 (1 , 2 , 4 , 8) ; _mm_cmpeq_epi32 (_mm_and_si128 (bit_lanes , bit_mask) , bit_mask) } . simd_into (token) } } + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask32x4(self, a: mask32x4) -> u64 { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 as u64 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x4 < Avx2 >) -> u64 { _mm_movemask_ps (_mm_castsi128_ps (a . into ())) as u32 as u64 } } + kernel(self, a) } #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x4 < Avx2 > , b : mask32x4 < Avx2 >) -> mask32x4 < Avx2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x4 < Avx2 > , b : mask32x4 < Avx2 >) -> mask32x4 < Avx2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x4 < Avx2 > , b : mask32x4 < Avx2 >) -> mask32x4 < Avx2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_mask32x4(self, a: mask32x4) -> mask32x4 { @@ -1940,35 +1914,43 @@ impl Simd for Avx2 { b: mask32x4, c: mask32x4, ) -> mask32x4 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x4 < Avx2 > , b : mask32x4 < Avx2 > , c : mask32x4 < Avx2 >) -> mask32x4 < Avx2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x4 < Avx2 > , b : mask32x4 < Avx2 >) -> mask32x4 < Avx2 > { _mm_cmpeq_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn any_true_mask32x4(self, a: mask32x4) -> bool { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x4 < Avx2 >) -> bool { _mm_movemask_ps (_mm_castsi128_ps (a . into ())) as u32 != 0 } } + kernel(self, a) } #[inline(always)] fn all_true_mask32x4(self, a: mask32x4) -> bool { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0b1111 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x4 < Avx2 >) -> bool { _mm_movemask_ps (_mm_castsi128_ps (a . into ())) as u32 == 0b1111 } } + kernel(self, a) } #[inline(always)] fn any_false_mask32x4(self, a: mask32x4) -> bool { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0b1111 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x4 < Avx2 >) -> bool { _mm_movemask_ps (_mm_castsi128_ps (a . into ())) as u32 != 0b1111 } } + kernel(self, a) } #[inline(always)] fn all_false_mask32x4(self, a: mask32x4) -> bool { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x4 < Avx2 >) -> bool { _mm_movemask_ps (_mm_castsi128_ps (a . into ())) as u32 == 0 } } + kernel(self, a) } #[inline(always)] fn combine_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x8 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x4 < Avx2 > , b : mask32x4 < Avx2 >) -> mask32x8 < Avx2 > { _mm256_setr_m128i (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn splat_f64x2(self, val: f64) -> f64x2 { - unsafe { _mm_set1_pd(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : f64) -> f64x2 < Avx2 > { _mm_set1_pd (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2 { @@ -2047,15 +2029,18 @@ impl Simd for Avx2 { } #[inline(always)] fn abs_f64x2(self, a: f64x2) -> f64x2 { - unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_andnot_pd (_mm_set1_pd (- 0.0) , a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn neg_f64x2(self, a: f64x2) -> f64x2 { - unsafe { _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_xor_pd (a . into () , _mm_set1_pd (- 0.0)) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn sqrt_f64x2(self, a: f64x2) -> f64x2 { - unsafe { _mm_sqrt_pd(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_sqrt_pd (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn approximate_recip_f64x2(self, a: f64x2) -> f64x2 { @@ -2063,62 +2048,73 @@ impl Simd for Avx2 { } #[inline(always)] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_add_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_sub_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_sub_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_mul_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_mul_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn div_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_div_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_div_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn copysign_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let mask = _mm_set1_pd(-0.0); - _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 >) -> f64x2 < Avx2 > { let mask = _mm_set1_pd (- 0.0) ; _mm_or_pd (_mm_and_pd (mask , b . into ()) , _mm_andnot_pd (mask , a . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_eq_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmpeq_pd(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 >) -> mask64x2 < Avx2 > { _mm_castpd_si128 (_mm_cmpeq_pd (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmplt_pd(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 >) -> mask64x2 < Avx2 > { _mm_castpd_si128 (_mm_cmplt_pd (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmple_pd(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 >) -> mask64x2 < Avx2 > { _mm_castpd_si128 (_mm_cmple_pd (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmpge_pd(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 >) -> mask64x2 < Avx2 > { _mm_castpd_si128 (_mm_cmpge_pd (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmpgt_pd(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 >) -> mask64x2 < Avx2 > { _mm_castpd_si128 (_mm_cmpgt_pd (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_unpacklo_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_unpacklo_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_unpackhi_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_unpackhi_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_shuffle_pd :: < 0b00 > (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_shuffle_pd :: < 0b11 > (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_f64x2(self, a: f64x2, b: f64x2) -> (f64x2, f64x2) { @@ -2130,54 +2126,48 @@ impl Simd for Avx2 { } #[inline(always)] fn max_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_max_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn min_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_min_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let intermediate = _mm_max_pd(a.into(), b.into()); - let b_is_nan = _mm_cmpunord_pd(b.into(), b.into()); - _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 >) -> f64x2 < Avx2 > { let intermediate = _mm_max_pd (a . into () , b . into ()) ; let b_is_nan = _mm_cmpunord_pd (b . into () , b . into ()) ; _mm_blendv_pd (intermediate , a . into () , b_is_nan) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn min_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let intermediate = _mm_min_pd(a.into(), b.into()); - let b_is_nan = _mm_cmpunord_pd(b.into(), b.into()); - _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 >) -> f64x2 < Avx2 > { let intermediate = _mm_min_pd (a . into () , b . into ()) ; let b_is_nan = _mm_cmpunord_pd (b . into () , b . into ()) ; _mm_blendv_pd (intermediate , a . into () , b_is_nan) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { - unsafe { _mm_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 > , c : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_fmadd_pd (a . into () , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn mul_sub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { - unsafe { _mm_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 > , c : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_fmsub_pd (a . into () , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn floor_f64x2(self, a: f64x2) -> f64x2 { - unsafe { - _mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_round_pd :: < { _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn ceil_f64x2(self, a: f64x2) -> f64x2 { - unsafe { - _mm_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_round_pd :: < { _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn round_ties_even_f64x2(self, a: f64x2) -> f64x2 { - unsafe { - _mm_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_round_pd :: < { _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn fract_f64x2(self, a: f64x2) -> f64x2 { @@ -2185,28 +2175,28 @@ impl Simd for Avx2 { } #[inline(always)] fn trunc_f64x2(self, a: f64x2) -> f64x2 { - unsafe { - _mm_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_round_pd :: < { _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn select_f64x2(self, a: mask64x2, b: f64x2, c: f64x2) -> f64x2 { - unsafe { _mm_blendv_pd(c.into(), b.into(), _mm_castsi128_pd(a.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x2 < Avx2 > , b : f64x2 < Avx2 > , c : f64x2 < Avx2 >) -> f64x2 < Avx2 > { _mm_blendv_pd (c . into () , b . into () , _mm_castsi128_pd (a . into ())) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn combine_f64x2(self, a: f64x2, b: f64x2) -> f64x4 { - unsafe { _mm256_setr_m128d(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 > , b : f64x2 < Avx2 >) -> f64x4 < Avx2 > { _mm256_setr_m128d (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn reinterpret_f32_f64x2(self, a: f64x2) -> f32x4 { - unsafe { _mm_castpd_ps(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x2 < Avx2 >) -> f32x4 < Avx2 > { _mm_castpd_ps (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_mask64x2(self, val: bool) -> mask64x2 { - unsafe { - let val: i64 = if val { !0 } else { 0 }; - _mm_set1_epi64x(val).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : bool) -> mask64x2 < Avx2 > { let val : i64 = if val { ! 0 } else { 0 } ; _mm_set1_epi64x (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { @@ -2221,30 +2211,28 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { - unsafe { - { - let bit_lanes = _mm_set1_epi64x(bits.cast_signed()); - let bit_mask = _mm_set_epi64x(2, 1); - _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask) - } - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , bits : u64) -> mask64x2 < Avx2 > { { let bit_lanes = _mm_set1_epi64x (bits . cast_signed ()) ; let bit_mask = _mm_set_epi64x (2 , 1) ; _mm_cmpeq_epi64 (_mm_and_si128 (bit_lanes , bit_mask) , bit_mask) } . simd_into (token) } } + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x2 < Avx2 >) -> u64 { _mm_movemask_pd (_mm_castsi128_pd (a . into ())) as u32 as u64 } } + kernel(self, a) } #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x2 < Avx2 > , b : mask64x2 < Avx2 >) -> mask64x2 < Avx2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x2 < Avx2 > , b : mask64x2 < Avx2 >) -> mask64x2 < Avx2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x2 < Avx2 > , b : mask64x2 < Avx2 >) -> mask64x2 < Avx2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_mask64x2(self, a: mask64x2) -> mask64x2 { @@ -2257,35 +2245,43 @@ impl Simd for Avx2 { b: mask64x2, c: mask64x2, ) -> mask64x2 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x2 < Avx2 > , b : mask64x2 < Avx2 > , c : mask64x2 < Avx2 >) -> mask64x2 < Avx2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { _mm_cmpeq_epi64(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x2 < Avx2 > , b : mask64x2 < Avx2 >) -> mask64x2 < Avx2 > { _mm_cmpeq_epi64 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn any_true_mask64x2(self, a: mask64x2) -> bool { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x2 < Avx2 >) -> bool { _mm_movemask_pd (_mm_castsi128_pd (a . into ())) as u32 != 0 } } + kernel(self, a) } #[inline(always)] fn all_true_mask64x2(self, a: mask64x2) -> bool { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x2 < Avx2 >) -> bool { _mm_movemask_pd (_mm_castsi128_pd (a . into ())) as u32 == 0b11 } } + kernel(self, a) } #[inline(always)] fn any_false_mask64x2(self, a: mask64x2) -> bool { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x2 < Avx2 >) -> bool { _mm_movemask_pd (_mm_castsi128_pd (a . into ())) as u32 != 0b11 } } + kernel(self, a) } #[inline(always)] fn all_false_mask64x2(self, a: mask64x2) -> bool { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x2 < Avx2 >) -> bool { _mm_movemask_pd (_mm_castsi128_pd (a . into ())) as u32 == 0 } } + kernel(self, a) } #[inline(always)] fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x2 < Avx2 > , b : mask64x2 < Avx2 >) -> mask64x4 < Avx2 > { _mm256_setr_m128i (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn splat_f32x8(self, val: f32) -> f32x8 { - unsafe { _mm256_set1_ps(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : f32) -> f32x8 < Avx2 > { _mm256_set1_ps (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { @@ -2377,173 +2373,148 @@ impl Simd for Avx2 { } #[inline(always)] fn abs_f32x8(self, a: f32x8) -> f32x8 { - unsafe { _mm256_andnot_ps(_mm256_set1_ps(-0.0), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 >) -> f32x8 < Avx2 > { _mm256_andnot_ps (_mm256_set1_ps (- 0.0) , a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn neg_f32x8(self, a: f32x8) -> f32x8 { - unsafe { _mm256_xor_ps(a.into(), _mm256_set1_ps(-0.0)).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 >) -> f32x8 < Avx2 > { _mm256_xor_ps (a . into () , _mm256_set1_ps (- 0.0)) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn sqrt_f32x8(self, a: f32x8) -> f32x8 { - unsafe { _mm256_sqrt_ps(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 >) -> f32x8 < Avx2 > { _mm256_sqrt_ps (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { - unsafe { _mm256_rcp_ps(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 >) -> f32x8 < Avx2 > { _mm256_rcp_ps (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { _mm256_add_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> f32x8 < Avx2 > { _mm256_add_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { _mm256_sub_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> f32x8 < Avx2 > { _mm256_sub_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { _mm256_mul_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> f32x8 < Avx2 > { _mm256_mul_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { _mm256_div_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> f32x8 < Avx2 > { _mm256_div_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - let mask = _mm256_set1_ps(-0.0); - _mm256_or_ps( - _mm256_and_ps(mask, b.into()), - _mm256_andnot_ps(mask, a.into()), - ) - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> f32x8 < Avx2 > { let mask = _mm256_set1_ps (- 0.0) ; _mm256_or_ps (_mm256_and_ps (mask , b . into ()) , _mm256_andnot_ps (mask , a . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - unsafe { _mm256_castps_si256(_mm256_cmp_ps::<0i32>(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> mask32x8 < Avx2 > { _mm256_castps_si256 (_mm256_cmp_ps :: < 0i32 > (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - unsafe { _mm256_castps_si256(_mm256_cmp_ps::<17i32>(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> mask32x8 < Avx2 > { _mm256_castps_si256 (_mm256_cmp_ps :: < 17i32 > (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - unsafe { _mm256_castps_si256(_mm256_cmp_ps::<18i32>(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> mask32x8 < Avx2 > { _mm256_castps_si256 (_mm256_cmp_ps :: < 18i32 > (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - unsafe { _mm256_castps_si256(_mm256_cmp_ps::<29i32>(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> mask32x8 < Avx2 > { _mm256_castps_si256 (_mm256_cmp_ps :: < 29i32 > (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - unsafe { _mm256_castps_si256(_mm256_cmp_ps::<30i32>(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> mask32x8 < Avx2 > { _mm256_castps_si256 (_mm256_cmp_ps :: < 30i32 > (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - let lo = _mm256_unpacklo_ps(a.into(), b.into()); - let hi = _mm256_unpackhi_ps(a.into(), b.into()); - _mm256_permute2f128_ps::<0b0010_0000>(lo, hi).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> f32x8 < Avx2 > { let lo = _mm256_unpacklo_ps (a . into () , b . into ()) ; let hi = _mm256_unpackhi_ps (a . into () , b . into ()) ; _mm256_permute2f128_ps :: < 0b0010_0000 > (lo , hi) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - let lo = _mm256_unpacklo_ps(a.into(), b.into()); - let hi = _mm256_unpackhi_ps(a.into(), b.into()); - _mm256_permute2f128_ps::<0b0011_0001>(lo, hi).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> f32x8 < Avx2 > { let lo = _mm256_unpacklo_ps (a . into () , b . into ()) ; let hi = _mm256_unpackhi_ps (a . into () , b . into ()) ; _mm256_permute2f128_ps :: < 0b0011_0001 > (lo , hi) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - let t1 = _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - _mm256_permute2f128_ps::<0b0010_0000>(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> f32x8 < Avx2 > { let t1 = _mm256_permutevar8x32_ps (a . into () , _mm256_setr_epi32 (0 , 2 , 4 , 6 , 1 , 3 , 5 , 7)) ; let t2 = _mm256_permutevar8x32_ps (b . into () , _mm256_setr_epi32 (0 , 2 , 4 , 6 , 1 , 3 , 5 , 7)) ; _mm256_permute2f128_ps :: < 0b0010_0000 > (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - let t1 = _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - _mm256_permute2f128_ps::<0b0011_0001>(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> f32x8 < Avx2 > { let t1 = _mm256_permutevar8x32_ps (a . into () , _mm256_setr_epi32 (0 , 2 , 4 , 6 , 1 , 3 , 5 , 7)) ; let t2 = _mm256_permutevar8x32_ps (b . into () , _mm256_setr_epi32 (0 , 2 , 4 , 6 , 1 , 3 , 5 , 7)) ; _mm256_permute2f128_ps :: < 0b0011_0001 > (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { - unsafe { - let lo = _mm256_unpacklo_ps(a.into(), b.into()); - let hi = _mm256_unpackhi_ps(a.into(), b.into()); - ( - _mm256_permute2f128_ps::<0b0010_0000>(lo, hi).simd_into(self), - _mm256_permute2f128_ps::<0b0011_0001>(lo, hi).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> (f32x8 < Avx2 > , f32x8 < Avx2 >) { let lo = _mm256_unpacklo_ps (a . into () , b . into ()) ; let hi = _mm256_unpackhi_ps (a . into () , b . into ()) ; (_mm256_permute2f128_ps :: < 0b0010_0000 > (lo , hi) . simd_into (token) , _mm256_permute2f128_ps :: < 0b0011_0001 > (lo , hi) . simd_into (token) ,) } } + kernel(self, a, b) } #[inline(always)] fn deinterleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { - unsafe { - let t1 = _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - ( - _mm256_permute2f128_ps::<0b0010_0000>(t1, t2).simd_into(self), - _mm256_permute2f128_ps::<0b0011_0001>(t1, t2).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> (f32x8 < Avx2 > , f32x8 < Avx2 >) { let t1 = _mm256_permutevar8x32_ps (a . into () , _mm256_setr_epi32 (0 , 2 , 4 , 6 , 1 , 3 , 5 , 7)) ; let t2 = _mm256_permutevar8x32_ps (b . into () , _mm256_setr_epi32 (0 , 2 , 4 , 6 , 1 , 3 , 5 , 7)) ; (_mm256_permute2f128_ps :: < 0b0010_0000 > (t1 , t2) . simd_into (token) , _mm256_permute2f128_ps :: < 0b0011_0001 > (t1 , t2) . simd_into (token) ,) } } + kernel(self, a, b) } #[inline(always)] fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { _mm256_max_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> f32x8 < Avx2 > { _mm256_max_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { _mm256_min_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> f32x8 < Avx2 > { _mm256_min_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - let intermediate = _mm256_max_ps(a.into(), b.into()); - let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into()); - _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> f32x8 < Avx2 > { let intermediate = _mm256_max_ps (a . into () , b . into ()) ; let b_is_nan = _mm256_cmp_ps :: < 3i32 > (b . into () , b . into ()) ; _mm256_blendv_ps (intermediate , a . into () , b_is_nan) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - let intermediate = _mm256_min_ps(a.into(), b.into()); - let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into()); - _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 >) -> f32x8 < Avx2 > { let intermediate = _mm256_min_ps (a . into () , b . into ()) ; let b_is_nan = _mm256_cmp_ps :: < 3i32 > (b . into () , b . into ()) ; _mm256_blendv_ps (intermediate , a . into () , b_is_nan) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { - unsafe { _mm256_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 > , c : f32x8 < Avx2 >) -> f32x8 < Avx2 > { _mm256_fmadd_ps (a . into () , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn mul_sub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { - unsafe { _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 > , b : f32x8 < Avx2 > , c : f32x8 < Avx2 >) -> f32x8 < Avx2 > { _mm256_fmsub_ps (a . into () , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn floor_f32x8(self, a: f32x8) -> f32x8 { - unsafe { - _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 >) -> f32x8 < Avx2 > { _mm256_round_ps :: < { _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn ceil_f32x8(self, a: f32x8) -> f32x8 { - unsafe { - _mm256_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 >) -> f32x8 < Avx2 > { _mm256_round_ps :: < { _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn round_ties_even_f32x8(self, a: f32x8) -> f32x8 { - unsafe { - _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 >) -> f32x8 < Avx2 > { _mm256_round_ps :: < { _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn fract_f32x8(self, a: f32x8) -> f32x8 { @@ -2551,15 +2522,13 @@ impl Simd for Avx2 { } #[inline(always)] fn trunc_f32x8(self, a: f32x8) -> f32x8 { - unsafe { - _mm256_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 >) -> f32x8 < Avx2 > { _mm256_round_ps :: < { _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { - unsafe { - _mm256_blendv_ps(c.into(), b.into(), _mm256_castsi256_ps(a.into())).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x8 < Avx2 > , b : f32x8 < Avx2 > , c : f32x8 < Avx2 >) -> f32x8 < Avx2 > { _mm256_blendv_ps (c . into () , b . into () , _mm256_castsi256_ps (a . into ())) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { @@ -2570,91 +2539,53 @@ impl Simd for Avx2 { } #[inline(always)] fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { - unsafe { - ( - _mm256_extractf128_ps::<0>(a.into()).simd_into(self), - _mm256_extractf128_ps::<1>(a.into()).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 >) -> (f32x4 < Avx2 > , f32x4 < Avx2 >) { (_mm256_extractf128_ps :: < 0 > (a . into ()) . simd_into (token) , _mm256_extractf128_ps :: < 1 > (a . into ()) . simd_into (token) ,) } } + kernel(self, a) } #[inline(always)] fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { - unsafe { _mm256_castps_pd(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 >) -> f64x4 < Avx2 > { _mm256_castps_pd (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { - unsafe { _mm256_castps_si256(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 >) -> i32x8 < Avx2 > { _mm256_castps_si256 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { - unsafe { _mm256_castps_si256(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 >) -> u8x32 < Avx2 > { _mm256_castps_si256 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { - unsafe { _mm256_castps_si256(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 >) -> u32x8 < Avx2 > { _mm256_castps_si256 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { - unsafe { - let mut converted = _mm256_cvttps_epi32(a.into()); - let in_range = _mm256_cmp_ps::<17i32>(a.into(), _mm256_set1_ps(2147483648.0)); - let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; - if !all_in_range { - let excess = _mm256_sub_ps(a.into(), _mm256_set1_ps(2147483648.0)); - let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess)); - converted = _mm256_add_epi32(converted, excess_converted); - } - converted.simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 >) -> u32x8 < Avx2 > { let mut converted = _mm256_cvttps_epi32 (a . into ()) ; let in_range = _mm256_cmp_ps :: < 17i32 > (a . into () , _mm256_set1_ps (2147483648.0)) ; let all_in_range = _mm256_movemask_ps (in_range) == 0b11111111 ; if ! all_in_range { let excess = _mm256_sub_ps (a . into () , _mm256_set1_ps (2147483648.0)) ; let excess_converted = _mm256_cvttps_epi32 (_mm256_andnot_ps (in_range , excess)) ; converted = _mm256_add_epi32 (converted , excess_converted) ; } converted . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_u32_precise_f32x8(self, a: f32x8) -> u32x8 { - unsafe { - let a = _mm256_max_ps(a.into(), _mm256_setzero_ps()); - let mut converted = _mm256_cvttps_epi32(a); - let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0)); - let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; - if !all_in_range { - let exceeds_unsigned_range = - _mm256_castps_si256(_mm256_cmp_ps::<17i32>(_mm256_set1_ps(4294967040.0), a)); - let excess = _mm256_sub_ps(a, _mm256_set1_ps(2147483648.0)); - let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess)); - converted = _mm256_add_epi32(converted, excess_converted); - converted = _mm256_blendv_epi8( - converted, - _mm256_set1_epi32(u32::MAX.cast_signed()), - exceeds_unsigned_range, - ); - } - converted.simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 >) -> u32x8 < Avx2 > { let a = _mm256_max_ps (a . into () , _mm256_setzero_ps ()) ; let mut converted = _mm256_cvttps_epi32 (a) ; let in_range = _mm256_cmp_ps :: < 17i32 > (a , _mm256_set1_ps (2147483648.0)) ; let all_in_range = _mm256_movemask_ps (in_range) == 0b11111111 ; if ! all_in_range { let exceeds_unsigned_range = _mm256_castps_si256 (_mm256_cmp_ps :: < 17i32 > (_mm256_set1_ps (4294967040.0) , a)) ; let excess = _mm256_sub_ps (a , _mm256_set1_ps (2147483648.0)) ; let excess_converted = _mm256_cvttps_epi32 (_mm256_andnot_ps (in_range , excess)) ; converted = _mm256_add_epi32 (converted , excess_converted) ; converted = _mm256_blendv_epi8 (converted , _mm256_set1_epi32 (u32 :: MAX . cast_signed ()) , exceeds_unsigned_range) ; } converted . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { - unsafe { _mm256_cvttps_epi32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 >) -> i32x8 < Avx2 > { _mm256_cvttps_epi32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_i32_precise_f32x8(self, a: f32x8) -> i32x8 { - unsafe { - let a = a.into(); - let mut converted = _mm256_cvttps_epi32(a); - let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0)); - let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; - if !all_in_range { - converted = _mm256_blendv_epi8( - _mm256_set1_epi32(i32::MAX), - converted, - _mm256_castps_si256(in_range), - ); - let is_not_nan = _mm256_castps_si256(_mm256_cmp_ps::<7i32>(a, a)); - converted = _mm256_and_si256(converted, is_not_nan); - } - converted.simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f32x8 < Avx2 >) -> i32x8 < Avx2 > { let a = a . into () ; let mut converted = _mm256_cvttps_epi32 (a) ; let in_range = _mm256_cmp_ps :: < 17i32 > (a , _mm256_set1_ps (2147483648.0)) ; let all_in_range = _mm256_movemask_ps (in_range) == 0b11111111 ; if ! all_in_range { converted = _mm256_blendv_epi8 (_mm256_set1_epi32 (i32 :: MAX) , converted , _mm256_castps_si256 (in_range)) ; let is_not_nan = _mm256_castps_si256 (_mm256_cmp_ps :: < 7i32 > (a , a)) ; converted = _mm256_and_si256 (converted , is_not_nan) ; } converted . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_i8x32(self, val: i8) -> i8x32 { - unsafe { _mm256_set1_epi8(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : i8) -> i8x32 < Avx2 > { _mm256_set1_epi8 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { @@ -2746,38 +2677,33 @@ impl Simd for Avx2 { } #[inline(always)] fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { _mm256_add_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , b : i8x32 < Avx2 >) -> i8x32 < Avx2 > { _mm256_add_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { _mm256_sub_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , b : i8x32 < Avx2 >) -> i8x32 < Avx2 > { _mm256_sub_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { - let dst_even = _mm256_mullo_epi16(a.into(), b.into()); - let dst_odd = _mm256_mullo_epi16( - _mm256_srli_epi16::<8>(a.into()), - _mm256_srli_epi16::<8>(b.into()), - ); - _mm256_or_si256( - _mm256_slli_epi16(dst_odd, 8), - _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)), - ) - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , b : i8x32 < Avx2 >) -> i8x32 < Avx2 > { let dst_even = _mm256_mullo_epi16 (a . into () , b . into ()) ; let dst_odd = _mm256_mullo_epi16 (_mm256_srli_epi16 :: < 8 > (a . into ()) , _mm256_srli_epi16 :: < 8 > (b . into ())) ; _mm256_or_si256 (_mm256_slli_epi16 (dst_odd , 8) , _mm256_and_si256 (dst_even , _mm256_set1_epi16 (0xFF))) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , b : i8x32 < Avx2 >) -> i8x32 < Avx2 > { _mm256_and_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , b : i8x32 < Avx2 >) -> i8x32 < Avx2 > { _mm256_or_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , b : i8x32 < Avx2 >) -> i8x32 < Avx2 > { _mm256_xor_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_i8x32(self, a: i8x32) -> i8x32 { @@ -2785,15 +2711,8 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_i8x32(self, a: i8x32, shift: u32) -> i8x32 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); - let hi_16 = _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); - let lo_shifted = _mm256_sll_epi16(lo_16, shift_count); - let hi_shifted = _mm256_sll_epi16(hi_16, shift_count); - _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , shift : u32) -> i8x32 < Avx2 > { let val = a . into () ; let shift_count = _mm_cvtsi32_si128 (shift . cast_signed ()) ; let lo_16 = _mm256_unpacklo_epi8 (val , _mm256_cmpgt_epi8 (_mm256_setzero_si256 () , val)) ; let hi_16 = _mm256_unpackhi_epi8 (val , _mm256_cmpgt_epi8 (_mm256_setzero_si256 () , val)) ; let lo_shifted = _mm256_sll_epi16 (lo_16 , shift_count) ; let hi_shifted = _mm256_sll_epi16 (hi_16 , shift_count) ; _mm256_packs_epi16 (lo_shifted , hi_shifted) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { @@ -2801,15 +2720,8 @@ impl Simd for Avx2 { } #[inline(always)] fn shr_i8x32(self, a: i8x32, shift: u32) -> i8x32 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); - let hi_16 = _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); - let lo_shifted = _mm256_sra_epi16(lo_16, shift_count); - let hi_shifted = _mm256_sra_epi16(hi_16, shift_count); - _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , shift : u32) -> i8x32 < Avx2 > { let val = a . into () ; let shift_count = _mm_cvtsi32_si128 (shift . cast_signed ()) ; let lo_16 = _mm256_unpacklo_epi8 (val , _mm256_cmpgt_epi8 (_mm256_setzero_si256 () , val)) ; let hi_16 = _mm256_unpackhi_epi8 (val , _mm256_cmpgt_epi8 (_mm256_setzero_si256 () , val)) ; let lo_shifted = _mm256_sra_epi16 (lo_16 , shift_count) ; let hi_shifted = _mm256_sra_epi16 (hi_16 , shift_count) ; _mm256_packs_epi16 (lo_shifted , hi_shifted) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { @@ -2817,125 +2729,73 @@ impl Simd for Avx2 { } #[inline(always)] fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - unsafe { _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , b : i8x32 < Avx2 >) -> mask8x32 < Avx2 > { _mm256_cmpeq_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - unsafe { _mm256_cmpgt_epi8(b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , b : i8x32 < Avx2 >) -> mask8x32 < Avx2 > { _mm256_cmpgt_epi8 (b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - unsafe { _mm256_cmpeq_epi8(_mm256_min_epi8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , b : i8x32 < Avx2 >) -> mask8x32 < Avx2 > { _mm256_cmpeq_epi8 (_mm256_min_epi8 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - unsafe { _mm256_cmpeq_epi8(_mm256_max_epi8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , b : i8x32 < Avx2 >) -> mask8x32 < Avx2 > { _mm256_cmpeq_epi8 (_mm256_max_epi8 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - unsafe { _mm256_cmpgt_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , b : i8x32 < Avx2 >) -> mask8x32 < Avx2 > { _mm256_cmpgt_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { - let lo = _mm256_unpacklo_epi8(a.into(), b.into()); - let hi = _mm256_unpackhi_epi8(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , b : i8x32 < Avx2 >) -> i8x32 < Avx2 > { let lo = _mm256_unpacklo_epi8 (a . into () , b . into ()) ; let hi = _mm256_unpackhi_epi8 (a . into () , b . into ()) ; _mm256_permute2x128_si256 :: < 0b0010_0000 > (lo , hi) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { - let lo = _mm256_unpacklo_epi8(a.into(), b.into()); - let hi = _mm256_unpackhi_epi8(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , b : i8x32 < Avx2 >) -> i8x32 < Avx2 > { let lo = _mm256_unpacklo_epi8 (a . into () , b . into ()) ; let hi = _mm256_unpackhi_epi8 (a . into () , b . into ()) ; _mm256_permute2x128_si256 :: < 0b0011_0001 > (lo , hi) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , b : i8x32 < Avx2 >) -> i8x32 < Avx2 > { let t1 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (a . into () , _mm256_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15 , 0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15)) ,) ; let t2 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (b . into () , _mm256_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15 , 0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15)) ,) ; _mm256_permute2x128_si256 :: < 0b0010_0000 > (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , b : i8x32 < Avx2 >) -> i8x32 < Avx2 > { let t1 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (a . into () , _mm256_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15 , 0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15)) ,) ; let t2 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (b . into () , _mm256_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15 , 0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15)) ,) ; _mm256_permute2x128_si256 :: < 0b0011_0001 > (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { - unsafe { - let lo = _mm256_unpacklo_epi8(a.into(), b.into()); - let hi = _mm256_unpackhi_epi8(a.into(), b.into()); - ( - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , b : i8x32 < Avx2 >) -> (i8x32 < Avx2 > , i8x32 < Avx2 >) { let lo = _mm256_unpacklo_epi8 (a . into () , b . into ()) ; let hi = _mm256_unpackhi_epi8 (a . into () , b . into ()) ; (_mm256_permute2x128_si256 :: < 0b0010_0000 > (lo , hi) . simd_into (token) , _mm256_permute2x128_si256 :: < 0b0011_0001 > (lo , hi) . simd_into (token) ,) } } + kernel(self, a, b) } #[inline(always)] fn deinterleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - ( - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , b : i8x32 < Avx2 >) -> (i8x32 < Avx2 > , i8x32 < Avx2 >) { let t1 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (a . into () , _mm256_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15 , 0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15)) ,) ; let t2 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (b . into () , _mm256_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15 , 0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15)) ,) ; (_mm256_permute2x128_si256 :: < 0b0010_0000 > (t1 , t2) . simd_into (token) , _mm256_permute2x128_si256 :: < 0b0011_0001 > (t1 , t2) . simd_into (token) ,) } } + kernel(self, a, b) } #[inline(always)] fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x32 < Avx2 > , b : i8x32 < Avx2 > , c : i8x32 < Avx2 >) -> i8x32 < Avx2 > { _mm256_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { _mm256_min_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , b : i8x32 < Avx2 >) -> i8x32 < Avx2 > { _mm256_min_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { _mm256_max_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 > , b : i8x32 < Avx2 >) -> i8x32 < Avx2 > { _mm256_max_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { @@ -2946,28 +2806,28 @@ impl Simd for Avx2 { } #[inline(always)] fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 >) -> (i8x16 < Avx2 > , i8x16 < Avx2 >) { (_mm256_extracti128_si256 :: < 0 > (a . into ()) . simd_into (token) , _mm256_extracti128_si256 :: < 1 > (a . into ()) . simd_into (token) ,) } } + kernel(self, a) } #[inline(always)] fn neg_i8x32(self, a: i8x32) -> i8x32 { - unsafe { _mm256_sub_epi8(_mm256_setzero_si256(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 >) -> i8x32 < Avx2 > { _mm256_sub_epi8 (_mm256_setzero_si256 () , a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { - __m256i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 >) -> u8x32 < Avx2 > { __m256i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { - __m256i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i8x32 < Avx2 >) -> u32x8 < Avx2 > { __m256i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_u8x32(self, val: u8) -> u8x32 { - unsafe { _mm256_set1_epi8(val.cast_signed()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : u8) -> u8x32 < Avx2 > { _mm256_set1_epi8 (val . cast_signed ()) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { @@ -3059,38 +2919,33 @@ impl Simd for Avx2 { } #[inline(always)] fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { _mm256_add_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , b : u8x32 < Avx2 >) -> u8x32 < Avx2 > { _mm256_add_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { _mm256_sub_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , b : u8x32 < Avx2 >) -> u8x32 < Avx2 > { _mm256_sub_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { - let dst_even = _mm256_mullo_epi16(a.into(), b.into()); - let dst_odd = _mm256_mullo_epi16( - _mm256_srli_epi16::<8>(a.into()), - _mm256_srli_epi16::<8>(b.into()), - ); - _mm256_or_si256( - _mm256_slli_epi16(dst_odd, 8), - _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)), - ) - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , b : u8x32 < Avx2 >) -> u8x32 < Avx2 > { let dst_even = _mm256_mullo_epi16 (a . into () , b . into ()) ; let dst_odd = _mm256_mullo_epi16 (_mm256_srli_epi16 :: < 8 > (a . into ()) , _mm256_srli_epi16 :: < 8 > (b . into ())) ; _mm256_or_si256 (_mm256_slli_epi16 (dst_odd , 8) , _mm256_and_si256 (dst_even , _mm256_set1_epi16 (0xFF))) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , b : u8x32 < Avx2 >) -> u8x32 < Avx2 > { _mm256_and_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , b : u8x32 < Avx2 >) -> u8x32 < Avx2 > { _mm256_or_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , b : u8x32 < Avx2 >) -> u8x32 < Avx2 > { _mm256_xor_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_u8x32(self, a: u8x32) -> u8x32 { @@ -3098,15 +2953,8 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_u8x32(self, a: u8x32, shift: u32) -> u8x32 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256()); - let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256()); - let lo_shifted = _mm256_sll_epi16(lo_16, shift_count); - let hi_shifted = _mm256_sll_epi16(hi_16, shift_count); - _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , shift : u32) -> u8x32 < Avx2 > { let val = a . into () ; let shift_count = _mm_cvtsi32_si128 (shift . cast_signed ()) ; let lo_16 = _mm256_unpacklo_epi8 (val , _mm256_setzero_si256 ()) ; let hi_16 = _mm256_unpackhi_epi8 (val , _mm256_setzero_si256 ()) ; let lo_shifted = _mm256_sll_epi16 (lo_16 , shift_count) ; let hi_shifted = _mm256_sll_epi16 (hi_16 , shift_count) ; _mm256_packus_epi16 (lo_shifted , hi_shifted) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { @@ -3114,15 +2962,8 @@ impl Simd for Avx2 { } #[inline(always)] fn shr_u8x32(self, a: u8x32, shift: u32) -> u8x32 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256()); - let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256()); - let lo_shifted = _mm256_srl_epi16(lo_16, shift_count); - let hi_shifted = _mm256_srl_epi16(hi_16, shift_count); - _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , shift : u32) -> u8x32 < Avx2 > { let val = a . into () ; let shift_count = _mm_cvtsi32_si128 (shift . cast_signed ()) ; let lo_16 = _mm256_unpacklo_epi8 (val , _mm256_setzero_si256 ()) ; let hi_16 = _mm256_unpackhi_epi8 (val , _mm256_setzero_si256 ()) ; let lo_shifted = _mm256_srl_epi16 (lo_16 , shift_count) ; let hi_shifted = _mm256_srl_epi16 (hi_16 , shift_count) ; _mm256_packus_epi16 (lo_shifted , hi_shifted) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { @@ -3130,135 +2971,73 @@ impl Simd for Avx2 { } #[inline(always)] fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - unsafe { _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , b : u8x32 < Avx2 >) -> mask8x32 < Avx2 > { _mm256_cmpeq_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - unsafe { - let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed()); - let a_signed = _mm256_xor_si256(a.into(), sign_bit); - let b_signed = _mm256_xor_si256(b.into(), sign_bit); - _mm256_cmpgt_epi8(b_signed, a_signed).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , b : u8x32 < Avx2 >) -> mask8x32 < Avx2 > { let sign_bit = _mm256_set1_epi8 (0x80u8 . cast_signed ()) ; let a_signed = _mm256_xor_si256 (a . into () , sign_bit) ; let b_signed = _mm256_xor_si256 (b . into () , sign_bit) ; _mm256_cmpgt_epi8 (b_signed , a_signed) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - unsafe { _mm256_cmpeq_epi8(_mm256_min_epu8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , b : u8x32 < Avx2 >) -> mask8x32 < Avx2 > { _mm256_cmpeq_epi8 (_mm256_min_epu8 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - unsafe { _mm256_cmpeq_epi8(_mm256_max_epu8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , b : u8x32 < Avx2 >) -> mask8x32 < Avx2 > { _mm256_cmpeq_epi8 (_mm256_max_epu8 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - unsafe { - let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed()); - let a_signed = _mm256_xor_si256(a.into(), sign_bit); - let b_signed = _mm256_xor_si256(b.into(), sign_bit); - _mm256_cmpgt_epi8(a_signed, b_signed).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , b : u8x32 < Avx2 >) -> mask8x32 < Avx2 > { let sign_bit = _mm256_set1_epi8 (0x80u8 . cast_signed ()) ; let a_signed = _mm256_xor_si256 (a . into () , sign_bit) ; let b_signed = _mm256_xor_si256 (b . into () , sign_bit) ; _mm256_cmpgt_epi8 (a_signed , b_signed) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { - let lo = _mm256_unpacklo_epi8(a.into(), b.into()); - let hi = _mm256_unpackhi_epi8(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , b : u8x32 < Avx2 >) -> u8x32 < Avx2 > { let lo = _mm256_unpacklo_epi8 (a . into () , b . into ()) ; let hi = _mm256_unpackhi_epi8 (a . into () , b . into ()) ; _mm256_permute2x128_si256 :: < 0b0010_0000 > (lo , hi) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { - let lo = _mm256_unpacklo_epi8(a.into(), b.into()); - let hi = _mm256_unpackhi_epi8(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , b : u8x32 < Avx2 >) -> u8x32 < Avx2 > { let lo = _mm256_unpacklo_epi8 (a . into () , b . into ()) ; let hi = _mm256_unpackhi_epi8 (a . into () , b . into ()) ; _mm256_permute2x128_si256 :: < 0b0011_0001 > (lo , hi) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , b : u8x32 < Avx2 >) -> u8x32 < Avx2 > { let t1 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (a . into () , _mm256_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15 , 0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15)) ,) ; let t2 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (b . into () , _mm256_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15 , 0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15)) ,) ; _mm256_permute2x128_si256 :: < 0b0010_0000 > (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , b : u8x32 < Avx2 >) -> u8x32 < Avx2 > { let t1 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (a . into () , _mm256_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15 , 0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15)) ,) ; let t2 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (b . into () , _mm256_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15 , 0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15)) ,) ; _mm256_permute2x128_si256 :: < 0b0011_0001 > (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { - unsafe { - let lo = _mm256_unpacklo_epi8(a.into(), b.into()); - let hi = _mm256_unpackhi_epi8(a.into(), b.into()); - ( - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , b : u8x32 < Avx2 >) -> (u8x32 < Avx2 > , u8x32 < Avx2 >) { let lo = _mm256_unpacklo_epi8 (a . into () , b . into ()) ; let hi = _mm256_unpackhi_epi8 (a . into () , b . into ()) ; (_mm256_permute2x128_si256 :: < 0b0010_0000 > (lo , hi) . simd_into (token) , _mm256_permute2x128_si256 :: < 0b0011_0001 > (lo , hi) . simd_into (token) ,) } } + kernel(self, a, b) } #[inline(always)] fn deinterleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - ( - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , b : u8x32 < Avx2 >) -> (u8x32 < Avx2 > , u8x32 < Avx2 >) { let t1 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (a . into () , _mm256_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15 , 0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15)) ,) ; let t2 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (b . into () , _mm256_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15 , 0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15)) ,) ; (_mm256_permute2x128_si256 :: < 0b0010_0000 > (t1 , t2) . simd_into (token) , _mm256_permute2x128_si256 :: < 0b0011_0001 > (t1 , t2) . simd_into (token) ,) } } + kernel(self, a, b) } #[inline(always)] fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x32 < Avx2 > , b : u8x32 < Avx2 > , c : u8x32 < Avx2 >) -> u8x32 < Avx2 > { _mm256_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { _mm256_min_epu8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , b : u8x32 < Avx2 >) -> u8x32 < Avx2 > { _mm256_min_epu8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { _mm256_max_epu8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 > , b : u8x32 < Avx2 >) -> u8x32 < Avx2 > { _mm256_max_epu8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { @@ -3269,32 +3048,23 @@ impl Simd for Avx2 { } #[inline(always)] fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 >) -> (u8x16 < Avx2 > , u8x16 < Avx2 >) { (_mm256_extracti128_si256 :: < 0 > (a . into ()) . simd_into (token) , _mm256_extracti128_si256 :: < 1 > (a . into ()) . simd_into (token) ,) } } + kernel(self, a) } #[inline(always)] fn widen_u8x32(self, a: u8x32) -> u16x32 { - unsafe { - let (a0, a1) = self.split_u8x32(a); - let high = _mm256_cvtepu8_epi16(a0.into()).simd_into(self); - let low = _mm256_cvtepu8_epi16(a1.into()).simd_into(self); - self.combine_u16x16(high, low) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 >) -> u16x32 < Avx2 > { let (a0 , a1) = token . split_u8x32 (a) ; let high = _mm256_cvtepu8_epi16 (a0 . into ()) . simd_into (token) ; let low = _mm256_cvtepu8_epi16 (a1 . into ()) . simd_into (token) ; token . combine_u16x16 (high , low) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { - __m256i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u8x32 < Avx2 >) -> u32x8 < Avx2 > { __m256i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_mask8x32(self, val: bool) -> mask8x32 { - unsafe { - let val: i8 = if val { !0 } else { 0 }; - _mm256_set1_epi8(val).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : bool) -> mask8x32 < Avx2 > { let val : i8 = if val { ! 0 } else { 0 } ; _mm256_set1_epi8 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { @@ -3309,40 +3079,28 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { - unsafe { - { - let bit_bytes = _mm256_broadcastsi128_si256(_mm_cvtsi32_si128(bits as i32)); - let bit_bytes = _mm256_shuffle_epi8( - bit_bytes, - _mm256_setr_epi8( - 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, - 3, 3, 3, 3, 3, 3, 3, - ), - ); - let bit_mask = _mm256_setr_epi8( - 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, - 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, - ); - _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) - } - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , bits : u64) -> mask8x32 < Avx2 > { { let bit_bytes = _mm256_broadcastsi128_si256 (_mm_cvtsi32_si128 (bits as i32)) ; let bit_bytes = _mm256_shuffle_epi8 (bit_bytes , _mm256_setr_epi8 (0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3)) ; let bit_mask = _mm256_setr_epi8 (1 , 2 , 4 , 8 , 16 , 32 , 64 , - 128 , 1 , 2 , 4 , 8 , 16 , 32 , 64 , - 128 , 1 , 2 , 4 , 8 , 16 , 32 , 64 , - 128 , 1 , 2 , 4 , 8 , 16 , 32 , 64 , - 128) ; _mm256_cmpeq_epi8 (_mm256_and_si256 (bit_bytes , bit_mask) , bit_mask) } . simd_into (token) } } + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask8x32(self, a: mask8x32) -> u64 { - unsafe { _mm256_movemask_epi8(a.into()) as u32 as u64 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x32 < Avx2 >) -> u64 { _mm256_movemask_epi8 (a . into ()) as u32 as u64 } } + kernel(self, a) } #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x32 < Avx2 > , b : mask8x32 < Avx2 >) -> mask8x32 < Avx2 > { _mm256_and_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x32 < Avx2 > , b : mask8x32 < Avx2 >) -> mask8x32 < Avx2 > { _mm256_or_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x32 < Avx2 > , b : mask8x32 < Avx2 >) -> mask8x32 < Avx2 > { _mm256_xor_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_mask8x32(self, a: mask8x32) -> mask8x32 { @@ -3355,27 +3113,33 @@ impl Simd for Avx2 { b: mask8x32, c: mask8x32, ) -> mask8x32 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x32 < Avx2 > , b : mask8x32 < Avx2 > , c : mask8x32 < Avx2 >) -> mask8x32 < Avx2 > { _mm256_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - unsafe { _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x32 < Avx2 > , b : mask8x32 < Avx2 >) -> mask8x32 < Avx2 > { _mm256_cmpeq_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn any_true_mask8x32(self, a: mask8x32) -> bool { - unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x32 < Avx2 >) -> bool { _mm256_movemask_epi8 (a . into ()) as u32 != 0 } } + kernel(self, a) } #[inline(always)] fn all_true_mask8x32(self, a: mask8x32) -> bool { - unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x32 < Avx2 >) -> bool { _mm256_movemask_epi8 (a . into ()) as u32 == 0xffffffff } } + kernel(self, a) } #[inline(always)] fn any_false_mask8x32(self, a: mask8x32) -> bool { - unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x32 < Avx2 >) -> bool { _mm256_movemask_epi8 (a . into ()) as u32 != 0xffffffff } } + kernel(self, a) } #[inline(always)] fn all_false_mask8x32(self, a: mask8x32) -> bool { - unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x32 < Avx2 >) -> bool { _mm256_movemask_epi8 (a . into ()) as u32 == 0 } } + kernel(self, a) } #[inline(always)] fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { @@ -3386,16 +3150,13 @@ impl Simd for Avx2 { } #[inline(always)] fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask8x32 < Avx2 >) -> (mask8x16 < Avx2 > , mask8x16 < Avx2 >) { (_mm256_extracti128_si256 :: < 0 > (a . into ()) . simd_into (token) , _mm256_extracti128_si256 :: < 1 > (a . into ()) . simd_into (token) ,) } } + kernel(self, a) } #[inline(always)] fn splat_i16x16(self, val: i16) -> i16x16 { - unsafe { _mm256_set1_epi16(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : i16) -> i16x16 < Avx2 > { _mm256_set1_epi16 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { @@ -3487,27 +3248,33 @@ impl Simd for Avx2 { } #[inline(always)] fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { _mm256_add_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , b : i16x16 < Avx2 >) -> i16x16 < Avx2 > { _mm256_add_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { _mm256_sub_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , b : i16x16 < Avx2 >) -> i16x16 < Avx2 > { _mm256_sub_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { _mm256_mullo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , b : i16x16 < Avx2 >) -> i16x16 < Avx2 > { _mm256_mullo_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , b : i16x16 < Avx2 >) -> i16x16 < Avx2 > { _mm256_and_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , b : i16x16 < Avx2 >) -> i16x16 < Avx2 > { _mm256_or_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , b : i16x16 < Avx2 >) -> i16x16 < Avx2 > { _mm256_xor_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_i16x16(self, a: i16x16) -> i16x16 { @@ -3515,9 +3282,8 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_i16x16(self, a: i16x16, shift: u32) -> i16x16 { - unsafe { - _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , shift : u32) -> i16x16 < Avx2 > { _mm256_sll_epi16 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { @@ -3525,9 +3291,8 @@ impl Simd for Avx2 { } #[inline(always)] fn shr_i16x16(self, a: i16x16, shift: u32) -> i16x16 { - unsafe { - _mm256_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , shift : u32) -> i16x16 < Avx2 > { _mm256_sra_epi16 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { @@ -3535,129 +3300,73 @@ impl Simd for Avx2 { } #[inline(always)] fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - unsafe { _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , b : i16x16 < Avx2 >) -> mask16x16 < Avx2 > { _mm256_cmpeq_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - unsafe { _mm256_cmpgt_epi16(b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , b : i16x16 < Avx2 >) -> mask16x16 < Avx2 > { _mm256_cmpgt_epi16 (b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - unsafe { - _mm256_cmpeq_epi16(_mm256_min_epi16(a.into(), b.into()), a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , b : i16x16 < Avx2 >) -> mask16x16 < Avx2 > { _mm256_cmpeq_epi16 (_mm256_min_epi16 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - unsafe { - _mm256_cmpeq_epi16(_mm256_max_epi16(a.into(), b.into()), a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , b : i16x16 < Avx2 >) -> mask16x16 < Avx2 > { _mm256_cmpeq_epi16 (_mm256_max_epi16 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - unsafe { _mm256_cmpgt_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , b : i16x16 < Avx2 >) -> mask16x16 < Avx2 > { _mm256_cmpgt_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { - let lo = _mm256_unpacklo_epi16(a.into(), b.into()); - let hi = _mm256_unpackhi_epi16(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , b : i16x16 < Avx2 >) -> i16x16 < Avx2 > { let lo = _mm256_unpacklo_epi16 (a . into () , b . into ()) ; let hi = _mm256_unpackhi_epi16 (a . into () , b . into ()) ; _mm256_permute2x128_si256 :: < 0b0010_0000 > (lo , hi) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { - let lo = _mm256_unpacklo_epi16(a.into(), b.into()); - let hi = _mm256_unpackhi_epi16(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , b : i16x16 < Avx2 >) -> i16x16 < Avx2 > { let lo = _mm256_unpacklo_epi16 (a . into () , b . into ()) ; let hi = _mm256_unpackhi_epi16 (a . into () , b . into ()) ; _mm256_permute2x128_si256 :: < 0b0011_0001 > (lo , hi) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , b : i16x16 < Avx2 >) -> i16x16 < Avx2 > { let t1 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (a . into () , _mm256_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15 , 0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15)) ,) ; let t2 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (b . into () , _mm256_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15 , 0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15)) ,) ; _mm256_permute2x128_si256 :: < 0b0010_0000 > (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , b : i16x16 < Avx2 >) -> i16x16 < Avx2 > { let t1 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (a . into () , _mm256_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15 , 0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15)) ,) ; let t2 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (b . into () , _mm256_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15 , 0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15)) ,) ; _mm256_permute2x128_si256 :: < 0b0011_0001 > (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { - unsafe { - let lo = _mm256_unpacklo_epi16(a.into(), b.into()); - let hi = _mm256_unpackhi_epi16(a.into(), b.into()); - ( - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , b : i16x16 < Avx2 >) -> (i16x16 < Avx2 > , i16x16 < Avx2 >) { let lo = _mm256_unpacklo_epi16 (a . into () , b . into ()) ; let hi = _mm256_unpackhi_epi16 (a . into () , b . into ()) ; (_mm256_permute2x128_si256 :: < 0b0010_0000 > (lo , hi) . simd_into (token) , _mm256_permute2x128_si256 :: < 0b0011_0001 > (lo , hi) . simd_into (token) ,) } } + kernel(self, a, b) } #[inline(always)] fn deinterleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - ( - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , b : i16x16 < Avx2 >) -> (i16x16 < Avx2 > , i16x16 < Avx2 >) { let t1 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (a . into () , _mm256_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15 , 0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15)) ,) ; let t2 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (b . into () , _mm256_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15 , 0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15)) ,) ; (_mm256_permute2x128_si256 :: < 0b0010_0000 > (t1 , t2) . simd_into (token) , _mm256_permute2x128_si256 :: < 0b0011_0001 > (t1 , t2) . simd_into (token) ,) } } + kernel(self, a, b) } #[inline(always)] fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x16 < Avx2 > , b : i16x16 < Avx2 > , c : i16x16 < Avx2 >) -> i16x16 < Avx2 > { _mm256_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { _mm256_min_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , b : i16x16 < Avx2 >) -> i16x16 < Avx2 > { _mm256_min_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { _mm256_max_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 > , b : i16x16 < Avx2 >) -> i16x16 < Avx2 > { _mm256_max_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { @@ -3668,28 +3377,28 @@ impl Simd for Avx2 { } #[inline(always)] fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 >) -> (i16x8 < Avx2 > , i16x8 < Avx2 >) { (_mm256_extracti128_si256 :: < 0 > (a . into ()) . simd_into (token) , _mm256_extracti128_si256 :: < 1 > (a . into ()) . simd_into (token) ,) } } + kernel(self, a) } #[inline(always)] fn neg_i16x16(self, a: i16x16) -> i16x16 { - unsafe { _mm256_sub_epi16(_mm256_setzero_si256(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 >) -> i16x16 < Avx2 > { _mm256_sub_epi16 (_mm256_setzero_si256 () , a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { - __m256i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 >) -> u8x32 < Avx2 > { __m256i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { - __m256i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i16x16 < Avx2 >) -> u32x8 < Avx2 > { __m256i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_u16x16(self, val: u16) -> u16x16 { - unsafe { _mm256_set1_epi16(val.cast_signed()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : u16) -> u16x16 < Avx2 > { _mm256_set1_epi16 (val . cast_signed ()) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { @@ -3781,27 +3490,33 @@ impl Simd for Avx2 { } #[inline(always)] fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { _mm256_add_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , b : u16x16 < Avx2 >) -> u16x16 < Avx2 > { _mm256_add_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { _mm256_sub_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , b : u16x16 < Avx2 >) -> u16x16 < Avx2 > { _mm256_sub_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { _mm256_mullo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , b : u16x16 < Avx2 >) -> u16x16 < Avx2 > { _mm256_mullo_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , b : u16x16 < Avx2 >) -> u16x16 < Avx2 > { _mm256_and_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , b : u16x16 < Avx2 >) -> u16x16 < Avx2 > { _mm256_or_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , b : u16x16 < Avx2 >) -> u16x16 < Avx2 > { _mm256_xor_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_u16x16(self, a: u16x16) -> u16x16 { @@ -3809,9 +3524,8 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_u16x16(self, a: u16x16, shift: u32) -> u16x16 { - unsafe { - _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , shift : u32) -> u16x16 < Avx2 > { _mm256_sll_epi16 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { @@ -3819,9 +3533,8 @@ impl Simd for Avx2 { } #[inline(always)] fn shr_u16x16(self, a: u16x16, shift: u32) -> u16x16 { - unsafe { - _mm256_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , shift : u32) -> u16x16 < Avx2 > { _mm256_srl_epi16 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { @@ -3829,139 +3542,73 @@ impl Simd for Avx2 { } #[inline(always)] fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - unsafe { _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , b : u16x16 < Avx2 >) -> mask16x16 < Avx2 > { _mm256_cmpeq_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - unsafe { - let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed()); - let a_signed = _mm256_xor_si256(a.into(), sign_bit); - let b_signed = _mm256_xor_si256(b.into(), sign_bit); - _mm256_cmpgt_epi16(b_signed, a_signed).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , b : u16x16 < Avx2 >) -> mask16x16 < Avx2 > { let sign_bit = _mm256_set1_epi16 (0x8000u16 . cast_signed ()) ; let a_signed = _mm256_xor_si256 (a . into () , sign_bit) ; let b_signed = _mm256_xor_si256 (b . into () , sign_bit) ; _mm256_cmpgt_epi16 (b_signed , a_signed) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - unsafe { - _mm256_cmpeq_epi16(_mm256_min_epu16(a.into(), b.into()), a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , b : u16x16 < Avx2 >) -> mask16x16 < Avx2 > { _mm256_cmpeq_epi16 (_mm256_min_epu16 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - unsafe { - _mm256_cmpeq_epi16(_mm256_max_epu16(a.into(), b.into()), a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , b : u16x16 < Avx2 >) -> mask16x16 < Avx2 > { _mm256_cmpeq_epi16 (_mm256_max_epu16 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - unsafe { - let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed()); - let a_signed = _mm256_xor_si256(a.into(), sign_bit); - let b_signed = _mm256_xor_si256(b.into(), sign_bit); - _mm256_cmpgt_epi16(a_signed, b_signed).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , b : u16x16 < Avx2 >) -> mask16x16 < Avx2 > { let sign_bit = _mm256_set1_epi16 (0x8000u16 . cast_signed ()) ; let a_signed = _mm256_xor_si256 (a . into () , sign_bit) ; let b_signed = _mm256_xor_si256 (b . into () , sign_bit) ; _mm256_cmpgt_epi16 (a_signed , b_signed) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { - let lo = _mm256_unpacklo_epi16(a.into(), b.into()); - let hi = _mm256_unpackhi_epi16(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , b : u16x16 < Avx2 >) -> u16x16 < Avx2 > { let lo = _mm256_unpacklo_epi16 (a . into () , b . into ()) ; let hi = _mm256_unpackhi_epi16 (a . into () , b . into ()) ; _mm256_permute2x128_si256 :: < 0b0010_0000 > (lo , hi) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { - let lo = _mm256_unpacklo_epi16(a.into(), b.into()); - let hi = _mm256_unpackhi_epi16(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , b : u16x16 < Avx2 >) -> u16x16 < Avx2 > { let lo = _mm256_unpacklo_epi16 (a . into () , b . into ()) ; let hi = _mm256_unpackhi_epi16 (a . into () , b . into ()) ; _mm256_permute2x128_si256 :: < 0b0011_0001 > (lo , hi) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , b : u16x16 < Avx2 >) -> u16x16 < Avx2 > { let t1 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (a . into () , _mm256_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15 , 0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15)) ,) ; let t2 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (b . into () , _mm256_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15 , 0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15)) ,) ; _mm256_permute2x128_si256 :: < 0b0010_0000 > (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , b : u16x16 < Avx2 >) -> u16x16 < Avx2 > { let t1 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (a . into () , _mm256_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15 , 0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15)) ,) ; let t2 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (b . into () , _mm256_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15 , 0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15)) ,) ; _mm256_permute2x128_si256 :: < 0b0011_0001 > (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { - unsafe { - let lo = _mm256_unpacklo_epi16(a.into(), b.into()); - let hi = _mm256_unpackhi_epi16(a.into(), b.into()); - ( - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , b : u16x16 < Avx2 >) -> (u16x16 < Avx2 > , u16x16 < Avx2 >) { let lo = _mm256_unpacklo_epi16 (a . into () , b . into ()) ; let hi = _mm256_unpackhi_epi16 (a . into () , b . into ()) ; (_mm256_permute2x128_si256 :: < 0b0010_0000 > (lo , hi) . simd_into (token) , _mm256_permute2x128_si256 :: < 0b0011_0001 > (lo , hi) . simd_into (token) ,) } } + kernel(self, a, b) } #[inline(always)] fn deinterleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - ( - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , b : u16x16 < Avx2 >) -> (u16x16 < Avx2 > , u16x16 < Avx2 >) { let t1 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (a . into () , _mm256_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15 , 0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15)) ,) ; let t2 = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (_mm256_shuffle_epi8 (b . into () , _mm256_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15 , 0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15)) ,) ; (_mm256_permute2x128_si256 :: < 0b0010_0000 > (t1 , t2) . simd_into (token) , _mm256_permute2x128_si256 :: < 0b0011_0001 > (t1 , t2) . simd_into (token) ,) } } + kernel(self, a, b) } #[inline(always)] fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x16 < Avx2 > , b : u16x16 < Avx2 > , c : u16x16 < Avx2 >) -> u16x16 < Avx2 > { _mm256_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { _mm256_min_epu16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , b : u16x16 < Avx2 >) -> u16x16 < Avx2 > { _mm256_min_epu16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { _mm256_max_epu16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 > , b : u16x16 < Avx2 >) -> u16x16 < Avx2 > { _mm256_max_epu16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { @@ -3972,39 +3619,28 @@ impl Simd for Avx2 { } #[inline(always)] fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 >) -> (u16x8 < Avx2 > , u16x8 < Avx2 >) { (_mm256_extracti128_si256 :: < 0 > (a . into ()) . simd_into (token) , _mm256_extracti128_si256 :: < 1 > (a . into ()) . simd_into (token) ,) } } + kernel(self, a) } #[inline(always)] fn narrow_u16x16(self, a: u16x16) -> u8x16 { - unsafe { - let mask = _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, 12, - 14, -1, -1, -1, -1, -1, -1, -1, -1, - ); - let shuffled = _mm256_shuffle_epi8(a.into(), mask); - let packed = _mm256_permute4x64_epi64::<0b11_01_10_00>(shuffled); - _mm256_castsi256_si128(packed).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 >) -> u8x16 < Avx2 > { let mask = _mm256_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , - 1 , - 1 , - 1 , - 1 , - 1 , - 1 , - 1 , - 1 , 0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , - 1 , - 1 , - 1 , - 1 , - 1 , - 1 , - 1 , - 1) ; let shuffled = _mm256_shuffle_epi8 (a . into () , mask) ; let packed = _mm256_permute4x64_epi64 :: < 0b11_01_10_00 > (shuffled) ; _mm256_castsi256_si128 (packed) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { - __m256i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 >) -> u8x32 < Avx2 > { __m256i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { - __m256i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x16 < Avx2 >) -> u32x8 < Avx2 > { __m256i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_mask16x16(self, val: bool) -> mask16x16 { - unsafe { - let val: i16 = if val { !0 } else { 0 }; - _mm256_set1_epi16(val).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : bool) -> mask16x16 < Avx2 > { let val : i16 = if val { ! 0 } else { 0 } ; _mm256_set1_epi16 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { @@ -4019,38 +3655,28 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { - unsafe { - { - let bit_lanes = _mm256_set1_epi16(bits as i16); - let bit_mask = _mm256_setr_epi16( - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, -32768, - ); - _mm256_cmpeq_epi16(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) - } - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , bits : u64) -> mask16x16 < Avx2 > { { let bit_lanes = _mm256_set1_epi16 (bits as i16) ; let bit_mask = _mm256_setr_epi16 (1 , 2 , 4 , 8 , 16 , 32 , 64 , 128 , 256 , 512 , 1024 , 2048 , 4096 , 8192 , 16384 , - 32768) ; _mm256_cmpeq_epi16 (_mm256_and_si256 (bit_lanes , bit_mask) , bit_mask) } . simd_into (token) } } + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { - unsafe { - { - let halves: [__m128i; 2usize] = crate::transmute::checked_transmute_copy(&a.val.0); - let packed = _mm_packs_epi16(halves[0], halves[1]); - _mm_movemask_epi8(packed) as u32 as u64 - } - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x16 < Avx2 >) -> u64 { { let halves : [__m128i ; 2usize] = crate :: transmute :: checked_transmute_copy (& a . val . 0) ; let packed = _mm_packs_epi16 (halves [0] , halves [1]) ; _mm_movemask_epi8 (packed) as u32 as u64 } } } + kernel(self, a) } #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x16 < Avx2 > , b : mask16x16 < Avx2 >) -> mask16x16 < Avx2 > { _mm256_and_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x16 < Avx2 > , b : mask16x16 < Avx2 >) -> mask16x16 < Avx2 > { _mm256_or_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x16 < Avx2 > , b : mask16x16 < Avx2 >) -> mask16x16 < Avx2 > { _mm256_xor_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_mask16x16(self, a: mask16x16) -> mask16x16 { @@ -4063,27 +3689,33 @@ impl Simd for Avx2 { b: mask16x16, c: mask16x16, ) -> mask16x16 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x16 < Avx2 > , b : mask16x16 < Avx2 > , c : mask16x16 < Avx2 >) -> mask16x16 < Avx2 > { _mm256_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - unsafe { _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x16 < Avx2 > , b : mask16x16 < Avx2 >) -> mask16x16 < Avx2 > { _mm256_cmpeq_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn any_true_mask16x16(self, a: mask16x16) -> bool { - unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x16 < Avx2 >) -> bool { _mm256_movemask_epi8 (a . into ()) as u32 != 0 } } + kernel(self, a) } #[inline(always)] fn all_true_mask16x16(self, a: mask16x16) -> bool { - unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x16 < Avx2 >) -> bool { _mm256_movemask_epi8 (a . into ()) as u32 == 0xffffffff } } + kernel(self, a) } #[inline(always)] fn any_false_mask16x16(self, a: mask16x16) -> bool { - unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x16 < Avx2 >) -> bool { _mm256_movemask_epi8 (a . into ()) as u32 != 0xffffffff } } + kernel(self, a) } #[inline(always)] fn all_false_mask16x16(self, a: mask16x16) -> bool { - unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x16 < Avx2 >) -> bool { _mm256_movemask_epi8 (a . into ()) as u32 == 0 } } + kernel(self, a) } #[inline(always)] fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { @@ -4094,16 +3726,13 @@ impl Simd for Avx2 { } #[inline(always)] fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x16 < Avx2 >) -> (mask16x8 < Avx2 > , mask16x8 < Avx2 >) { (_mm256_extracti128_si256 :: < 0 > (a . into ()) . simd_into (token) , _mm256_extracti128_si256 :: < 1 > (a . into ()) . simd_into (token) ,) } } + kernel(self, a) } #[inline(always)] fn splat_i32x8(self, val: i32) -> i32x8 { - unsafe { _mm256_set1_epi32(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : i32) -> i32x8 < Avx2 > { _mm256_set1_epi32 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { @@ -4195,27 +3824,33 @@ impl Simd for Avx2 { } #[inline(always)] fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_add_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> i32x8 < Avx2 > { _mm256_add_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_sub_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> i32x8 < Avx2 > { _mm256_sub_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_mullo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> i32x8 < Avx2 > { _mm256_mullo_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> i32x8 < Avx2 > { _mm256_and_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> i32x8 < Avx2 > { _mm256_or_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> i32x8 < Avx2 > { _mm256_xor_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_i32x8(self, a: i32x8) -> i32x8 { @@ -4223,119 +3858,93 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_i32x8(self, a: i32x8, shift: u32) -> i32x8 { - unsafe { - _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , shift : u32) -> i32x8 < Avx2 > { _mm256_sll_epi32 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_sllv_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> i32x8 < Avx2 > { _mm256_sllv_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn shr_i32x8(self, a: i32x8, shift: u32) -> i32x8 { - unsafe { - _mm256_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , shift : u32) -> i32x8 < Avx2 > { _mm256_sra_epi32 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_srav_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> i32x8 < Avx2 > { _mm256_srav_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - unsafe { _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> mask32x8 < Avx2 > { _mm256_cmpeq_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - unsafe { _mm256_cmpgt_epi32(b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> mask32x8 < Avx2 > { _mm256_cmpgt_epi32 (b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - unsafe { - _mm256_cmpeq_epi32(_mm256_min_epi32(a.into(), b.into()), a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> mask32x8 < Avx2 > { _mm256_cmpeq_epi32 (_mm256_min_epi32 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - unsafe { - _mm256_cmpeq_epi32(_mm256_max_epi32(a.into(), b.into()), a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> mask32x8 < Avx2 > { _mm256_cmpeq_epi32 (_mm256_max_epi32 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - unsafe { _mm256_cmpgt_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> mask32x8 < Avx2 > { _mm256_cmpgt_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { - let lo = _mm256_unpacklo_epi32(a.into(), b.into()); - let hi = _mm256_unpackhi_epi32(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> i32x8 < Avx2 > { let lo = _mm256_unpacklo_epi32 (a . into () , b . into ()) ; let hi = _mm256_unpackhi_epi32 (a . into () , b . into ()) ; _mm256_permute2x128_si256 :: < 0b0010_0000 > (lo , hi) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { - let lo = _mm256_unpacklo_epi32(a.into(), b.into()); - let hi = _mm256_unpackhi_epi32(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> i32x8 < Avx2 > { let lo = _mm256_unpacklo_epi32 (a . into () , b . into ()) ; let hi = _mm256_unpackhi_epi32 (a . into () , b . into ()) ; _mm256_permute2x128_si256 :: < 0b0011_0001 > (lo , hi) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { - let t1 = - _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = - _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> i32x8 < Avx2 > { let t1 = _mm256_permutevar8x32_epi32 (a . into () , _mm256_setr_epi32 (0 , 2 , 4 , 6 , 1 , 3 , 5 , 7)) ; let t2 = _mm256_permutevar8x32_epi32 (b . into () , _mm256_setr_epi32 (0 , 2 , 4 , 6 , 1 , 3 , 5 , 7)) ; _mm256_permute2x128_si256 :: < 0b0010_0000 > (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { - let t1 = - _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = - _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> i32x8 < Avx2 > { let t1 = _mm256_permutevar8x32_epi32 (a . into () , _mm256_setr_epi32 (0 , 2 , 4 , 6 , 1 , 3 , 5 , 7)) ; let t2 = _mm256_permutevar8x32_epi32 (b . into () , _mm256_setr_epi32 (0 , 2 , 4 , 6 , 1 , 3 , 5 , 7)) ; _mm256_permute2x128_si256 :: < 0b0011_0001 > (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { - unsafe { - let lo = _mm256_unpacklo_epi32(a.into(), b.into()); - let hi = _mm256_unpackhi_epi32(a.into(), b.into()); - ( - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> (i32x8 < Avx2 > , i32x8 < Avx2 >) { let lo = _mm256_unpacklo_epi32 (a . into () , b . into ()) ; let hi = _mm256_unpackhi_epi32 (a . into () , b . into ()) ; (_mm256_permute2x128_si256 :: < 0b0010_0000 > (lo , hi) . simd_into (token) , _mm256_permute2x128_si256 :: < 0b0011_0001 > (lo , hi) . simd_into (token) ,) } } + kernel(self, a, b) } #[inline(always)] fn deinterleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { - unsafe { - let t1 = - _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = - _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - ( - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> (i32x8 < Avx2 > , i32x8 < Avx2 >) { let t1 = _mm256_permutevar8x32_epi32 (a . into () , _mm256_setr_epi32 (0 , 2 , 4 , 6 , 1 , 3 , 5 , 7)) ; let t2 = _mm256_permutevar8x32_epi32 (b . into () , _mm256_setr_epi32 (0 , 2 , 4 , 6 , 1 , 3 , 5 , 7)) ; (_mm256_permute2x128_si256 :: < 0b0010_0000 > (t1 , t2) . simd_into (token) , _mm256_permute2x128_si256 :: < 0b0011_0001 > (t1 , t2) . simd_into (token) ,) } } + kernel(self, a, b) } #[inline(always)] fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x8 < Avx2 > , b : i32x8 < Avx2 > , c : i32x8 < Avx2 >) -> i32x8 < Avx2 > { _mm256_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_min_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> i32x8 < Avx2 > { _mm256_min_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_max_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 > , b : i32x8 < Avx2 >) -> i32x8 < Avx2 > { _mm256_max_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { @@ -4346,32 +3955,33 @@ impl Simd for Avx2 { } #[inline(always)] fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 >) -> (i32x4 < Avx2 > , i32x4 < Avx2 >) { (_mm256_extracti128_si256 :: < 0 > (a . into ()) . simd_into (token) , _mm256_extracti128_si256 :: < 1 > (a . into ()) . simd_into (token) ,) } } + kernel(self, a) } #[inline(always)] fn neg_i32x8(self, a: i32x8) -> i32x8 { - unsafe { _mm256_sub_epi32(_mm256_setzero_si256(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 >) -> i32x8 < Avx2 > { _mm256_sub_epi32 (_mm256_setzero_si256 () , a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { - __m256i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 >) -> u8x32 < Avx2 > { __m256i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { - __m256i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 >) -> u32x8 < Avx2 > { __m256i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { - unsafe { _mm256_cvtepi32_ps(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : i32x8 < Avx2 >) -> f32x8 < Avx2 > { _mm256_cvtepi32_ps (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_u32x8(self, val: u32) -> u32x8 { - unsafe { _mm256_set1_epi32(val.cast_signed()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : u32) -> u32x8 < Avx2 > { _mm256_set1_epi32 (val . cast_signed ()) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { @@ -4463,27 +4073,33 @@ impl Simd for Avx2 { } #[inline(always)] fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_add_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> u32x8 < Avx2 > { _mm256_add_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_sub_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> u32x8 < Avx2 > { _mm256_sub_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_mullo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> u32x8 < Avx2 > { _mm256_mullo_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> u32x8 < Avx2 > { _mm256_and_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> u32x8 < Avx2 > { _mm256_or_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> u32x8 < Avx2 > { _mm256_xor_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_u32x8(self, a: u32x8) -> u32x8 { @@ -4491,129 +4107,93 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_u32x8(self, a: u32x8, shift: u32) -> u32x8 { - unsafe { - _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , shift : u32) -> u32x8 < Avx2 > { _mm256_sll_epi32 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_sllv_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> u32x8 < Avx2 > { _mm256_sllv_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn shr_u32x8(self, a: u32x8, shift: u32) -> u32x8 { - unsafe { - _mm256_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , shift : u32) -> u32x8 < Avx2 > { _mm256_srl_epi32 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_srlv_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> u32x8 < Avx2 > { _mm256_srlv_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - unsafe { _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> mask32x8 < Avx2 > { _mm256_cmpeq_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - unsafe { - let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed()); - let a_signed = _mm256_xor_si256(a.into(), sign_bit); - let b_signed = _mm256_xor_si256(b.into(), sign_bit); - _mm256_cmpgt_epi32(b_signed, a_signed).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> mask32x8 < Avx2 > { let sign_bit = _mm256_set1_epi32 (0x80000000u32 . cast_signed ()) ; let a_signed = _mm256_xor_si256 (a . into () , sign_bit) ; let b_signed = _mm256_xor_si256 (b . into () , sign_bit) ; _mm256_cmpgt_epi32 (b_signed , a_signed) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - unsafe { - _mm256_cmpeq_epi32(_mm256_min_epu32(a.into(), b.into()), a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> mask32x8 < Avx2 > { _mm256_cmpeq_epi32 (_mm256_min_epu32 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - unsafe { - _mm256_cmpeq_epi32(_mm256_max_epu32(a.into(), b.into()), a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> mask32x8 < Avx2 > { _mm256_cmpeq_epi32 (_mm256_max_epu32 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - unsafe { - let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed()); - let a_signed = _mm256_xor_si256(a.into(), sign_bit); - let b_signed = _mm256_xor_si256(b.into(), sign_bit); - _mm256_cmpgt_epi32(a_signed, b_signed).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> mask32x8 < Avx2 > { let sign_bit = _mm256_set1_epi32 (0x80000000u32 . cast_signed ()) ; let a_signed = _mm256_xor_si256 (a . into () , sign_bit) ; let b_signed = _mm256_xor_si256 (b . into () , sign_bit) ; _mm256_cmpgt_epi32 (a_signed , b_signed) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { - let lo = _mm256_unpacklo_epi32(a.into(), b.into()); - let hi = _mm256_unpackhi_epi32(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> u32x8 < Avx2 > { let lo = _mm256_unpacklo_epi32 (a . into () , b . into ()) ; let hi = _mm256_unpackhi_epi32 (a . into () , b . into ()) ; _mm256_permute2x128_si256 :: < 0b0010_0000 > (lo , hi) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { - let lo = _mm256_unpacklo_epi32(a.into(), b.into()); - let hi = _mm256_unpackhi_epi32(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> u32x8 < Avx2 > { let lo = _mm256_unpacklo_epi32 (a . into () , b . into ()) ; let hi = _mm256_unpackhi_epi32 (a . into () , b . into ()) ; _mm256_permute2x128_si256 :: < 0b0011_0001 > (lo , hi) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { - let t1 = - _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = - _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> u32x8 < Avx2 > { let t1 = _mm256_permutevar8x32_epi32 (a . into () , _mm256_setr_epi32 (0 , 2 , 4 , 6 , 1 , 3 , 5 , 7)) ; let t2 = _mm256_permutevar8x32_epi32 (b . into () , _mm256_setr_epi32 (0 , 2 , 4 , 6 , 1 , 3 , 5 , 7)) ; _mm256_permute2x128_si256 :: < 0b0010_0000 > (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { - let t1 = - _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = - _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> u32x8 < Avx2 > { let t1 = _mm256_permutevar8x32_epi32 (a . into () , _mm256_setr_epi32 (0 , 2 , 4 , 6 , 1 , 3 , 5 , 7)) ; let t2 = _mm256_permutevar8x32_epi32 (b . into () , _mm256_setr_epi32 (0 , 2 , 4 , 6 , 1 , 3 , 5 , 7)) ; _mm256_permute2x128_si256 :: < 0b0011_0001 > (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { - unsafe { - let lo = _mm256_unpacklo_epi32(a.into(), b.into()); - let hi = _mm256_unpackhi_epi32(a.into(), b.into()); - ( - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> (u32x8 < Avx2 > , u32x8 < Avx2 >) { let lo = _mm256_unpacklo_epi32 (a . into () , b . into ()) ; let hi = _mm256_unpackhi_epi32 (a . into () , b . into ()) ; (_mm256_permute2x128_si256 :: < 0b0010_0000 > (lo , hi) . simd_into (token) , _mm256_permute2x128_si256 :: < 0b0011_0001 > (lo , hi) . simd_into (token) ,) } } + kernel(self, a, b) } #[inline(always)] fn deinterleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { - unsafe { - let t1 = - _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = - _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - ( - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> (u32x8 < Avx2 > , u32x8 < Avx2 >) { let t1 = _mm256_permutevar8x32_epi32 (a . into () , _mm256_setr_epi32 (0 , 2 , 4 , 6 , 1 , 3 , 5 , 7)) ; let t2 = _mm256_permutevar8x32_epi32 (b . into () , _mm256_setr_epi32 (0 , 2 , 4 , 6 , 1 , 3 , 5 , 7)) ; (_mm256_permute2x128_si256 :: < 0b0010_0000 > (t1 , t2) . simd_into (token) , _mm256_permute2x128_si256 :: < 0b0011_0001 > (t1 , t2) . simd_into (token) ,) } } + kernel(self, a, b) } #[inline(always)] fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x8 < Avx2 > , b : u32x8 < Avx2 > , c : u32x8 < Avx2 >) -> u32x8 < Avx2 > { _mm256_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_min_epu32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> u32x8 < Avx2 > { _mm256_min_epu32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_max_epu32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 > , b : u32x8 < Avx2 >) -> u32x8 < Avx2 > { _mm256_max_epu32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { @@ -4624,40 +4204,23 @@ impl Simd for Avx2 { } #[inline(always)] fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 >) -> (u32x4 < Avx2 > , u32x4 < Avx2 >) { (_mm256_extracti128_si256 :: < 0 > (a . into ()) . simd_into (token) , _mm256_extracti128_si256 :: < 1 > (a . into ()) . simd_into (token) ,) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { - __m256i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 >) -> u8x32 < Avx2 > { __m256i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { - unsafe { - let a = a.into(); - let lo = _mm256_blend_epi16::<0xAA>(a, _mm256_set1_epi32(0x4B000000)); - let hi = _mm256_blend_epi16::<0xAA>( - _mm256_srli_epi32::<16>(a), - _mm256_set1_epi32(0x53000000), - ); - let fhi = _mm256_sub_ps( - _mm256_castsi256_ps(hi), - _mm256_set1_ps(f32::from_bits(0x53000080)), - ); - let result = _mm256_add_ps(_mm256_castsi256_ps(lo), fhi); - result.simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u32x8 < Avx2 >) -> f32x8 < Avx2 > { let a = a . into () ; let lo = _mm256_blend_epi16 :: < 0xAA > (a , _mm256_set1_epi32 (0x4B000000)) ; let hi = _mm256_blend_epi16 :: < 0xAA > (_mm256_srli_epi32 :: < 16 > (a) , _mm256_set1_epi32 (0x53000000)) ; let fhi = _mm256_sub_ps (_mm256_castsi256_ps (hi) , _mm256_set1_ps (f32 :: from_bits (0x53000080))) ; let result = _mm256_add_ps (_mm256_castsi256_ps (lo) , fhi) ; result . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_mask32x8(self, val: bool) -> mask32x8 { - unsafe { - let val: i32 = if val { !0 } else { 0 }; - _mm256_set1_epi32(val).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : bool) -> mask32x8 < Avx2 > { let val : i32 = if val { ! 0 } else { 0 } ; _mm256_set1_epi32 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { @@ -4672,30 +4235,28 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { - unsafe { - { - let bit_lanes = _mm256_set1_epi32(bits as i32); - let bit_mask = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128); - _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) - } - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , bits : u64) -> mask32x8 < Avx2 > { { let bit_lanes = _mm256_set1_epi32 (bits as i32) ; let bit_mask = _mm256_setr_epi32 (1 , 2 , 4 , 8 , 16 , 32 , 64 , 128) ; _mm256_cmpeq_epi32 (_mm256_and_si256 (bit_lanes , bit_mask) , bit_mask) } . simd_into (token) } } + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask32x8(self, a: mask32x8) -> u64 { - unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 as u64 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x8 < Avx2 >) -> u64 { _mm256_movemask_ps (_mm256_castsi256_ps (a . into ())) as u32 as u64 } } + kernel(self, a) } #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x8 < Avx2 > , b : mask32x8 < Avx2 >) -> mask32x8 < Avx2 > { _mm256_and_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x8 < Avx2 > , b : mask32x8 < Avx2 >) -> mask32x8 < Avx2 > { _mm256_or_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x8 < Avx2 > , b : mask32x8 < Avx2 >) -> mask32x8 < Avx2 > { _mm256_xor_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_mask32x8(self, a: mask32x8) -> mask32x8 { @@ -4708,27 +4269,33 @@ impl Simd for Avx2 { b: mask32x8, c: mask32x8, ) -> mask32x8 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x8 < Avx2 > , b : mask32x8 < Avx2 > , c : mask32x8 < Avx2 >) -> mask32x8 < Avx2 > { _mm256_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - unsafe { _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x8 < Avx2 > , b : mask32x8 < Avx2 >) -> mask32x8 < Avx2 > { _mm256_cmpeq_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn any_true_mask32x8(self, a: mask32x8) -> bool { - unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x8 < Avx2 >) -> bool { _mm256_movemask_ps (_mm256_castsi256_ps (a . into ())) as u32 != 0 } } + kernel(self, a) } #[inline(always)] fn all_true_mask32x8(self, a: mask32x8) -> bool { - unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0b11111111 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x8 < Avx2 >) -> bool { _mm256_movemask_ps (_mm256_castsi256_ps (a . into ())) as u32 == 0b11111111 } } + kernel(self, a) } #[inline(always)] fn any_false_mask32x8(self, a: mask32x8) -> bool { - unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0b11111111 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x8 < Avx2 >) -> bool { _mm256_movemask_ps (_mm256_castsi256_ps (a . into ())) as u32 != 0b11111111 } } + kernel(self, a) } #[inline(always)] fn all_false_mask32x8(self, a: mask32x8) -> bool { - unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x8 < Avx2 >) -> bool { _mm256_movemask_ps (_mm256_castsi256_ps (a . into ())) as u32 == 0 } } + kernel(self, a) } #[inline(always)] fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { @@ -4739,16 +4306,13 @@ impl Simd for Avx2 { } #[inline(always)] fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask32x8 < Avx2 >) -> (mask32x4 < Avx2 > , mask32x4 < Avx2 >) { (_mm256_extracti128_si256 :: < 0 > (a . into ()) . simd_into (token) , _mm256_extracti128_si256 :: < 1 > (a . into ()) . simd_into (token) ,) } } + kernel(self, a) } #[inline(always)] fn splat_f64x4(self, val: f64) -> f64x4 { - unsafe { _mm256_set1_pd(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : f64) -> f64x4 < Avx2 > { _mm256_set1_pd (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { @@ -4840,15 +4404,18 @@ impl Simd for Avx2 { } #[inline(always)] fn abs_f64x4(self, a: f64x4) -> f64x4 { - unsafe { _mm256_andnot_pd(_mm256_set1_pd(-0.0), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 >) -> f64x4 < Avx2 > { _mm256_andnot_pd (_mm256_set1_pd (- 0.0) , a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn neg_f64x4(self, a: f64x4) -> f64x4 { - unsafe { _mm256_xor_pd(a.into(), _mm256_set1_pd(-0.0)).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 >) -> f64x4 < Avx2 > { _mm256_xor_pd (a . into () , _mm256_set1_pd (- 0.0)) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn sqrt_f64x4(self, a: f64x4) -> f64x4 { - unsafe { _mm256_sqrt_pd(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 >) -> f64x4 < Avx2 > { _mm256_sqrt_pd (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { @@ -4856,157 +4423,128 @@ impl Simd for Avx2 { } #[inline(always)] fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { _mm256_add_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> f64x4 < Avx2 > { _mm256_add_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { _mm256_sub_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> f64x4 < Avx2 > { _mm256_sub_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { _mm256_mul_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> f64x4 < Avx2 > { _mm256_mul_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { _mm256_div_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> f64x4 < Avx2 > { _mm256_div_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - let mask = _mm256_set1_pd(-0.0); - _mm256_or_pd( - _mm256_and_pd(mask, b.into()), - _mm256_andnot_pd(mask, a.into()), - ) - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> f64x4 < Avx2 > { let mask = _mm256_set1_pd (- 0.0) ; _mm256_or_pd (_mm256_and_pd (mask , b . into ()) , _mm256_andnot_pd (mask , a . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<0i32>(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> mask64x4 < Avx2 > { _mm256_castpd_si256 (_mm256_cmp_pd :: < 0i32 > (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<17i32>(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> mask64x4 < Avx2 > { _mm256_castpd_si256 (_mm256_cmp_pd :: < 17i32 > (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<18i32>(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> mask64x4 < Avx2 > { _mm256_castpd_si256 (_mm256_cmp_pd :: < 18i32 > (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<29i32>(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> mask64x4 < Avx2 > { _mm256_castpd_si256 (_mm256_cmp_pd :: < 29i32 > (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<30i32>(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> mask64x4 < Avx2 > { _mm256_castpd_si256 (_mm256_cmp_pd :: < 30i32 > (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - let lo = _mm256_unpacklo_pd(a.into(), b.into()); - let hi = _mm256_unpackhi_pd(a.into(), b.into()); - _mm256_permute2f128_pd::<0b0010_0000>(lo, hi).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> f64x4 < Avx2 > { let lo = _mm256_unpacklo_pd (a . into () , b . into ()) ; let hi = _mm256_unpackhi_pd (a . into () , b . into ()) ; _mm256_permute2f128_pd :: < 0b0010_0000 > (lo , hi) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - let lo = _mm256_unpacklo_pd(a.into(), b.into()); - let hi = _mm256_unpackhi_pd(a.into(), b.into()); - _mm256_permute2f128_pd::<0b0011_0001>(lo, hi).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> f64x4 < Avx2 > { let lo = _mm256_unpacklo_pd (a . into () , b . into ()) ; let hi = _mm256_unpackhi_pd (a . into () , b . into ()) ; _mm256_permute2f128_pd :: < 0b0011_0001 > (lo , hi) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into()); - let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into()); - _mm256_permute2f128_pd::<0b0010_0000>(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> f64x4 < Avx2 > { let t1 = _mm256_permute4x64_pd :: < 0b11_01_10_00 > (a . into ()) ; let t2 = _mm256_permute4x64_pd :: < 0b11_01_10_00 > (b . into ()) ; _mm256_permute2f128_pd :: < 0b0010_0000 > (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into()); - let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into()); - _mm256_permute2f128_pd::<0b0011_0001>(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> f64x4 < Avx2 > { let t1 = _mm256_permute4x64_pd :: < 0b11_01_10_00 > (a . into ()) ; let t2 = _mm256_permute4x64_pd :: < 0b11_01_10_00 > (b . into ()) ; _mm256_permute2f128_pd :: < 0b0011_0001 > (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { - unsafe { - let lo = _mm256_unpacklo_pd(a.into(), b.into()); - let hi = _mm256_unpackhi_pd(a.into(), b.into()); - ( - _mm256_permute2f128_pd::<0b0010_0000>(lo, hi).simd_into(self), - _mm256_permute2f128_pd::<0b0011_0001>(lo, hi).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> (f64x4 < Avx2 > , f64x4 < Avx2 >) { let lo = _mm256_unpacklo_pd (a . into () , b . into ()) ; let hi = _mm256_unpackhi_pd (a . into () , b . into ()) ; (_mm256_permute2f128_pd :: < 0b0010_0000 > (lo , hi) . simd_into (token) , _mm256_permute2f128_pd :: < 0b0011_0001 > (lo , hi) . simd_into (token) ,) } } + kernel(self, a, b) } #[inline(always)] fn deinterleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { - unsafe { - let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into()); - let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into()); - ( - _mm256_permute2f128_pd::<0b0010_0000>(t1, t2).simd_into(self), - _mm256_permute2f128_pd::<0b0011_0001>(t1, t2).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> (f64x4 < Avx2 > , f64x4 < Avx2 >) { let t1 = _mm256_permute4x64_pd :: < 0b11_01_10_00 > (a . into ()) ; let t2 = _mm256_permute4x64_pd :: < 0b11_01_10_00 > (b . into ()) ; (_mm256_permute2f128_pd :: < 0b0010_0000 > (t1 , t2) . simd_into (token) , _mm256_permute2f128_pd :: < 0b0011_0001 > (t1 , t2) . simd_into (token) ,) } } + kernel(self, a, b) } #[inline(always)] fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { _mm256_max_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> f64x4 < Avx2 > { _mm256_max_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { _mm256_min_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> f64x4 < Avx2 > { _mm256_min_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - let intermediate = _mm256_max_pd(a.into(), b.into()); - let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into()); - _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> f64x4 < Avx2 > { let intermediate = _mm256_max_pd (a . into () , b . into ()) ; let b_is_nan = _mm256_cmp_pd :: < 3i32 > (b . into () , b . into ()) ; _mm256_blendv_pd (intermediate , a . into () , b_is_nan) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - let intermediate = _mm256_min_pd(a.into(), b.into()); - let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into()); - _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 >) -> f64x4 < Avx2 > { let intermediate = _mm256_min_pd (a . into () , b . into ()) ; let b_is_nan = _mm256_cmp_pd :: < 3i32 > (b . into () , b . into ()) ; _mm256_blendv_pd (intermediate , a . into () , b_is_nan) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { - unsafe { _mm256_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 > , c : f64x4 < Avx2 >) -> f64x4 < Avx2 > { _mm256_fmadd_pd (a . into () , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn mul_sub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { - unsafe { _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 > , b : f64x4 < Avx2 > , c : f64x4 < Avx2 >) -> f64x4 < Avx2 > { _mm256_fmsub_pd (a . into () , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn floor_f64x4(self, a: f64x4) -> f64x4 { - unsafe { - _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 >) -> f64x4 < Avx2 > { _mm256_round_pd :: < { _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn ceil_f64x4(self, a: f64x4) -> f64x4 { - unsafe { - _mm256_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 >) -> f64x4 < Avx2 > { _mm256_round_pd :: < { _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn round_ties_even_f64x4(self, a: f64x4) -> f64x4 { - unsafe { - _mm256_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 >) -> f64x4 < Avx2 > { _mm256_round_pd :: < { _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn fract_f64x4(self, a: f64x4) -> f64x4 { @@ -5014,15 +4552,13 @@ impl Simd for Avx2 { } #[inline(always)] fn trunc_f64x4(self, a: f64x4) -> f64x4 { - unsafe { - _mm256_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 >) -> f64x4 < Avx2 > { _mm256_round_pd :: < { _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { - unsafe { - _mm256_blendv_pd(c.into(), b.into(), _mm256_castsi256_pd(a.into())).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x4 < Avx2 > , b : f64x4 < Avx2 > , c : f64x4 < Avx2 >) -> f64x4 < Avx2 > { _mm256_blendv_pd (c . into () , b . into () , _mm256_castsi256_pd (a . into ())) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { @@ -5033,23 +4569,18 @@ impl Simd for Avx2 { } #[inline(always)] fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { - unsafe { - ( - _mm256_extractf128_pd::<0>(a.into()).simd_into(self), - _mm256_extractf128_pd::<1>(a.into()).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 >) -> (f64x2 < Avx2 > , f64x2 < Avx2 >) { (_mm256_extractf128_pd :: < 0 > (a . into ()) . simd_into (token) , _mm256_extractf128_pd :: < 1 > (a . into ()) . simd_into (token) ,) } } + kernel(self, a) } #[inline(always)] fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { - unsafe { _mm256_castpd_ps(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : f64x4 < Avx2 >) -> f32x8 < Avx2 > { _mm256_castpd_ps (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_mask64x4(self, val: bool) -> mask64x4 { - unsafe { - let val: i64 = if val { !0 } else { 0 }; - _mm256_set1_epi64x(val).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , val : bool) -> mask64x4 < Avx2 > { let val : i64 = if val { ! 0 } else { 0 } ; _mm256_set1_epi64x (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { @@ -5064,30 +4595,28 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { - unsafe { - { - let bit_lanes = _mm256_set1_epi64x(bits.cast_signed()); - let bit_mask = _mm256_set_epi64x(8, 4, 2, 1); - _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) - } - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , bits : u64) -> mask64x4 < Avx2 > { { let bit_lanes = _mm256_set1_epi64x (bits . cast_signed ()) ; let bit_mask = _mm256_set_epi64x (8 , 4 , 2 , 1) ; _mm256_cmpeq_epi64 (_mm256_and_si256 (bit_lanes , bit_mask) , bit_mask) } . simd_into (token) } } + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask64x4(self, a: mask64x4) -> u64 { - unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 as u64 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x4 < Avx2 >) -> u64 { _mm256_movemask_pd (_mm256_castsi256_pd (a . into ())) as u32 as u64 } } + kernel(self, a) } #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x4 < Avx2 > , b : mask64x4 < Avx2 >) -> mask64x4 < Avx2 > { _mm256_and_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x4 < Avx2 > , b : mask64x4 < Avx2 >) -> mask64x4 < Avx2 > { _mm256_or_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x4 < Avx2 > , b : mask64x4 < Avx2 >) -> mask64x4 < Avx2 > { _mm256_xor_si256 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_mask64x4(self, a: mask64x4) -> mask64x4 { @@ -5100,27 +4629,33 @@ impl Simd for Avx2 { b: mask64x4, c: mask64x4, ) -> mask64x4 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x4 < Avx2 > , b : mask64x4 < Avx2 > , c : mask64x4 < Avx2 >) -> mask64x4 < Avx2 > { _mm256_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - unsafe { _mm256_cmpeq_epi64(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x4 < Avx2 > , b : mask64x4 < Avx2 >) -> mask64x4 < Avx2 > { _mm256_cmpeq_epi64 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn any_true_mask64x4(self, a: mask64x4) -> bool { - unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x4 < Avx2 >) -> bool { _mm256_movemask_pd (_mm256_castsi256_pd (a . into ())) as u32 != 0 } } + kernel(self, a) } #[inline(always)] fn all_true_mask64x4(self, a: mask64x4) -> bool { - unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0b1111 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x4 < Avx2 >) -> bool { _mm256_movemask_pd (_mm256_castsi256_pd (a . into ())) as u32 == 0b1111 } } + kernel(self, a) } #[inline(always)] fn any_false_mask64x4(self, a: mask64x4) -> bool { - unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0b1111 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x4 < Avx2 >) -> bool { _mm256_movemask_pd (_mm256_castsi256_pd (a . into ())) as u32 != 0b1111 } } + kernel(self, a) } #[inline(always)] fn all_false_mask64x4(self, a: mask64x4) -> bool { - unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x4 < Avx2 >) -> bool { _mm256_movemask_pd (_mm256_castsi256_pd (a . into ())) as u32 == 0 } } + kernel(self, a) } #[inline(always)] fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { @@ -5131,12 +4666,8 @@ impl Simd for Avx2 { } #[inline(always)] fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask64x4 < Avx2 >) -> (mask64x2 < Avx2 > , mask64x2 < Avx2 >) { (_mm256_extracti128_si256 :: < 0 > (a . into ()) . simd_into (token) , _mm256_extracti128_si256 :: < 1 > (a . into ()) . simd_into (token) ,) } } + kernel(self, a) } #[inline(always)] fn splat_f32x16(self, val: f32) -> f32x16 { @@ -6187,40 +5718,8 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { - unsafe { - { - let bit_bytes = _mm256_set1_epi64x(bits.cast_signed()); - let bit_mask = _mm256_setr_epi8( - 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, - 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, - ); - mask8x64 { - val: crate::support::Aligned512([ - { - let bit_bytes = _mm256_shuffle_epi8( - bit_bytes, - _mm256_setr_epi8( - 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, - 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, - ), - ); - _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) - }, - { - let bit_bytes = _mm256_shuffle_epi8( - bit_bytes, - _mm256_setr_epi8( - 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, - 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, - ), - ); - _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) - }, - ]), - simd: self, - } - } - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , bits : u64) -> mask8x64 < Avx2 > { { let bit_bytes = _mm256_set1_epi64x (bits . cast_signed ()) ; let bit_mask = _mm256_setr_epi8 (1 , 2 , 4 , 8 , 16 , 32 , 64 , - 128 , 1 , 2 , 4 , 8 , 16 , 32 , 64 , - 128 , 1 , 2 , 4 , 8 , 16 , 32 , 64 , - 128 , 1 , 2 , 4 , 8 , 16 , 32 , 64 , - 128) ; mask8x64 { val : crate :: support :: Aligned512 ([{ let bit_bytes = _mm256_shuffle_epi8 (bit_bytes , _mm256_setr_epi8 (0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3)) ; _mm256_cmpeq_epi8 (_mm256_and_si256 (bit_bytes , bit_mask) , bit_mask) } , { let bit_bytes = _mm256_shuffle_epi8 (bit_bytes , _mm256_setr_epi8 (4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7)) ; _mm256_cmpeq_epi8 (_mm256_and_si256 (bit_bytes , bit_mask) , bit_mask) }]) , simd : token , } } } } + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { @@ -6923,16 +6422,8 @@ impl Simd for Avx2 { } #[inline(always)] fn narrow_u16x32(self, a: u16x32) -> u8x32 { - let (a, b) = self.split_u16x32(a); - unsafe { - let mask = _mm256_set1_epi16(0xFF); - let lo_masked = _mm256_and_si256(a.into(), mask); - let hi_masked = _mm256_and_si256(b.into(), mask); - let result = _mm256_permute4x64_epi64::<0b_11_01_10_00>(_mm256_packus_epi16( - lo_masked, hi_masked, - )); - result.simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : u16x32 < Avx2 >) -> u8x32 < Avx2 > { let (a , b) = token . split_u16x32 (a) ; let mask = _mm256_set1_epi16 (0xFF) ; let lo_masked = _mm256_and_si256 (a . into () , mask) ; let hi_masked = _mm256_and_si256 (b . into () , mask) ; let result = _mm256_permute4x64_epi64 :: < 0b_11_01_10_00 > (_mm256_packus_epi16 (lo_masked , hi_masked)) ; result . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { @@ -6974,15 +6465,8 @@ impl Simd for Avx2 { } #[inline(always)] fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { - unsafe { - { - let lo = _mm256_movemask_epi8(a.val.0[0]) as u32; - let hi = _mm256_movemask_epi8(a.val.0[1]) as u32; - let lo = _pext_u32(lo, 0x5555_5555u32) as u64; - let hi = _pext_u32(hi, 0x5555_5555u32) as u64; - lo | (hi << 16usize) - } - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , a : mask16x32 < Avx2 >) -> u64 { { let lo = _mm256_movemask_epi8 (a . val . 0 [0]) as u32 ; let hi = _mm256_movemask_epi8 (a . val . 0 [1]) as u32 ; let lo = _pext_u32 (lo , 0x5555_5555u32) as u64 ; let hi = _pext_u32 (hi , 0x5555_5555u32) as u64 ; lo | (hi << 16usize) } } } + kernel(self, a) } #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { @@ -7687,25 +7171,8 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { - unsafe { - { - let bit_lanes = _mm256_set1_epi32(bits as i32); - mask32x16 { - val: crate::support::Aligned512([ - { - let bit_mask = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128); - _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) - }, - { - let bit_mask = - _mm256_setr_epi32(256, 512, 1024, 2048, 4096, 8192, 16384, 32768); - _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) - }, - ]), - simd: self, - } - } - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , bits : u64) -> mask32x16 < Avx2 > { { let bit_lanes = _mm256_set1_epi32 (bits as i32) ; mask32x16 { val : crate :: support :: Aligned512 ([{ let bit_mask = _mm256_setr_epi32 (1 , 2 , 4 , 8 , 16 , 32 , 64 , 128) ; _mm256_cmpeq_epi32 (_mm256_and_si256 (bit_lanes , bit_mask) , bit_mask) } , { let bit_mask = _mm256_setr_epi32 (256 , 512 , 1024 , 2048 , 4096 , 8192 , 16384 , 32768) ; _mm256_cmpeq_epi32 (_mm256_and_si256 (bit_lanes , bit_mask) , bit_mask) }]) , simd : token , } } } } + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask32x16(self, a: mask32x16) -> u64 { @@ -8133,24 +7600,8 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8 { - unsafe { - { - let bit_lanes = _mm256_set1_epi64x(bits.cast_signed()); - mask64x8 { - val: crate::support::Aligned512([ - { - let bit_mask = _mm256_set_epi64x(8, 4, 2, 1); - _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) - }, - { - let bit_mask = _mm256_set_epi64x(128, 64, 32, 16); - _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) - }, - ]), - simd: self, - } - } - } + crate::kernel! { # [inline (always)] fn kernel (token : Avx2 , bits : u64) -> mask64x8 < Avx2 > { { let bit_lanes = _mm256_set1_epi64x (bits . cast_signed ()) ; mask64x8 { val : crate :: support :: Aligned512 ([{ let bit_mask = _mm256_set_epi64x (8 , 4 , 2 , 1) ; _mm256_cmpeq_epi64 (_mm256_and_si256 (bit_lanes , bit_mask) , bit_mask) } , { let bit_mask = _mm256_set_epi64x (128 , 64 , 32 , 16) ; _mm256_cmpeq_epi64 (_mm256_and_si256 (bit_lanes , bit_mask) , bit_mask) }]) , simd : token , } } } } + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask64x8(self, a: mask64x8) -> u64 { diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs index d52d105d0..5f37cf333 100644 --- a/fearless_simd/src/generated/neon.rs +++ b/fearless_simd/src/generated/neon.rs @@ -88,7 +88,8 @@ impl Simd for Neon { } #[inline(always)] fn splat_f32x4(self, val: f32) -> f32x4 { - unsafe { vdupq_n_f32(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , val : f32) -> f32x4 < Neon > { vdupq_n_f32 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4 { @@ -167,86 +168,93 @@ impl Simd for Neon { } #[inline(always)] fn abs_f32x4(self, a: f32x4) -> f32x4 { - unsafe { vabsq_f32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon >) -> f32x4 < Neon > { vabsq_f32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn neg_f32x4(self, a: f32x4) -> f32x4 { - unsafe { vnegq_f32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon >) -> f32x4 < Neon > { vnegq_f32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn sqrt_f32x4(self, a: f32x4) -> f32x4 { - unsafe { vsqrtq_f32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon >) -> f32x4 < Neon > { vsqrtq_f32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn approximate_recip_f32x4(self, a: f32x4) -> f32x4 { - unsafe { vrecpeq_f32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon >) -> f32x4 < Neon > { vrecpeq_f32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { vaddq_f32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon >) -> f32x4 < Neon > { vaddq_f32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { vsubq_f32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon >) -> f32x4 < Neon > { vsubq_f32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { vmulq_f32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon >) -> f32x4 < Neon > { vmulq_f32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn div_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { vdivq_f32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon >) -> f32x4 < Neon > { vdivq_f32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn copysign_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { - let sign_mask = vdupq_n_u32(1 << 31); - vbslq_f32(sign_mask, b.into(), a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon >) -> f32x4 < Neon > { let sign_mask = vdupq_n_u32 (1 << 31) ; vbslq_f32 (sign_mask , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_eq_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vceqq_f32(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon >) -> mask32x4 < Neon > { vreinterpretq_s32_u32 (vceqq_f32 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcltq_f32(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon >) -> mask32x4 < Neon > { vreinterpretq_s32_u32 (vcltq_f32 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcleq_f32(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon >) -> mask32x4 < Neon > { vreinterpretq_s32_u32 (vcleq_f32 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcgeq_f32(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon >) -> mask32x4 < Neon > { vreinterpretq_s32_u32 (vcgeq_f32 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcgtq_f32(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon >) -> mask32x4 < Neon > { vreinterpretq_s32_u32 (vcgtq_f32 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vzip1q_f32(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon >) -> f32x4 < Neon > { let x = a . into () ; let y = b . into () ; vzip1q_f32 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vzip2q_f32(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon >) -> f32x4 < Neon > { let x = a . into () ; let y = b . into () ; vzip2q_f32 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp1q_f32(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon >) -> f32x4 < Neon > { let x = a . into () ; let y = b . into () ; vuzp1q_f32 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp2q_f32(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon >) -> f32x4 < Neon > { let x = a . into () ; let y = b . into () ; vuzp2q_f32 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_f32x4(self, a: f32x4, b: f32x4) -> (f32x4, f32x4) { @@ -258,55 +266,63 @@ impl Simd for Neon { } #[inline(always)] fn max_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { vmaxq_f32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon >) -> f32x4 < Neon > { vmaxq_f32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn min_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { vminq_f32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon >) -> f32x4 < Neon > { vminq_f32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { vmaxnmq_f32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon >) -> f32x4 < Neon > { vmaxnmq_f32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn min_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { vminnmq_f32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon >) -> f32x4 < Neon > { vminnmq_f32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { - unsafe { vfmaq_f32(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon > , c : f32x4 < Neon >) -> f32x4 < Neon > { vfmaq_f32 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn mul_sub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { - unsafe { vnegq_f32(vfmsq_f32(c.into(), b.into(), a.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon > , b : f32x4 < Neon > , c : f32x4 < Neon >) -> f32x4 < Neon > { vnegq_f32 (vfmsq_f32 (c . into () , b . into () , a . into ())) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn floor_f32x4(self, a: f32x4) -> f32x4 { - unsafe { vrndmq_f32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon >) -> f32x4 < Neon > { vrndmq_f32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn ceil_f32x4(self, a: f32x4) -> f32x4 { - unsafe { vrndpq_f32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon >) -> f32x4 < Neon > { vrndpq_f32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn round_ties_even_f32x4(self, a: f32x4) -> f32x4 { - unsafe { vrndnq_f32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon >) -> f32x4 < Neon > { vrndnq_f32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn fract_f32x4(self, a: f32x4) -> f32x4 { - unsafe { - let c1 = vcvtq_s32_f32(a.into()); - let c2 = vcvtq_f32_s32(c1); - vsubq_f32(a.into(), c2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon >) -> f32x4 < Neon > { let c1 = vcvtq_s32_f32 (a . into ()) ; let c2 = vcvtq_f32_s32 (c1) ; vsubq_f32 (a . into () , c2) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn trunc_f32x4(self, a: f32x4) -> f32x4 { - unsafe { vrndq_f32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon >) -> f32x4 < Neon > { vrndq_f32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn select_f32x4(self, a: mask32x4, b: f32x4, c: f32x4) -> f32x4 { - unsafe { vbslq_f32(vreinterpretq_u32_s32(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask32x4 < Neon > , b : f32x4 < Neon > , c : f32x4 < Neon >) -> f32x4 < Neon > { vbslq_f32 (vreinterpretq_u32_s32 (a . into ()) , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn combine_f32x4(self, a: f32x4, b: f32x4) -> f32x8 { @@ -317,23 +333,28 @@ impl Simd for Neon { } #[inline(always)] fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2 { - unsafe { vreinterpretq_f64_f32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon >) -> f64x2 < Neon > { vreinterpretq_f64_f32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_i32_f32x4(self, a: f32x4) -> i32x4 { - unsafe { vreinterpretq_s32_f32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon >) -> i32x4 < Neon > { vreinterpretq_s32_f32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_f32x4(self, a: f32x4) -> u8x16 { - unsafe { vreinterpretq_u8_f32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon >) -> u8x16 < Neon > { vreinterpretq_u8_f32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_f32x4(self, a: f32x4) -> u32x4 { - unsafe { vreinterpretq_u32_f32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon >) -> u32x4 < Neon > { vreinterpretq_u32_f32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_u32_f32x4(self, a: f32x4) -> u32x4 { - unsafe { vcvtq_u32_f32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon >) -> u32x4 < Neon > { vcvtq_u32_f32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_u32_precise_f32x4(self, a: f32x4) -> u32x4 { @@ -341,7 +362,8 @@ impl Simd for Neon { } #[inline(always)] fn cvt_i32_f32x4(self, a: f32x4) -> i32x4 { - unsafe { vcvtq_s32_f32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f32x4 < Neon >) -> i32x4 < Neon > { vcvtq_s32_f32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_i32_precise_f32x4(self, a: f32x4) -> i32x4 { @@ -349,7 +371,8 @@ impl Simd for Neon { } #[inline(always)] fn splat_i8x16(self, val: i8) -> i8x16 { - unsafe { vdupq_n_s8(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , val : i8) -> i8x16 < Neon > { vdupq_n_s8 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16 { @@ -428,91 +451,103 @@ impl Simd for Neon { } #[inline(always)] fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { vaddq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , b : i8x16 < Neon >) -> i8x16 < Neon > { vaddq_s8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { vsubq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , b : i8x16 < Neon >) -> i8x16 < Neon > { vsubq_s8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { vmulq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , b : i8x16 < Neon >) -> i8x16 < Neon > { vmulq_s8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { vandq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , b : i8x16 < Neon >) -> i8x16 < Neon > { vandq_s8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { vorrq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , b : i8x16 < Neon >) -> i8x16 < Neon > { vorrq_s8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { veorq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , b : i8x16 < Neon >) -> i8x16 < Neon > { veorq_s8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_i8x16(self, a: i8x16) -> i8x16 { - unsafe { vmvnq_s8(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon >) -> i8x16 < Neon > { vmvnq_s8 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn shl_i8x16(self, a: i8x16, shift: u32) -> i8x16 { - unsafe { vshlq_s8(a.into(), vdupq_n_s8(shift as i8)).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , shift : u32) -> i8x16 < Neon > { vshlq_s8 (a . into () , vdupq_n_s8 (shift as i8)) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { vshlq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , b : i8x16 < Neon >) -> i8x16 < Neon > { vshlq_s8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16 { - unsafe { vshlq_s8(a.into(), vdupq_n_s8(-(shift as i8))).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , shift : u32) -> i8x16 < Neon > { vshlq_s8 (a . into () , vdupq_n_s8 (- (shift as i8))) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { vshlq_s8(a.into(), vnegq_s8(b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , b : i8x16 < Neon >) -> i8x16 < Neon > { vshlq_s8 (a . into () , vnegq_s8 (b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vceqq_s8(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , b : i8x16 < Neon >) -> mask8x16 < Neon > { vreinterpretq_s8_u8 (vceqq_s8 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vcltq_s8(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , b : i8x16 < Neon >) -> mask8x16 < Neon > { vreinterpretq_s8_u8 (vcltq_s8 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vcleq_s8(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , b : i8x16 < Neon >) -> mask8x16 < Neon > { vreinterpretq_s8_u8 (vcleq_s8 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vcgeq_s8(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , b : i8x16 < Neon >) -> mask8x16 < Neon > { vreinterpretq_s8_u8 (vcgeq_s8 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vcgtq_s8(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , b : i8x16 < Neon >) -> mask8x16 < Neon > { vreinterpretq_s8_u8 (vcgtq_s8 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - let x = a.into(); - let y = b.into(); - unsafe { vzip1q_s8(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , b : i8x16 < Neon >) -> i8x16 < Neon > { let x = a . into () ; let y = b . into () ; vzip1q_s8 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - let x = a.into(); - let y = b.into(); - unsafe { vzip2q_s8(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , b : i8x16 < Neon >) -> i8x16 < Neon > { let x = a . into () ; let y = b . into () ; vzip2q_s8 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp1q_s8(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , b : i8x16 < Neon >) -> i8x16 < Neon > { let x = a . into () ; let y = b . into () ; vuzp1q_s8 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp2q_s8(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , b : i8x16 < Neon >) -> i8x16 < Neon > { let x = a . into () ; let y = b . into () ; vuzp2q_s8 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_i8x16(self, a: i8x16, b: i8x16) -> (i8x16, i8x16) { @@ -524,15 +559,18 @@ impl Simd for Neon { } #[inline(always)] fn select_i8x16(self, a: mask8x16, b: i8x16, c: i8x16) -> i8x16 { - unsafe { vbslq_s8(vreinterpretq_u8_s8(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask8x16 < Neon > , b : i8x16 < Neon > , c : i8x16 < Neon >) -> i8x16 < Neon > { vbslq_s8 (vreinterpretq_u8_s8 (a . into ()) , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { vminq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , b : i8x16 < Neon >) -> i8x16 < Neon > { vminq_s8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { vmaxq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon > , b : i8x16 < Neon >) -> i8x16 < Neon > { vmaxq_s8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_i8x16(self, a: i8x16, b: i8x16) -> i8x32 { @@ -543,19 +581,23 @@ impl Simd for Neon { } #[inline(always)] fn neg_i8x16(self, a: i8x16) -> i8x16 { - unsafe { vnegq_s8(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon >) -> i8x16 < Neon > { vnegq_s8 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i8x16(self, a: i8x16) -> u8x16 { - unsafe { vreinterpretq_u8_s8(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon >) -> u8x16 < Neon > { vreinterpretq_u8_s8 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i8x16(self, a: i8x16) -> u32x4 { - unsafe { vreinterpretq_u32_s8(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i8x16 < Neon >) -> u32x4 < Neon > { vreinterpretq_u32_s8 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_u8x16(self, val: u8) -> u8x16 { - unsafe { vdupq_n_u8(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , val : u8) -> u8x16 < Neon > { vdupq_n_u8 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16 { @@ -634,91 +676,103 @@ impl Simd for Neon { } #[inline(always)] fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { vaddq_u8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , b : u8x16 < Neon >) -> u8x16 < Neon > { vaddq_u8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { vsubq_u8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , b : u8x16 < Neon >) -> u8x16 < Neon > { vsubq_u8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { vmulq_u8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , b : u8x16 < Neon >) -> u8x16 < Neon > { vmulq_u8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { vandq_u8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , b : u8x16 < Neon >) -> u8x16 < Neon > { vandq_u8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { vorrq_u8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , b : u8x16 < Neon >) -> u8x16 < Neon > { vorrq_u8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { veorq_u8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , b : u8x16 < Neon >) -> u8x16 < Neon > { veorq_u8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_u8x16(self, a: u8x16) -> u8x16 { - unsafe { vmvnq_u8(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon >) -> u8x16 < Neon > { vmvnq_u8 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn shl_u8x16(self, a: u8x16, shift: u32) -> u8x16 { - unsafe { vshlq_u8(a.into(), vdupq_n_s8(shift as i8)).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , shift : u32) -> u8x16 < Neon > { vshlq_u8 (a . into () , vdupq_n_s8 (shift as i8)) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { vshlq_u8(a.into(), vreinterpretq_s8_u8(b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , b : u8x16 < Neon >) -> u8x16 < Neon > { vshlq_u8 (a . into () , vreinterpretq_s8_u8 (b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16 { - unsafe { vshlq_u8(a.into(), vdupq_n_s8(-(shift as i8))).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , shift : u32) -> u8x16 < Neon > { vshlq_u8 (a . into () , vdupq_n_s8 (- (shift as i8))) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { vshlq_u8(a.into(), vnegq_s8(vreinterpretq_s8_u8(b.into()))).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , b : u8x16 < Neon >) -> u8x16 < Neon > { vshlq_u8 (a . into () , vnegq_s8 (vreinterpretq_s8_u8 (b . into ()))) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vceqq_u8(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , b : u8x16 < Neon >) -> mask8x16 < Neon > { vreinterpretq_s8_u8 (vceqq_u8 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vcltq_u8(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , b : u8x16 < Neon >) -> mask8x16 < Neon > { vreinterpretq_s8_u8 (vcltq_u8 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vcleq_u8(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , b : u8x16 < Neon >) -> mask8x16 < Neon > { vreinterpretq_s8_u8 (vcleq_u8 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vcgeq_u8(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , b : u8x16 < Neon >) -> mask8x16 < Neon > { vreinterpretq_s8_u8 (vcgeq_u8 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vcgtq_u8(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , b : u8x16 < Neon >) -> mask8x16 < Neon > { vreinterpretq_s8_u8 (vcgtq_u8 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - let x = a.into(); - let y = b.into(); - unsafe { vzip1q_u8(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , b : u8x16 < Neon >) -> u8x16 < Neon > { let x = a . into () ; let y = b . into () ; vzip1q_u8 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - let x = a.into(); - let y = b.into(); - unsafe { vzip2q_u8(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , b : u8x16 < Neon >) -> u8x16 < Neon > { let x = a . into () ; let y = b . into () ; vzip2q_u8 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp1q_u8(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , b : u8x16 < Neon >) -> u8x16 < Neon > { let x = a . into () ; let y = b . into () ; vuzp1q_u8 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp2q_u8(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , b : u8x16 < Neon >) -> u8x16 < Neon > { let x = a . into () ; let y = b . into () ; vuzp2q_u8 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_u8x16(self, a: u8x16, b: u8x16) -> (u8x16, u8x16) { @@ -730,15 +784,18 @@ impl Simd for Neon { } #[inline(always)] fn select_u8x16(self, a: mask8x16, b: u8x16, c: u8x16) -> u8x16 { - unsafe { vbslq_u8(vreinterpretq_u8_s8(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask8x16 < Neon > , b : u8x16 < Neon > , c : u8x16 < Neon >) -> u8x16 < Neon > { vbslq_u8 (vreinterpretq_u8_s8 (a . into ()) , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { vminq_u8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , b : u8x16 < Neon >) -> u8x16 < Neon > { vminq_u8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { vmaxq_u8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon > , b : u8x16 < Neon >) -> u8x16 < Neon > { vmaxq_u8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_u8x16(self, a: u8x16, b: u8x16) -> u8x32 { @@ -749,22 +806,18 @@ impl Simd for Neon { } #[inline(always)] fn widen_u8x16(self, a: u8x16) -> u16x16 { - unsafe { - let low = vmovl_u8(vget_low_u8(a.into())); - let high = vmovl_u8(vget_high_u8(a.into())); - uint16x8x2_t(low, high).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon >) -> u16x16 < Neon > { let low = vmovl_u8 (vget_low_u8 (a . into ())) ; let high = vmovl_u8 (vget_high_u8 (a . into ())) ; uint16x8x2_t (low , high) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_u8x16(self, a: u8x16) -> u32x4 { - unsafe { vreinterpretq_u32_u8(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u8x16 < Neon >) -> u32x4 < Neon > { vreinterpretq_u32_u8 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_mask8x16(self, val: bool) -> mask8x16 { - unsafe { - let val: i8 = if val { !0 } else { 0 }; - vdupq_n_s8(val).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , val : bool) -> mask8x16 < Neon > { let val : i8 = if val { ! 0 } else { 0 } ; vdupq_n_s8 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16 { @@ -805,19 +858,23 @@ impl Simd for Neon { } #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { vandq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask8x16 < Neon > , b : mask8x16 < Neon >) -> mask8x16 < Neon > { vandq_s8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { vorrq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask8x16 < Neon > , b : mask8x16 < Neon >) -> mask8x16 < Neon > { vorrq_s8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { veorq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask8x16 < Neon > , b : mask8x16 < Neon >) -> mask8x16 < Neon > { veorq_s8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_mask8x16(self, a: mask8x16) -> mask8x16 { - unsafe { vmvnq_s8(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask8x16 < Neon >) -> mask8x16 < Neon > { vmvnq_s8 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn select_mask8x16( @@ -826,27 +883,33 @@ impl Simd for Neon { b: mask8x16, c: mask8x16, ) -> mask8x16 { - unsafe { vbslq_s8(vreinterpretq_u8_s8(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask8x16 < Neon > , b : mask8x16 < Neon > , c : mask8x16 < Neon >) -> mask8x16 < Neon > { vbslq_s8 (vreinterpretq_u8_s8 (a . into ()) , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vceqq_s8(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask8x16 < Neon > , b : mask8x16 < Neon >) -> mask8x16 < Neon > { vreinterpretq_s8_u8 (vceqq_s8 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn any_true_mask8x16(self, a: mask8x16) -> bool { - unsafe { vmaxvq_u32(vreinterpretq_u32_s8(a.into())) != 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask8x16 < Neon >) -> bool { vmaxvq_u32 (vreinterpretq_u32_s8 (a . into ())) != 0 } } + kernel(self, a) } #[inline(always)] fn all_true_mask8x16(self, a: mask8x16) -> bool { - unsafe { vminvq_u32(vreinterpretq_u32_s8(a.into())) == 0xffffffff } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask8x16 < Neon >) -> bool { vminvq_u32 (vreinterpretq_u32_s8 (a . into ())) == 0xffffffff } } + kernel(self, a) } #[inline(always)] fn any_false_mask8x16(self, a: mask8x16) -> bool { - unsafe { vminvq_u32(vreinterpretq_u32_s8(a.into())) != 0xffffffff } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask8x16 < Neon >) -> bool { vminvq_u32 (vreinterpretq_u32_s8 (a . into ())) != 0xffffffff } } + kernel(self, a) } #[inline(always)] fn all_false_mask8x16(self, a: mask8x16) -> bool { - unsafe { vmaxvq_u32(vreinterpretq_u32_s8(a.into())) == 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask8x16 < Neon >) -> bool { vmaxvq_u32 (vreinterpretq_u32_s8 (a . into ())) == 0 } } + kernel(self, a) } #[inline(always)] fn combine_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x32 { @@ -857,7 +920,8 @@ impl Simd for Neon { } #[inline(always)] fn splat_i16x8(self, val: i16) -> i16x8 { - unsafe { vdupq_n_s16(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , val : i16) -> i16x8 < Neon > { vdupq_n_s16 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8 { @@ -936,91 +1000,103 @@ impl Simd for Neon { } #[inline(always)] fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { vaddq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , b : i16x8 < Neon >) -> i16x8 < Neon > { vaddq_s16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { vsubq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , b : i16x8 < Neon >) -> i16x8 < Neon > { vsubq_s16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { vmulq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , b : i16x8 < Neon >) -> i16x8 < Neon > { vmulq_s16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { vandq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , b : i16x8 < Neon >) -> i16x8 < Neon > { vandq_s16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { vorrq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , b : i16x8 < Neon >) -> i16x8 < Neon > { vorrq_s16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { veorq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , b : i16x8 < Neon >) -> i16x8 < Neon > { veorq_s16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_i16x8(self, a: i16x8) -> i16x8 { - unsafe { vmvnq_s16(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon >) -> i16x8 < Neon > { vmvnq_s16 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn shl_i16x8(self, a: i16x8, shift: u32) -> i16x8 { - unsafe { vshlq_s16(a.into(), vdupq_n_s16(shift as i16)).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , shift : u32) -> i16x8 < Neon > { vshlq_s16 (a . into () , vdupq_n_s16 (shift as i16)) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { vshlq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , b : i16x8 < Neon >) -> i16x8 < Neon > { vshlq_s16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8 { - unsafe { vshlq_s16(a.into(), vdupq_n_s16(-(shift as i16))).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , shift : u32) -> i16x8 < Neon > { vshlq_s16 (a . into () , vdupq_n_s16 (- (shift as i16))) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { vshlq_s16(a.into(), vnegq_s16(b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , b : i16x8 < Neon >) -> i16x8 < Neon > { vshlq_s16 (a . into () , vnegq_s16 (b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vceqq_s16(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , b : i16x8 < Neon >) -> mask16x8 < Neon > { vreinterpretq_s16_u16 (vceqq_s16 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vcltq_s16(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , b : i16x8 < Neon >) -> mask16x8 < Neon > { vreinterpretq_s16_u16 (vcltq_s16 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vcleq_s16(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , b : i16x8 < Neon >) -> mask16x8 < Neon > { vreinterpretq_s16_u16 (vcleq_s16 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vcgeq_s16(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , b : i16x8 < Neon >) -> mask16x8 < Neon > { vreinterpretq_s16_u16 (vcgeq_s16 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vcgtq_s16(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , b : i16x8 < Neon >) -> mask16x8 < Neon > { vreinterpretq_s16_u16 (vcgtq_s16 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - let x = a.into(); - let y = b.into(); - unsafe { vzip1q_s16(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , b : i16x8 < Neon >) -> i16x8 < Neon > { let x = a . into () ; let y = b . into () ; vzip1q_s16 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - let x = a.into(); - let y = b.into(); - unsafe { vzip2q_s16(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , b : i16x8 < Neon >) -> i16x8 < Neon > { let x = a . into () ; let y = b . into () ; vzip2q_s16 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp1q_s16(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , b : i16x8 < Neon >) -> i16x8 < Neon > { let x = a . into () ; let y = b . into () ; vuzp1q_s16 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp2q_s16(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , b : i16x8 < Neon >) -> i16x8 < Neon > { let x = a . into () ; let y = b . into () ; vuzp2q_s16 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_i16x8(self, a: i16x8, b: i16x8) -> (i16x8, i16x8) { @@ -1032,15 +1108,18 @@ impl Simd for Neon { } #[inline(always)] fn select_i16x8(self, a: mask16x8, b: i16x8, c: i16x8) -> i16x8 { - unsafe { vbslq_s16(vreinterpretq_u16_s16(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask16x8 < Neon > , b : i16x8 < Neon > , c : i16x8 < Neon >) -> i16x8 < Neon > { vbslq_s16 (vreinterpretq_u16_s16 (a . into ()) , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { vminq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , b : i16x8 < Neon >) -> i16x8 < Neon > { vminq_s16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { vmaxq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon > , b : i16x8 < Neon >) -> i16x8 < Neon > { vmaxq_s16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_i16x8(self, a: i16x8, b: i16x8) -> i16x16 { @@ -1051,19 +1130,23 @@ impl Simd for Neon { } #[inline(always)] fn neg_i16x8(self, a: i16x8) -> i16x8 { - unsafe { vnegq_s16(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon >) -> i16x8 < Neon > { vnegq_s16 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i16x8(self, a: i16x8) -> u8x16 { - unsafe { vreinterpretq_u8_s16(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon >) -> u8x16 < Neon > { vreinterpretq_u8_s16 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i16x8(self, a: i16x8) -> u32x4 { - unsafe { vreinterpretq_u32_s16(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i16x8 < Neon >) -> u32x4 < Neon > { vreinterpretq_u32_s16 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_u16x8(self, val: u16) -> u16x8 { - unsafe { vdupq_n_u16(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , val : u16) -> u16x8 < Neon > { vdupq_n_u16 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8 { @@ -1142,91 +1225,103 @@ impl Simd for Neon { } #[inline(always)] fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { vaddq_u16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , b : u16x8 < Neon >) -> u16x8 < Neon > { vaddq_u16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { vsubq_u16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , b : u16x8 < Neon >) -> u16x8 < Neon > { vsubq_u16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { vmulq_u16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , b : u16x8 < Neon >) -> u16x8 < Neon > { vmulq_u16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { vandq_u16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , b : u16x8 < Neon >) -> u16x8 < Neon > { vandq_u16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { vorrq_u16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , b : u16x8 < Neon >) -> u16x8 < Neon > { vorrq_u16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { veorq_u16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , b : u16x8 < Neon >) -> u16x8 < Neon > { veorq_u16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_u16x8(self, a: u16x8) -> u16x8 { - unsafe { vmvnq_u16(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon >) -> u16x8 < Neon > { vmvnq_u16 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn shl_u16x8(self, a: u16x8, shift: u32) -> u16x8 { - unsafe { vshlq_u16(a.into(), vdupq_n_s16(shift as i16)).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , shift : u32) -> u16x8 < Neon > { vshlq_u16 (a . into () , vdupq_n_s16 (shift as i16)) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { vshlq_u16(a.into(), vreinterpretq_s16_u16(b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , b : u16x8 < Neon >) -> u16x8 < Neon > { vshlq_u16 (a . into () , vreinterpretq_s16_u16 (b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8 { - unsafe { vshlq_u16(a.into(), vdupq_n_s16(-(shift as i16))).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , shift : u32) -> u16x8 < Neon > { vshlq_u16 (a . into () , vdupq_n_s16 (- (shift as i16))) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { vshlq_u16(a.into(), vnegq_s16(vreinterpretq_s16_u16(b.into()))).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , b : u16x8 < Neon >) -> u16x8 < Neon > { vshlq_u16 (a . into () , vnegq_s16 (vreinterpretq_s16_u16 (b . into ()))) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vceqq_u16(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , b : u16x8 < Neon >) -> mask16x8 < Neon > { vreinterpretq_s16_u16 (vceqq_u16 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vcltq_u16(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , b : u16x8 < Neon >) -> mask16x8 < Neon > { vreinterpretq_s16_u16 (vcltq_u16 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vcleq_u16(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , b : u16x8 < Neon >) -> mask16x8 < Neon > { vreinterpretq_s16_u16 (vcleq_u16 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vcgeq_u16(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , b : u16x8 < Neon >) -> mask16x8 < Neon > { vreinterpretq_s16_u16 (vcgeq_u16 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vcgtq_u16(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , b : u16x8 < Neon >) -> mask16x8 < Neon > { vreinterpretq_s16_u16 (vcgtq_u16 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - let x = a.into(); - let y = b.into(); - unsafe { vzip1q_u16(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , b : u16x8 < Neon >) -> u16x8 < Neon > { let x = a . into () ; let y = b . into () ; vzip1q_u16 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - let x = a.into(); - let y = b.into(); - unsafe { vzip2q_u16(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , b : u16x8 < Neon >) -> u16x8 < Neon > { let x = a . into () ; let y = b . into () ; vzip2q_u16 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp1q_u16(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , b : u16x8 < Neon >) -> u16x8 < Neon > { let x = a . into () ; let y = b . into () ; vuzp1q_u16 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp2q_u16(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , b : u16x8 < Neon >) -> u16x8 < Neon > { let x = a . into () ; let y = b . into () ; vuzp2q_u16 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_u16x8(self, a: u16x8, b: u16x8) -> (u16x8, u16x8) { @@ -1238,15 +1333,18 @@ impl Simd for Neon { } #[inline(always)] fn select_u16x8(self, a: mask16x8, b: u16x8, c: u16x8) -> u16x8 { - unsafe { vbslq_u16(vreinterpretq_u16_s16(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask16x8 < Neon > , b : u16x8 < Neon > , c : u16x8 < Neon >) -> u16x8 < Neon > { vbslq_u16 (vreinterpretq_u16_s16 (a . into ()) , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { vminq_u16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , b : u16x8 < Neon >) -> u16x8 < Neon > { vminq_u16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { vmaxq_u16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon > , b : u16x8 < Neon >) -> u16x8 < Neon > { vmaxq_u16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_u16x8(self, a: u16x8, b: u16x8) -> u16x16 { @@ -1257,18 +1355,18 @@ impl Simd for Neon { } #[inline(always)] fn reinterpret_u8_u16x8(self, a: u16x8) -> u8x16 { - unsafe { vreinterpretq_u8_u16(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon >) -> u8x16 < Neon > { vreinterpretq_u8_u16 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_u16x8(self, a: u16x8) -> u32x4 { - unsafe { vreinterpretq_u32_u16(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x8 < Neon >) -> u32x4 < Neon > { vreinterpretq_u32_u16 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_mask16x8(self, val: bool) -> mask16x8 { - unsafe { - let val: i16 = if val { !0 } else { 0 }; - vdupq_n_s16(val).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , val : bool) -> mask16x8 < Neon > { let val : i16 = if val { ! 0 } else { 0 } ; vdupq_n_s16 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8 { @@ -1300,19 +1398,23 @@ impl Simd for Neon { } #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { vandq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask16x8 < Neon > , b : mask16x8 < Neon >) -> mask16x8 < Neon > { vandq_s16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { vorrq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask16x8 < Neon > , b : mask16x8 < Neon >) -> mask16x8 < Neon > { vorrq_s16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { veorq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask16x8 < Neon > , b : mask16x8 < Neon >) -> mask16x8 < Neon > { veorq_s16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_mask16x8(self, a: mask16x8) -> mask16x8 { - unsafe { vmvnq_s16(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask16x8 < Neon >) -> mask16x8 < Neon > { vmvnq_s16 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn select_mask16x8( @@ -1321,27 +1423,33 @@ impl Simd for Neon { b: mask16x8, c: mask16x8, ) -> mask16x8 { - unsafe { vbslq_s16(vreinterpretq_u16_s16(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask16x8 < Neon > , b : mask16x8 < Neon > , c : mask16x8 < Neon >) -> mask16x8 < Neon > { vbslq_s16 (vreinterpretq_u16_s16 (a . into ()) , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vceqq_s16(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask16x8 < Neon > , b : mask16x8 < Neon >) -> mask16x8 < Neon > { vreinterpretq_s16_u16 (vceqq_s16 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn any_true_mask16x8(self, a: mask16x8) -> bool { - unsafe { vmaxvq_u32(vreinterpretq_u32_s16(a.into())) != 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask16x8 < Neon >) -> bool { vmaxvq_u32 (vreinterpretq_u32_s16 (a . into ())) != 0 } } + kernel(self, a) } #[inline(always)] fn all_true_mask16x8(self, a: mask16x8) -> bool { - unsafe { vminvq_u32(vreinterpretq_u32_s16(a.into())) == 0xffffffff } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask16x8 < Neon >) -> bool { vminvq_u32 (vreinterpretq_u32_s16 (a . into ())) == 0xffffffff } } + kernel(self, a) } #[inline(always)] fn any_false_mask16x8(self, a: mask16x8) -> bool { - unsafe { vminvq_u32(vreinterpretq_u32_s16(a.into())) != 0xffffffff } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask16x8 < Neon >) -> bool { vminvq_u32 (vreinterpretq_u32_s16 (a . into ())) != 0xffffffff } } + kernel(self, a) } #[inline(always)] fn all_false_mask16x8(self, a: mask16x8) -> bool { - unsafe { vmaxvq_u32(vreinterpretq_u32_s16(a.into())) == 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask16x8 < Neon >) -> bool { vmaxvq_u32 (vreinterpretq_u32_s16 (a . into ())) == 0 } } + kernel(self, a) } #[inline(always)] fn combine_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x16 { @@ -1352,7 +1460,8 @@ impl Simd for Neon { } #[inline(always)] fn splat_i32x4(self, val: i32) -> i32x4 { - unsafe { vdupq_n_s32(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , val : i32) -> i32x4 < Neon > { vdupq_n_s32 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4 { @@ -1431,91 +1540,103 @@ impl Simd for Neon { } #[inline(always)] fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { vaddq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , b : i32x4 < Neon >) -> i32x4 < Neon > { vaddq_s32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { vsubq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , b : i32x4 < Neon >) -> i32x4 < Neon > { vsubq_s32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { vmulq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , b : i32x4 < Neon >) -> i32x4 < Neon > { vmulq_s32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { vandq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , b : i32x4 < Neon >) -> i32x4 < Neon > { vandq_s32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { vorrq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , b : i32x4 < Neon >) -> i32x4 < Neon > { vorrq_s32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { veorq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , b : i32x4 < Neon >) -> i32x4 < Neon > { veorq_s32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_i32x4(self, a: i32x4) -> i32x4 { - unsafe { vmvnq_s32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon >) -> i32x4 < Neon > { vmvnq_s32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn shl_i32x4(self, a: i32x4, shift: u32) -> i32x4 { - unsafe { vshlq_s32(a.into(), vdupq_n_s32(shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , shift : u32) -> i32x4 < Neon > { vshlq_s32 (a . into () , vdupq_n_s32 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { vshlq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , b : i32x4 < Neon >) -> i32x4 < Neon > { vshlq_s32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn shr_i32x4(self, a: i32x4, shift: u32) -> i32x4 { - unsafe { vshlq_s32(a.into(), vdupq_n_s32(-shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , shift : u32) -> i32x4 < Neon > { vshlq_s32 (a . into () , vdupq_n_s32 (- shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { vshlq_s32(a.into(), vnegq_s32(b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , b : i32x4 < Neon >) -> i32x4 < Neon > { vshlq_s32 (a . into () , vnegq_s32 (b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_eq_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vceqq_s32(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , b : i32x4 < Neon >) -> mask32x4 < Neon > { vreinterpretq_s32_u32 (vceqq_s32 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcltq_s32(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , b : i32x4 < Neon >) -> mask32x4 < Neon > { vreinterpretq_s32_u32 (vcltq_s32 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcleq_s32(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , b : i32x4 < Neon >) -> mask32x4 < Neon > { vreinterpretq_s32_u32 (vcleq_s32 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcgeq_s32(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , b : i32x4 < Neon >) -> mask32x4 < Neon > { vreinterpretq_s32_u32 (vcgeq_s32 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcgtq_s32(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , b : i32x4 < Neon >) -> mask32x4 < Neon > { vreinterpretq_s32_u32 (vcgtq_s32 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vzip1q_s32(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , b : i32x4 < Neon >) -> i32x4 < Neon > { let x = a . into () ; let y = b . into () ; vzip1q_s32 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vzip2q_s32(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , b : i32x4 < Neon >) -> i32x4 < Neon > { let x = a . into () ; let y = b . into () ; vzip2q_s32 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp1q_s32(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , b : i32x4 < Neon >) -> i32x4 < Neon > { let x = a . into () ; let y = b . into () ; vuzp1q_s32 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp2q_s32(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , b : i32x4 < Neon >) -> i32x4 < Neon > { let x = a . into () ; let y = b . into () ; vuzp2q_s32 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_i32x4(self, a: i32x4, b: i32x4) -> (i32x4, i32x4) { @@ -1527,15 +1648,18 @@ impl Simd for Neon { } #[inline(always)] fn select_i32x4(self, a: mask32x4, b: i32x4, c: i32x4) -> i32x4 { - unsafe { vbslq_s32(vreinterpretq_u32_s32(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask32x4 < Neon > , b : i32x4 < Neon > , c : i32x4 < Neon >) -> i32x4 < Neon > { vbslq_s32 (vreinterpretq_u32_s32 (a . into ()) , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { vminq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , b : i32x4 < Neon >) -> i32x4 < Neon > { vminq_s32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { vmaxq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon > , b : i32x4 < Neon >) -> i32x4 < Neon > { vmaxq_s32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_i32x4(self, a: i32x4, b: i32x4) -> i32x8 { @@ -1546,23 +1670,28 @@ impl Simd for Neon { } #[inline(always)] fn neg_i32x4(self, a: i32x4) -> i32x4 { - unsafe { vnegq_s32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon >) -> i32x4 < Neon > { vnegq_s32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i32x4(self, a: i32x4) -> u8x16 { - unsafe { vreinterpretq_u8_s32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon >) -> u8x16 < Neon > { vreinterpretq_u8_s32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i32x4(self, a: i32x4) -> u32x4 { - unsafe { vreinterpretq_u32_s32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon >) -> u32x4 < Neon > { vreinterpretq_u32_s32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_f32_i32x4(self, a: i32x4) -> f32x4 { - unsafe { vcvtq_f32_s32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : i32x4 < Neon >) -> f32x4 < Neon > { vcvtq_f32_s32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_u32x4(self, val: u32) -> u32x4 { - unsafe { vdupq_n_u32(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , val : u32) -> u32x4 < Neon > { vdupq_n_u32 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4 { @@ -1641,91 +1770,103 @@ impl Simd for Neon { } #[inline(always)] fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { vaddq_u32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , b : u32x4 < Neon >) -> u32x4 < Neon > { vaddq_u32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { vsubq_u32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , b : u32x4 < Neon >) -> u32x4 < Neon > { vsubq_u32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { vmulq_u32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , b : u32x4 < Neon >) -> u32x4 < Neon > { vmulq_u32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { vandq_u32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , b : u32x4 < Neon >) -> u32x4 < Neon > { vandq_u32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { vorrq_u32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , b : u32x4 < Neon >) -> u32x4 < Neon > { vorrq_u32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { veorq_u32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , b : u32x4 < Neon >) -> u32x4 < Neon > { veorq_u32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_u32x4(self, a: u32x4) -> u32x4 { - unsafe { vmvnq_u32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon >) -> u32x4 < Neon > { vmvnq_u32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn shl_u32x4(self, a: u32x4, shift: u32) -> u32x4 { - unsafe { vshlq_u32(a.into(), vdupq_n_s32(shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , shift : u32) -> u32x4 < Neon > { vshlq_u32 (a . into () , vdupq_n_s32 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { vshlq_u32(a.into(), vreinterpretq_s32_u32(b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , b : u32x4 < Neon >) -> u32x4 < Neon > { vshlq_u32 (a . into () , vreinterpretq_s32_u32 (b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn shr_u32x4(self, a: u32x4, shift: u32) -> u32x4 { - unsafe { vshlq_u32(a.into(), vdupq_n_s32(-shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , shift : u32) -> u32x4 < Neon > { vshlq_u32 (a . into () , vdupq_n_s32 (- shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { vshlq_u32(a.into(), vnegq_s32(vreinterpretq_s32_u32(b.into()))).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , b : u32x4 < Neon >) -> u32x4 < Neon > { vshlq_u32 (a . into () , vnegq_s32 (vreinterpretq_s32_u32 (b . into ()))) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_eq_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vceqq_u32(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , b : u32x4 < Neon >) -> mask32x4 < Neon > { vreinterpretq_s32_u32 (vceqq_u32 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcltq_u32(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , b : u32x4 < Neon >) -> mask32x4 < Neon > { vreinterpretq_s32_u32 (vcltq_u32 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcleq_u32(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , b : u32x4 < Neon >) -> mask32x4 < Neon > { vreinterpretq_s32_u32 (vcleq_u32 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcgeq_u32(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , b : u32x4 < Neon >) -> mask32x4 < Neon > { vreinterpretq_s32_u32 (vcgeq_u32 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcgtq_u32(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , b : u32x4 < Neon >) -> mask32x4 < Neon > { vreinterpretq_s32_u32 (vcgtq_u32 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vzip1q_u32(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , b : u32x4 < Neon >) -> u32x4 < Neon > { let x = a . into () ; let y = b . into () ; vzip1q_u32 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vzip2q_u32(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , b : u32x4 < Neon >) -> u32x4 < Neon > { let x = a . into () ; let y = b . into () ; vzip2q_u32 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp1q_u32(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , b : u32x4 < Neon >) -> u32x4 < Neon > { let x = a . into () ; let y = b . into () ; vuzp1q_u32 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp2q_u32(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , b : u32x4 < Neon >) -> u32x4 < Neon > { let x = a . into () ; let y = b . into () ; vuzp2q_u32 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_u32x4(self, a: u32x4, b: u32x4) -> (u32x4, u32x4) { @@ -1737,15 +1878,18 @@ impl Simd for Neon { } #[inline(always)] fn select_u32x4(self, a: mask32x4, b: u32x4, c: u32x4) -> u32x4 { - unsafe { vbslq_u32(vreinterpretq_u32_s32(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask32x4 < Neon > , b : u32x4 < Neon > , c : u32x4 < Neon >) -> u32x4 < Neon > { vbslq_u32 (vreinterpretq_u32_s32 (a . into ()) , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { vminq_u32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , b : u32x4 < Neon >) -> u32x4 < Neon > { vminq_u32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { vmaxq_u32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon > , b : u32x4 < Neon >) -> u32x4 < Neon > { vmaxq_u32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_u32x4(self, a: u32x4, b: u32x4) -> u32x8 { @@ -1756,18 +1900,18 @@ impl Simd for Neon { } #[inline(always)] fn reinterpret_u8_u32x4(self, a: u32x4) -> u8x16 { - unsafe { vreinterpretq_u8_u32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon >) -> u8x16 < Neon > { vreinterpretq_u8_u32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_f32_u32x4(self, a: u32x4) -> f32x4 { - unsafe { vcvtq_f32_u32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u32x4 < Neon >) -> f32x4 < Neon > { vcvtq_f32_u32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_mask32x4(self, val: bool) -> mask32x4 { - unsafe { - let val: i32 = if val { !0 } else { 0 }; - vdupq_n_s32(val).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , val : bool) -> mask32x4 < Neon > { let val : i32 = if val { ! 0 } else { 0 } ; vdupq_n_s32 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4 { @@ -1799,19 +1943,23 @@ impl Simd for Neon { } #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { vandq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask32x4 < Neon > , b : mask32x4 < Neon >) -> mask32x4 < Neon > { vandq_s32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { vorrq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask32x4 < Neon > , b : mask32x4 < Neon >) -> mask32x4 < Neon > { vorrq_s32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { veorq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask32x4 < Neon > , b : mask32x4 < Neon >) -> mask32x4 < Neon > { veorq_s32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_mask32x4(self, a: mask32x4) -> mask32x4 { - unsafe { vmvnq_s32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask32x4 < Neon >) -> mask32x4 < Neon > { vmvnq_s32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn select_mask32x4( @@ -1820,27 +1968,33 @@ impl Simd for Neon { b: mask32x4, c: mask32x4, ) -> mask32x4 { - unsafe { vbslq_s32(vreinterpretq_u32_s32(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask32x4 < Neon > , b : mask32x4 < Neon > , c : mask32x4 < Neon >) -> mask32x4 < Neon > { vbslq_s32 (vreinterpretq_u32_s32 (a . into ()) , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vceqq_s32(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask32x4 < Neon > , b : mask32x4 < Neon >) -> mask32x4 < Neon > { vreinterpretq_s32_u32 (vceqq_s32 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn any_true_mask32x4(self, a: mask32x4) -> bool { - unsafe { vmaxvq_u32(vreinterpretq_u32_s32(a.into())) != 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask32x4 < Neon >) -> bool { vmaxvq_u32 (vreinterpretq_u32_s32 (a . into ())) != 0 } } + kernel(self, a) } #[inline(always)] fn all_true_mask32x4(self, a: mask32x4) -> bool { - unsafe { vminvq_u32(vreinterpretq_u32_s32(a.into())) == 0xffffffff } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask32x4 < Neon >) -> bool { vminvq_u32 (vreinterpretq_u32_s32 (a . into ())) == 0xffffffff } } + kernel(self, a) } #[inline(always)] fn any_false_mask32x4(self, a: mask32x4) -> bool { - unsafe { vminvq_u32(vreinterpretq_u32_s32(a.into())) != 0xffffffff } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask32x4 < Neon >) -> bool { vminvq_u32 (vreinterpretq_u32_s32 (a . into ())) != 0xffffffff } } + kernel(self, a) } #[inline(always)] fn all_false_mask32x4(self, a: mask32x4) -> bool { - unsafe { vmaxvq_u32(vreinterpretq_u32_s32(a.into())) == 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask32x4 < Neon >) -> bool { vmaxvq_u32 (vreinterpretq_u32_s32 (a . into ())) == 0 } } + kernel(self, a) } #[inline(always)] fn combine_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x8 { @@ -1851,7 +2005,8 @@ impl Simd for Neon { } #[inline(always)] fn splat_f64x2(self, val: f64) -> f64x2 { - unsafe { vdupq_n_f64(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , val : f64) -> f64x2 < Neon > { vdupq_n_f64 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2 { @@ -1930,86 +2085,93 @@ impl Simd for Neon { } #[inline(always)] fn abs_f64x2(self, a: f64x2) -> f64x2 { - unsafe { vabsq_f64(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon >) -> f64x2 < Neon > { vabsq_f64 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn neg_f64x2(self, a: f64x2) -> f64x2 { - unsafe { vnegq_f64(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon >) -> f64x2 < Neon > { vnegq_f64 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn sqrt_f64x2(self, a: f64x2) -> f64x2 { - unsafe { vsqrtq_f64(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon >) -> f64x2 < Neon > { vsqrtq_f64 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn approximate_recip_f64x2(self, a: f64x2) -> f64x2 { - unsafe { vrecpeq_f64(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon >) -> f64x2 < Neon > { vrecpeq_f64 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { vaddq_f64(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon >) -> f64x2 < Neon > { vaddq_f64 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { vsubq_f64(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon >) -> f64x2 < Neon > { vsubq_f64 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { vmulq_f64(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon >) -> f64x2 < Neon > { vmulq_f64 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn div_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { vdivq_f64(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon >) -> f64x2 < Neon > { vdivq_f64 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn copysign_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let sign_mask = vdupq_n_u64(1 << 63); - vbslq_f64(sign_mask, b.into(), a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon >) -> f64x2 < Neon > { let sign_mask = vdupq_n_u64 (1 << 63) ; vbslq_f64 (sign_mask , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_eq_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { vreinterpretq_s64_u64(vceqq_f64(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon >) -> mask64x2 < Neon > { vreinterpretq_s64_u64 (vceqq_f64 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { vreinterpretq_s64_u64(vcltq_f64(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon >) -> mask64x2 < Neon > { vreinterpretq_s64_u64 (vcltq_f64 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { vreinterpretq_s64_u64(vcleq_f64(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon >) -> mask64x2 < Neon > { vreinterpretq_s64_u64 (vcleq_f64 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { vreinterpretq_s64_u64(vcgeq_f64(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon >) -> mask64x2 < Neon > { vreinterpretq_s64_u64 (vcgeq_f64 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { vreinterpretq_s64_u64(vcgtq_f64(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon >) -> mask64x2 < Neon > { vreinterpretq_s64_u64 (vcgtq_f64 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - let x = a.into(); - let y = b.into(); - unsafe { vzip1q_f64(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon >) -> f64x2 < Neon > { let x = a . into () ; let y = b . into () ; vzip1q_f64 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - let x = a.into(); - let y = b.into(); - unsafe { vzip2q_f64(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon >) -> f64x2 < Neon > { let x = a . into () ; let y = b . into () ; vzip2q_f64 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp1q_f64(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon >) -> f64x2 < Neon > { let x = a . into () ; let y = b . into () ; vuzp1q_f64 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp2q_f64(x, y).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon >) -> f64x2 < Neon > { let x = a . into () ; let y = b . into () ; vuzp2q_f64 (x , y) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_f64x2(self, a: f64x2, b: f64x2) -> (f64x2, f64x2) { @@ -2021,55 +2183,63 @@ impl Simd for Neon { } #[inline(always)] fn max_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { vmaxq_f64(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon >) -> f64x2 < Neon > { vmaxq_f64 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn min_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { vminq_f64(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon >) -> f64x2 < Neon > { vminq_f64 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { vmaxnmq_f64(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon >) -> f64x2 < Neon > { vmaxnmq_f64 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn min_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { vminnmq_f64(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon >) -> f64x2 < Neon > { vminnmq_f64 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { - unsafe { vfmaq_f64(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon > , c : f64x2 < Neon >) -> f64x2 < Neon > { vfmaq_f64 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn mul_sub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { - unsafe { vnegq_f64(vfmsq_f64(c.into(), b.into(), a.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon > , b : f64x2 < Neon > , c : f64x2 < Neon >) -> f64x2 < Neon > { vnegq_f64 (vfmsq_f64 (c . into () , b . into () , a . into ())) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn floor_f64x2(self, a: f64x2) -> f64x2 { - unsafe { vrndmq_f64(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon >) -> f64x2 < Neon > { vrndmq_f64 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn ceil_f64x2(self, a: f64x2) -> f64x2 { - unsafe { vrndpq_f64(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon >) -> f64x2 < Neon > { vrndpq_f64 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn round_ties_even_f64x2(self, a: f64x2) -> f64x2 { - unsafe { vrndnq_f64(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon >) -> f64x2 < Neon > { vrndnq_f64 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn fract_f64x2(self, a: f64x2) -> f64x2 { - unsafe { - let c1 = vcvtq_s64_f64(a.into()); - let c2 = vcvtq_f64_s64(c1); - vsubq_f64(a.into(), c2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon >) -> f64x2 < Neon > { let c1 = vcvtq_s64_f64 (a . into ()) ; let c2 = vcvtq_f64_s64 (c1) ; vsubq_f64 (a . into () , c2) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn trunc_f64x2(self, a: f64x2) -> f64x2 { - unsafe { vrndq_f64(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon >) -> f64x2 < Neon > { vrndq_f64 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn select_f64x2(self, a: mask64x2, b: f64x2, c: f64x2) -> f64x2 { - unsafe { vbslq_f64(vreinterpretq_u64_s64(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask64x2 < Neon > , b : f64x2 < Neon > , c : f64x2 < Neon >) -> f64x2 < Neon > { vbslq_f64 (vreinterpretq_u64_s64 (a . into ()) , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn combine_f64x2(self, a: f64x2, b: f64x2) -> f64x4 { @@ -2080,14 +2250,13 @@ impl Simd for Neon { } #[inline(always)] fn reinterpret_f32_f64x2(self, a: f64x2) -> f32x4 { - unsafe { vreinterpretq_f32_f64(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : f64x2 < Neon >) -> f32x4 < Neon > { vreinterpretq_f32_f64 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_mask64x2(self, val: bool) -> mask64x2 { - unsafe { - let val: i64 = if val { !0 } else { 0 }; - vdupq_n_s64(val).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , val : bool) -> mask64x2 < Neon > { let val : i64 = if val { ! 0 } else { 0 } ; vdupq_n_s64 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { @@ -2119,19 +2288,23 @@ impl Simd for Neon { } #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { vandq_s64(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask64x2 < Neon > , b : mask64x2 < Neon >) -> mask64x2 < Neon > { vandq_s64 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { vorrq_s64(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask64x2 < Neon > , b : mask64x2 < Neon >) -> mask64x2 < Neon > { vorrq_s64 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { veorq_s64(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask64x2 < Neon > , b : mask64x2 < Neon >) -> mask64x2 < Neon > { veorq_s64 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_mask64x2(self, a: mask64x2) -> mask64x2 { - unsafe { vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a.into()))).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask64x2 < Neon >) -> mask64x2 < Neon > { vreinterpretq_s64_s32 (vmvnq_s32 (vreinterpretq_s32_s64 (a . into ()))) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn select_mask64x2( @@ -2140,27 +2313,33 @@ impl Simd for Neon { b: mask64x2, c: mask64x2, ) -> mask64x2 { - unsafe { vbslq_s64(vreinterpretq_u64_s64(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask64x2 < Neon > , b : mask64x2 < Neon > , c : mask64x2 < Neon >) -> mask64x2 < Neon > { vbslq_s64 (vreinterpretq_u64_s64 (a . into ()) , b . into () , c . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { vreinterpretq_s64_u64(vceqq_s64(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask64x2 < Neon > , b : mask64x2 < Neon >) -> mask64x2 < Neon > { vreinterpretq_s64_u64 (vceqq_s64 (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn any_true_mask64x2(self, a: mask64x2) -> bool { - unsafe { vmaxvq_u32(vreinterpretq_u32_s64(a.into())) != 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask64x2 < Neon >) -> bool { vmaxvq_u32 (vreinterpretq_u32_s64 (a . into ())) != 0 } } + kernel(self, a) } #[inline(always)] fn all_true_mask64x2(self, a: mask64x2) -> bool { - unsafe { vminvq_u32(vreinterpretq_u32_s64(a.into())) == 0xffffffff } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask64x2 < Neon >) -> bool { vminvq_u32 (vreinterpretq_u32_s64 (a . into ())) == 0xffffffff } } + kernel(self, a) } #[inline(always)] fn any_false_mask64x2(self, a: mask64x2) -> bool { - unsafe { vminvq_u32(vreinterpretq_u32_s64(a.into())) != 0xffffffff } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask64x2 < Neon >) -> bool { vminvq_u32 (vreinterpretq_u32_s64 (a . into ())) != 0xffffffff } } + kernel(self, a) } #[inline(always)] fn all_false_mask64x2(self, a: mask64x2) -> bool { - unsafe { vmaxvq_u32(vreinterpretq_u32_s64(a.into())) == 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : mask64x2 < Neon >) -> bool { vmaxvq_u32 (vreinterpretq_u32_s64 (a . into ())) == 0 } } + kernel(self, a) } #[inline(always)] fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { @@ -3891,12 +4070,8 @@ impl Simd for Neon { } #[inline(always)] fn narrow_u16x16(self, a: u16x16) -> u8x16 { - unsafe { - let converted: uint16x8x2_t = a.into(); - let low = vmovn_u16(converted.0); - let high = vmovn_u16(converted.1); - vcombine_u8(low, high).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Neon , a : u16x16 < Neon >) -> u8x16 < Neon > { let converted : uint16x8x2_t = a . into () ; let low = vmovn_u16 (converted . 0) ; let high = vmovn_u16 (converted . 1) ; vcombine_u8 (low , high) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs index 6a0420aee..21fe306b4 100644 --- a/fearless_simd/src/generated/sse4_2.rs +++ b/fearless_simd/src/generated/sse4_2.rs @@ -124,7 +124,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn splat_f32x4(self, val: f32) -> f32x4 { - unsafe { _mm_set1_ps(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , val : f32) -> f32x4 < Sse4_2 > { _mm_set1_ps (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4 { @@ -203,78 +204,93 @@ impl Simd for Sse4_2 { } #[inline(always)] fn abs_f32x4(self, a: f32x4) -> f32x4 { - unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_andnot_ps (_mm_set1_ps (- 0.0) , a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn neg_f32x4(self, a: f32x4) -> f32x4 { - unsafe { _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_xor_ps (a . into () , _mm_set1_ps (- 0.0)) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn sqrt_f32x4(self, a: f32x4) -> f32x4 { - unsafe { _mm_sqrt_ps(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_sqrt_ps (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn approximate_recip_f32x4(self, a: f32x4) -> f32x4 { - unsafe { _mm_rcp_ps(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_rcp_ps (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 > , b : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_add_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_sub_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 > , b : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_sub_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_mul_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 > , b : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_mul_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn div_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_div_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 > , b : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_div_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn copysign_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { - let mask = _mm_set1_ps(-0.0); - _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 > , b : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { let mask = _mm_set1_ps (- 0.0) ; _mm_or_ps (_mm_and_ps (mask , b . into ()) , _mm_andnot_ps (mask , a . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_eq_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmpeq_ps(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 > , b : f32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { _mm_castps_si128 (_mm_cmpeq_ps (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmplt_ps(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 > , b : f32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { _mm_castps_si128 (_mm_cmplt_ps (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmple_ps(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 > , b : f32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { _mm_castps_si128 (_mm_cmple_ps (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmpge_ps(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 > , b : f32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { _mm_castps_si128 (_mm_cmpge_ps (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmpgt_ps(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 > , b : f32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { _mm_castps_si128 (_mm_cmpgt_ps (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_unpacklo_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 > , b : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_unpacklo_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_unpackhi_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 > , b : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_unpackhi_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 > , b : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_shuffle_ps :: < 0b10_00_10_00 > (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 > , b : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_shuffle_ps :: < 0b11_01_11_01 > (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_f32x4(self, a: f32x4, b: f32x4) -> (f32x4, f32x4) { @@ -286,27 +302,23 @@ impl Simd for Sse4_2 { } #[inline(always)] fn max_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 > , b : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_max_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn min_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 > , b : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_min_ps (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { - let intermediate = _mm_max_ps(a.into(), b.into()); - let b_is_nan = _mm_cmpunord_ps(b.into(), b.into()); - _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 > , b : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { let intermediate = _mm_max_ps (a . into () , b . into ()) ; let b_is_nan = _mm_cmpunord_ps (b . into () , b . into ()) ; _mm_blendv_ps (intermediate , a . into () , b_is_nan) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn min_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { - let intermediate = _mm_min_ps(a.into(), b.into()); - let b_is_nan = _mm_cmpunord_ps(b.into(), b.into()); - _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 > , b : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { let intermediate = _mm_min_ps (a . into () , b . into ()) ; let b_is_nan = _mm_cmpunord_ps (b . into () , b . into ()) ; _mm_blendv_ps (intermediate , a . into () , b_is_nan) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { @@ -318,22 +330,18 @@ impl Simd for Sse4_2 { } #[inline(always)] fn floor_f32x4(self, a: f32x4) -> f32x4 { - unsafe { - _mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_round_ps :: < { _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn ceil_f32x4(self, a: f32x4) -> f32x4 { - unsafe { - _mm_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_round_ps :: < { _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn round_ties_even_f32x4(self, a: f32x4) -> f32x4 { - unsafe { - _mm_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_round_ps :: < { _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn fract_f32x4(self, a: f32x4) -> f32x4 { @@ -341,13 +349,13 @@ impl Simd for Sse4_2 { } #[inline(always)] fn trunc_f32x4(self, a: f32x4) -> f32x4 { - unsafe { - _mm_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_round_ps :: < { _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn select_f32x4(self, a: mask32x4, b: f32x4, c: f32x4) -> f32x4 { - unsafe { _mm_blendv_ps(c.into(), b.into(), _mm_castsi128_ps(a.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask32x4 < Sse4_2 > , b : f32x4 < Sse4_2 > , c : f32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_blendv_ps (c . into () , b . into () , _mm_castsi128_ps (a . into ())) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn combine_f32x4(self, a: f32x4, b: f32x4) -> f32x8 { @@ -358,82 +366,48 @@ impl Simd for Sse4_2 { } #[inline(always)] fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2 { - unsafe { _mm_castps_pd(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 >) -> f64x2 < Sse4_2 > { _mm_castps_pd (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_i32_f32x4(self, a: f32x4) -> i32x4 { - unsafe { _mm_castps_si128(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 >) -> i32x4 < Sse4_2 > { _mm_castps_si128 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_f32x4(self, a: f32x4) -> u8x16 { - unsafe { _mm_castps_si128(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 >) -> u8x16 < Sse4_2 > { _mm_castps_si128 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_f32x4(self, a: f32x4) -> u32x4 { - unsafe { _mm_castps_si128(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 >) -> u32x4 < Sse4_2 > { _mm_castps_si128 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_u32_f32x4(self, a: f32x4) -> u32x4 { - unsafe { - let mut converted = _mm_cvttps_epi32(a.into()); - let in_range = _mm_cmplt_ps(a.into(), _mm_set1_ps(2147483648.0)); - let all_in_range = _mm_movemask_ps(in_range) == 0b1111; - if !all_in_range { - let excess = _mm_sub_ps(a.into(), _mm_set1_ps(2147483648.0)); - let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess)); - converted = _mm_add_epi32(converted, excess_converted); - } - converted.simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 >) -> u32x4 < Sse4_2 > { let mut converted = _mm_cvttps_epi32 (a . into ()) ; let in_range = _mm_cmplt_ps (a . into () , _mm_set1_ps (2147483648.0)) ; let all_in_range = _mm_movemask_ps (in_range) == 0b1111 ; if ! all_in_range { let excess = _mm_sub_ps (a . into () , _mm_set1_ps (2147483648.0)) ; let excess_converted = _mm_cvttps_epi32 (_mm_andnot_ps (in_range , excess)) ; converted = _mm_add_epi32 (converted , excess_converted) ; } converted . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_u32_precise_f32x4(self, a: f32x4) -> u32x4 { - unsafe { - let a = _mm_max_ps(a.into(), _mm_setzero_ps()); - let mut converted = _mm_cvttps_epi32(a); - let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0)); - let all_in_range = _mm_movemask_ps(in_range) == 0b1111; - if !all_in_range { - let exceeds_unsigned_range = - _mm_castps_si128(_mm_cmplt_ps(_mm_set1_ps(4294967040.0), a)); - let excess = _mm_sub_ps(a, _mm_set1_ps(2147483648.0)); - let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess)); - converted = _mm_add_epi32(converted, excess_converted); - converted = _mm_blendv_epi8( - converted, - _mm_set1_epi32(u32::MAX.cast_signed()), - exceeds_unsigned_range, - ); - } - converted.simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 >) -> u32x4 < Sse4_2 > { let a = _mm_max_ps (a . into () , _mm_setzero_ps ()) ; let mut converted = _mm_cvttps_epi32 (a) ; let in_range = _mm_cmplt_ps (a , _mm_set1_ps (2147483648.0)) ; let all_in_range = _mm_movemask_ps (in_range) == 0b1111 ; if ! all_in_range { let exceeds_unsigned_range = _mm_castps_si128 (_mm_cmplt_ps (_mm_set1_ps (4294967040.0) , a)) ; let excess = _mm_sub_ps (a , _mm_set1_ps (2147483648.0)) ; let excess_converted = _mm_cvttps_epi32 (_mm_andnot_ps (in_range , excess)) ; converted = _mm_add_epi32 (converted , excess_converted) ; converted = _mm_blendv_epi8 (converted , _mm_set1_epi32 (u32 :: MAX . cast_signed ()) , exceeds_unsigned_range) ; } converted . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_i32_f32x4(self, a: f32x4) -> i32x4 { - unsafe { _mm_cvttps_epi32(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 >) -> i32x4 < Sse4_2 > { _mm_cvttps_epi32 (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_i32_precise_f32x4(self, a: f32x4) -> i32x4 { - unsafe { - let a = a.into(); - let mut converted = _mm_cvttps_epi32(a); - let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0)); - let all_in_range = _mm_movemask_ps(in_range) == 0b1111; - if !all_in_range { - converted = _mm_blendv_epi8( - _mm_set1_epi32(i32::MAX), - converted, - _mm_castps_si128(in_range), - ); - let is_not_nan = _mm_castps_si128(_mm_cmpord_ps(a, a)); - converted = _mm_and_si128(converted, is_not_nan); - } - converted.simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f32x4 < Sse4_2 >) -> i32x4 < Sse4_2 > { let a = a . into () ; let mut converted = _mm_cvttps_epi32 (a) ; let in_range = _mm_cmplt_ps (a , _mm_set1_ps (2147483648.0)) ; let all_in_range = _mm_movemask_ps (in_range) == 0b1111 ; if ! all_in_range { converted = _mm_blendv_epi8 (_mm_set1_epi32 (i32 :: MAX) , converted , _mm_castps_si128 (in_range)) ; let is_not_nan = _mm_castps_si128 (_mm_cmpord_ps (a , a)) ; converted = _mm_and_si128 (converted , is_not_nan) ; } converted . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_i8x16(self, val: i8) -> i8x16 { - unsafe { _mm_set1_epi8(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , val : i8) -> i8x16 < Sse4_2 > { _mm_set1_epi8 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16 { @@ -512,36 +486,33 @@ impl Simd for Sse4_2 { } #[inline(always)] fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 > , b : i8x16 < Sse4_2 >) -> i8x16 < Sse4_2 > { _mm_add_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 > , b : i8x16 < Sse4_2 >) -> i8x16 < Sse4_2 > { _mm_sub_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { - let dst_even = _mm_mullo_epi16(a.into(), b.into()); - let dst_odd = - _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into())); - _mm_or_si128( - _mm_slli_epi16(dst_odd, 8), - _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)), - ) - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 > , b : i8x16 < Sse4_2 >) -> i8x16 < Sse4_2 > { let dst_even = _mm_mullo_epi16 (a . into () , b . into ()) ; let dst_odd = _mm_mullo_epi16 (_mm_srli_epi16 :: < 8 > (a . into ()) , _mm_srli_epi16 :: < 8 > (b . into ())) ; _mm_or_si128 (_mm_slli_epi16 (dst_odd , 8) , _mm_and_si128 (dst_even , _mm_set1_epi16 (0xFF))) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 > , b : i8x16 < Sse4_2 >) -> i8x16 < Sse4_2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 > , b : i8x16 < Sse4_2 >) -> i8x16 < Sse4_2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 > , b : i8x16 < Sse4_2 >) -> i8x16 < Sse4_2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_i8x16(self, a: i8x16) -> i8x16 { @@ -549,15 +520,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shl_i8x16(self, a: i8x16, shift: u32) -> i8x16 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); - let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); - let lo_shifted = _mm_sll_epi16(lo_16, shift_count); - let hi_shifted = _mm_sll_epi16(hi_16, shift_count); - _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 > , shift : u32) -> i8x16 < Sse4_2 > { let val = a . into () ; let shift_count = _mm_cvtsi32_si128 (shift . cast_signed ()) ; let lo_16 = _mm_unpacklo_epi8 (val , _mm_cmpgt_epi8 (_mm_setzero_si128 () , val)) ; let hi_16 = _mm_unpackhi_epi8 (val , _mm_cmpgt_epi8 (_mm_setzero_si128 () , val)) ; let lo_shifted = _mm_sll_epi16 (lo_16 , shift_count) ; let hi_shifted = _mm_sll_epi16 (hi_16 , shift_count) ; _mm_packs_epi16 (lo_shifted , hi_shifted) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { @@ -565,15 +529,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); - let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); - let lo_shifted = _mm_sra_epi16(lo_16, shift_count); - let hi_shifted = _mm_sra_epi16(hi_16, shift_count); - _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 > , shift : u32) -> i8x16 < Sse4_2 > { let val = a . into () ; let shift_count = _mm_cvtsi32_si128 (shift . cast_signed ()) ; let lo_16 = _mm_unpacklo_epi8 (val , _mm_cmpgt_epi8 (_mm_setzero_si128 () , val)) ; let hi_16 = _mm_unpackhi_epi8 (val , _mm_cmpgt_epi8 (_mm_setzero_si128 () , val)) ; let lo_shifted = _mm_sra_epi16 (lo_16 , shift_count) ; let hi_shifted = _mm_sra_epi16 (hi_16 , shift_count) ; _mm_packs_epi16 (lo_shifted , hi_shifted) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { @@ -581,49 +538,48 @@ impl Simd for Sse4_2 { } #[inline(always)] fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 > , b : i8x16 < Sse4_2 >) -> mask8x16 < Sse4_2 > { _mm_cmpeq_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpgt_epi8(b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 > , b : i8x16 < Sse4_2 >) -> mask8x16 < Sse4_2 > { _mm_cmpgt_epi8 (b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(_mm_min_epi8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 > , b : i8x16 < Sse4_2 >) -> mask8x16 < Sse4_2 > { _mm_cmpeq_epi8 (_mm_min_epi8 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(_mm_max_epi8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 > , b : i8x16 < Sse4_2 >) -> mask8x16 < Sse4_2 > { _mm_cmpeq_epi8 (_mm_max_epi8 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpgt_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 > , b : i8x16 < Sse4_2 >) -> mask8x16 < Sse4_2 > { _mm_cmpgt_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 > , b : i8x16 < Sse4_2 >) -> i8x16 < Sse4_2 > { _mm_unpacklo_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 > , b : i8x16 < Sse4_2 >) -> i8x16 < Sse4_2 > { _mm_unpackhi_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 > , b : i8x16 < Sse4_2 >) -> i8x16 < Sse4_2 > { let mask = _mm_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15) ; let t1 = _mm_shuffle_epi8 (a . into () , mask) ; let t2 = _mm_shuffle_epi8 (b . into () , mask) ; _mm_unpacklo_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 > , b : i8x16 < Sse4_2 >) -> i8x16 < Sse4_2 > { let mask = _mm_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15) ; let t1 = _mm_shuffle_epi8 (a . into () , mask) ; let t2 = _mm_shuffle_epi8 (b . into () , mask) ; _mm_unpackhi_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_i8x16(self, a: i8x16, b: i8x16) -> (i8x16, i8x16) { @@ -635,15 +591,18 @@ impl Simd for Sse4_2 { } #[inline(always)] fn select_i8x16(self, a: mask8x16, b: i8x16, c: i8x16) -> i8x16 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask8x16 < Sse4_2 > , b : i8x16 < Sse4_2 > , c : i8x16 < Sse4_2 >) -> i8x16 < Sse4_2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_min_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 > , b : i8x16 < Sse4_2 >) -> i8x16 < Sse4_2 > { _mm_min_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_max_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 > , b : i8x16 < Sse4_2 >) -> i8x16 < Sse4_2 > { _mm_max_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_i8x16(self, a: i8x16, b: i8x16) -> i8x32 { @@ -654,19 +613,23 @@ impl Simd for Sse4_2 { } #[inline(always)] fn neg_i8x16(self, a: i8x16) -> i8x16 { - unsafe { _mm_sub_epi8(_mm_setzero_si128(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 >) -> i8x16 < Sse4_2 > { _mm_sub_epi8 (_mm_setzero_si128 () , a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i8x16(self, a: i8x16) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 >) -> u8x16 < Sse4_2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i8x16(self, a: i8x16) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i8x16 < Sse4_2 >) -> u32x4 < Sse4_2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_u8x16(self, val: u8) -> u8x16 { - unsafe { _mm_set1_epi8(val.cast_signed()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , val : u8) -> u8x16 < Sse4_2 > { _mm_set1_epi8 (val . cast_signed ()) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16 { @@ -745,36 +708,33 @@ impl Simd for Sse4_2 { } #[inline(always)] fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 > , b : u8x16 < Sse4_2 >) -> u8x16 < Sse4_2 > { _mm_add_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 > , b : u8x16 < Sse4_2 >) -> u8x16 < Sse4_2 > { _mm_sub_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { - let dst_even = _mm_mullo_epi16(a.into(), b.into()); - let dst_odd = - _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into())); - _mm_or_si128( - _mm_slli_epi16(dst_odd, 8), - _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)), - ) - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 > , b : u8x16 < Sse4_2 >) -> u8x16 < Sse4_2 > { let dst_even = _mm_mullo_epi16 (a . into () , b . into ()) ; let dst_odd = _mm_mullo_epi16 (_mm_srli_epi16 :: < 8 > (a . into ()) , _mm_srli_epi16 :: < 8 > (b . into ())) ; _mm_or_si128 (_mm_slli_epi16 (dst_odd , 8) , _mm_and_si128 (dst_even , _mm_set1_epi16 (0xFF))) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 > , b : u8x16 < Sse4_2 >) -> u8x16 < Sse4_2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 > , b : u8x16 < Sse4_2 >) -> u8x16 < Sse4_2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 > , b : u8x16 < Sse4_2 >) -> u8x16 < Sse4_2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_u8x16(self, a: u8x16) -> u8x16 { @@ -782,15 +742,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shl_u8x16(self, a: u8x16, shift: u32) -> u8x16 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); - let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); - let lo_shifted = _mm_sll_epi16(lo_16, shift_count); - let hi_shifted = _mm_sll_epi16(hi_16, shift_count); - _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 > , shift : u32) -> u8x16 < Sse4_2 > { let val = a . into () ; let shift_count = _mm_cvtsi32_si128 (shift . cast_signed ()) ; let lo_16 = _mm_unpacklo_epi8 (val , _mm_setzero_si128 ()) ; let hi_16 = _mm_unpackhi_epi8 (val , _mm_setzero_si128 ()) ; let lo_shifted = _mm_sll_epi16 (lo_16 , shift_count) ; let hi_shifted = _mm_sll_epi16 (hi_16 , shift_count) ; _mm_packus_epi16 (lo_shifted , hi_shifted) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { @@ -798,15 +751,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); - let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); - let lo_shifted = _mm_srl_epi16(lo_16, shift_count); - let hi_shifted = _mm_srl_epi16(hi_16, shift_count); - _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 > , shift : u32) -> u8x16 < Sse4_2 > { let val = a . into () ; let shift_count = _mm_cvtsi32_si128 (shift . cast_signed ()) ; let lo_16 = _mm_unpacklo_epi8 (val , _mm_setzero_si128 ()) ; let hi_16 = _mm_unpackhi_epi8 (val , _mm_setzero_si128 ()) ; let lo_shifted = _mm_srl_epi16 (lo_16 , shift_count) ; let hi_shifted = _mm_srl_epi16 (hi_16 , shift_count) ; _mm_packus_epi16 (lo_shifted , hi_shifted) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { @@ -814,59 +760,48 @@ impl Simd for Sse4_2 { } #[inline(always)] fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 > , b : u8x16 < Sse4_2 >) -> mask8x16 < Sse4_2 > { _mm_cmpeq_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { - let sign_bit = _mm_set1_epi8(0x80u8.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi8(b_signed, a_signed).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 > , b : u8x16 < Sse4_2 >) -> mask8x16 < Sse4_2 > { let sign_bit = _mm_set1_epi8 (0x80u8 . cast_signed ()) ; let a_signed = _mm_xor_si128 (a . into () , sign_bit) ; let b_signed = _mm_xor_si128 (b . into () , sign_bit) ; _mm_cmpgt_epi8 (b_signed , a_signed) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(_mm_min_epu8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 > , b : u8x16 < Sse4_2 >) -> mask8x16 < Sse4_2 > { _mm_cmpeq_epi8 (_mm_min_epu8 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(_mm_max_epu8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 > , b : u8x16 < Sse4_2 >) -> mask8x16 < Sse4_2 > { _mm_cmpeq_epi8 (_mm_max_epu8 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { - let sign_bit = _mm_set1_epi8(0x80u8.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 > , b : u8x16 < Sse4_2 >) -> mask8x16 < Sse4_2 > { let sign_bit = _mm_set1_epi8 (0x80u8 . cast_signed ()) ; let a_signed = _mm_xor_si128 (a . into () , sign_bit) ; let b_signed = _mm_xor_si128 (b . into () , sign_bit) ; _mm_cmpgt_epi8 (a_signed , b_signed) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 > , b : u8x16 < Sse4_2 >) -> u8x16 < Sse4_2 > { _mm_unpacklo_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 > , b : u8x16 < Sse4_2 >) -> u8x16 < Sse4_2 > { _mm_unpackhi_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 > , b : u8x16 < Sse4_2 >) -> u8x16 < Sse4_2 > { let mask = _mm_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15) ; let t1 = _mm_shuffle_epi8 (a . into () , mask) ; let t2 = _mm_shuffle_epi8 (b . into () , mask) ; _mm_unpacklo_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 > , b : u8x16 < Sse4_2 >) -> u8x16 < Sse4_2 > { let mask = _mm_setr_epi8 (0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15) ; let t1 = _mm_shuffle_epi8 (a . into () , mask) ; let t2 = _mm_shuffle_epi8 (b . into () , mask) ; _mm_unpackhi_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_u8x16(self, a: u8x16, b: u8x16) -> (u8x16, u8x16) { @@ -878,15 +813,18 @@ impl Simd for Sse4_2 { } #[inline(always)] fn select_u8x16(self, a: mask8x16, b: u8x16, c: u8x16) -> u8x16 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask8x16 < Sse4_2 > , b : u8x16 < Sse4_2 > , c : u8x16 < Sse4_2 >) -> u8x16 < Sse4_2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_min_epu8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 > , b : u8x16 < Sse4_2 >) -> u8x16 < Sse4_2 > { _mm_min_epu8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_max_epu8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 > , b : u8x16 < Sse4_2 >) -> u8x16 < Sse4_2 > { _mm_max_epu8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_u8x16(self, a: u8x16, b: u8x16) -> u8x32 { @@ -897,23 +835,18 @@ impl Simd for Sse4_2 { } #[inline(always)] fn widen_u8x16(self, a: u8x16) -> u16x16 { - unsafe { - let raw = a.into(); - let high = _mm_cvtepu8_epi16(raw).simd_into(self); - let low = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(raw)).simd_into(self); - self.combine_u16x8(high, low) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 >) -> u16x16 < Sse4_2 > { let raw = a . into () ; let high = _mm_cvtepu8_epi16 (raw) . simd_into (token) ; let low = _mm_cvtepu8_epi16 (_mm_srli_si128 :: < 8 > (raw)) . simd_into (token) ; token . combine_u16x8 (high , low) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_u8x16(self, a: u8x16) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u8x16 < Sse4_2 >) -> u32x4 < Sse4_2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_mask8x16(self, val: bool) -> mask8x16 { - unsafe { - let val: i8 = if val { !0 } else { 0 }; - _mm_set1_epi8(val).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , val : bool) -> mask8x16 < Sse4_2 > { let val : i8 = if val { ! 0 } else { 0 } ; _mm_set1_epi8 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16 { @@ -928,35 +861,28 @@ impl Simd for Sse4_2 { } #[inline(always)] fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16 { - unsafe { - { - let bit_bytes = _mm_cvtsi32_si128(bits as i32); - let bit_bytes = _mm_shuffle_epi8( - bit_bytes, - _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1), - ); - let bit_mask = - _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128); - _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) - } - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , bits : u64) -> mask8x16 < Sse4_2 > { { let bit_bytes = _mm_cvtsi32_si128 (bits as i32) ; let bit_bytes = _mm_shuffle_epi8 (bit_bytes , _mm_setr_epi8 (0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1)) ; let bit_mask = _mm_setr_epi8 (1 , 2 , 4 , 8 , 16 , 32 , 64 , - 128 , 1 , 2 , 4 , 8 , 16 , 32 , 64 , - 128) ; _mm_cmpeq_epi8 (_mm_and_si128 (bit_bytes , bit_mask) , bit_mask) } . simd_into (token) } } + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask8x16(self, a: mask8x16) -> u64 { - unsafe { _mm_movemask_epi8(a.into()) as u32 as u64 } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask8x16 < Sse4_2 >) -> u64 { _mm_movemask_epi8 (a . into ()) as u32 as u64 } } + kernel(self, a) } #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask8x16 < Sse4_2 > , b : mask8x16 < Sse4_2 >) -> mask8x16 < Sse4_2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask8x16 < Sse4_2 > , b : mask8x16 < Sse4_2 >) -> mask8x16 < Sse4_2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask8x16 < Sse4_2 > , b : mask8x16 < Sse4_2 >) -> mask8x16 < Sse4_2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_mask8x16(self, a: mask8x16) -> mask8x16 { @@ -969,27 +895,33 @@ impl Simd for Sse4_2 { b: mask8x16, c: mask8x16, ) -> mask8x16 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask8x16 < Sse4_2 > , b : mask8x16 < Sse4_2 > , c : mask8x16 < Sse4_2 >) -> mask8x16 < Sse4_2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask8x16 < Sse4_2 > , b : mask8x16 < Sse4_2 >) -> mask8x16 < Sse4_2 > { _mm_cmpeq_epi8 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn any_true_mask8x16(self, a: mask8x16) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask8x16 < Sse4_2 >) -> bool { _mm_movemask_epi8 (a . into ()) as u32 != 0 } } + kernel(self, a) } #[inline(always)] fn all_true_mask8x16(self, a: mask8x16) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask8x16 < Sse4_2 >) -> bool { _mm_movemask_epi8 (a . into ()) as u32 == 0xffff } } + kernel(self, a) } #[inline(always)] fn any_false_mask8x16(self, a: mask8x16) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask8x16 < Sse4_2 >) -> bool { _mm_movemask_epi8 (a . into ()) as u32 != 0xffff } } + kernel(self, a) } #[inline(always)] fn all_false_mask8x16(self, a: mask8x16) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask8x16 < Sse4_2 >) -> bool { _mm_movemask_epi8 (a . into ()) as u32 == 0 } } + kernel(self, a) } #[inline(always)] fn combine_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x32 { @@ -1000,7 +932,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn splat_i16x8(self, val: i16) -> i16x8 { - unsafe { _mm_set1_epi16(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , val : i16) -> i16x8 < Sse4_2 > { _mm_set1_epi16 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8 { @@ -1079,27 +1012,33 @@ impl Simd for Sse4_2 { } #[inline(always)] fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 > , b : i16x8 < Sse4_2 >) -> i16x8 < Sse4_2 > { _mm_add_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 > , b : i16x8 < Sse4_2 >) -> i16x8 < Sse4_2 > { _mm_sub_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 > , b : i16x8 < Sse4_2 >) -> i16x8 < Sse4_2 > { _mm_mullo_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 > , b : i16x8 < Sse4_2 >) -> i16x8 < Sse4_2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 > , b : i16x8 < Sse4_2 >) -> i16x8 < Sse4_2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 > , b : i16x8 < Sse4_2 >) -> i16x8 < Sse4_2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_i16x8(self, a: i16x8) -> i16x8 { @@ -1107,7 +1046,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shl_i16x8(self, a: i16x8, shift: u32) -> i16x8 { - unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 > , shift : u32) -> i16x8 < Sse4_2 > { _mm_sll_epi16 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { @@ -1115,7 +1055,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8 { - unsafe { _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 > , shift : u32) -> i16x8 < Sse4_2 > { _mm_sra_epi16 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { @@ -1123,49 +1064,48 @@ impl Simd for Sse4_2 { } #[inline(always)] fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 > , b : i16x8 < Sse4_2 >) -> mask16x8 < Sse4_2 > { _mm_cmpeq_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpgt_epi16(b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 > , b : i16x8 < Sse4_2 >) -> mask16x8 < Sse4_2 > { _mm_cmpgt_epi16 (b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(_mm_min_epi16(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 > , b : i16x8 < Sse4_2 >) -> mask16x8 < Sse4_2 > { _mm_cmpeq_epi16 (_mm_min_epi16 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(_mm_max_epi16(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 > , b : i16x8 < Sse4_2 >) -> mask16x8 < Sse4_2 > { _mm_cmpeq_epi16 (_mm_max_epi16 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpgt_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 > , b : i16x8 < Sse4_2 >) -> mask16x8 < Sse4_2 > { _mm_cmpgt_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 > , b : i16x8 < Sse4_2 >) -> i16x8 < Sse4_2 > { _mm_unpacklo_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 > , b : i16x8 < Sse4_2 >) -> i16x8 < Sse4_2 > { _mm_unpackhi_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 > , b : i16x8 < Sse4_2 >) -> i16x8 < Sse4_2 > { let mask = _mm_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15) ; let t1 = _mm_shuffle_epi8 (a . into () , mask) ; let t2 = _mm_shuffle_epi8 (b . into () , mask) ; _mm_unpacklo_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 > , b : i16x8 < Sse4_2 >) -> i16x8 < Sse4_2 > { let mask = _mm_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15) ; let t1 = _mm_shuffle_epi8 (a . into () , mask) ; let t2 = _mm_shuffle_epi8 (b . into () , mask) ; _mm_unpackhi_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_i16x8(self, a: i16x8, b: i16x8) -> (i16x8, i16x8) { @@ -1177,15 +1117,18 @@ impl Simd for Sse4_2 { } #[inline(always)] fn select_i16x8(self, a: mask16x8, b: i16x8, c: i16x8) -> i16x8 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask16x8 < Sse4_2 > , b : i16x8 < Sse4_2 > , c : i16x8 < Sse4_2 >) -> i16x8 < Sse4_2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_min_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 > , b : i16x8 < Sse4_2 >) -> i16x8 < Sse4_2 > { _mm_min_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_max_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 > , b : i16x8 < Sse4_2 >) -> i16x8 < Sse4_2 > { _mm_max_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_i16x8(self, a: i16x8, b: i16x8) -> i16x16 { @@ -1196,19 +1139,23 @@ impl Simd for Sse4_2 { } #[inline(always)] fn neg_i16x8(self, a: i16x8) -> i16x8 { - unsafe { _mm_sub_epi16(_mm_setzero_si128(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 >) -> i16x8 < Sse4_2 > { _mm_sub_epi16 (_mm_setzero_si128 () , a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i16x8(self, a: i16x8) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 >) -> u8x16 < Sse4_2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i16x8(self, a: i16x8) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i16x8 < Sse4_2 >) -> u32x4 < Sse4_2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_u16x8(self, val: u16) -> u16x8 { - unsafe { _mm_set1_epi16(val.cast_signed()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , val : u16) -> u16x8 < Sse4_2 > { _mm_set1_epi16 (val . cast_signed ()) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8 { @@ -1287,27 +1234,33 @@ impl Simd for Sse4_2 { } #[inline(always)] fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 > , b : u16x8 < Sse4_2 >) -> u16x8 < Sse4_2 > { _mm_add_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 > , b : u16x8 < Sse4_2 >) -> u16x8 < Sse4_2 > { _mm_sub_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 > , b : u16x8 < Sse4_2 >) -> u16x8 < Sse4_2 > { _mm_mullo_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 > , b : u16x8 < Sse4_2 >) -> u16x8 < Sse4_2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 > , b : u16x8 < Sse4_2 >) -> u16x8 < Sse4_2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 > , b : u16x8 < Sse4_2 >) -> u16x8 < Sse4_2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_u16x8(self, a: u16x8) -> u16x8 { @@ -1315,7 +1268,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shl_u16x8(self, a: u16x8, shift: u32) -> u16x8 { - unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 > , shift : u32) -> u16x8 < Sse4_2 > { _mm_sll_epi16 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { @@ -1323,7 +1277,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8 { - unsafe { _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 > , shift : u32) -> u16x8 < Sse4_2 > { _mm_srl_epi16 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { @@ -1331,59 +1286,48 @@ impl Simd for Sse4_2 { } #[inline(always)] fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 > , b : u16x8 < Sse4_2 >) -> mask16x8 < Sse4_2 > { _mm_cmpeq_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { - let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi16(b_signed, a_signed).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 > , b : u16x8 < Sse4_2 >) -> mask16x8 < Sse4_2 > { let sign_bit = _mm_set1_epi16 (0x8000u16 . cast_signed ()) ; let a_signed = _mm_xor_si128 (a . into () , sign_bit) ; let b_signed = _mm_xor_si128 (b . into () , sign_bit) ; _mm_cmpgt_epi16 (b_signed , a_signed) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(_mm_min_epu16(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 > , b : u16x8 < Sse4_2 >) -> mask16x8 < Sse4_2 > { _mm_cmpeq_epi16 (_mm_min_epu16 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(_mm_max_epu16(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 > , b : u16x8 < Sse4_2 >) -> mask16x8 < Sse4_2 > { _mm_cmpeq_epi16 (_mm_max_epu16 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { - let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 > , b : u16x8 < Sse4_2 >) -> mask16x8 < Sse4_2 > { let sign_bit = _mm_set1_epi16 (0x8000u16 . cast_signed ()) ; let a_signed = _mm_xor_si128 (a . into () , sign_bit) ; let b_signed = _mm_xor_si128 (b . into () , sign_bit) ; _mm_cmpgt_epi16 (a_signed , b_signed) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 > , b : u16x8 < Sse4_2 >) -> u16x8 < Sse4_2 > { _mm_unpacklo_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 > , b : u16x8 < Sse4_2 >) -> u16x8 < Sse4_2 > { _mm_unpackhi_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 > , b : u16x8 < Sse4_2 >) -> u16x8 < Sse4_2 > { let mask = _mm_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15) ; let t1 = _mm_shuffle_epi8 (a . into () , mask) ; let t2 = _mm_shuffle_epi8 (b . into () , mask) ; _mm_unpacklo_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 > , b : u16x8 < Sse4_2 >) -> u16x8 < Sse4_2 > { let mask = _mm_setr_epi8 (0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15) ; let t1 = _mm_shuffle_epi8 (a . into () , mask) ; let t2 = _mm_shuffle_epi8 (b . into () , mask) ; _mm_unpackhi_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_u16x8(self, a: u16x8, b: u16x8) -> (u16x8, u16x8) { @@ -1395,15 +1339,18 @@ impl Simd for Sse4_2 { } #[inline(always)] fn select_u16x8(self, a: mask16x8, b: u16x8, c: u16x8) -> u16x8 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask16x8 < Sse4_2 > , b : u16x8 < Sse4_2 > , c : u16x8 < Sse4_2 >) -> u16x8 < Sse4_2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_min_epu16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 > , b : u16x8 < Sse4_2 >) -> u16x8 < Sse4_2 > { _mm_min_epu16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_max_epu16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 > , b : u16x8 < Sse4_2 >) -> u16x8 < Sse4_2 > { _mm_max_epu16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_u16x8(self, a: u16x8, b: u16x8) -> u16x16 { @@ -1414,18 +1361,18 @@ impl Simd for Sse4_2 { } #[inline(always)] fn reinterpret_u8_u16x8(self, a: u16x8) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 >) -> u8x16 < Sse4_2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_u16x8(self, a: u16x8) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x8 < Sse4_2 >) -> u32x4 < Sse4_2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_mask16x8(self, val: bool) -> mask16x8 { - unsafe { - let val: i16 = if val { !0 } else { 0 }; - _mm_set1_epi16(val).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , val : bool) -> mask16x8 < Sse4_2 > { let val : i16 = if val { ! 0 } else { 0 } ; _mm_set1_epi16 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8 { @@ -1440,35 +1387,28 @@ impl Simd for Sse4_2 { } #[inline(always)] fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8 { - unsafe { - { - let bit_lanes = _mm_set1_epi16(bits as i16); - let bit_mask = _mm_setr_epi16(1, 2, 4, 8, 16, 32, 64, 128); - _mm_cmpeq_epi16(_mm_and_si128(bit_lanes, bit_mask), bit_mask) - } - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , bits : u64) -> mask16x8 < Sse4_2 > { { let bit_lanes = _mm_set1_epi16 (bits as i16) ; let bit_mask = _mm_setr_epi16 (1 , 2 , 4 , 8 , 16 , 32 , 64 , 128) ; _mm_cmpeq_epi16 (_mm_and_si128 (bit_lanes , bit_mask) , bit_mask) } . simd_into (token) } } + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask16x8(self, a: mask16x8) -> u64 { - unsafe { - { - let packed = _mm_packs_epi16(a.into(), a.into()); - _mm_movemask_epi8(packed) as u8 as u64 - } - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask16x8 < Sse4_2 >) -> u64 { { let packed = _mm_packs_epi16 (a . into () , a . into ()) ; _mm_movemask_epi8 (packed) as u8 as u64 } } } + kernel(self, a) } #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask16x8 < Sse4_2 > , b : mask16x8 < Sse4_2 >) -> mask16x8 < Sse4_2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask16x8 < Sse4_2 > , b : mask16x8 < Sse4_2 >) -> mask16x8 < Sse4_2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask16x8 < Sse4_2 > , b : mask16x8 < Sse4_2 >) -> mask16x8 < Sse4_2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_mask16x8(self, a: mask16x8) -> mask16x8 { @@ -1481,27 +1421,33 @@ impl Simd for Sse4_2 { b: mask16x8, c: mask16x8, ) -> mask16x8 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask16x8 < Sse4_2 > , b : mask16x8 < Sse4_2 > , c : mask16x8 < Sse4_2 >) -> mask16x8 < Sse4_2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask16x8 < Sse4_2 > , b : mask16x8 < Sse4_2 >) -> mask16x8 < Sse4_2 > { _mm_cmpeq_epi16 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn any_true_mask16x8(self, a: mask16x8) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask16x8 < Sse4_2 >) -> bool { _mm_movemask_epi8 (a . into ()) as u32 != 0 } } + kernel(self, a) } #[inline(always)] fn all_true_mask16x8(self, a: mask16x8) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask16x8 < Sse4_2 >) -> bool { _mm_movemask_epi8 (a . into ()) as u32 == 0xffff } } + kernel(self, a) } #[inline(always)] fn any_false_mask16x8(self, a: mask16x8) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask16x8 < Sse4_2 >) -> bool { _mm_movemask_epi8 (a . into ()) as u32 != 0xffff } } + kernel(self, a) } #[inline(always)] fn all_false_mask16x8(self, a: mask16x8) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask16x8 < Sse4_2 >) -> bool { _mm_movemask_epi8 (a . into ()) as u32 == 0 } } + kernel(self, a) } #[inline(always)] fn combine_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x16 { @@ -1512,7 +1458,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn splat_i32x4(self, val: i32) -> i32x4 { - unsafe { _mm_set1_epi32(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , val : i32) -> i32x4 < Sse4_2 > { _mm_set1_epi32 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4 { @@ -1591,27 +1538,33 @@ impl Simd for Sse4_2 { } #[inline(always)] fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 > , b : i32x4 < Sse4_2 >) -> i32x4 < Sse4_2 > { _mm_add_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 > , b : i32x4 < Sse4_2 >) -> i32x4 < Sse4_2 > { _mm_sub_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 > , b : i32x4 < Sse4_2 >) -> i32x4 < Sse4_2 > { _mm_mullo_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 > , b : i32x4 < Sse4_2 >) -> i32x4 < Sse4_2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 > , b : i32x4 < Sse4_2 >) -> i32x4 < Sse4_2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 > , b : i32x4 < Sse4_2 >) -> i32x4 < Sse4_2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_i32x4(self, a: i32x4) -> i32x4 { @@ -1619,7 +1572,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shl_i32x4(self, a: i32x4, shift: u32) -> i32x4 { - unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 > , shift : u32) -> i32x4 < Sse4_2 > { _mm_sll_epi32 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { @@ -1627,7 +1581,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shr_i32x4(self, a: i32x4, shift: u32) -> i32x4 { - unsafe { _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 > , shift : u32) -> i32x4 < Sse4_2 > { _mm_sra_epi32 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { @@ -1635,47 +1590,48 @@ impl Simd for Sse4_2 { } #[inline(always)] fn simd_eq_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 > , b : i32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { _mm_cmpeq_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpgt_epi32(b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 > , b : i32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { _mm_cmpgt_epi32 (b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(_mm_min_epi32(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 > , b : i32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { _mm_cmpeq_epi32 (_mm_min_epi32 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(_mm_max_epi32(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 > , b : i32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { _mm_cmpeq_epi32 (_mm_max_epi32 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpgt_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 > , b : i32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { _mm_cmpgt_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 > , b : i32x4 < Sse4_2 >) -> i32x4 < Sse4_2 > { _mm_unpacklo_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 > , b : i32x4 < Sse4_2 >) -> i32x4 < Sse4_2 > { _mm_unpackhi_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 > , b : i32x4 < Sse4_2 >) -> i32x4 < Sse4_2 > { let t1 = _mm_shuffle_epi32 :: < 0b11_01_10_00 > (a . into ()) ; let t2 = _mm_shuffle_epi32 :: < 0b11_01_10_00 > (b . into ()) ; _mm_unpacklo_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 > , b : i32x4 < Sse4_2 >) -> i32x4 < Sse4_2 > { let t1 = _mm_shuffle_epi32 :: < 0b11_01_10_00 > (a . into ()) ; let t2 = _mm_shuffle_epi32 :: < 0b11_01_10_00 > (b . into ()) ; _mm_unpackhi_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_i32x4(self, a: i32x4, b: i32x4) -> (i32x4, i32x4) { @@ -1687,15 +1643,18 @@ impl Simd for Sse4_2 { } #[inline(always)] fn select_i32x4(self, a: mask32x4, b: i32x4, c: i32x4) -> i32x4 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask32x4 < Sse4_2 > , b : i32x4 < Sse4_2 > , c : i32x4 < Sse4_2 >) -> i32x4 < Sse4_2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_min_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 > , b : i32x4 < Sse4_2 >) -> i32x4 < Sse4_2 > { _mm_min_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_max_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 > , b : i32x4 < Sse4_2 >) -> i32x4 < Sse4_2 > { _mm_max_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_i32x4(self, a: i32x4, b: i32x4) -> i32x8 { @@ -1706,23 +1665,28 @@ impl Simd for Sse4_2 { } #[inline(always)] fn neg_i32x4(self, a: i32x4) -> i32x4 { - unsafe { _mm_sub_epi32(_mm_setzero_si128(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 >) -> i32x4 < Sse4_2 > { _mm_sub_epi32 (_mm_setzero_si128 () , a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i32x4(self, a: i32x4) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 >) -> u8x16 < Sse4_2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i32x4(self, a: i32x4) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 >) -> u32x4 < Sse4_2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_f32_i32x4(self, a: i32x4) -> f32x4 { - unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : i32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_cvtepi32_ps (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_u32x4(self, val: u32) -> u32x4 { - unsafe { _mm_set1_epi32(val.cast_signed()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , val : u32) -> u32x4 < Sse4_2 > { _mm_set1_epi32 (val . cast_signed ()) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4 { @@ -1801,27 +1765,33 @@ impl Simd for Sse4_2 { } #[inline(always)] fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 > , b : u32x4 < Sse4_2 >) -> u32x4 < Sse4_2 > { _mm_add_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 > , b : u32x4 < Sse4_2 >) -> u32x4 < Sse4_2 > { _mm_sub_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 > , b : u32x4 < Sse4_2 >) -> u32x4 < Sse4_2 > { _mm_mullo_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn and_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 > , b : u32x4 < Sse4_2 >) -> u32x4 < Sse4_2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 > , b : u32x4 < Sse4_2 >) -> u32x4 < Sse4_2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 > , b : u32x4 < Sse4_2 >) -> u32x4 < Sse4_2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_u32x4(self, a: u32x4) -> u32x4 { @@ -1829,7 +1799,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shl_u32x4(self, a: u32x4, shift: u32) -> u32x4 { - unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 > , shift : u32) -> u32x4 < Sse4_2 > { _mm_sll_epi32 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shlv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { @@ -1837,7 +1808,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shr_u32x4(self, a: u32x4, shift: u32) -> u32x4 { - unsafe { _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 > , shift : u32) -> u32x4 < Sse4_2 > { _mm_srl_epi32 (a . into () , _mm_cvtsi32_si128 (shift . cast_signed ())) . simd_into (token) } } + kernel(self, a, shift) } #[inline(always)] fn shrv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { @@ -1845,57 +1817,48 @@ impl Simd for Sse4_2 { } #[inline(always)] fn simd_eq_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 > , b : u32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { _mm_cmpeq_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { - let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi32(b_signed, a_signed).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 > , b : u32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { let sign_bit = _mm_set1_epi32 (0x80000000u32 . cast_signed ()) ; let a_signed = _mm_xor_si128 (a . into () , sign_bit) ; let b_signed = _mm_xor_si128 (b . into () , sign_bit) ; _mm_cmpgt_epi32 (b_signed , a_signed) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(_mm_min_epu32(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 > , b : u32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { _mm_cmpeq_epi32 (_mm_min_epu32 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(_mm_max_epu32(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 > , b : u32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { _mm_cmpeq_epi32 (_mm_max_epu32 (a . into () , b . into ()) , a . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { - let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 > , b : u32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { let sign_bit = _mm_set1_epi32 (0x80000000u32 . cast_signed ()) ; let a_signed = _mm_xor_si128 (a . into () , sign_bit) ; let b_signed = _mm_xor_si128 (b . into () , sign_bit) ; _mm_cmpgt_epi32 (a_signed , b_signed) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 > , b : u32x4 < Sse4_2 >) -> u32x4 < Sse4_2 > { _mm_unpacklo_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 > , b : u32x4 < Sse4_2 >) -> u32x4 < Sse4_2 > { _mm_unpackhi_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 > , b : u32x4 < Sse4_2 >) -> u32x4 < Sse4_2 > { let t1 = _mm_shuffle_epi32 :: < 0b11_01_10_00 > (a . into ()) ; let t2 = _mm_shuffle_epi32 :: < 0b11_01_10_00 > (b . into ()) ; _mm_unpacklo_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 > , b : u32x4 < Sse4_2 >) -> u32x4 < Sse4_2 > { let t1 = _mm_shuffle_epi32 :: < 0b11_01_10_00 > (a . into ()) ; let t2 = _mm_shuffle_epi32 :: < 0b11_01_10_00 > (b . into ()) ; _mm_unpackhi_epi64 (t1 , t2) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_u32x4(self, a: u32x4, b: u32x4) -> (u32x4, u32x4) { @@ -1907,15 +1870,18 @@ impl Simd for Sse4_2 { } #[inline(always)] fn select_u32x4(self, a: mask32x4, b: u32x4, c: u32x4) -> u32x4 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask32x4 < Sse4_2 > , b : u32x4 < Sse4_2 > , c : u32x4 < Sse4_2 >) -> u32x4 < Sse4_2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn min_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_min_epu32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 > , b : u32x4 < Sse4_2 >) -> u32x4 < Sse4_2 > { _mm_min_epu32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_max_epu32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 > , b : u32x4 < Sse4_2 >) -> u32x4 < Sse4_2 > { _mm_max_epu32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn combine_u32x4(self, a: u32x4, b: u32x4) -> u32x8 { @@ -1926,28 +1892,18 @@ impl Simd for Sse4_2 { } #[inline(always)] fn reinterpret_u8_u32x4(self, a: u32x4) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 >) -> u8x16 < Sse4_2 > { __m128i :: from (a) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn cvt_f32_u32x4(self, a: u32x4) -> f32x4 { - unsafe { - let a = a.into(); - let lo = _mm_blend_epi16::<0xAA>(a, _mm_set1_epi32(0x4B000000)); - let hi = _mm_blend_epi16::<0xAA>(_mm_srli_epi32::<16>(a), _mm_set1_epi32(0x53000000)); - let fhi = _mm_sub_ps( - _mm_castsi128_ps(hi), - _mm_set1_ps(f32::from_bits(0x53000080)), - ); - let result = _mm_add_ps(_mm_castsi128_ps(lo), fhi); - result.simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u32x4 < Sse4_2 >) -> f32x4 < Sse4_2 > { let a = a . into () ; let lo = _mm_blend_epi16 :: < 0xAA > (a , _mm_set1_epi32 (0x4B000000)) ; let hi = _mm_blend_epi16 :: < 0xAA > (_mm_srli_epi32 :: < 16 > (a) , _mm_set1_epi32 (0x53000000)) ; let fhi = _mm_sub_ps (_mm_castsi128_ps (hi) , _mm_set1_ps (f32 :: from_bits (0x53000080))) ; let result = _mm_add_ps (_mm_castsi128_ps (lo) , fhi) ; result . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_mask32x4(self, val: bool) -> mask32x4 { - unsafe { - let val: i32 = if val { !0 } else { 0 }; - _mm_set1_epi32(val).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , val : bool) -> mask32x4 < Sse4_2 > { let val : i32 = if val { ! 0 } else { 0 } ; _mm_set1_epi32 (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4 { @@ -1962,30 +1918,28 @@ impl Simd for Sse4_2 { } #[inline(always)] fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4 { - unsafe { - { - let bit_lanes = _mm_set1_epi32(bits as i32); - let bit_mask = _mm_setr_epi32(1, 2, 4, 8); - _mm_cmpeq_epi32(_mm_and_si128(bit_lanes, bit_mask), bit_mask) - } - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , bits : u64) -> mask32x4 < Sse4_2 > { { let bit_lanes = _mm_set1_epi32 (bits as i32) ; let bit_mask = _mm_setr_epi32 (1 , 2 , 4 , 8) ; _mm_cmpeq_epi32 (_mm_and_si128 (bit_lanes , bit_mask) , bit_mask) } . simd_into (token) } } + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask32x4(self, a: mask32x4) -> u64 { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 as u64 } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask32x4 < Sse4_2 >) -> u64 { _mm_movemask_ps (_mm_castsi128_ps (a . into ())) as u32 as u64 } } + kernel(self, a) } #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask32x4 < Sse4_2 > , b : mask32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask32x4 < Sse4_2 > , b : mask32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask32x4 < Sse4_2 > , b : mask32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_mask32x4(self, a: mask32x4) -> mask32x4 { @@ -1998,27 +1952,33 @@ impl Simd for Sse4_2 { b: mask32x4, c: mask32x4, ) -> mask32x4 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask32x4 < Sse4_2 > , b : mask32x4 < Sse4_2 > , c : mask32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask32x4 < Sse4_2 > , b : mask32x4 < Sse4_2 >) -> mask32x4 < Sse4_2 > { _mm_cmpeq_epi32 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn any_true_mask32x4(self, a: mask32x4) -> bool { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask32x4 < Sse4_2 >) -> bool { _mm_movemask_ps (_mm_castsi128_ps (a . into ())) as u32 != 0 } } + kernel(self, a) } #[inline(always)] fn all_true_mask32x4(self, a: mask32x4) -> bool { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0b1111 } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask32x4 < Sse4_2 >) -> bool { _mm_movemask_ps (_mm_castsi128_ps (a . into ())) as u32 == 0b1111 } } + kernel(self, a) } #[inline(always)] fn any_false_mask32x4(self, a: mask32x4) -> bool { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0b1111 } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask32x4 < Sse4_2 >) -> bool { _mm_movemask_ps (_mm_castsi128_ps (a . into ())) as u32 != 0b1111 } } + kernel(self, a) } #[inline(always)] fn all_false_mask32x4(self, a: mask32x4) -> bool { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask32x4 < Sse4_2 >) -> bool { _mm_movemask_ps (_mm_castsi128_ps (a . into ())) as u32 == 0 } } + kernel(self, a) } #[inline(always)] fn combine_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x8 { @@ -2029,7 +1989,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn splat_f64x2(self, val: f64) -> f64x2 { - unsafe { _mm_set1_pd(val).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , val : f64) -> f64x2 < Sse4_2 > { _mm_set1_pd (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2 { @@ -2108,15 +2069,18 @@ impl Simd for Sse4_2 { } #[inline(always)] fn abs_f64x2(self, a: f64x2) -> f64x2 { - unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { _mm_andnot_pd (_mm_set1_pd (- 0.0) , a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn neg_f64x2(self, a: f64x2) -> f64x2 { - unsafe { _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { _mm_xor_pd (a . into () , _mm_set1_pd (- 0.0)) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn sqrt_f64x2(self, a: f64x2) -> f64x2 { - unsafe { _mm_sqrt_pd(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { _mm_sqrt_pd (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn approximate_recip_f64x2(self, a: f64x2) -> f64x2 { @@ -2124,62 +2088,73 @@ impl Simd for Sse4_2 { } #[inline(always)] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 > , b : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { _mm_add_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn sub_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_sub_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 > , b : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { _mm_sub_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_mul_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 > , b : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { _mm_mul_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn div_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_div_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 > , b : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { _mm_div_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn copysign_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let mask = _mm_set1_pd(-0.0); - _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 > , b : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { let mask = _mm_set1_pd (- 0.0) ; _mm_or_pd (_mm_and_pd (mask , b . into ()) , _mm_andnot_pd (mask , a . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_eq_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmpeq_pd(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 > , b : f64x2 < Sse4_2 >) -> mask64x2 < Sse4_2 > { _mm_castpd_si128 (_mm_cmpeq_pd (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_lt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmplt_pd(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 > , b : f64x2 < Sse4_2 >) -> mask64x2 < Sse4_2 > { _mm_castpd_si128 (_mm_cmplt_pd (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_le_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmple_pd(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 > , b : f64x2 < Sse4_2 >) -> mask64x2 < Sse4_2 > { _mm_castpd_si128 (_mm_cmple_pd (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_ge_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmpge_pd(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 > , b : f64x2 < Sse4_2 >) -> mask64x2 < Sse4_2 > { _mm_castpd_si128 (_mm_cmpge_pd (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn simd_gt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmpgt_pd(a.into(), b.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 > , b : f64x2 < Sse4_2 >) -> mask64x2 < Sse4_2 > { _mm_castpd_si128 (_mm_cmpgt_pd (a . into () , b . into ())) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_unpacklo_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 > , b : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { _mm_unpacklo_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn zip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_unpackhi_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 > , b : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { _mm_unpackhi_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 > , b : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { _mm_shuffle_pd :: < 0b00 > (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn unzip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 > , b : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { _mm_shuffle_pd :: < 0b11 > (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn interleave_f64x2(self, a: f64x2, b: f64x2) -> (f64x2, f64x2) { @@ -2191,27 +2166,23 @@ impl Simd for Sse4_2 { } #[inline(always)] fn max_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 > , b : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { _mm_max_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn min_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 > , b : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { _mm_min_pd (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn max_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let intermediate = _mm_max_pd(a.into(), b.into()); - let b_is_nan = _mm_cmpunord_pd(b.into(), b.into()); - _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 > , b : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { let intermediate = _mm_max_pd (a . into () , b . into ()) ; let b_is_nan = _mm_cmpunord_pd (b . into () , b . into ()) ; _mm_blendv_pd (intermediate , a . into () , b_is_nan) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn min_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let intermediate = _mm_min_pd(a.into(), b.into()); - let b_is_nan = _mm_cmpunord_pd(b.into(), b.into()); - _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 > , b : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { let intermediate = _mm_min_pd (a . into () , b . into ()) ; let b_is_nan = _mm_cmpunord_pd (b . into () , b . into ()) ; _mm_blendv_pd (intermediate , a . into () , b_is_nan) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn mul_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { @@ -2223,22 +2194,18 @@ impl Simd for Sse4_2 { } #[inline(always)] fn floor_f64x2(self, a: f64x2) -> f64x2 { - unsafe { - _mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { _mm_round_pd :: < { _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn ceil_f64x2(self, a: f64x2) -> f64x2 { - unsafe { - _mm_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { _mm_round_pd :: < { _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn round_ties_even_f64x2(self, a: f64x2) -> f64x2 { - unsafe { - _mm_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { _mm_round_pd :: < { _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn fract_f64x2(self, a: f64x2) -> f64x2 { @@ -2246,13 +2213,13 @@ impl Simd for Sse4_2 { } #[inline(always)] fn trunc_f64x2(self, a: f64x2) -> f64x2 { - unsafe { - _mm_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { _mm_round_pd :: < { _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC } > (a . into () ,) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn select_f64x2(self, a: mask64x2, b: f64x2, c: f64x2) -> f64x2 { - unsafe { _mm_blendv_pd(c.into(), b.into(), _mm_castsi128_pd(a.into())).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask64x2 < Sse4_2 > , b : f64x2 < Sse4_2 > , c : f64x2 < Sse4_2 >) -> f64x2 < Sse4_2 > { _mm_blendv_pd (c . into () , b . into () , _mm_castsi128_pd (a . into ())) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn combine_f64x2(self, a: f64x2, b: f64x2) -> f64x4 { @@ -2263,14 +2230,13 @@ impl Simd for Sse4_2 { } #[inline(always)] fn reinterpret_f32_f64x2(self, a: f64x2) -> f32x4 { - unsafe { _mm_castpd_ps(a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : f64x2 < Sse4_2 >) -> f32x4 < Sse4_2 > { _mm_castpd_ps (a . into ()) . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn splat_mask64x2(self, val: bool) -> mask64x2 { - unsafe { - let val: i64 = if val { !0 } else { 0 }; - _mm_set1_epi64x(val).simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , val : bool) -> mask64x2 < Sse4_2 > { let val : i64 = if val { ! 0 } else { 0 } ; _mm_set1_epi64x (val) . simd_into (token) } } + kernel(self, val) } #[inline(always)] fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { @@ -2285,30 +2251,28 @@ impl Simd for Sse4_2 { } #[inline(always)] fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { - unsafe { - { - let bit_lanes = _mm_set1_epi64x(bits.cast_signed()); - let bit_mask = _mm_set_epi64x(2, 1); - _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask) - } - .simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , bits : u64) -> mask64x2 < Sse4_2 > { { let bit_lanes = _mm_set1_epi64x (bits . cast_signed ()) ; let bit_mask = _mm_set_epi64x (2 , 1) ; _mm_cmpeq_epi64 (_mm_and_si128 (bit_lanes , bit_mask) , bit_mask) } . simd_into (token) } } + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask64x2 < Sse4_2 >) -> u64 { _mm_movemask_pd (_mm_castsi128_pd (a . into ())) as u32 as u64 } } + kernel(self, a) } #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask64x2 < Sse4_2 > , b : mask64x2 < Sse4_2 >) -> mask64x2 < Sse4_2 > { _mm_and_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask64x2 < Sse4_2 > , b : mask64x2 < Sse4_2 >) -> mask64x2 < Sse4_2 > { _mm_or_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask64x2 < Sse4_2 > , b : mask64x2 < Sse4_2 >) -> mask64x2 < Sse4_2 > { _mm_xor_si128 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn not_mask64x2(self, a: mask64x2) -> mask64x2 { @@ -2321,27 +2285,33 @@ impl Simd for Sse4_2 { b: mask64x2, c: mask64x2, ) -> mask64x2 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask64x2 < Sse4_2 > , b : mask64x2 < Sse4_2 > , c : mask64x2 < Sse4_2 >) -> mask64x2 < Sse4_2 > { _mm_blendv_epi8 (c . into () , b . into () , a . into ()) . simd_into (token) } } + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { _mm_cmpeq_epi64(a.into(), b.into()).simd_into(self) } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask64x2 < Sse4_2 > , b : mask64x2 < Sse4_2 >) -> mask64x2 < Sse4_2 > { _mm_cmpeq_epi64 (a . into () , b . into ()) . simd_into (token) } } + kernel(self, a, b) } #[inline(always)] fn any_true_mask64x2(self, a: mask64x2) -> bool { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask64x2 < Sse4_2 >) -> bool { _mm_movemask_pd (_mm_castsi128_pd (a . into ())) as u32 != 0 } } + kernel(self, a) } #[inline(always)] fn all_true_mask64x2(self, a: mask64x2) -> bool { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11 } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask64x2 < Sse4_2 >) -> bool { _mm_movemask_pd (_mm_castsi128_pd (a . into ())) as u32 == 0b11 } } + kernel(self, a) } #[inline(always)] fn any_false_mask64x2(self, a: mask64x2) -> bool { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11 } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask64x2 < Sse4_2 >) -> bool { _mm_movemask_pd (_mm_castsi128_pd (a . into ())) as u32 != 0b11 } } + kernel(self, a) } #[inline(always)] fn all_false_mask64x2(self, a: mask64x2) -> bool { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0 } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask64x2 < Sse4_2 >) -> bool { _mm_movemask_pd (_mm_castsi128_pd (a . into ())) as u32 == 0 } } + kernel(self, a) } #[inline(always)] fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { @@ -3960,14 +3930,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn narrow_u16x16(self, a: u16x16) -> u8x16 { - let (a, b) = self.split_u16x16(a); - unsafe { - let mask = _mm_set1_epi16(0xFF); - let lo_masked = _mm_and_si128(a.into(), mask); - let hi_masked = _mm_and_si128(b.into(), mask); - let result = _mm_packus_epi16(lo_masked, hi_masked); - result.simd_into(self) - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : u16x16 < Sse4_2 >) -> u8x16 < Sse4_2 > { let (a , b) = token . split_u16x16 (a) ; let mask = _mm_set1_epi16 (0xFF) ; let lo_masked = _mm_and_si128 (a . into () , mask) ; let hi_masked = _mm_and_si128 (b . into () , mask) ; let result = _mm_packus_epi16 (lo_masked , hi_masked) ; result . simd_into (token) } } + kernel(self, a) } #[inline(always)] fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { @@ -4006,12 +3970,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { - unsafe { - { - let packed = _mm_packs_epi16(a.val.0[0], a.val.0[1]); - _mm_movemask_epi8(packed) as u32 as u64 - } - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask16x16 < Sse4_2 >) -> u64 { { let packed = _mm_packs_epi16 (a . val . 0 [0] , a . val . 0 [1]) ; _mm_movemask_epi8 (packed) as u32 as u64 } } } + kernel(self, a) } #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { @@ -6270,46 +6230,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { - unsafe { - { - let bit_bytes = _mm_set1_epi64x(bits.cast_signed()); - let bit_mask = - _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128); - mask8x64 { - val: crate::support::Aligned512([ - { - let bit_bytes = _mm_shuffle_epi8( - bit_bytes, - _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1), - ); - _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) - }, - { - let bit_bytes = _mm_shuffle_epi8( - bit_bytes, - _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3), - ); - _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) - }, - { - let bit_bytes = _mm_shuffle_epi8( - bit_bytes, - _mm_setr_epi8(4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5), - ); - _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) - }, - { - let bit_bytes = _mm_shuffle_epi8( - bit_bytes, - _mm_setr_epi8(6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7), - ); - _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) - }, - ]), - simd: self, - } - } - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , bits : u64) -> mask8x64 < Sse4_2 > { { let bit_bytes = _mm_set1_epi64x (bits . cast_signed ()) ; let bit_mask = _mm_setr_epi8 (1 , 2 , 4 , 8 , 16 , 32 , 64 , - 128 , 1 , 2 , 4 , 8 , 16 , 32 , 64 , - 128) ; mask8x64 { val : crate :: support :: Aligned512 ([{ let bit_bytes = _mm_shuffle_epi8 (bit_bytes , _mm_setr_epi8 (0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1)) ; _mm_cmpeq_epi8 (_mm_and_si128 (bit_bytes , bit_mask) , bit_mask) } , { let bit_bytes = _mm_shuffle_epi8 (bit_bytes , _mm_setr_epi8 (2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3)) ; _mm_cmpeq_epi8 (_mm_and_si128 (bit_bytes , bit_mask) , bit_mask) } , { let bit_bytes = _mm_shuffle_epi8 (bit_bytes , _mm_setr_epi8 (4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5)) ; _mm_cmpeq_epi8 (_mm_and_si128 (bit_bytes , bit_mask) , bit_mask) } , { let bit_bytes = _mm_shuffle_epi8 (bit_bytes , _mm_setr_epi8 (6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7)) ; _mm_cmpeq_epi8 (_mm_and_si128 (bit_bytes , bit_mask) , bit_mask) }]) , simd : token , } } } } + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { @@ -7055,15 +6977,8 @@ impl Simd for Sse4_2 { } #[inline(always)] fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { - unsafe { - { - let lo = _mm_packs_epi16(a.val.0[0], a.val.0[1]); - let hi = _mm_packs_epi16(a.val.0[2], a.val.0[3]); - let lo = _mm_movemask_epi8(lo) as u32 as u64; - let hi = _mm_movemask_epi8(hi) as u32 as u64; - lo | (hi << 16usize) - } - } + crate::kernel! { # [inline (always)] fn kernel (token : Sse4_2 , a : mask16x32 < Sse4_2 >) -> u64 { { let lo = _mm_packs_epi16 (a . val . 0 [0] , a . val . 0 [1]) ; let hi = _mm_packs_epi16 (a . val . 0 [2] , a . val . 0 [3]) ; let lo = _mm_movemask_epi8 (lo) as u32 as u64 ; let hi = _mm_movemask_epi8 (hi) as u32 as u64 ; lo | (hi << 16usize) } } } + kernel(self, a) } #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs index 9765c06df..3d624a227 100644 --- a/fearless_simd_gen/src/mk_neon.rs +++ b/fearless_simd_gen/src/mk_neon.rs @@ -83,14 +83,14 @@ impl Level for Neon { OpSig::Splat => { let expr = neon::expr(method, vec_ty, &[quote! { val }]); let normalize_mask = integer_lane_mask_splat_arg(vec_ty); - quote! { - #method_sig { - unsafe { - #normalize_mask - #expr.simd_into(self) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { + #normalize_mask + #expr.simd_into(self) + }, + ) } OpSig::Shift => { let dup_type = vec_ty.cast(ScalarType::Int); @@ -109,26 +109,14 @@ impl Level for Neon { vec_ty, &[quote! { a.into() }, quote! { #dup_intrinsic ( #shift ) }], ); - quote! { - #method_sig { - unsafe { - #expr.simd_into(self) - } - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, quote! { #expr.simd_into(self) }) } OpSig::Unary => { let args = [quote! { a.into() }]; let expr = neon::expr(method, vec_ty, &args); - quote! { - #method_sig { - unsafe { - #expr.simd_into(self) - } - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, quote! { #expr.simd_into(self) }) } OpSig::LoadInterleaved { block_size, @@ -187,17 +175,17 @@ impl Level for Neon { let id2 = Ident::new(&format!("vcombine_{}", target_scalar_ty), Span::call_site()); - quote! { - #method_sig { - unsafe { - let converted: #arch = a.into(); - let low = #id1(converted.0); - let high = #id1(converted.1); + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { + let converted: #arch = a.into(); + let low = #id1(converted.0); + let high = #id1(converted.1); - #id2(low, high).simd_into(self) - } - } - } + #id2(low, high).simd_into(self) + }, + ) } else { let arch = self.arch_ty(&target_ty); let id1 = Ident::new(&format!("vmovl_{}", vec_scalar_ty), Span::call_site()); @@ -205,16 +193,16 @@ impl Level for Neon { let id3 = Ident::new(&format!("vget_high_{}", vec_scalar_ty), Span::call_site()); - quote! { - #method_sig { - unsafe { - let low = #id1(#id2(a.into())); - let high = #id1(#id3(a.into())); + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { + let low = #id1(#id2(a.into())); + let high = #id1(#id3(a.into())); - #arch(low, high).simd_into(self) - } - } - } + #arch(low, high).simd_into(self) + }, + ) } } OpSig::Binary => { @@ -263,13 +251,7 @@ impl Level for Neon { } }; - quote! { - #method_sig { - unsafe { - #expr - } - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, expr) } OpSig::Ternary => { let args = match method { @@ -291,13 +273,7 @@ impl Level for Neon { let neg = simple_intrinsic("vneg", vec_ty); expr = quote! { #neg(#expr) }; } - quote! { - #method_sig { - unsafe { - #expr.simd_into(self) - } - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, quote! { #expr.simd_into(self) }) } OpSig::Compare => { let args = [quote! { a.into() }, quote! { b.into() }]; @@ -306,13 +282,11 @@ impl Level for Neon { let scalar_bits = vec_ty.scalar_bits; let reinterpret_str = format!("vreinterpret{opt_q}_s{scalar_bits}_u{scalar_bits}"); let reinterpret = Ident::new(&reinterpret_str, Span::call_site()); - quote! { - #method_sig { - unsafe { - #reinterpret(#expr).simd_into(self) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #reinterpret(#expr).simd_into(self) }, + ) } OpSig::Select => { let opt_q = neon::opt_q(vec_ty); @@ -320,13 +294,11 @@ impl Level for Neon { let reinterpret_str = format!("vreinterpret{opt_q}_u{scalar_bits}_s{scalar_bits}"); let reinterpret = Ident::new(&reinterpret_str, Span::call_site()); let vbsl = simple_intrinsic("vbsl", vec_ty); - quote! { - #method_sig { - unsafe { - #vbsl(#reinterpret(a.into()), b.into(), c.into()).simd_into(self) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #vbsl(#reinterpret(a.into()), b.into(), c.into()).simd_into(self) }, + ) } OpSig::Combine { combined_ty } => { let combined_wrapper = combined_ty.aligned_wrapper(); @@ -375,28 +347,28 @@ impl Level for Neon { OpSig::Zip { select_low } => { let neon = if select_low { "vzip1" } else { "vzip2" }; let zip = simple_intrinsic(neon, vec_ty); - quote! { - #method_sig { + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { let x = a.into(); let y = b.into(); - unsafe { - #zip(x, y).simd_into(self) - } - } - } + #zip(x, y).simd_into(self) + }, + ) } OpSig::Unzip { select_even } => { let neon = if select_even { "vuzp1" } else { "vuzp2" }; let zip = simple_intrinsic(neon, vec_ty); - quote! { - #method_sig { + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { let x = a.into(); let y = b.into(); - unsafe { - #zip(x, y).simd_into(self) - } - } - } + #zip(x, y).simd_into(self) + }, + ) } OpSig::Slide { granularity } => { use SlideGranularity::*; @@ -480,13 +452,11 @@ impl Level for Neon { } else { let to_ty = &vec_ty.reinterpret(target_ty, scalar_bits); let neon = cvt_intrinsic("vcvt", to_ty, vec_ty); - quote! { - #method_sig { - unsafe { - #neon(a.into()).simd_into(self) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #neon(a.into()).simd_into(self) }, + ) } } OpSig::Reinterpret { @@ -497,13 +467,11 @@ impl Level for Neon { let to_ty = vec_ty.reinterpret(target_ty, scalar_bits); let neon = cvt_intrinsic("vreinterpret", &to_ty, vec_ty); - quote! { - #method_sig { - unsafe { - #neon(a.into()).simd_into(self) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #neon(a.into()).simd_into(self) }, + ) } else { quote! {} } @@ -522,13 +490,11 @@ impl Level for Neon { let u32_ty = vec_ty.reinterpret(ScalarType::Unsigned, 32); let min_max = simple_intrinsic(reduction, &u32_ty); let reinterpret = format_ident!("vreinterpretq_u32_s{}", vec_ty.scalar_bits); - quote! { - #method_sig { - unsafe { - #min_max(#reinterpret(a.into())) #target - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #min_max(#reinterpret(a.into())) #target }, + ) } OpSig::MaskFromBitmask => self.handle_mask_from_bitmask(method_sig, vec_ty), OpSig::MaskToBitmask => self.handle_mask_to_bitmask(method_sig, vec_ty), diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index eea35754c..6eb8b2c6a 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -186,36 +186,36 @@ impl Level for X86 { let method_sig = op.simd_trait_method_sig(vec_ty); match sig { - OpSig::Splat => self.handle_splat(method_sig, vec_ty), - OpSig::Compare => self.handle_compare(method_sig, method, vec_ty), - OpSig::Unary => self.handle_unary(method_sig, method, vec_ty), + OpSig::Splat => self.handle_splat(op, vec_ty), + OpSig::Compare => self.handle_compare(op, method, vec_ty), + OpSig::Unary => self.handle_unary(op, method_sig, method, vec_ty), OpSig::WidenNarrow { target_ty } => { - self.handle_widen_narrow(method_sig, method, vec_ty, target_ty) - } - OpSig::Binary => self.handle_binary(method_sig, method, vec_ty), - OpSig::Shift => self.handle_shift(method_sig, method, vec_ty), - OpSig::Ternary => self.handle_ternary(method_sig, method, vec_ty), - OpSig::Select => self.handle_select(method_sig, vec_ty), - OpSig::Combine { combined_ty } => self.handle_combine(method_sig, vec_ty, &combined_ty), - OpSig::Split { half_ty } => self.handle_split(method_sig, vec_ty, &half_ty), - OpSig::Zip { select_low } => self.handle_zip(method_sig, vec_ty, select_low), - OpSig::Unzip { select_even } => self.handle_unzip(method_sig, vec_ty, select_even), + self.handle_widen_narrow(op, method, vec_ty, target_ty) + } + OpSig::Binary => self.handle_binary(op, method, vec_ty), + OpSig::Shift => self.handle_shift(op, method, vec_ty), + OpSig::Ternary => self.handle_ternary(op, method_sig, method, vec_ty), + OpSig::Select => self.handle_select(op, vec_ty), + OpSig::Combine { combined_ty } => self.handle_combine(op, vec_ty, &combined_ty), + OpSig::Split { half_ty } => self.handle_split(op, vec_ty, &half_ty), + OpSig::Zip { select_low } => self.handle_zip(op, vec_ty, select_low), + OpSig::Unzip { select_even } => self.handle_unzip(op, vec_ty, select_even), OpSig::Slide { granularity } => self.handle_slide(method_sig, vec_ty, granularity), OpSig::Cvt { target_ty, scalar_bits, precise, - } => self.handle_cvt(method_sig, vec_ty, target_ty, scalar_bits, precise), + } => self.handle_cvt(op, vec_ty, target_ty, scalar_bits, precise), OpSig::Reinterpret { target_ty, scalar_bits, - } => self.handle_reinterpret(self, method_sig, vec_ty, target_ty, scalar_bits), + } => self.handle_reinterpret(self, op, vec_ty, target_ty, scalar_bits), OpSig::MaskReduce { quantifier, condition, - } => self.handle_mask_reduce(method_sig, vec_ty, quantifier, condition), - OpSig::MaskFromBitmask => self.handle_mask_from_bitmask(method_sig, vec_ty), - OpSig::MaskToBitmask => self.handle_mask_to_bitmask(method_sig, vec_ty), + } => self.handle_mask_reduce(op, vec_ty, quantifier, condition), + OpSig::MaskFromBitmask => self.handle_mask_from_bitmask(op, vec_ty), + OpSig::MaskToBitmask => self.handle_mask_to_bitmask(op, vec_ty), OpSig::LoadInterleaved { block_size, block_count, @@ -233,8 +233,8 @@ impl Level for X86 { OpSig::StoreArray => generic_store_array(method_sig, vec_ty), OpSig::FromBytes => generic_from_bytes(method_sig, vec_ty), OpSig::ToBytes => generic_to_bytes(method_sig, vec_ty), - OpSig::Interleave => self.handle_interleave(method_sig, vec_ty), - OpSig::Deinterleave => self.handle_deinterleave(method_sig, vec_ty), + OpSig::Interleave => self.handle_interleave(op, vec_ty), + OpSig::Deinterleave => self.handle_deinterleave(op, vec_ty), } } } @@ -594,21 +594,21 @@ fn signed_literal(value: u64, bits: u32) -> TokenStream { } impl X86 { - pub(crate) fn handle_splat(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream { + pub(crate) fn handle_splat(&self, op: Op, vec_ty: &VecType) -> TokenStream { let intrinsic = set1_intrinsic(vec_ty); let cast = match vec_ty.scalar { ScalarType::Unsigned => quote!(.cast_signed()), _ => quote!(), }; let normalize_mask = integer_lane_mask_splat_arg(vec_ty); - quote! { - #method_sig { - unsafe { - #normalize_mask - #intrinsic(val #cast).simd_into(self) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { + #normalize_mask + #intrinsic(val #cast).simd_into(self) + }, + ) } fn has_specialized_mask_from_bitmask(&self, vec_ty: &VecType) -> bool { @@ -634,11 +634,7 @@ impl X86 { vec_ty.scalar == ScalarType::Mask && vec_ty.scalar_bits == 16 } - pub(crate) fn handle_mask_from_bitmask( - &self, - method_sig: TokenStream, - vec_ty: &VecType, - ) -> TokenStream { + pub(crate) fn handle_mask_from_bitmask(&self, op: Op, vec_ty: &VecType) -> TokenStream { assert_eq!( vec_ty.scalar, ScalarType::Mask, @@ -647,24 +643,12 @@ impl X86 { if self.has_wide_byte_mask_from_bitmask(vec_ty) { let expr = mask_from_bitmask_wide_bytes(self.native_width(), vec_ty); - return quote! { - #method_sig { - unsafe { - #expr - } - } - }; + return op.simd_trait_kernel_method(self.token(), vec_ty, expr); } if self.has_wide_avx2_mask_from_bitmask(vec_ty) { let expr = mask_from_bitmask_wide_avx2(vec_ty); - return quote! { - #method_sig { - unsafe { - #expr - } - } - }; + return op.simd_trait_kernel_method(self.token(), vec_ty, expr); } let expr = match vec_ty.scalar_bits { @@ -683,20 +667,10 @@ impl X86 { _ => unreachable!(), }; - quote! { - #method_sig { - unsafe { - #expr - } - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, expr) } - pub(crate) fn handle_mask_to_bitmask( - &self, - method_sig: TokenStream, - vec_ty: &VecType, - ) -> TokenStream { + pub(crate) fn handle_mask_to_bitmask(&self, op: Op, vec_ty: &VecType) -> TokenStream { assert_eq!( vec_ty.scalar, ScalarType::Mask, @@ -707,21 +681,15 @@ impl X86 { 8 => { let bits_ty = vec_ty.reinterpret(ScalarType::Int, 8); let movemask = simple_intrinsic("movemask", &bits_ty); - quote! { - #method_sig { - unsafe { #movemask(a.into()) as u32 as u64 } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #movemask(a.into()) as u32 as u64 }, + ) } 16 => { let bits = mask_to_bitmask_words(self.native_width(), vec_ty); - quote! { - #method_sig { - unsafe { - #bits - } - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, bits) } 32 | 64 => { let float_ty = vec_ty.cast(ScalarType::Float); @@ -733,22 +701,17 @@ impl X86 { vec_ty.scalar_bits, vec_ty.n_bits(), ); - quote! { - #method_sig { - unsafe { #movemask(#cast(a.into())) as u32 as u64 } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #movemask(#cast(a.into())) as u32 as u64 }, + ) } _ => unreachable!(), } } - pub(crate) fn handle_compare( - &self, - method_sig: TokenStream, - method: &str, - vec_ty: &VecType, - ) -> TokenStream { + pub(crate) fn handle_compare(&self, op: Op, method: &str, vec_ty: &VecType) -> TokenStream { let args = [quote! { a.into() }, quote! { b.into() }]; let expr = if vec_ty.scalar != ScalarType::Float { @@ -817,15 +780,12 @@ impl X86 { quote! { #ident(#compare_op(a.into(), b.into())) } }; - quote! { - #method_sig { - unsafe { #expr.simd_into(self) } - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, quote! { #expr.simd_into(self) }) } pub(crate) fn handle_unary( &self, + op: Op, method_sig: TokenStream, method: &str, vec_ty: &VecType, @@ -865,18 +825,14 @@ impl X86 { _ => { let args = [quote! { a.into() }]; let expr = x86::expr(method, vec_ty, &args); - quote! { - #method_sig { - unsafe { #expr.simd_into(self) } - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, quote! { #expr.simd_into(self) }) } } } pub(crate) fn handle_widen_narrow( &self, - method_sig: TokenStream, + op: Op, method: &str, vec_ty: &VecType, target_ty: VecType, @@ -893,9 +849,7 @@ impl X86 { dst_width, ); quote! { - unsafe { - #extend(a.into()).simd_into(self) - } + #extend(a.into()).simd_into(self) } } (Self::Avx2, 512, 256) => { @@ -911,12 +865,10 @@ impl X86 { ); let split = generic_op_name("split", vec_ty); quote! { - unsafe { - let (a0, a1) = self.#split(a); - let high = #extend(a0.into()).simd_into(self); - let low = #extend(a1.into()).simd_into(self); - self.#combine(high, low) - } + let (a0, a1) = self.#split(a); + let high = #extend(a0.into()).simd_into(self); + let low = #extend(a1.into()).simd_into(self); + self.#combine(high, low) } } (Self::Sse4_2, 256, 128) => { @@ -931,14 +883,12 @@ impl X86 { &vec_ty.reinterpret(vec_ty.scalar, vec_ty.scalar_bits * 2), ); quote! { - unsafe { - let raw = a.into(); - let high = #extend(raw).simd_into(self); - // Shift by 8 since we want to get the higher part into the - // lower position. - let low = #extend(_mm_srli_si128::<8>(raw)).simd_into(self); - self.#combine(high, low) - } + let raw = a.into(); + let high = #extend(raw).simd_into(self); + // Shift by 8 since we want to get the higher part into the + // lower position. + let low = #extend(_mm_srli_si128::<8>(raw)).simd_into(self); + self.#combine(high, low) } } _ => unimplemented!(), @@ -954,14 +904,12 @@ impl X86 { _ => unimplemented!(), }; quote! { - unsafe { - let mask = _mm256_setr_epi8(#mask, #mask); + let mask = _mm256_setr_epi8(#mask, #mask); - let shuffled = _mm256_shuffle_epi8(a.into(), mask); - let packed = _mm256_permute4x64_epi64::<0b11_01_10_00>(shuffled); + let shuffled = _mm256_shuffle_epi8(a.into(), mask); + let packed = _mm256_permute4x64_epi64::<0b11_01_10_00>(shuffled); - _mm256_castsi256_si128(packed).simd_into(self) - } + _mm256_castsi256_si128(packed).simd_into(self) } } (Self::Avx2, 256, 512) => { @@ -978,17 +926,15 @@ impl X86 { let split = generic_op_name("split", vec_ty); quote! { let (a, b) = self.#split(a); - unsafe { - // Note that AVX2 only has an intrinsic for saturating cast, - // but not wrapping. - let mask = #mask(0xFF); - let lo_masked = _mm256_and_si256(a.into(), mask); - let hi_masked = _mm256_and_si256(b.into(), mask); - // The 256-bit version of packus_epi16 operates lane-wise, so we need to arrange things - // properly afterwards. - let result = _mm256_permute4x64_epi64::<0b_11_01_10_00>(#pack(lo_masked, hi_masked)); - result.simd_into(self) - } + // Note that AVX2 only has an intrinsic for saturating cast, + // but not wrapping. + let mask = #mask(0xFF); + let lo_masked = _mm256_and_si256(a.into(), mask); + let hi_masked = _mm256_and_si256(b.into(), mask); + // The 256-bit version of packus_epi16 operates lane-wise, so we need to arrange things + // properly afterwards. + let result = _mm256_permute4x64_epi64::<0b_11_01_10_00>(#pack(lo_masked, hi_masked)); + result.simd_into(self) } } (Self::Sse4_2, 128, 256) => { @@ -1005,14 +951,12 @@ impl X86 { let split = generic_op_name("split", vec_ty); quote! { let (a, b) = self.#split(a); - unsafe { - // Below AVX-512. we only have an intrinsic for saturating cast, but not wrapping. - let mask = #mask(0xFF); - let lo_masked = _mm_and_si128(a.into(), mask); - let hi_masked = _mm_and_si128(b.into(), mask); - let result = #pack(lo_masked, hi_masked); - result.simd_into(self) - } + // Below AVX-512. we only have an intrinsic for saturating cast, but not wrapping. + let mask = #mask(0xFF); + let lo_masked = _mm_and_si128(a.into(), mask); + let hi_masked = _mm_and_si128(b.into(), mask); + let result = #pack(lo_masked, hi_masked); + result.simd_into(self) } } _ => unimplemented!(), @@ -1021,19 +965,11 @@ impl X86 { _ => unreachable!(), }; - quote! { - #method_sig { - #expr - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, expr) } - pub(crate) fn handle_binary( - &self, - method_sig: TokenStream, - method: &str, - vec_ty: &VecType, - ) -> TokenStream { + pub(crate) fn handle_binary(&self, op: Op, method: &str, vec_ty: &VecType) -> TokenStream { + let method_sig = op.simd_trait_method_sig(vec_ty); let body = match method { "mul" if vec_ty.scalar_bits == 8 => { // https://stackoverflow.com/questions/8193601/sse-multiplication-16-x-uint8-t @@ -1044,12 +980,10 @@ impl X86 { let slli = intrinsic_ident("slli", "epi16", vec_ty.n_bits()); let srli = intrinsic_ident("srli", "epi16", vec_ty.n_bits()); quote! { - unsafe { - let dst_even = #mullo(a.into(), b.into()); - let dst_odd = #mullo(#srli::<8>(a.into()), #srli::<8>(b.into())); + let dst_even = #mullo(a.into(), b.into()); + let dst_odd = #mullo(#srli::<8>(a.into()), #srli::<8>(b.into())); - #or(#slli(dst_odd, 8), #and(dst_even, #set1(0xFF))).simd_into(self) - } + #or(#slli(dst_odd, 8), #and(dst_even, #set1(0xFF))).simd_into(self) } } "shlv" | "shrv" if *self == Self::Avx2 && vec_ty.scalar_bits >= 32 => { @@ -1062,7 +996,7 @@ impl X86 { }; let intrinsic = intrinsic_ident(name, suffix, vec_ty.n_bits()); quote! { - unsafe { #intrinsic(a.into(), b.into()).simd_into(self) } + #intrinsic(a.into(), b.into()).simd_into(self) } } // SSE2 has shift operations, but they shift every lane by the same amount, so we can't use them here. @@ -1072,25 +1006,25 @@ impl X86 { let args = [quote! { a.into() }, quote! { b.into() }]; let expr = x86::expr(method, vec_ty, &args); quote! { - unsafe { #expr.simd_into(self) } + #expr.simd_into(self) } } }; - quote! { - #method_sig { - #body + match method { + "shlv" | "shrv" if !(*self == Self::Avx2 && vec_ty.scalar_bits >= 32) => { + quote! { + #method_sig { + #body + } + } } + _ => op.simd_trait_kernel_method(self.token(), vec_ty, body), } } - pub(crate) fn handle_shift( - &self, - method_sig: TokenStream, - method: &str, - vec_ty: &VecType, - ) -> TokenStream { - let op = match (method, vec_ty.scalar) { + pub(crate) fn handle_shift(&self, op: Op, method: &str, vec_ty: &VecType) -> TokenStream { + let shift_op = match (method, vec_ty.scalar) { ("shr", ScalarType::Unsigned) => "srl", ("shr", ScalarType::Int) => "sra", ("shl", _) => "sll", @@ -1098,7 +1032,7 @@ impl X86 { }; let ty_bits = vec_ty.n_bits(); let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits.max(16), false); - let shift_intrinsic = intrinsic_ident(op, suffix, ty_bits); + let shift_intrinsic = intrinsic_ident(shift_op, suffix, ty_bits); if vec_ty.scalar_bits == 8 { // x86 doesn't have shifting for 8-bit, so we first convert into 16-bit, shift, and then back to 8-bit. @@ -1124,33 +1058,34 @@ impl X86 { let extend_intrinsic_hi = extend_expr(unpack_hi); let pack_intrinsic = pack_intrinsic(16, vec_ty.scalar == ScalarType::Int, ty_bits); - quote! { - #method_sig { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = #extend_intrinsic_lo; - let hi_16 = #extend_intrinsic_hi; + let lo_16 = #extend_intrinsic_lo; + let hi_16 = #extend_intrinsic_hi; - let lo_shifted = #shift_intrinsic(lo_16, shift_count); - let hi_shifted = #shift_intrinsic(hi_16, shift_count); + let lo_shifted = #shift_intrinsic(lo_16, shift_count); + let hi_shifted = #shift_intrinsic(hi_16, shift_count); - #pack_intrinsic(lo_shifted, hi_shifted).simd_into(self) - } - } - } + #pack_intrinsic(lo_shifted, hi_shifted).simd_into(self) + }, + ) } else { - quote! { - #method_sig { - unsafe { #shift_intrinsic(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #shift_intrinsic(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }, + ) } } pub(crate) fn handle_ternary( &self, + op: Op, method_sig: TokenStream, method: &str, vec_ty: &VecType, @@ -1158,19 +1093,19 @@ impl X86 { match method { "mul_add" if *self == Self::Avx2 => { let intrinsic = simple_intrinsic("fmadd", vec_ty); - quote! { - #method_sig { - unsafe { #intrinsic(a.into(), b.into(), c.into()).simd_into(self) } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #intrinsic(a.into(), b.into(), c.into()).simd_into(self) }, + ) } "mul_sub" if *self == Self::Avx2 => { let intrinsic = simple_intrinsic("fmsub", vec_ty); - quote! { - #method_sig { - unsafe { #intrinsic(a.into(), b.into(), c.into()).simd_into(self) } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #intrinsic(a.into(), b.into(), c.into()).simd_into(self) }, + ) } "mul_add" => { quote! { @@ -1194,16 +1129,12 @@ impl X86 { ]; let expr = x86::expr(method, vec_ty, &args); - quote! { - #method_sig { - #expr.simd_into(self) - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, quote! { #expr.simd_into(self) }) } } } - pub(crate) fn handle_select(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream { + pub(crate) fn handle_select(&self, op: Op, vec_ty: &VecType) -> TokenStream { // Our select ops' argument order is mask, a, b; Intel's intrinsics are b, a, mask let args = [ quote! { c.into() }, @@ -1224,43 +1155,35 @@ impl X86 { ]; let expr = x86::expr("select", vec_ty, &args); - quote! { - #method_sig { - unsafe { #expr.simd_into(self) } - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, quote! { #expr.simd_into(self) }) } - pub(crate) fn handle_split( - &self, - method_sig: TokenStream, - vec_ty: &VecType, - half_ty: &VecType, - ) -> TokenStream { + pub(crate) fn handle_split(&self, op: Op, vec_ty: &VecType, half_ty: &VecType) -> TokenStream { if *self == Self::Avx2 && half_ty.n_bits() == 128 { let extract_op = match vec_ty.scalar { ScalarType::Float => "extractf128", _ => "extracti128", }; let extract_intrinsic = intrinsic_ident(extract_op, coarse_type(vec_ty), 256); - quote! { - #method_sig { - unsafe { - ( - #extract_intrinsic::<0>(a.into()).simd_into(self), - #extract_intrinsic::<1>(a.into()).simd_into(self), - ) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { + ( + #extract_intrinsic::<0>(a.into()).simd_into(self), + #extract_intrinsic::<1>(a.into()).simd_into(self), + ) + }, + ) } else { + let method_sig = op.simd_trait_method_sig(vec_ty); generic_block_split(method_sig, half_ty, self.max_block_size()) } } pub(crate) fn handle_combine( &self, - method_sig: TokenStream, + op: Op, vec_ty: &VecType, combined_ty: &VecType, ) -> TokenStream { @@ -1271,24 +1194,18 @@ impl X86 { _ => "m128i", }; let set_intrinsic = intrinsic_ident("setr", suffix, 256); - quote! { - #method_sig { - unsafe { - #set_intrinsic(a.into(), b.into()).simd_into(self) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #set_intrinsic(a.into(), b.into()).simd_into(self) }, + ) } else { + let method_sig = op.simd_trait_method_sig(vec_ty); generic_block_combine(method_sig, combined_ty, self.max_block_size()) } } - pub(crate) fn handle_zip( - &self, - method_sig: TokenStream, - vec_ty: &VecType, - select_low: bool, - ) -> TokenStream { + pub(crate) fn handle_zip(&self, op: Op, vec_ty: &VecType, select_low: bool) -> TokenStream { let expr = match vec_ty.n_bits() { 128 => { let op = if select_low { "unpacklo" } else { "unpackhi" }; @@ -1296,7 +1213,7 @@ impl X86 { let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false); let unpack_intrinsic = intrinsic_ident(op, suffix, vec_ty.n_bits()); quote! { - unsafe { #unpack_intrinsic(a.into(), b.into()).simd_into(self) } + #unpack_intrinsic(a.into(), b.into()).simd_into(self) } } 256 => { @@ -1319,29 +1236,19 @@ impl X86 { ); quote! { - unsafe { - let lo = #lo(a.into(), b.into()); - let hi = #hi(a.into(), b.into()); + let lo = #lo(a.into(), b.into()); + let hi = #hi(a.into(), b.into()); - #shuffle::<#shuffle_immediate>(lo, hi).simd_into(self) - } + #shuffle::<#shuffle_immediate>(lo, hi).simd_into(self) } } _ => unreachable!(), }; - quote! { - #method_sig { - #expr - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, expr) } - pub(crate) fn handle_interleave( - &self, - method_sig: TokenStream, - vec_ty: &VecType, - ) -> TokenStream { + pub(crate) fn handle_interleave(&self, op: Op, vec_ty: &VecType) -> TokenStream { match vec_ty.n_bits() { 256 => { // Optimized path: compute unpacklo and unpackhi once, then use permute2f128 to @@ -1358,24 +1265,25 @@ impl X86 { coarse_type(vec_ty), 256, ); - quote! { - #method_sig { - unsafe { - let lo = #lo(a.into(), b.into()); - let hi = #hi(a.into(), b.into()); - ( - #shuffle::<0b0010_0000>(lo, hi).simd_into(self), - #shuffle::<0b0011_0001>(lo, hi).simd_into(self), - ) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { + let lo = #lo(a.into(), b.into()); + let hi = #hi(a.into(), b.into()); + ( + #shuffle::<0b0010_0000>(lo, hi).simd_into(self), + #shuffle::<0b0011_0001>(lo, hi).simd_into(self), + ) + }, + ) } _ => { // For 128-bit vectors, zip_low/zip_high are single instructions (unpacklo/unpackhi), // so there's no redundancy in calling them separately. let zip_low = generic_op_name("zip_low", vec_ty); let zip_high = generic_op_name("zip_high", vec_ty); + let method_sig = op.simd_trait_method_sig(vec_ty); quote! { #method_sig { (self.#zip_low(a, b), self.#zip_high(a, b)) @@ -1385,11 +1293,7 @@ impl X86 { } } - pub(crate) fn handle_deinterleave( - &self, - method_sig: TokenStream, - vec_ty: &VecType, - ) -> TokenStream { + pub(crate) fn handle_deinterleave(&self, op: Op, vec_ty: &VecType) -> TokenStream { match vec_ty.n_bits() { 256 => { // Optimized path: compute the per-input shuffles once, then use permute2f128 / @@ -1397,24 +1301,25 @@ impl X86 { // the redundant shuffle operations that occur when unzip_low and unzip_high are // called separately. let (t1, t2, shuffle) = self.unzip256_intermediates(vec_ty); - quote! { - #method_sig { - unsafe { - let t1 = #t1; - let t2 = #t2; - ( - #shuffle::<0b0010_0000>(t1, t2).simd_into(self), - #shuffle::<0b0011_0001>(t1, t2).simd_into(self), - ) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { + let t1 = #t1; + let t2 = #t2; + ( + #shuffle::<0b0010_0000>(t1, t2).simd_into(self), + #shuffle::<0b0011_0001>(t1, t2).simd_into(self), + ) + }, + ) } _ => { // For 128-bit vectors, unzip_low/unzip_high are cheap, so there's no // redundancy in calling them separately. let unzip_low = generic_op_name("unzip_low", vec_ty); let unzip_high = generic_op_name("unzip_high", vec_ty); + let method_sig = op.simd_trait_method_sig(vec_ty); quote! { #method_sig { (self.#unzip_low(a, b), self.#unzip_high(a, b)) @@ -1476,12 +1381,7 @@ impl X86 { (t1, t2, shuffle) } - pub(crate) fn handle_unzip( - &self, - method_sig: TokenStream, - vec_ty: &VecType, - select_even: bool, - ) -> TokenStream { + pub(crate) fn handle_unzip(&self, op: Op, vec_ty: &VecType, select_even: bool) -> TokenStream { let expr = match (vec_ty.scalar, vec_ty.n_bits(), vec_ty.scalar_bits) { (ScalarType::Float, 128, _) => { // 128-bit shuffle of floats or doubles; there are built-in SSE intrinsics for this @@ -1496,7 +1396,7 @@ impl X86 { _ => unimplemented!(), }; - quote! { unsafe { #intrinsic::<#mask>(a.into(), b.into()).simd_into(self) } } + quote! { #intrinsic::<#mask>(a.into(), b.into()).simd_into(self) } } (ScalarType::Int | ScalarType::Mask | ScalarType::Unsigned, 128, 32) => { // 128-bit shuffle of 32-bit integers; unlike with floats, there is no single shuffle instruction that @@ -1505,11 +1405,9 @@ impl X86 { let intrinsic = intrinsic_ident(op, "epi64", vec_ty.n_bits()); quote! { - unsafe { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - #intrinsic(t1, t2).simd_into(self) - } + let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); + let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); + #intrinsic(t1, t2).simd_into(self) } } (ScalarType::Int | ScalarType::Mask | ScalarType::Unsigned, 128, 16 | 8) => { @@ -1535,13 +1433,11 @@ impl X86 { let unpack_epi64 = intrinsic_ident(op, "epi64", vec_ty.n_bits()); quote! { - unsafe { - let mask = #mask_reg; + let mask = #mask_reg; - let t1 = #shuffle_epi8(a.into(), mask); - let t2 = #shuffle_epi8(b.into(), mask); - #unpack_epi64(t1, t2).simd_into(self) - } + let t1 = #shuffle_epi8(a.into(), mask); + let t2 = #shuffle_epi8(b.into(), mask); + #unpack_epi64(t1, t2).simd_into(self) } } (_, 256, _) => { @@ -1553,21 +1449,15 @@ impl X86 { }; quote! { - unsafe { - let t1 = #t1; - let t2 = #t2; - #shuffle::<#shuffle_immediate>(t1, t2).simd_into(self) - } + let t1 = #t1; + let t2 = #t2; + #shuffle::<#shuffle_immediate>(t1, t2).simd_into(self) } } _ => unimplemented!(), }; - quote! { - #method_sig { - #expr - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, expr) } pub(crate) fn handle_slide( @@ -1631,7 +1521,7 @@ impl X86 { pub(crate) fn handle_cvt( &self, - method_sig: TokenStream, + op: Op, vec_ty: &VecType, target_scalar: ScalarType, target_scalar_bits: usize, @@ -1673,81 +1563,73 @@ impl X86 { match (target_scalar, precise) { (ScalarType::Int, false) => { quote! { - unsafe { - #convert(a.into()).simd_into(self) - } + #convert(a.into()).simd_into(self) } } (ScalarType::Unsigned, false) => { quote! { - unsafe { - let mut converted = #convert(a.into()); - - // In the common case where everything is in range of an i32, we don't need to do anything else. - let in_range = #cmplt(a.into(), #set1_float(2147483648.0)); - let all_in_range = #movemask(in_range) == #all_ones; + let mut converted = #convert(a.into()); - if !all_in_range { - // Add any excess (beyond the maximum value) - let excess = #sub_float(a.into(), #set1_float(2147483648.0)); - let excess_converted = #convert(#andnot(in_range, excess)); - converted = #add_int(converted, excess_converted); - } + // In the common case where everything is in range of an i32, we don't need to do anything else. + let in_range = #cmplt(a.into(), #set1_float(2147483648.0)); + let all_in_range = #movemask(in_range) == #all_ones; - converted.simd_into(self) + if !all_in_range { + // Add any excess (beyond the maximum value) + let excess = #sub_float(a.into(), #set1_float(2147483648.0)); + let excess_converted = #convert(#andnot(in_range, excess)); + converted = #add_int(converted, excess_converted); } + + converted.simd_into(self) } } (ScalarType::Int, true) => { quote! { - unsafe { - let a = a.into(); - - let mut converted = #convert(a); - - // In the common case where everything is in range, we don't need to do anything else. - let in_range = #cmplt(a, #set1_float(2147483648.0)); - let all_in_range = #movemask(in_range) == #all_ones; - - if !all_in_range { - // If we are above i32::MAX (2147483647), clamp to it. - converted = #blend(#set1_int(i32::MAX), converted, #cast_to_int(in_range)); - // Set NaN to 0. Using `and` seems slightly faster than `blend`. - let is_not_nan = #cast_to_int(#cmpord(a, a)); - converted = #and(converted, is_not_nan); - // We don't need to handle negative overflow because Intel's "invalid result" sentinel - // value is -2147483648, which is what we want anyway. - } - - converted.simd_into(self) + let a = a.into(); + + let mut converted = #convert(a); + + // In the common case where everything is in range, we don't need to do anything else. + let in_range = #cmplt(a, #set1_float(2147483648.0)); + let all_in_range = #movemask(in_range) == #all_ones; + + if !all_in_range { + // If we are above i32::MAX (2147483647), clamp to it. + converted = #blend(#set1_int(i32::MAX), converted, #cast_to_int(in_range)); + // Set NaN to 0. Using `and` seems slightly faster than `blend`. + let is_not_nan = #cast_to_int(#cmpord(a, a)); + converted = #and(converted, is_not_nan); + // We don't need to handle negative overflow because Intel's "invalid result" sentinel + // value is -2147483648, which is what we want anyway. } + + converted.simd_into(self) } } (ScalarType::Unsigned, true) => { quote! { - unsafe { - // Clamp out-of-range values (and NaN) to 0. Intel's `_mm_max_ps` always takes the second - // operand if the first is NaN. - let a = #max(a.into(), #set0()); - let mut converted = #convert(a); - - // In the common case where everything is in range of an i32, we don't need to do anything else. - let in_range = #cmplt(a, #set1_float(2147483648.0)); - let all_in_range = #movemask(in_range) == #all_ones; - - if !all_in_range { - let exceeds_unsigned_range = #cast_to_int(#cmplt(#set1_float(4294967040.0), a)); - // Add any excess (beyond the maximum value) - let excess = #sub_float(a, #set1_float(2147483648.0)); - let excess_converted = #convert(#andnot(in_range, excess)); - - // Clamp to u32::MAX. - converted = #add_int(converted, excess_converted); - converted = #blend(converted, #set1_int(u32::MAX.cast_signed()), exceeds_unsigned_range); - } - - converted.simd_into(self) + // Clamp out-of-range values (and NaN) to 0. Intel's `_mm_max_ps` always takes the second + // operand if the first is NaN. + let a = #max(a.into(), #set0()); + let mut converted = #convert(a); + + // In the common case where everything is in range of an i32, we don't need to do anything else. + let in_range = #cmplt(a, #set1_float(2147483648.0)); + let all_in_range = #movemask(in_range) == #all_ones; + + if !all_in_range { + let exceeds_unsigned_range = #cast_to_int(#cmplt(#set1_float(4294967040.0), a)); + // Add any excess (beyond the maximum value) + let excess = #sub_float(a, #set1_float(2147483648.0)); + let excess_converted = #convert(#andnot(in_range, excess)); + + // Clamp to u32::MAX. + converted = #add_int(converted, excess_converted); + converted = #blend(converted, #set1_int(u32::MAX.cast_signed()), exceeds_unsigned_range); } + + converted.simd_into(self) } } _ => unreachable!(), @@ -1761,9 +1643,7 @@ impl X86 { let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits); let intrinsic = simple_intrinsic("cvtepi32", &target_ty); quote! { - unsafe { - #intrinsic(a.into()).simd_into(self) - } + #intrinsic(a.into()).simd_into(self) } } (ScalarType::Unsigned, ScalarType::Float) => { @@ -1791,32 +1671,26 @@ impl X86 { // https://github.com/llvm/llvm-project/blob/6f8e87b9d097c5ef631f24d2eb2f34eb31b54d3b/llvm/lib/Target/X86/X86ISelLowering.cpp // (The file is too big for GitHub to show a preview, so no line numbers.) quote! { - unsafe { - let a = a.into(); - let lo = #blend::<0xAA>(a, #set1_int(0x4B000000)); - let hi = #blend::<0xAA>(#srli::<16>(a), #set1_int(0x53000000)); + let a = a.into(); + let lo = #blend::<0xAA>(a, #set1_int(0x4B000000)); + let hi = #blend::<0xAA>(#srli::<16>(a), #set1_int(0x53000000)); - let fhi = #sub_float(#cast_to_float(hi), #set1_float(f32::from_bits(0x53000080))); - let result = #add_float(#cast_to_float(lo), fhi); + let fhi = #sub_float(#cast_to_float(hi), #set1_float(f32::from_bits(0x53000080))); + let result = #add_float(#cast_to_float(lo), fhi); - result.simd_into(self) - } + result.simd_into(self) } } _ => unimplemented!(), }; - quote! { - #method_sig { - #expr - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, expr) } pub(crate) fn handle_reinterpret( &self, level: &impl Level, - method_sig: TokenStream, + op: Op, vec_ty: &VecType, target_ty: ScalarType, scalar_bits: usize, @@ -1829,11 +1703,11 @@ impl X86 { if coarse_type(vec_ty) == coarse_type(&dst_ty) { let arch_ty = level.arch_ty(vec_ty); - quote! { - #method_sig { - #arch_ty::from(a).simd_into(self) - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #arch_ty::from(a).simd_into(self) }, + ) } else { let ident = cast_ident( vec_ty.scalar, @@ -1842,19 +1716,17 @@ impl X86 { scalar_bits, vec_ty.n_bits(), ); - quote! { - #method_sig { - unsafe { - #ident(a.into()).simd_into(self) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #ident(a.into()).simd_into(self) }, + ) } } pub(crate) fn handle_mask_reduce( &self, - method_sig: TokenStream, + method_op: Op, vec_ty: &VecType, quantifier: Quantifier, condition: bool, @@ -1908,13 +1780,7 @@ impl X86 { (Quantifier::All, false) => quote! { == 0 }, }; - quote! { - #method_sig { - unsafe { - #movemask as u32 #op - } - } - } + method_op.simd_trait_kernel_method(self.token(), vec_ty, quote! { #movemask as u32 #op }) } pub(crate) fn handle_load_interleaved( diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs index c1129e6be..498eabd14 100644 --- a/fearless_simd_gen/src/ops.rs +++ b/fearless_simd_gen/src/ops.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT use anyhow::{Context, anyhow}; -use proc_macro2::{Ident, Span, TokenStream}; +use proc_macro2::{Group, Ident, Span, TokenStream, TokenTree}; use quote::{format_ident, quote}; use std::fmt::Write; @@ -336,6 +336,167 @@ impl Op { } } + /// Generate a `Simd` trait method that delegates its body to a local `kernel!` function. + /// + /// The generated method keeps the trait signature using `Self`, while the local kernel uses + /// the concrete SIMD token type required by `kernel!`. Const-generic operations are rejected + /// because `kernel!` currently only accepts plain non-generic functions. + pub(crate) fn simd_trait_kernel_method( + &self, + level: Ident, + vec_ty: &VecType, + body: TokenStream, + ) -> TokenStream { + assert!( + !matches!(self.sig, OpSig::Slide { .. }), + "kernel! does not support const-generic methods" + ); + + let method_sig = self.simd_trait_method_sig(vec_ty); + let token = Ident::new("token", Span::call_site()); + let kernel_body = replace_ident(body, "self", &token); + let (arg_decls, call_args, ret) = self.simd_trait_kernel_sig_parts(&level, vec_ty); + + quote! { + #method_sig { + crate::kernel! { + #[inline(always)] + fn kernel(#token: #level #(, #arg_decls)*) -> #ret { + #kernel_body + } + } + + kernel(self #(, #call_args)*) + } + } + } + + /// Build the concrete argument declarations, forwarding arguments, and return type for a generated kernel. + /// + /// This mirrors [`Op::simd_trait_method_sig`], but substitutes the concrete SIMD level token for `Self`. + fn simd_trait_kernel_sig_parts( + &self, + level: &Ident, + vec_ty: &VecType, + ) -> (Vec, Vec, TokenStream) { + let ty = vec_ty.rust(); + let arg_names = self + .sig + .simd_trait_arg_names() + .iter() + .map(|n| Ident::new(n, Span::call_site())) + .collect::>(); + let vec = quote! { #ty<#level> }; + + let (arg_tys, ret) = match &self.sig { + OpSig::Splat => { + let arg_ty = splat_arg_ty(vec_ty); + (vec![arg_ty], vec) + } + OpSig::LoadInterleaved { + block_size, + block_count, + } => { + let arg_ty = load_interleaved_arg_ty(*block_size, *block_count, vec_ty); + (vec![arg_ty], vec) + } + OpSig::StoreInterleaved { + block_size, + block_count, + } => { + let arg_ty = store_interleaved_arg_ty(*block_size, *block_count, vec_ty); + (vec![vec.clone(), arg_ty], quote! { () }) + } + OpSig::Compare => { + let result = vec_ty.mask_ty().rust(); + (vec![vec.clone(), vec.clone()], quote! { #result<#level> }) + } + OpSig::Split { half_ty } => { + let result = half_ty.rust(); + (vec![vec], quote! { (#result<#level>, #result<#level>) }) + } + OpSig::Combine { combined_ty } => { + let result = combined_ty.rust(); + (vec![vec.clone(), vec], quote! { #result<#level> }) + } + OpSig::Unary => (vec![vec.clone()], vec), + OpSig::Binary | OpSig::Zip { .. } | OpSig::Unzip { .. } => { + (vec![vec.clone(), vec.clone()], vec) + } + OpSig::Interleave | OpSig::Deinterleave => { + (vec![vec.clone(), vec.clone()], quote! { (#vec, #vec) }) + } + OpSig::Slide { .. } => unreachable!("checked by caller"), + OpSig::Cvt { + target_ty, + scalar_bits, + .. + } + | OpSig::Reinterpret { + target_ty, + scalar_bits, + } => { + let result = vec_ty.reinterpret(*target_ty, *scalar_bits).rust(); + (vec![vec], quote! { #result<#level> }) + } + OpSig::WidenNarrow { target_ty } => { + let result = target_ty.rust(); + (vec![vec], quote! { #result<#level> }) + } + OpSig::MaskReduce { .. } => (vec![vec], quote! { bool }), + OpSig::MaskFromBitmask => (vec![quote! { u64 }], vec), + OpSig::MaskToBitmask => (vec![vec], quote! { u64 }), + OpSig::Shift => (vec![vec.clone(), quote! { u32 }], vec), + OpSig::Ternary => (vec![vec.clone(), vec.clone(), vec.clone()], vec), + OpSig::Select => { + let mask_ty = vec_ty.mask_ty().rust(); + ( + vec![quote! { #mask_ty<#level> }, vec.clone(), vec.clone()], + vec, + ) + } + OpSig::FromArray { kind } => { + let ref_tok = kind.token(); + let rust_scalar = vec_ty.scalar.rust(vec_ty.scalar_bits); + let len = vec_ty.len; + let array_ty = quote! { [#rust_scalar; #len] }; + (vec![quote! { #ref_tok #array_ty }], vec) + } + OpSig::AsArray { kind } => { + let ref_tok = kind.token(); + let rust_scalar = vec_ty.scalar.rust(vec_ty.scalar_bits); + let len = vec_ty.len; + let array_ty = quote! { [#rust_scalar; #len] }; + ( + vec![quote! { #ref_tok #vec }], + quote! { #ref_tok #array_ty }, + ) + } + OpSig::StoreArray => { + let rust_scalar = vec_ty.scalar.rust(vec_ty.scalar_bits); + let len = vec_ty.len; + let array_ty = quote! { [#rust_scalar; #len] }; + (vec![vec, quote! { &mut #array_ty }], quote! { () }) + } + OpSig::FromBytes => { + let bytes_ty = vec_ty.reinterpret(ScalarType::Unsigned, 8).rust(); + (vec![quote! { #bytes_ty<#level> }], vec) + } + OpSig::ToBytes => { + let bytes_ty = vec_ty.reinterpret(ScalarType::Unsigned, 8).rust(); + (vec![vec], quote! { #bytes_ty<#level> }) + } + }; + + let arg_decls = arg_names + .iter() + .zip(arg_tys) + .map(|(name, ty)| quote! { #name: #ty }) + .collect(); + + (arg_decls, arg_names, ret) + } + pub(crate) fn vec_trait_method_sig(&self) -> Option { let arg_names = self .sig @@ -457,6 +618,26 @@ impl Op { } } +/// Replace all identifiers named `from` in a token stream with `to`, recursing into token groups. +/// +/// This is used to turn generated method bodies that mention `self` into kernel bodies that mention +/// the concrete token parameter instead. +fn replace_ident(stream: TokenStream, from: &str, to: &Ident) -> TokenStream { + stream + .into_iter() + .map(|tree| match tree { + TokenTree::Group(group) => { + let mut new_group = + Group::new(group.delimiter(), replace_ident(group.stream(), from, to)); + new_group.set_span(group.span()); + TokenTree::Group(new_group) + } + TokenTree::Ident(ident) if ident.to_string() == from => TokenTree::Ident(to.clone()), + tree => tree, + }) + .collect() +} + fn splat_arg_ty(vec_ty: &VecType) -> TokenStream { if vec_ty.scalar == ScalarType::Mask { quote! { bool }