diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index c0bb5300..216c6562 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -98,7 +98,13 @@ impl Simd for Avx2 { } #[inline(always)] fn splat_f32x4(self, val: f32) -> f32x4 { - unsafe { _mm_set1_ps(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: f32) -> f32x4 { + _mm_set1_ps(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4 { @@ -171,78 +177,185 @@ impl Simd for Avx2 { } #[inline(always)] fn abs_f32x4(self, a: f32x4) -> f32x4 { - unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4) -> f32x4 { + _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn neg_f32x4(self, a: f32x4) -> f32x4 { - unsafe { _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4) -> f32x4 { + _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn sqrt_f32x4(self, a: f32x4) -> f32x4 { - unsafe { _mm_sqrt_ps(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4) -> f32x4 { + _mm_sqrt_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn approximate_recip_f32x4(self, a: f32x4) -> f32x4 { - unsafe { _mm_rcp_ps(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4) -> f32x4 { + _mm_rcp_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4) -> f32x4 { + _mm_add_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_sub_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4) -> f32x4 { + _mm_sub_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_mul_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4) -> f32x4 { + _mm_mul_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn div_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_div_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4) -> f32x4 { + _mm_div_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn copysign_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { - let mask = _mm_set1_ps(-0.0); - _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4) -> f32x4 { + let mask = _mm_set1_ps(-0.0); + _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmpeq_ps(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4) -> mask32x4 { + _mm_castps_si128(_mm_cmpeq_ps(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmplt_ps(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4) -> mask32x4 { + _mm_castps_si128(_mm_cmplt_ps(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmple_ps(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4) -> mask32x4 { + _mm_castps_si128(_mm_cmple_ps(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmpge_ps(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4) -> mask32x4 { + _mm_castps_si128(_mm_cmpge_ps(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmpgt_ps(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4) -> mask32x4 { + _mm_castps_si128(_mm_cmpgt_ps(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_unpacklo_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4) -> f32x4 { + _mm_unpacklo_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_unpackhi_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4) -> f32x4 { + _mm_unpackhi_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4) -> f32x4 { + _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4) -> f32x4 { + _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_f32x4(self, a: f32x4, b: f32x4) -> (f32x4, f32x4) { @@ -254,54 +367,100 @@ impl Simd for Avx2 { } #[inline(always)] fn max_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4) -> f32x4 { + _mm_max_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4) -> f32x4 { + _mm_min_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { - let intermediate = _mm_max_ps(a.into(), b.into()); - let b_is_nan = _mm_cmpunord_ps(b.into(), b.into()); - _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4) -> f32x4 { + let intermediate = _mm_max_ps(a.into(), b.into()); + let b_is_nan = _mm_cmpunord_ps(b.into(), b.into()); + _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { - let intermediate = _mm_min_ps(a.into(), b.into()); - let b_is_nan = _mm_cmpunord_ps(b.into(), b.into()); - _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4) -> f32x4 { + let intermediate = _mm_min_ps(a.into(), b.into()); + let b_is_nan = _mm_cmpunord_ps(b.into(), b.into()); + _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { - unsafe { _mm_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { + _mm_fmadd_ps(a.into(), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn mul_sub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { - unsafe { _mm_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { + _mm_fmsub_ps(a.into(), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn floor_f32x4(self, a: f32x4) -> f32x4 { - unsafe { - _mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4) -> f32x4 { + _mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn ceil_f32x4(self, a: f32x4) -> f32x4 { - unsafe { - _mm_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4) -> f32x4 { + _mm_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn round_ties_even_f32x4(self, a: f32x4) -> f32x4 { - unsafe { - _mm_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4) -> f32x4 { + _mm_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn fract_f32x4(self, a: f32x4) -> f32x4 { @@ -309,96 +468,166 @@ impl Simd for Avx2 { } #[inline(always)] fn trunc_f32x4(self, a: f32x4) -> f32x4 { - unsafe { - _mm_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4) -> f32x4 { + _mm_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn select_f32x4(self, a: mask32x4, b: f32x4, c: f32x4) -> f32x4 { - unsafe { _mm_blendv_ps(c.into(), b.into(), _mm_castsi128_ps(a.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask32x4, + b: f32x4, + c: f32x4, + ) -> f32x4 { + _mm_blendv_ps(c.into(), b.into(), _mm_castsi128_ps(a.into())).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn combine_f32x4(self, a: f32x4, b: f32x4) -> f32x8 { - unsafe { _mm256_setr_m128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4, b: f32x4) -> f32x8 { + _mm256_setr_m128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2 { - unsafe { _mm_castps_pd(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4) -> f64x2 { + _mm_castps_pd(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_i32_f32x4(self, a: f32x4) -> i32x4 { - unsafe { _mm_castps_si128(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4) -> i32x4 { + _mm_castps_si128(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_f32x4(self, a: f32x4) -> u8x16 { - unsafe { _mm_castps_si128(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4) -> u8x16 { + _mm_castps_si128(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_f32x4(self, a: f32x4) -> u32x4 { - unsafe { _mm_castps_si128(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4) -> u32x4 { + _mm_castps_si128(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_u32_f32x4(self, a: f32x4) -> u32x4 { - unsafe { - let mut converted = _mm_cvttps_epi32(a.into()); - let in_range = _mm_cmplt_ps(a.into(), _mm_set1_ps(2147483648.0)); - let all_in_range = _mm_movemask_ps(in_range) == 0b1111; - if !all_in_range { - let excess = _mm_sub_ps(a.into(), _mm_set1_ps(2147483648.0)); - let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess)); - converted = _mm_add_epi32(converted, excess_converted); + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4) -> u32x4 { + let mut converted = _mm_cvttps_epi32(a.into()); + let in_range = _mm_cmplt_ps(a.into(), _mm_set1_ps(2147483648.0)); + let all_in_range = _mm_movemask_ps(in_range) == 0b1111; + if !all_in_range { + let excess = _mm_sub_ps(a.into(), _mm_set1_ps(2147483648.0)); + let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess)); + converted = _mm_add_epi32(converted, excess_converted); + } + converted.simd_into(token) } - converted.simd_into(self) - } + ); + kernel(self, a) } #[inline(always)] fn cvt_u32_precise_f32x4(self, a: f32x4) -> u32x4 { - unsafe { - let a = _mm_max_ps(a.into(), _mm_setzero_ps()); - let mut converted = _mm_cvttps_epi32(a); - let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0)); - let all_in_range = _mm_movemask_ps(in_range) == 0b1111; - if !all_in_range { - let exceeds_unsigned_range = - _mm_castps_si128(_mm_cmplt_ps(_mm_set1_ps(4294967040.0), a)); - let excess = _mm_sub_ps(a, _mm_set1_ps(2147483648.0)); - let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess)); - converted = _mm_add_epi32(converted, excess_converted); - converted = _mm_blendv_epi8( - converted, - _mm_set1_epi32(u32::MAX.cast_signed()), - exceeds_unsigned_range, - ); + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4) -> u32x4 { + let a = _mm_max_ps(a.into(), _mm_setzero_ps()); + let mut converted = _mm_cvttps_epi32(a); + let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0)); + let all_in_range = _mm_movemask_ps(in_range) == 0b1111; + if !all_in_range { + let exceeds_unsigned_range = + _mm_castps_si128(_mm_cmplt_ps(_mm_set1_ps(4294967040.0), a)); + let excess = _mm_sub_ps(a, _mm_set1_ps(2147483648.0)); + let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess)); + converted = _mm_add_epi32(converted, excess_converted); + converted = _mm_blendv_epi8( + converted, + _mm_set1_epi32(u32::MAX.cast_signed()), + exceeds_unsigned_range, + ); + } + converted.simd_into(token) } - converted.simd_into(self) - } + ); + kernel(self, a) } #[inline(always)] fn cvt_i32_f32x4(self, a: f32x4) -> i32x4 { - unsafe { _mm_cvttps_epi32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4) -> i32x4 { + _mm_cvttps_epi32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_i32_precise_f32x4(self, a: f32x4) -> i32x4 { - unsafe { - let a = a.into(); - let mut converted = _mm_cvttps_epi32(a); - let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0)); - let all_in_range = _mm_movemask_ps(in_range) == 0b1111; - if !all_in_range { - converted = _mm_blendv_epi8( - _mm_set1_epi32(i32::MAX), - converted, - _mm_castps_si128(in_range), - ); - let is_not_nan = _mm_castps_si128(_mm_cmpord_ps(a, a)); - converted = _mm_and_si128(converted, is_not_nan); + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x4) -> i32x4 { + let a = a.into(); + let mut converted = _mm_cvttps_epi32(a); + let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0)); + let all_in_range = _mm_movemask_ps(in_range) == 0b1111; + if !all_in_range { + converted = _mm_blendv_epi8( + _mm_set1_epi32(i32::MAX), + converted, + _mm_castps_si128(in_range), + ); + let is_not_nan = _mm_castps_si128(_mm_cmpord_ps(a, a)); + converted = _mm_and_si128(converted, is_not_nan); + } + converted.simd_into(token) } - converted.simd_into(self) - } + ); + kernel(self, a) } #[inline(always)] fn splat_i8x16(self, val: i8) -> i8x16 { - unsafe { _mm_set1_epi8(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: i8) -> i8x16 { + _mm_set1_epi8(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16 { @@ -471,36 +700,70 @@ impl Simd for Avx2 { } #[inline(always)] fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, b: i8x16) -> i8x16 { + _mm_add_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, b: i8x16) -> i8x16 { + _mm_sub_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { - let dst_even = _mm_mullo_epi16(a.into(), b.into()); - let dst_odd = - _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into())); - _mm_or_si128( - _mm_slli_epi16(dst_odd, 8), - _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, b: i8x16) -> i8x16 { + let dst_even = _mm_mullo_epi16(a.into(), b.into()); + let dst_odd = + _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into())); + _mm_or_si128( + _mm_slli_epi16(dst_odd, 8), + _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, b: i8x16) -> i8x16 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, b: i8x16) -> i8x16 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, b: i8x16) -> i8x16 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_i8x16(self, a: i8x16) -> i8x16 { @@ -508,15 +771,19 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_i8x16(self, a: i8x16, shift: u32) -> i8x16 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); - let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); - let lo_shifted = _mm_sll_epi16(lo_16, shift_count); - let hi_shifted = _mm_sll_epi16(hi_16, shift_count); - _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, shift: u32) -> i8x16 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); + let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); + let lo_shifted = _mm_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm_sll_epi16(hi_16, shift_count); + _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { @@ -524,15 +791,19 @@ impl Simd for Avx2 { } #[inline(always)] fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); - let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); - let lo_shifted = _mm_sra_epi16(lo_16, shift_count); - let hi_shifted = _mm_sra_epi16(hi_16, shift_count); - _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, shift: u32) -> i8x16 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); + let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); + let lo_shifted = _mm_sra_epi16(lo_16, shift_count); + let hi_shifted = _mm_sra_epi16(hi_16, shift_count); + _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { @@ -540,49 +811,99 @@ impl Simd for Avx2 { } #[inline(always)] fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, b: i8x16) -> mask8x16 { + _mm_cmpeq_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpgt_epi8(b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, b: i8x16) -> mask8x16 { + _mm_cmpgt_epi8(b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(_mm_min_epi8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, b: i8x16) -> mask8x16 { + _mm_cmpeq_epi8(_mm_min_epi8(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(_mm_max_epi8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, b: i8x16) -> mask8x16 { + _mm_cmpeq_epi8(_mm_max_epi8(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpgt_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, b: i8x16) -> mask8x16 { + _mm_cmpgt_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, b: i8x16) -> i8x16 { + _mm_unpacklo_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, b: i8x16) -> i8x16 { + _mm_unpackhi_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, b: i8x16) -> i8x16 { + let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpacklo_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, b: i8x16) -> i8x16 { + let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpackhi_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_i8x16(self, a: i8x16, b: i8x16) -> (i8x16, i8x16) { @@ -594,35 +915,88 @@ impl Simd for Avx2 { } #[inline(always)] fn select_i8x16(self, a: mask8x16, b: i8x16, c: i8x16) -> i8x16 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask8x16, + b: i8x16, + c: i8x16, + ) -> i8x16 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_min_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, b: i8x16) -> i8x16 { + _mm_min_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_max_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, b: i8x16) -> i8x16 { + _mm_max_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_i8x16(self, a: i8x16, b: i8x16) -> i8x32 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16, b: i8x16) -> i8x32 { + _mm256_setr_m128i(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn neg_i8x16(self, a: i8x16) -> i8x16 { - unsafe { _mm_sub_epi8(_mm_setzero_si128(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16) -> i8x16 { + _mm_sub_epi8(_mm_setzero_si128(), a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i8x16(self, a: i8x16) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16) -> u8x16 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i8x16(self, a: i8x16) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x16) -> u32x4 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_u8x16(self, val: u8) -> u8x16 { - unsafe { _mm_set1_epi8(val.cast_signed()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: u8) -> u8x16 { + _mm_set1_epi8(val.cast_signed()).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16 { @@ -695,36 +1069,70 @@ impl Simd for Avx2 { } #[inline(always)] fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, b: u8x16) -> u8x16 { + _mm_add_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, b: u8x16) -> u8x16 { + _mm_sub_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { - let dst_even = _mm_mullo_epi16(a.into(), b.into()); - let dst_odd = - _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into())); - _mm_or_si128( - _mm_slli_epi16(dst_odd, 8), - _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, b: u8x16) -> u8x16 { + let dst_even = _mm_mullo_epi16(a.into(), b.into()); + let dst_odd = + _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into())); + _mm_or_si128( + _mm_slli_epi16(dst_odd, 8), + _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, b: u8x16) -> u8x16 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, b: u8x16) -> u8x16 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, b: u8x16) -> u8x16 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_u8x16(self, a: u8x16) -> u8x16 { @@ -732,15 +1140,19 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_u8x16(self, a: u8x16, shift: u32) -> u8x16 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); - let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); - let lo_shifted = _mm_sll_epi16(lo_16, shift_count); - let hi_shifted = _mm_sll_epi16(hi_16, shift_count); - _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, shift: u32) -> u8x16 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); + let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); + let lo_shifted = _mm_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm_sll_epi16(hi_16, shift_count); + _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { @@ -748,15 +1160,19 @@ impl Simd for Avx2 { } #[inline(always)] fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); - let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); - let lo_shifted = _mm_srl_epi16(lo_16, shift_count); - let hi_shifted = _mm_srl_epi16(hi_16, shift_count); - _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, shift: u32) -> u8x16 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); + let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); + let lo_shifted = _mm_srl_epi16(lo_16, shift_count); + let hi_shifted = _mm_srl_epi16(hi_16, shift_count); + _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { @@ -764,59 +1180,105 @@ impl Simd for Avx2 { } #[inline(always)] fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, b: u8x16) -> mask8x16 { + _mm_cmpeq_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { - let sign_bit = _mm_set1_epi8(0x80u8.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi8(b_signed, a_signed).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, b: u8x16) -> mask8x16 { + let sign_bit = _mm_set1_epi8(0x80u8.cast_signed()); + let a_signed = _mm_xor_si128(a.into(), sign_bit); + let b_signed = _mm_xor_si128(b.into(), sign_bit); + _mm_cmpgt_epi8(b_signed, a_signed).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(_mm_min_epu8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, b: u8x16) -> mask8x16 { + _mm_cmpeq_epi8(_mm_min_epu8(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(_mm_max_epu8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, b: u8x16) -> mask8x16 { + _mm_cmpeq_epi8(_mm_max_epu8(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { - let sign_bit = _mm_set1_epi8(0x80u8.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, b: u8x16) -> mask8x16 { + let sign_bit = _mm_set1_epi8(0x80u8.cast_signed()); + let a_signed = _mm_xor_si128(a.into(), sign_bit); + let b_signed = _mm_xor_si128(b.into(), sign_bit); + _mm_cmpgt_epi8(a_signed, b_signed).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, b: u8x16) -> u8x16 { + _mm_unpacklo_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, b: u8x16) -> u8x16 { + _mm_unpackhi_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, b: u8x16) -> u8x16 { + let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpacklo_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, b: u8x16) -> u8x16 { + let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpackhi_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_u8x16(self, a: u8x16, b: u8x16) -> (u8x16, u8x16) { @@ -828,34 +1290,79 @@ impl Simd for Avx2 { } #[inline(always)] fn select_u8x16(self, a: mask8x16, b: u8x16, c: u8x16) -> u8x16 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask8x16, + b: u8x16, + c: u8x16, + ) -> u8x16 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_min_epu8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, b: u8x16) -> u8x16 { + _mm_min_epu8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_max_epu8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, b: u8x16) -> u8x16 { + _mm_max_epu8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_u8x16(self, a: u8x16, b: u8x16) -> u8x32 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16, b: u8x16) -> u8x32 { + _mm256_setr_m128i(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn widen_u8x16(self, a: u8x16) -> u16x16 { - unsafe { _mm256_cvtepu8_epi16(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16) -> u16x16 { + _mm256_cvtepu8_epi16(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_u8x16(self, a: u8x16) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x16) -> u32x4 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_mask8x16(self, val: bool) -> mask8x16 { - unsafe { - let val: i8 = if val { !0 } else { 0 }; - _mm_set1_epi8(val).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: bool) -> mask8x16 { + let val: i8 = if val { !0 } else { 0 }; + _mm_set1_epi8(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16 { @@ -870,35 +1377,63 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16 { - unsafe { - { - let bit_bytes = _mm_cvtsi32_si128(bits as i32); - let bit_bytes = _mm_shuffle_epi8( - bit_bytes, - _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1), - ); - let bit_mask = - _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128); - _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, bits: u64) -> mask8x16 { + { + let bit_bytes = _mm_cvtsi32_si128(bits as i32); + let bit_bytes = _mm_shuffle_epi8( + bit_bytes, + _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1), + ); + let bit_mask = + _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128); + _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + } + .simd_into(token) } - .simd_into(self) - } + ); + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask8x16(self, a: mask8x16) -> u64 { - unsafe { _mm_movemask_epi8(a.into()) as u32 as u64 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x16) -> u64 { + _mm_movemask_epi8(a.into()) as u32 as u64 + } + ); + kernel(self, a) } #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x16, b: mask8x16) -> mask8x16 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x16, b: mask8x16) -> mask8x16 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x16, b: mask8x16) -> mask8x16 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_mask8x16(self, a: mask8x16) -> mask8x16 { @@ -911,35 +1446,88 @@ impl Simd for Avx2 { b: mask8x16, c: mask8x16, ) -> mask8x16 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask8x16, + b: mask8x16, + c: mask8x16, + ) -> mask8x16 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x16, b: mask8x16) -> mask8x16 { + _mm_cmpeq_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn any_true_mask8x16(self, a: mask8x16) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x16) -> bool { + _mm_movemask_epi8(a.into()) as u32 != 0 + } + ); + kernel(self, a) } #[inline(always)] fn all_true_mask8x16(self, a: mask8x16) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x16) -> bool { + _mm_movemask_epi8(a.into()) as u32 == 0xffff + } + ); + kernel(self, a) } #[inline(always)] fn any_false_mask8x16(self, a: mask8x16) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x16) -> bool { + _mm_movemask_epi8(a.into()) as u32 != 0xffff + } + ); + kernel(self, a) } #[inline(always)] fn all_false_mask8x16(self, a: mask8x16) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x16) -> bool { + _mm_movemask_epi8(a.into()) as u32 == 0 + } + ); + kernel(self, a) } #[inline(always)] fn combine_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x32 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x16, b: mask8x16) -> mask8x32 { + _mm256_setr_m128i(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn splat_i16x8(self, val: i16) -> i16x8 { - unsafe { _mm_set1_epi16(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: i16) -> i16x8 { + _mm_set1_epi16(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8 { @@ -1012,27 +1600,63 @@ impl Simd for Avx2 { } #[inline(always)] fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, b: i16x8) -> i16x8 { + _mm_add_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, b: i16x8) -> i16x8 { + _mm_sub_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, b: i16x8) -> i16x8 { + _mm_mullo_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, b: i16x8) -> i16x8 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, b: i16x8) -> i16x8 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, b: i16x8) -> i16x8 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_i16x8(self, a: i16x8) -> i16x8 { @@ -1040,7 +1664,13 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_i16x8(self, a: i16x8, shift: u32) -> i16x8 { - unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, shift: u32) -> i16x8 { + _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { @@ -1048,7 +1678,13 @@ impl Simd for Avx2 { } #[inline(always)] fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8 { - unsafe { _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, shift: u32) -> i16x8 { + _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { @@ -1056,49 +1692,99 @@ impl Simd for Avx2 { } #[inline(always)] fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, b: i16x8) -> mask16x8 { + _mm_cmpeq_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpgt_epi16(b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, b: i16x8) -> mask16x8 { + _mm_cmpgt_epi16(b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(_mm_min_epi16(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, b: i16x8) -> mask16x8 { + _mm_cmpeq_epi16(_mm_min_epi16(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(_mm_max_epi16(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, b: i16x8) -> mask16x8 { + _mm_cmpeq_epi16(_mm_max_epi16(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpgt_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, b: i16x8) -> mask16x8 { + _mm_cmpgt_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, b: i16x8) -> i16x8 { + _mm_unpacklo_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, b: i16x8) -> i16x8 { + _mm_unpackhi_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, b: i16x8) -> i16x8 { + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpacklo_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, b: i16x8) -> i16x8 { + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpackhi_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_i16x8(self, a: i16x8, b: i16x8) -> (i16x8, i16x8) { @@ -1110,35 +1796,88 @@ impl Simd for Avx2 { } #[inline(always)] fn select_i16x8(self, a: mask16x8, b: i16x8, c: i16x8) -> i16x8 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask16x8, + b: i16x8, + c: i16x8, + ) -> i16x8 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_min_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, b: i16x8) -> i16x8 { + _mm_min_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_max_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, b: i16x8) -> i16x8 { + _mm_max_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_i16x8(self, a: i16x8, b: i16x8) -> i16x16 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8, b: i16x8) -> i16x16 { + _mm256_setr_m128i(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn neg_i16x8(self, a: i16x8) -> i16x8 { - unsafe { _mm_sub_epi16(_mm_setzero_si128(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8) -> i16x8 { + _mm_sub_epi16(_mm_setzero_si128(), a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i16x8(self, a: i16x8) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8) -> u8x16 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i16x8(self, a: i16x8) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x8) -> u32x4 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_u16x8(self, val: u16) -> u16x8 { - unsafe { _mm_set1_epi16(val.cast_signed()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: u16) -> u16x8 { + _mm_set1_epi16(val.cast_signed()).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8 { @@ -1211,27 +1950,63 @@ impl Simd for Avx2 { } #[inline(always)] fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, b: u16x8) -> u16x8 { + _mm_add_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, b: u16x8) -> u16x8 { + _mm_sub_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, b: u16x8) -> u16x8 { + _mm_mullo_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, b: u16x8) -> u16x8 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, b: u16x8) -> u16x8 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, b: u16x8) -> u16x8 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_u16x8(self, a: u16x8) -> u16x8 { @@ -1239,7 +2014,13 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_u16x8(self, a: u16x8, shift: u32) -> u16x8 { - unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, shift: u32) -> u16x8 { + _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { @@ -1247,7 +2028,13 @@ impl Simd for Avx2 { } #[inline(always)] fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8 { - unsafe { _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, shift: u32) -> u16x8 { + _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { @@ -1255,59 +2042,105 @@ impl Simd for Avx2 { } #[inline(always)] fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, b: u16x8) -> mask16x8 { + _mm_cmpeq_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { - let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi16(b_signed, a_signed).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, b: u16x8) -> mask16x8 { + let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed()); + let a_signed = _mm_xor_si128(a.into(), sign_bit); + let b_signed = _mm_xor_si128(b.into(), sign_bit); + _mm_cmpgt_epi16(b_signed, a_signed).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(_mm_min_epu16(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, b: u16x8) -> mask16x8 { + _mm_cmpeq_epi16(_mm_min_epu16(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(_mm_max_epu16(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, b: u16x8) -> mask16x8 { + _mm_cmpeq_epi16(_mm_max_epu16(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { - let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, b: u16x8) -> mask16x8 { + let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed()); + let a_signed = _mm_xor_si128(a.into(), sign_bit); + let b_signed = _mm_xor_si128(b.into(), sign_bit); + _mm_cmpgt_epi16(a_signed, b_signed).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, b: u16x8) -> u16x8 { + _mm_unpacklo_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, b: u16x8) -> u16x8 { + _mm_unpackhi_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, b: u16x8) -> u16x8 { + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpacklo_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, b: u16x8) -> u16x8 { + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpackhi_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_u16x8(self, a: u16x8, b: u16x8) -> (u16x8, u16x8) { @@ -1319,34 +2152,79 @@ impl Simd for Avx2 { } #[inline(always)] fn select_u16x8(self, a: mask16x8, b: u16x8, c: u16x8) -> u16x8 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask16x8, + b: u16x8, + c: u16x8, + ) -> u16x8 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_min_epu16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, b: u16x8) -> u16x8 { + _mm_min_epu16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_max_epu16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, b: u16x8) -> u16x8 { + _mm_max_epu16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_u16x8(self, a: u16x8, b: u16x8) -> u16x16 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8, b: u16x8) -> u16x16 { + _mm256_setr_m128i(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn reinterpret_u8_u16x8(self, a: u16x8) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8) -> u8x16 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_u16x8(self, a: u16x8) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x8) -> u32x4 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_mask16x8(self, val: bool) -> mask16x8 { - unsafe { - let val: i16 = if val { !0 } else { 0 }; - _mm_set1_epi16(val).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: bool) -> mask16x8 { + let val: i16 = if val { !0 } else { 0 }; + _mm_set1_epi16(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8 { @@ -1361,35 +2239,61 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8 { - unsafe { - { - let bit_lanes = _mm_set1_epi16(bits as i16); - let bit_mask = _mm_setr_epi16(1, 2, 4, 8, 16, 32, 64, 128); - _mm_cmpeq_epi16(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, bits: u64) -> mask16x8 { + { + let bit_lanes = _mm_set1_epi16(bits as i16); + let bit_mask = _mm_setr_epi16(1, 2, 4, 8, 16, 32, 64, 128); + _mm_cmpeq_epi16(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + } + .simd_into(token) } - .simd_into(self) - } + ); + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask16x8(self, a: mask16x8) -> u64 { - unsafe { - { - let packed = _mm_packs_epi16(a.into(), a.into()); - _mm_movemask_epi8(packed) as u8 as u64 + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x8) -> u64 { + { + let packed = _mm_packs_epi16(a.into(), a.into()); + _mm_movemask_epi8(packed) as u8 as u64 + } } - } + ); + kernel(self, a) } #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x8, b: mask16x8) -> mask16x8 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x8, b: mask16x8) -> mask16x8 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x8, b: mask16x8) -> mask16x8 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_mask16x8(self, a: mask16x8) -> mask16x8 { @@ -1402,35 +2306,88 @@ impl Simd for Avx2 { b: mask16x8, c: mask16x8, ) -> mask16x8 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask16x8, + b: mask16x8, + c: mask16x8, + ) -> mask16x8 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x8, b: mask16x8) -> mask16x8 { + _mm_cmpeq_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn any_true_mask16x8(self, a: mask16x8) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x8) -> bool { + _mm_movemask_epi8(a.into()) as u32 != 0 + } + ); + kernel(self, a) } #[inline(always)] fn all_true_mask16x8(self, a: mask16x8) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x8) -> bool { + _mm_movemask_epi8(a.into()) as u32 == 0xffff + } + ); + kernel(self, a) } #[inline(always)] fn any_false_mask16x8(self, a: mask16x8) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x8) -> bool { + _mm_movemask_epi8(a.into()) as u32 != 0xffff + } + ); + kernel(self, a) } #[inline(always)] fn all_false_mask16x8(self, a: mask16x8) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x8) -> bool { + _mm_movemask_epi8(a.into()) as u32 == 0 + } + ); + kernel(self, a) } #[inline(always)] fn combine_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x16 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x8, b: mask16x8) -> mask16x16 { + _mm256_setr_m128i(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn splat_i32x4(self, val: i32) -> i32x4 { - unsafe { _mm_set1_epi32(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: i32) -> i32x4 { + _mm_set1_epi32(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4 { @@ -1503,27 +2460,63 @@ impl Simd for Avx2 { } #[inline(always)] fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> i32x4 { + _mm_add_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> i32x4 { + _mm_sub_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> i32x4 { + _mm_mullo_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> i32x4 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> i32x4 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> i32x4 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_i32x4(self, a: i32x4) -> i32x4 { @@ -1531,63 +2524,137 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_i32x4(self, a: i32x4, shift: u32) -> i32x4 { - unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, shift: u32) -> i32x4 { + _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_sllv_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> i32x4 { + _mm_sllv_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_i32x4(self, a: i32x4, shift: u32) -> i32x4 { - unsafe { _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, shift: u32) -> i32x4 { + _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_srav_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> i32x4 { + _mm_srav_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> mask32x4 { + _mm_cmpeq_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpgt_epi32(b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> mask32x4 { + _mm_cmpgt_epi32(b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(_mm_min_epi32(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> mask32x4 { + _mm_cmpeq_epi32(_mm_min_epi32(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(_mm_max_epi32(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> mask32x4 { + _mm_cmpeq_epi32(_mm_max_epi32(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpgt_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> mask32x4 { + _mm_cmpgt_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> i32x4 { + _mm_unpacklo_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> i32x4 { + _mm_unpackhi_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> i32x4 { + let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); + let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); + _mm_unpacklo_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> i32x4 { + let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); + let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); + _mm_unpackhi_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_i32x4(self, a: i32x4, b: i32x4) -> (i32x4, i32x4) { @@ -1599,39 +2666,98 @@ impl Simd for Avx2 { } #[inline(always)] fn select_i32x4(self, a: mask32x4, b: i32x4, c: i32x4) -> i32x4 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask32x4, + b: i32x4, + c: i32x4, + ) -> i32x4 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_min_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> i32x4 { + _mm_min_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_max_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> i32x4 { + _mm_max_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_i32x4(self, a: i32x4, b: i32x4) -> i32x8 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4, b: i32x4) -> i32x8 { + _mm256_setr_m128i(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn neg_i32x4(self, a: i32x4) -> i32x4 { - unsafe { _mm_sub_epi32(_mm_setzero_si128(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4) -> i32x4 { + _mm_sub_epi32(_mm_setzero_si128(), a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i32x4(self, a: i32x4) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4) -> u8x16 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i32x4(self, a: i32x4) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4) -> u32x4 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_f32_i32x4(self, a: i32x4) -> f32x4 { - unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x4) -> f32x4 { + _mm_cvtepi32_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_u32x4(self, val: u32) -> u32x4 { - unsafe { _mm_set1_epi32(val.cast_signed()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: u32) -> u32x4 { + _mm_set1_epi32(val.cast_signed()).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4 { @@ -1704,27 +2830,63 @@ impl Simd for Avx2 { } #[inline(always)] fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> u32x4 { + _mm_add_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> u32x4 { + _mm_sub_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> u32x4 { + _mm_mullo_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> u32x4 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> u32x4 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> u32x4 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_u32x4(self, a: u32x4) -> u32x4 { @@ -1732,73 +2894,143 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_u32x4(self, a: u32x4, shift: u32) -> u32x4 { - unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, shift: u32) -> u32x4 { + _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_sllv_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> u32x4 { + _mm_sllv_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_u32x4(self, a: u32x4, shift: u32) -> u32x4 { - unsafe { _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, shift: u32) -> u32x4 { + _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_srlv_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> u32x4 { + _mm_srlv_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> mask32x4 { + _mm_cmpeq_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { - let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi32(b_signed, a_signed).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> mask32x4 { + let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed()); + let a_signed = _mm_xor_si128(a.into(), sign_bit); + let b_signed = _mm_xor_si128(b.into(), sign_bit); + _mm_cmpgt_epi32(b_signed, a_signed).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(_mm_min_epu32(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> mask32x4 { + _mm_cmpeq_epi32(_mm_min_epu32(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(_mm_max_epu32(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> mask32x4 { + _mm_cmpeq_epi32(_mm_max_epu32(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { - let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> mask32x4 { + let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed()); + let a_signed = _mm_xor_si128(a.into(), sign_bit); + let b_signed = _mm_xor_si128(b.into(), sign_bit); + _mm_cmpgt_epi32(a_signed, b_signed).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> u32x4 { + _mm_unpacklo_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> u32x4 { + _mm_unpackhi_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> u32x4 { + let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); + let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); + _mm_unpacklo_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> u32x4 { + let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); + let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); + _mm_unpackhi_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_u32x4(self, a: u32x4, b: u32x4) -> (u32x4, u32x4) { @@ -1810,44 +3042,88 @@ impl Simd for Avx2 { } #[inline(always)] fn select_u32x4(self, a: mask32x4, b: u32x4, c: u32x4) -> u32x4 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask32x4, + b: u32x4, + c: u32x4, + ) -> u32x4 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_min_epu32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> u32x4 { + _mm_min_epu32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_max_epu32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> u32x4 { + _mm_max_epu32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_u32x4(self, a: u32x4, b: u32x4) -> u32x8 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4, b: u32x4) -> u32x8 { + _mm256_setr_m128i(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn reinterpret_u8_u32x4(self, a: u32x4) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4) -> u8x16 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_f32_u32x4(self, a: u32x4) -> f32x4 { - unsafe { - let a = a.into(); - let lo = _mm_blend_epi16::<0xAA>(a, _mm_set1_epi32(0x4B000000)); - let hi = _mm_blend_epi16::<0xAA>(_mm_srli_epi32::<16>(a), _mm_set1_epi32(0x53000000)); - let fhi = _mm_sub_ps( - _mm_castsi128_ps(hi), - _mm_set1_ps(f32::from_bits(0x53000080)), - ); - let result = _mm_add_ps(_mm_castsi128_ps(lo), fhi); - result.simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x4) -> f32x4 { + let a = a.into(); + let lo = _mm_blend_epi16::<0xAA>(a, _mm_set1_epi32(0x4B000000)); + let hi = + _mm_blend_epi16::<0xAA>(_mm_srli_epi32::<16>(a), _mm_set1_epi32(0x53000000)); + let fhi = _mm_sub_ps( + _mm_castsi128_ps(hi), + _mm_set1_ps(f32::from_bits(0x53000080)), + ); + let result = _mm_add_ps(_mm_castsi128_ps(lo), fhi); + result.simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_mask32x4(self, val: bool) -> mask32x4 { - unsafe { - let val: i32 = if val { !0 } else { 0 }; - _mm_set1_epi32(val).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: bool) -> mask32x4 { + let val: i32 = if val { !0 } else { 0 }; + _mm_set1_epi32(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4 { @@ -1862,30 +3138,58 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4 { - unsafe { - { - let bit_lanes = _mm_set1_epi32(bits as i32); - let bit_mask = _mm_setr_epi32(1, 2, 4, 8); - _mm_cmpeq_epi32(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, bits: u64) -> mask32x4 { + { + let bit_lanes = _mm_set1_epi32(bits as i32); + let bit_mask = _mm_setr_epi32(1, 2, 4, 8); + _mm_cmpeq_epi32(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + } + .simd_into(token) } - .simd_into(self) - } + ); + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask32x4(self, a: mask32x4) -> u64 { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 as u64 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x4) -> u64 { + _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 as u64 + } + ); + kernel(self, a) } #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x4, b: mask32x4) -> mask32x4 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x4, b: mask32x4) -> mask32x4 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x4, b: mask32x4) -> mask32x4 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_mask32x4(self, a: mask32x4) -> mask32x4 { @@ -1898,35 +3202,88 @@ impl Simd for Avx2 { b: mask32x4, c: mask32x4, ) -> mask32x4 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask32x4, + b: mask32x4, + c: mask32x4, + ) -> mask32x4 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x4, b: mask32x4) -> mask32x4 { + _mm_cmpeq_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn any_true_mask32x4(self, a: mask32x4) -> bool { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x4) -> bool { + _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0 + } + ); + kernel(self, a) } #[inline(always)] fn all_true_mask32x4(self, a: mask32x4) -> bool { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0b1111 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x4) -> bool { + _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0b1111 + } + ); + kernel(self, a) } #[inline(always)] fn any_false_mask32x4(self, a: mask32x4) -> bool { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0b1111 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x4) -> bool { + _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0b1111 + } + ); + kernel(self, a) } #[inline(always)] fn all_false_mask32x4(self, a: mask32x4) -> bool { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x4) -> bool { + _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0 + } + ); + kernel(self, a) } #[inline(always)] fn combine_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x8 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x4, b: mask32x4) -> mask32x8 { + _mm256_setr_m128i(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn splat_f64x2(self, val: f64) -> f64x2 { - unsafe { _mm_set1_pd(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: f64) -> f64x2 { + _mm_set1_pd(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2 { @@ -1999,15 +3356,33 @@ impl Simd for Avx2 { } #[inline(always)] fn abs_f64x2(self, a: f64x2) -> f64x2 { - unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2) -> f64x2 { + _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn neg_f64x2(self, a: f64x2) -> f64x2 { - unsafe { _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2) -> f64x2 { + _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn sqrt_f64x2(self, a: f64x2) -> f64x2 { - unsafe { _mm_sqrt_pd(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2) -> f64x2 { + _mm_sqrt_pd(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn approximate_recip_f64x2(self, a: f64x2) -> f64x2 { @@ -2015,62 +3390,145 @@ impl Simd for Avx2 { } #[inline(always)] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2) -> f64x2 { + _mm_add_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_sub_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2) -> f64x2 { + _mm_sub_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_mul_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2) -> f64x2 { + _mm_mul_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn div_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_div_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2) -> f64x2 { + _mm_div_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn copysign_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let mask = _mm_set1_pd(-0.0); - _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2) -> f64x2 { + let mask = _mm_set1_pd(-0.0); + _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmpeq_pd(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2) -> mask64x2 { + _mm_castpd_si128(_mm_cmpeq_pd(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmplt_pd(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2) -> mask64x2 { + _mm_castpd_si128(_mm_cmplt_pd(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmple_pd(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2) -> mask64x2 { + _mm_castpd_si128(_mm_cmple_pd(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmpge_pd(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2) -> mask64x2 { + _mm_castpd_si128(_mm_cmpge_pd(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmpgt_pd(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2) -> mask64x2 { + _mm_castpd_si128(_mm_cmpgt_pd(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_unpacklo_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2) -> f64x2 { + _mm_unpacklo_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_unpackhi_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2) -> f64x2 { + _mm_unpackhi_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2) -> f64x2 { + _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2) -> f64x2 { + _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_f64x2(self, a: f64x2, b: f64x2) -> (f64x2, f64x2) { @@ -2082,54 +3540,100 @@ impl Simd for Avx2 { } #[inline(always)] fn max_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2) -> f64x2 { + _mm_max_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2) -> f64x2 { + _mm_min_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let intermediate = _mm_max_pd(a.into(), b.into()); - let b_is_nan = _mm_cmpunord_pd(b.into(), b.into()); - _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2) -> f64x2 { + let intermediate = _mm_max_pd(a.into(), b.into()); + let b_is_nan = _mm_cmpunord_pd(b.into(), b.into()); + _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let intermediate = _mm_min_pd(a.into(), b.into()); - let b_is_nan = _mm_cmpunord_pd(b.into(), b.into()); - _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2) -> f64x2 { + let intermediate = _mm_min_pd(a.into(), b.into()); + let b_is_nan = _mm_cmpunord_pd(b.into(), b.into()); + _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { - unsafe { _mm_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { + _mm_fmadd_pd(a.into(), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn mul_sub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { - unsafe { _mm_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { + _mm_fmsub_pd(a.into(), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn floor_f64x2(self, a: f64x2) -> f64x2 { - unsafe { - _mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2) -> f64x2 { + _mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn ceil_f64x2(self, a: f64x2) -> f64x2 { - unsafe { - _mm_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2) -> f64x2 { + _mm_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn round_ties_even_f64x2(self, a: f64x2) -> f64x2 { - unsafe { - _mm_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2) -> f64x2 { + _mm_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn fract_f64x2(self, a: f64x2) -> f64x2 { @@ -2137,28 +3641,60 @@ impl Simd for Avx2 { } #[inline(always)] fn trunc_f64x2(self, a: f64x2) -> f64x2 { - unsafe { - _mm_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2) -> f64x2 { + _mm_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn select_f64x2(self, a: mask64x2, b: f64x2, c: f64x2) -> f64x2 { - unsafe { _mm_blendv_pd(c.into(), b.into(), _mm_castsi128_pd(a.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask64x2, + b: f64x2, + c: f64x2, + ) -> f64x2 { + _mm_blendv_pd(c.into(), b.into(), _mm_castsi128_pd(a.into())).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn combine_f64x2(self, a: f64x2, b: f64x2) -> f64x4 { - unsafe { _mm256_setr_m128d(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2, b: f64x2) -> f64x4 { + _mm256_setr_m128d(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn reinterpret_f32_f64x2(self, a: f64x2) -> f32x4 { - unsafe { _mm_castpd_ps(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x2) -> f32x4 { + _mm_castpd_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_mask64x2(self, val: bool) -> mask64x2 { - unsafe { - let val: i64 = if val { !0 } else { 0 }; - _mm_set1_epi64x(val).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: bool) -> mask64x2 { + let val: i64 = if val { !0 } else { 0 }; + _mm_set1_epi64x(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { @@ -2173,30 +3709,58 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { - unsafe { - { - let bit_lanes = _mm_set1_epi64x(bits.cast_signed()); - let bit_mask = _mm_set_epi64x(2, 1); - _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, bits: u64) -> mask64x2 { + { + let bit_lanes = _mm_set1_epi64x(bits.cast_signed()); + let bit_mask = _mm_set_epi64x(2, 1); + _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + } + .simd_into(token) } - .simd_into(self) - } + ); + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x2) -> u64 { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 + } + ); + kernel(self, a) } #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x2, b: mask64x2) -> mask64x2 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x2, b: mask64x2) -> mask64x2 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x2, b: mask64x2) -> mask64x2 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_mask64x2(self, a: mask64x2) -> mask64x2 { @@ -2209,35 +3773,88 @@ impl Simd for Avx2 { b: mask64x2, c: mask64x2, ) -> mask64x2 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask64x2, + b: mask64x2, + c: mask64x2, + ) -> mask64x2 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { _mm_cmpeq_epi64(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x2, b: mask64x2) -> mask64x2 { + _mm_cmpeq_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn any_true_mask64x2(self, a: mask64x2) -> bool { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x2) -> bool { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0 + } + ); + kernel(self, a) } #[inline(always)] fn all_true_mask64x2(self, a: mask64x2) -> bool { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x2) -> bool { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11 + } + ); + kernel(self, a) } #[inline(always)] fn any_false_mask64x2(self, a: mask64x2) -> bool { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x2) -> bool { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11 + } + ); + kernel(self, a) } #[inline(always)] fn all_false_mask64x2(self, a: mask64x2) -> bool { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x2) -> bool { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0 + } + ); + kernel(self, a) } #[inline(always)] fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { - unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x2, b: mask64x2) -> mask64x4 { + _mm256_setr_m128i(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn splat_f32x8(self, val: f32) -> f32x8 { - unsafe { _mm256_set1_ps(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: f32) -> f32x8 { + _mm256_set1_ps(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { @@ -2323,173 +3940,329 @@ impl Simd for Avx2 { } #[inline(always)] fn abs_f32x8(self, a: f32x8) -> f32x8 { - unsafe { _mm256_andnot_ps(_mm256_set1_ps(-0.0), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> f32x8 { + _mm256_andnot_ps(_mm256_set1_ps(-0.0), a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn neg_f32x8(self, a: f32x8) -> f32x8 { - unsafe { _mm256_xor_ps(a.into(), _mm256_set1_ps(-0.0)).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> f32x8 { + _mm256_xor_ps(a.into(), _mm256_set1_ps(-0.0)).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn sqrt_f32x8(self, a: f32x8) -> f32x8 { - unsafe { _mm256_sqrt_ps(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> f32x8 { + _mm256_sqrt_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { - unsafe { _mm256_rcp_ps(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> f32x8 { + _mm256_rcp_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { _mm256_add_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + _mm256_add_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { _mm256_sub_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + _mm256_sub_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { _mm256_mul_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + _mm256_mul_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { _mm256_div_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + _mm256_div_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - let mask = _mm256_set1_ps(-0.0); - _mm256_or_ps( - _mm256_and_ps(mask, b.into()), - _mm256_andnot_ps(mask, a.into()), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + let mask = _mm256_set1_ps(-0.0); + _mm256_or_ps( + _mm256_and_ps(mask, b.into()), + _mm256_andnot_ps(mask, a.into()), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - unsafe { _mm256_castps_si256(_mm256_cmp_ps::<0i32>(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> mask32x8 { + _mm256_castps_si256(_mm256_cmp_ps::<0i32>(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - unsafe { _mm256_castps_si256(_mm256_cmp_ps::<17i32>(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> mask32x8 { + _mm256_castps_si256(_mm256_cmp_ps::<17i32>(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - unsafe { _mm256_castps_si256(_mm256_cmp_ps::<18i32>(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> mask32x8 { + _mm256_castps_si256(_mm256_cmp_ps::<18i32>(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - unsafe { _mm256_castps_si256(_mm256_cmp_ps::<29i32>(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> mask32x8 { + _mm256_castps_si256(_mm256_cmp_ps::<29i32>(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - unsafe { _mm256_castps_si256(_mm256_cmp_ps::<30i32>(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> mask32x8 { + _mm256_castps_si256(_mm256_cmp_ps::<30i32>(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - let lo = _mm256_unpacklo_ps(a.into(), b.into()); - let hi = _mm256_unpackhi_ps(a.into(), b.into()); - _mm256_permute2f128_ps::<0b0010_0000>(lo, hi).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + let lo = _mm256_unpacklo_ps(a.into(), b.into()); + let hi = _mm256_unpackhi_ps(a.into(), b.into()); + _mm256_permute2f128_ps::<0b0010_0000>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - let lo = _mm256_unpacklo_ps(a.into(), b.into()); - let hi = _mm256_unpackhi_ps(a.into(), b.into()); - _mm256_permute2f128_ps::<0b0011_0001>(lo, hi).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + let lo = _mm256_unpacklo_ps(a.into(), b.into()); + let hi = _mm256_unpackhi_ps(a.into(), b.into()); + _mm256_permute2f128_ps::<0b0011_0001>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - let t1 = _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - _mm256_permute2f128_ps::<0b0010_0000>(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + let t1 = + _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + let t2 = + _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + _mm256_permute2f128_ps::<0b0010_0000>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - let t1 = _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - _mm256_permute2f128_ps::<0b0011_0001>(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + let t1 = + _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + let t2 = + _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + _mm256_permute2f128_ps::<0b0011_0001>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { - unsafe { - let lo = _mm256_unpacklo_ps(a.into(), b.into()); - let hi = _mm256_unpackhi_ps(a.into(), b.into()); - ( - _mm256_permute2f128_ps::<0b0010_0000>(lo, hi).simd_into(self), - _mm256_permute2f128_ps::<0b0011_0001>(lo, hi).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> (f32x8, f32x8) { + let lo = _mm256_unpacklo_ps(a.into(), b.into()); + let hi = _mm256_unpackhi_ps(a.into(), b.into()); + ( + _mm256_permute2f128_ps::<0b0010_0000>(lo, hi).simd_into(token), + _mm256_permute2f128_ps::<0b0011_0001>(lo, hi).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn deinterleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { - unsafe { - let t1 = _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - ( - _mm256_permute2f128_ps::<0b0010_0000>(t1, t2).simd_into(self), - _mm256_permute2f128_ps::<0b0011_0001>(t1, t2).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> (f32x8, f32x8) { + let t1 = + _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + let t2 = + _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + ( + _mm256_permute2f128_ps::<0b0010_0000>(t1, t2).simd_into(token), + _mm256_permute2f128_ps::<0b0011_0001>(t1, t2).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { _mm256_max_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + _mm256_max_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { _mm256_min_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + _mm256_min_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - let intermediate = _mm256_max_ps(a.into(), b.into()); - let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into()); - _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + let intermediate = _mm256_max_ps(a.into(), b.into()); + let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into()); + _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - let intermediate = _mm256_min_ps(a.into(), b.into()); - let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into()); - _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + let intermediate = _mm256_min_ps(a.into(), b.into()); + let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into()); + _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { - unsafe { _mm256_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + _mm256_fmadd_ps(a.into(), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn mul_sub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { - unsafe { _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn floor_f32x8(self, a: f32x8) -> f32x8 { - unsafe { - _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> f32x8 { + _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn ceil_f32x8(self, a: f32x8) -> f32x8 { - unsafe { - _mm256_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> f32x8 { + _mm256_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn round_ties_even_f32x8(self, a: f32x8) -> f32x8 { - unsafe { - _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> f32x8 { + _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn fract_f32x8(self, a: f32x8) -> f32x8 { @@ -2497,15 +4270,29 @@ impl Simd for Avx2 { } #[inline(always)] fn trunc_f32x8(self, a: f32x8) -> f32x8 { - unsafe { - _mm256_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> f32x8 { + _mm256_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { - unsafe { - _mm256_blendv_ps(c.into(), b.into(), _mm256_castsi256_ps(a.into())).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask32x8, + b: f32x8, + c: f32x8, + ) -> f32x8 { + _mm256_blendv_ps(c.into(), b.into(), _mm256_castsi256_ps(a.into())).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { @@ -2516,91 +4303,145 @@ impl Simd for Avx2 { } #[inline(always)] fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { - unsafe { - ( - _mm256_extractf128_ps::<0>(a.into()).simd_into(self), - _mm256_extractf128_ps::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> (f32x4, f32x4) { + ( + _mm256_extractf128_ps::<0>(a.into()).simd_into(token), + _mm256_extractf128_ps::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { - unsafe { _mm256_castps_pd(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> f64x4 { + _mm256_castps_pd(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { - unsafe { _mm256_castps_si256(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> i32x8 { + _mm256_castps_si256(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { - unsafe { _mm256_castps_si256(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> u8x32 { + _mm256_castps_si256(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { - unsafe { _mm256_castps_si256(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> u32x8 { + _mm256_castps_si256(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { - unsafe { - let mut converted = _mm256_cvttps_epi32(a.into()); - let in_range = _mm256_cmp_ps::<17i32>(a.into(), _mm256_set1_ps(2147483648.0)); - let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; - if !all_in_range { - let excess = _mm256_sub_ps(a.into(), _mm256_set1_ps(2147483648.0)); - let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess)); - converted = _mm256_add_epi32(converted, excess_converted); + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> u32x8 { + let mut converted = _mm256_cvttps_epi32(a.into()); + let in_range = _mm256_cmp_ps::<17i32>(a.into(), _mm256_set1_ps(2147483648.0)); + let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; + if !all_in_range { + let excess = _mm256_sub_ps(a.into(), _mm256_set1_ps(2147483648.0)); + let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess)); + converted = _mm256_add_epi32(converted, excess_converted); + } + converted.simd_into(token) } - converted.simd_into(self) - } + ); + kernel(self, a) } #[inline(always)] fn cvt_u32_precise_f32x8(self, a: f32x8) -> u32x8 { - unsafe { - let a = _mm256_max_ps(a.into(), _mm256_setzero_ps()); - let mut converted = _mm256_cvttps_epi32(a); - let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0)); - let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; - if !all_in_range { - let exceeds_unsigned_range = - _mm256_castps_si256(_mm256_cmp_ps::<17i32>(_mm256_set1_ps(4294967040.0), a)); - let excess = _mm256_sub_ps(a, _mm256_set1_ps(2147483648.0)); - let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess)); - converted = _mm256_add_epi32(converted, excess_converted); - converted = _mm256_blendv_epi8( - converted, - _mm256_set1_epi32(u32::MAX.cast_signed()), - exceeds_unsigned_range, - ); + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> u32x8 { + let a = _mm256_max_ps(a.into(), _mm256_setzero_ps()); + let mut converted = _mm256_cvttps_epi32(a); + let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0)); + let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; + if !all_in_range { + let exceeds_unsigned_range = _mm256_castps_si256(_mm256_cmp_ps::<17i32>( + _mm256_set1_ps(4294967040.0), + a, + )); + let excess = _mm256_sub_ps(a, _mm256_set1_ps(2147483648.0)); + let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess)); + converted = _mm256_add_epi32(converted, excess_converted); + converted = _mm256_blendv_epi8( + converted, + _mm256_set1_epi32(u32::MAX.cast_signed()), + exceeds_unsigned_range, + ); + } + converted.simd_into(token) } - converted.simd_into(self) - } + ); + kernel(self, a) } #[inline(always)] fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { - unsafe { _mm256_cvttps_epi32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> i32x8 { + _mm256_cvttps_epi32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_i32_precise_f32x8(self, a: f32x8) -> i32x8 { - unsafe { - let a = a.into(); - let mut converted = _mm256_cvttps_epi32(a); - let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0)); - let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; - if !all_in_range { - converted = _mm256_blendv_epi8( - _mm256_set1_epi32(i32::MAX), - converted, - _mm256_castps_si256(in_range), - ); - let is_not_nan = _mm256_castps_si256(_mm256_cmp_ps::<7i32>(a, a)); - converted = _mm256_and_si256(converted, is_not_nan); + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> i32x8 { + let a = a.into(); + let mut converted = _mm256_cvttps_epi32(a); + let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0)); + let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; + if !all_in_range { + converted = _mm256_blendv_epi8( + _mm256_set1_epi32(i32::MAX), + converted, + _mm256_castps_si256(in_range), + ); + let is_not_nan = _mm256_castps_si256(_mm256_cmp_ps::<7i32>(a, a)); + converted = _mm256_and_si256(converted, is_not_nan); + } + converted.simd_into(token) } - converted.simd_into(self) - } + ); + kernel(self, a) } #[inline(always)] fn splat_i8x32(self, val: i8) -> i8x32 { - unsafe { _mm256_set1_epi8(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: i8) -> i8x32 { + _mm256_set1_epi8(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { @@ -2686,38 +4527,72 @@ impl Simd for Avx2 { } #[inline(always)] fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { _mm256_add_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { + _mm256_add_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { _mm256_sub_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { + _mm256_sub_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { - let dst_even = _mm256_mullo_epi16(a.into(), b.into()); - let dst_odd = _mm256_mullo_epi16( - _mm256_srli_epi16::<8>(a.into()), - _mm256_srli_epi16::<8>(b.into()), - ); - _mm256_or_si256( - _mm256_slli_epi16(dst_odd, 8), - _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { + let dst_even = _mm256_mullo_epi16(a.into(), b.into()); + let dst_odd = _mm256_mullo_epi16( + _mm256_srli_epi16::<8>(a.into()), + _mm256_srli_epi16::<8>(b.into()), + ); + _mm256_or_si256( + _mm256_slli_epi16(dst_odd, 8), + _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_i8x32(self, a: i8x32) -> i8x32 { @@ -2725,15 +4600,21 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_i8x32(self, a: i8x32, shift: u32) -> i8x32 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); - let hi_16 = _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); - let lo_shifted = _mm256_sll_epi16(lo_16, shift_count); - let hi_shifted = _mm256_sll_epi16(hi_16, shift_count); - _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, shift: u32) -> i8x32 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = + _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); + let hi_16 = + _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); + let lo_shifted = _mm256_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm256_sll_epi16(hi_16, shift_count); + _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { @@ -2741,15 +4622,21 @@ impl Simd for Avx2 { } #[inline(always)] fn shr_i8x32(self, a: i8x32, shift: u32) -> i8x32 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); - let hi_16 = _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); - let lo_shifted = _mm256_sra_epi16(lo_16, shift_count); - let hi_shifted = _mm256_sra_epi16(hi_16, shift_count); - _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, shift: u32) -> i8x32 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = + _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); + let hi_16 = + _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); + let lo_shifted = _mm256_sra_epi16(lo_16, shift_count); + let hi_shifted = _mm256_sra_epi16(hi_16, shift_count); + _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { @@ -2757,125 +4644,202 @@ impl Simd for Avx2 { } #[inline(always)] fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - unsafe { _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> mask8x32 { + _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - unsafe { _mm256_cmpgt_epi8(b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> mask8x32 { + _mm256_cmpgt_epi8(b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - unsafe { _mm256_cmpeq_epi8(_mm256_min_epi8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> mask8x32 { + _mm256_cmpeq_epi8(_mm256_min_epi8(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - unsafe { _mm256_cmpeq_epi8(_mm256_max_epi8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> mask8x32 { + _mm256_cmpeq_epi8(_mm256_max_epi8(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - unsafe { _mm256_cmpgt_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> mask8x32 { + _mm256_cmpgt_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { - let lo = _mm256_unpacklo_epi8(a.into(), b.into()); - let hi = _mm256_unpackhi_epi8(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { + let lo = _mm256_unpacklo_epi8(a.into(), b.into()); + let hi = _mm256_unpackhi_epi8(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { - let lo = _mm256_unpacklo_epi8(a.into(), b.into()); - let hi = _mm256_unpackhi_epi8(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { + let lo = _mm256_unpacklo_epi8(a.into(), b.into()); + let hi = _mm256_unpackhi_epi8(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + a.into(), + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + ), + )); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + b.into(), + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + ), + )); + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self) - } - } - #[inline(always)] - fn interleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { - unsafe { - let lo = _mm256_unpacklo_epi8(a.into(), b.into()); - let hi = _mm256_unpackhi_epi8(a.into(), b.into()); - ( - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + a.into(), + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + ), + )); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + b.into(), + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + ), + )); + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn interleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> (i8x32, i8x32) { + let lo = _mm256_unpacklo_epi8(a.into(), b.into()); + let hi = _mm256_unpackhi_epi8(a.into(), b.into()); + ( + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn deinterleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - ( - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> (i8x32, i8x32) { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + a.into(), + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + ), + )); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + b.into(), + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + ), + )); + ( + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask8x32, + b: i8x32, + c: i8x32, + ) -> i8x32 { + _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { _mm256_min_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { + _mm256_min_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { _mm256_max_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { + _mm256_max_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { @@ -2886,28 +4850,56 @@ impl Simd for Avx2 { } #[inline(always)] fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32) -> (i8x16, i8x16) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn neg_i8x32(self, a: i8x32) -> i8x32 { - unsafe { _mm256_sub_epi8(_mm256_setzero_si256(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32) -> i8x32 { + _mm256_sub_epi8(_mm256_setzero_si256(), a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { - __m256i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32) -> u8x32 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { - __m256i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i8x32) -> u32x8 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_u8x32(self, val: u8) -> u8x32 { - unsafe { _mm256_set1_epi8(val.cast_signed()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: u8) -> u8x32 { + _mm256_set1_epi8(val.cast_signed()).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { @@ -2993,38 +4985,72 @@ impl Simd for Avx2 { } #[inline(always)] fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { _mm256_add_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + _mm256_add_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { _mm256_sub_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + _mm256_sub_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { - let dst_even = _mm256_mullo_epi16(a.into(), b.into()); - let dst_odd = _mm256_mullo_epi16( - _mm256_srli_epi16::<8>(a.into()), - _mm256_srli_epi16::<8>(b.into()), - ); - _mm256_or_si256( - _mm256_slli_epi16(dst_odd, 8), - _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + let dst_even = _mm256_mullo_epi16(a.into(), b.into()); + let dst_odd = _mm256_mullo_epi16( + _mm256_srli_epi16::<8>(a.into()), + _mm256_srli_epi16::<8>(b.into()), + ); + _mm256_or_si256( + _mm256_slli_epi16(dst_odd, 8), + _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_u8x32(self, a: u8x32) -> u8x32 { @@ -3032,15 +5058,19 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_u8x32(self, a: u8x32, shift: u32) -> u8x32 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256()); - let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256()); - let lo_shifted = _mm256_sll_epi16(lo_16, shift_count); - let hi_shifted = _mm256_sll_epi16(hi_16, shift_count); - _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, shift: u32) -> u8x32 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256()); + let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256()); + let lo_shifted = _mm256_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm256_sll_epi16(hi_16, shift_count); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { @@ -3048,15 +5078,19 @@ impl Simd for Avx2 { } #[inline(always)] fn shr_u8x32(self, a: u8x32, shift: u32) -> u8x32 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256()); - let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256()); - let lo_shifted = _mm256_srl_epi16(lo_16, shift_count); - let hi_shifted = _mm256_srl_epi16(hi_16, shift_count); - _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, shift: u32) -> u8x32 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256()); + let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256()); + let lo_shifted = _mm256_srl_epi16(lo_16, shift_count); + let hi_shifted = _mm256_srl_epi16(hi_16, shift_count); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { @@ -3064,135 +5098,208 @@ impl Simd for Avx2 { } #[inline(always)] fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - unsafe { _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> mask8x32 { + _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - unsafe { - let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed()); - let a_signed = _mm256_xor_si256(a.into(), sign_bit); - let b_signed = _mm256_xor_si256(b.into(), sign_bit); - _mm256_cmpgt_epi8(b_signed, a_signed).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> mask8x32 { + let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed()); + let a_signed = _mm256_xor_si256(a.into(), sign_bit); + let b_signed = _mm256_xor_si256(b.into(), sign_bit); + _mm256_cmpgt_epi8(b_signed, a_signed).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - unsafe { _mm256_cmpeq_epi8(_mm256_min_epu8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> mask8x32 { + _mm256_cmpeq_epi8(_mm256_min_epu8(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - unsafe { _mm256_cmpeq_epi8(_mm256_max_epu8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> mask8x32 { + _mm256_cmpeq_epi8(_mm256_max_epu8(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - unsafe { - let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed()); - let a_signed = _mm256_xor_si256(a.into(), sign_bit); - let b_signed = _mm256_xor_si256(b.into(), sign_bit); - _mm256_cmpgt_epi8(a_signed, b_signed).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> mask8x32 { + let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed()); + let a_signed = _mm256_xor_si256(a.into(), sign_bit); + let b_signed = _mm256_xor_si256(b.into(), sign_bit); + _mm256_cmpgt_epi8(a_signed, b_signed).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { - let lo = _mm256_unpacklo_epi8(a.into(), b.into()); - let hi = _mm256_unpackhi_epi8(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + let lo = _mm256_unpacklo_epi8(a.into(), b.into()); + let hi = _mm256_unpackhi_epi8(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { - let lo = _mm256_unpacklo_epi8(a.into(), b.into()); - let hi = _mm256_unpackhi_epi8(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + let lo = _mm256_unpacklo_epi8(a.into(), b.into()); + let hi = _mm256_unpackhi_epi8(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + a.into(), + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + ), + )); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + b.into(), + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + ), + )); + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + a.into(), + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + ), + )); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + b.into(), + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + ), + )); + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { - unsafe { - let lo = _mm256_unpacklo_epi8(a.into(), b.into()); - let hi = _mm256_unpackhi_epi8(a.into(), b.into()); - ( - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + let lo = _mm256_unpacklo_epi8(a.into(), b.into()); + let hi = _mm256_unpackhi_epi8(a.into(), b.into()); + ( + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn deinterleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, - 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - ( - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + a.into(), + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + ), + )); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + b.into(), + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + ), + )); + ( + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask8x32, + b: u8x32, + c: u8x32, + ) -> u8x32 { + _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { _mm256_min_epu8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + _mm256_min_epu8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { _mm256_max_epu8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + _mm256_max_epu8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { @@ -3203,32 +5310,50 @@ impl Simd for Avx2 { } #[inline(always)] fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32) -> (u8x16, u8x16) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn widen_u8x32(self, a: u8x32) -> u16x32 { - unsafe { - let (a0, a1) = self.split_u8x32(a); - let high = _mm256_cvtepu8_epi16(a0.into()).simd_into(self); - let low = _mm256_cvtepu8_epi16(a1.into()).simd_into(self); - self.combine_u16x16(high, low) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32) -> u16x32 { + let (a0, a1) = token.split_u8x32(a); + let high = _mm256_cvtepu8_epi16(a0.into()).simd_into(token); + let low = _mm256_cvtepu8_epi16(a1.into()).simd_into(token); + token.combine_u16x16(high, low) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { - __m256i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32) -> u32x8 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_mask8x32(self, val: bool) -> mask8x32 { - unsafe { - let val: i8 = if val { !0 } else { 0 }; - _mm256_set1_epi8(val).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: bool) -> mask8x32 { + let val: i8 = if val { !0 } else { 0 }; + _mm256_set1_epi8(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { @@ -3243,40 +5368,68 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { - unsafe { - { - let bit_bytes = _mm256_broadcastsi128_si256(_mm_cvtsi32_si128(bits as i32)); - let bit_bytes = _mm256_shuffle_epi8( - bit_bytes, - _mm256_setr_epi8( - 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, - 3, 3, 3, 3, 3, 3, 3, - ), - ); - let bit_mask = _mm256_setr_epi8( - 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, - 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, - ); - _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, bits: u64) -> mask8x32 { + { + let bit_bytes = _mm256_broadcastsi128_si256(_mm_cvtsi32_si128(bits as i32)); + let bit_bytes = _mm256_shuffle_epi8( + bit_bytes, + _mm256_setr_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, + ), + ); + let bit_mask = _mm256_setr_epi8( + 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, + 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, + ); + _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) + } + .simd_into(token) } - .simd_into(self) - } + ); + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask8x32(self, a: mask8x32) -> u64 { - unsafe { _mm256_movemask_epi8(a.into()) as u32 as u64 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32) -> u64 { + _mm256_movemask_epi8(a.into()) as u32 as u64 + } + ); + kernel(self, a) } #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32, b: mask8x32) -> mask8x32 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32, b: mask8x32) -> mask8x32 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32, b: mask8x32) -> mask8x32 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_mask8x32(self, a: mask8x32) -> mask8x32 { @@ -3289,27 +5442,68 @@ impl Simd for Avx2 { b: mask8x32, c: mask8x32, ) -> mask8x32 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask8x32, + b: mask8x32, + c: mask8x32, + ) -> mask8x32 { + _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - unsafe { _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32, b: mask8x32) -> mask8x32 { + _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn any_true_mask8x32(self, a: mask8x32) -> bool { - unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32) -> bool { + _mm256_movemask_epi8(a.into()) as u32 != 0 + } + ); + kernel(self, a) } #[inline(always)] fn all_true_mask8x32(self, a: mask8x32) -> bool { - unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32) -> bool { + _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff + } + ); + kernel(self, a) } #[inline(always)] fn any_false_mask8x32(self, a: mask8x32) -> bool { - unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32) -> bool { + _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff + } + ); + kernel(self, a) } #[inline(always)] fn all_false_mask8x32(self, a: mask8x32) -> bool { - unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32) -> bool { + _mm256_movemask_epi8(a.into()) as u32 == 0 + } + ); + kernel(self, a) } #[inline(always)] fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { @@ -3320,16 +5514,26 @@ impl Simd for Avx2 { } #[inline(always)] fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32) -> (mask8x16, mask8x16) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn splat_i16x16(self, val: i16) -> i16x16 { - unsafe { _mm256_set1_epi16(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: i16) -> i16x16 { + _mm256_set1_epi16(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { @@ -3415,27 +5619,63 @@ impl Simd for Avx2 { } #[inline(always)] fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { _mm256_add_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { + _mm256_add_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { _mm256_sub_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { + _mm256_sub_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { _mm256_mullo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { + _mm256_mullo_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_i16x16(self, a: i16x16) -> i16x16 { @@ -3443,9 +5683,13 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_i16x16(self, a: i16x16, shift: u32) -> i16x16 { - unsafe { - _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16, shift: u32) -> i16x16 { + _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { @@ -3453,9 +5697,13 @@ impl Simd for Avx2 { } #[inline(always)] fn shr_i16x16(self, a: i16x16, shift: u32) -> i16x16 { - unsafe { - _mm256_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16, shift: u32) -> i16x16 { + _mm256_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { @@ -3463,129 +5711,210 @@ impl Simd for Avx2 { } #[inline(always)] fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - unsafe { _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> mask16x16 { + _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - unsafe { _mm256_cmpgt_epi16(b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> mask16x16 { + _mm256_cmpgt_epi16(b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - unsafe { - _mm256_cmpeq_epi16(_mm256_min_epi16(a.into(), b.into()), a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> mask16x16 { + _mm256_cmpeq_epi16(_mm256_min_epi16(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - unsafe { - _mm256_cmpeq_epi16(_mm256_max_epi16(a.into(), b.into()), a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> mask16x16 { + _mm256_cmpeq_epi16(_mm256_max_epi16(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - unsafe { _mm256_cmpgt_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> mask16x16 { + _mm256_cmpgt_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { - let lo = _mm256_unpacklo_epi16(a.into(), b.into()); - let hi = _mm256_unpackhi_epi16(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { + let lo = _mm256_unpacklo_epi16(a.into(), b.into()); + let hi = _mm256_unpackhi_epi16(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { - let lo = _mm256_unpacklo_epi16(a.into(), b.into()); - let hi = _mm256_unpackhi_epi16(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { + let lo = _mm256_unpacklo_epi16(a.into(), b.into()); + let hi = _mm256_unpackhi_epi16(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + a.into(), + _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, + 13, 2, 3, 6, 7, 10, 11, 14, 15, + ), + )); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + b.into(), + _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, + 13, 2, 3, 6, 7, 10, 11, 14, 15, + ), + )); + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + a.into(), + _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, + 13, 2, 3, 6, 7, 10, 11, 14, 15, + ), + )); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + b.into(), + _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, + 13, 2, 3, 6, 7, 10, 11, 14, 15, + ), + )); + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { - unsafe { - let lo = _mm256_unpacklo_epi16(a.into(), b.into()); - let hi = _mm256_unpackhi_epi16(a.into(), b.into()); - ( - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: i16x16, + b: i16x16, + ) -> (i16x16, i16x16) { + let lo = _mm256_unpacklo_epi16(a.into(), b.into()); + let hi = _mm256_unpackhi_epi16(a.into(), b.into()); + ( + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn deinterleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - ( - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: i16x16, + b: i16x16, + ) -> (i16x16, i16x16) { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + a.into(), + _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, + 13, 2, 3, 6, 7, 10, 11, 14, 15, + ), + )); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + b.into(), + _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, + 13, 2, 3, 6, 7, 10, 11, 14, 15, + ), + )); + ( + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask16x16, + b: i16x16, + c: i16x16, + ) -> i16x16 { + _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { _mm256_min_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { + _mm256_min_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { _mm256_max_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { + _mm256_max_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { @@ -3596,28 +5925,56 @@ impl Simd for Avx2 { } #[inline(always)] fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16) -> (i16x8, i16x8) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn neg_i16x16(self, a: i16x16) -> i16x16 { - unsafe { _mm256_sub_epi16(_mm256_setzero_si256(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16) -> i16x16 { + _mm256_sub_epi16(_mm256_setzero_si256(), a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { - __m256i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16) -> u8x32 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { - __m256i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i16x16) -> u32x8 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_u16x16(self, val: u16) -> u16x16 { - unsafe { _mm256_set1_epi16(val.cast_signed()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: u16) -> u16x16 { + _mm256_set1_epi16(val.cast_signed()).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { @@ -3703,27 +6060,63 @@ impl Simd for Avx2 { } #[inline(always)] fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { _mm256_add_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + _mm256_add_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { _mm256_sub_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + _mm256_sub_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { _mm256_mullo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + _mm256_mullo_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_u16x16(self, a: u16x16) -> u16x16 { @@ -3731,9 +6124,13 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_u16x16(self, a: u16x16, shift: u32) -> u16x16 { - unsafe { - _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, shift: u32) -> u16x16 { + _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { @@ -3741,9 +6138,13 @@ impl Simd for Avx2 { } #[inline(always)] fn shr_u16x16(self, a: u16x16, shift: u32) -> u16x16 { - unsafe { - _mm256_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, shift: u32) -> u16x16 { + _mm256_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { @@ -3751,139 +6152,216 @@ impl Simd for Avx2 { } #[inline(always)] fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - unsafe { _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> mask16x16 { + _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - unsafe { - let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed()); - let a_signed = _mm256_xor_si256(a.into(), sign_bit); - let b_signed = _mm256_xor_si256(b.into(), sign_bit); - _mm256_cmpgt_epi16(b_signed, a_signed).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> mask16x16 { + let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed()); + let a_signed = _mm256_xor_si256(a.into(), sign_bit); + let b_signed = _mm256_xor_si256(b.into(), sign_bit); + _mm256_cmpgt_epi16(b_signed, a_signed).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - unsafe { - _mm256_cmpeq_epi16(_mm256_min_epu16(a.into(), b.into()), a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> mask16x16 { + _mm256_cmpeq_epi16(_mm256_min_epu16(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - unsafe { - _mm256_cmpeq_epi16(_mm256_max_epu16(a.into(), b.into()), a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> mask16x16 { + _mm256_cmpeq_epi16(_mm256_max_epu16(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - unsafe { - let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed()); - let a_signed = _mm256_xor_si256(a.into(), sign_bit); - let b_signed = _mm256_xor_si256(b.into(), sign_bit); - _mm256_cmpgt_epi16(a_signed, b_signed).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> mask16x16 { + let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed()); + let a_signed = _mm256_xor_si256(a.into(), sign_bit); + let b_signed = _mm256_xor_si256(b.into(), sign_bit); + _mm256_cmpgt_epi16(a_signed, b_signed).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { - let lo = _mm256_unpacklo_epi16(a.into(), b.into()); - let hi = _mm256_unpackhi_epi16(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + let lo = _mm256_unpacklo_epi16(a.into(), b.into()); + let hi = _mm256_unpackhi_epi16(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { - let lo = _mm256_unpacklo_epi16(a.into(), b.into()); - let hi = _mm256_unpackhi_epi16(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + let lo = _mm256_unpacklo_epi16(a.into(), b.into()); + let hi = _mm256_unpackhi_epi16(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + a.into(), + _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, + 13, 2, 3, 6, 7, 10, 11, 14, 15, + ), + )); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + b.into(), + _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, + 13, 2, 3, 6, 7, 10, 11, 14, 15, + ), + )); + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + a.into(), + _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, + 13, 2, 3, 6, 7, 10, 11, 14, 15, + ), + )); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + b.into(), + _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, + 13, 2, 3, 6, 7, 10, 11, 14, 15, + ), + )); + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { - unsafe { - let lo = _mm256_unpacklo_epi16(a.into(), b.into()); - let hi = _mm256_unpackhi_epi16(a.into(), b.into()); - ( - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: u16x16, + b: u16x16, + ) -> (u16x16, u16x16) { + let lo = _mm256_unpacklo_epi16(a.into(), b.into()); + let hi = _mm256_unpackhi_epi16(a.into(), b.into()); + ( + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn deinterleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { - unsafe { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - ), - )); - ( - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: u16x16, + b: u16x16, + ) -> (u16x16, u16x16) { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + a.into(), + _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, + 13, 2, 3, 6, 7, 10, 11, 14, 15, + ), + )); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + b.into(), + _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, + 13, 2, 3, 6, 7, 10, 11, 14, 15, + ), + )); + ( + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask16x16, + b: u16x16, + c: u16x16, + ) -> u16x16 { + _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { _mm256_min_epu16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + _mm256_min_epu16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { _mm256_max_epu16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + _mm256_max_epu16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { @@ -3894,39 +6372,63 @@ impl Simd for Avx2 { } #[inline(always)] fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16) -> (u16x8, u16x8) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn narrow_u16x16(self, a: u16x16) -> u8x16 { - unsafe { - let mask = _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, 12, - 14, -1, -1, -1, -1, -1, -1, -1, -1, - ); - let shuffled = _mm256_shuffle_epi8(a.into(), mask); - let packed = _mm256_permute4x64_epi64::<0b11_01_10_00>(shuffled); - _mm256_castsi256_si128(packed).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16) -> u8x16 { + let mask = _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, + 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, + ); + let shuffled = _mm256_shuffle_epi8(a.into(), mask); + let packed = _mm256_permute4x64_epi64::<0b11_01_10_00>(shuffled); + _mm256_castsi256_si128(packed).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { - __m256i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16) -> u8x32 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { - __m256i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16) -> u32x8 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_mask16x16(self, val: bool) -> mask16x16 { - unsafe { - let val: i16 = if val { !0 } else { 0 }; - _mm256_set1_epi16(val).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: bool) -> mask16x16 { + let val: i16 = if val { !0 } else { 0 }; + _mm256_set1_epi16(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { @@ -3941,38 +6443,66 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { - unsafe { - { - let bit_lanes = _mm256_set1_epi16(bits as i16); - let bit_mask = _mm256_setr_epi16( - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, -32768, - ); - _mm256_cmpeq_epi16(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, bits: u64) -> mask16x16 { + { + let bit_lanes = _mm256_set1_epi16(bits as i16); + let bit_mask = _mm256_setr_epi16( + 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, + -32768, + ); + _mm256_cmpeq_epi16(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + } + .simd_into(token) } - .simd_into(self) - } + ); + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { - unsafe { - { - let halves: [__m128i; 2usize] = crate::transmute::checked_transmute_copy(&a.val.0); - let packed = _mm_packs_epi16(halves[0], halves[1]); - _mm_movemask_epi8(packed) as u32 as u64 + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x16) -> u64 { + { + let halves: [__m128i; 2usize] = + crate::transmute::checked_transmute_copy(&a.val.0); + let packed = _mm_packs_epi16(halves[0], halves[1]); + _mm_movemask_epi8(packed) as u32 as u64 + } } - } + ); + kernel(self, a) } #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x16, b: mask16x16) -> mask16x16 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x16, b: mask16x16) -> mask16x16 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x16, b: mask16x16) -> mask16x16 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_mask16x16(self, a: mask16x16) -> mask16x16 { @@ -3985,27 +6515,68 @@ impl Simd for Avx2 { b: mask16x16, c: mask16x16, ) -> mask16x16 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask16x16, + b: mask16x16, + c: mask16x16, + ) -> mask16x16 { + _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - unsafe { _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x16, b: mask16x16) -> mask16x16 { + _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn any_true_mask16x16(self, a: mask16x16) -> bool { - unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x16) -> bool { + _mm256_movemask_epi8(a.into()) as u32 != 0 + } + ); + kernel(self, a) } #[inline(always)] fn all_true_mask16x16(self, a: mask16x16) -> bool { - unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x16) -> bool { + _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff + } + ); + kernel(self, a) } #[inline(always)] fn any_false_mask16x16(self, a: mask16x16) -> bool { - unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x16) -> bool { + _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff + } + ); + kernel(self, a) } #[inline(always)] fn all_false_mask16x16(self, a: mask16x16) -> bool { - unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x16) -> bool { + _mm256_movemask_epi8(a.into()) as u32 == 0 + } + ); + kernel(self, a) } #[inline(always)] fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { @@ -4016,16 +6587,26 @@ impl Simd for Avx2 { } #[inline(always)] fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x16) -> (mask16x8, mask16x8) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn splat_i32x8(self, val: i32) -> i32x8 { - unsafe { _mm256_set1_epi32(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: i32) -> i32x8 { + _mm256_set1_epi32(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { @@ -4111,27 +6692,63 @@ impl Simd for Avx2 { } #[inline(always)] fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_add_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { + _mm256_add_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_sub_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { + _mm256_sub_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_mullo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { + _mm256_mullo_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_i32x8(self, a: i32x8) -> i32x8 { @@ -4139,119 +6756,224 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_i32x8(self, a: i32x8, shift: u32) -> i32x8 { - unsafe { - _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, shift: u32) -> i32x8 { + _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_sllv_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { + _mm256_sllv_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_i32x8(self, a: i32x8, shift: u32) -> i32x8 { - unsafe { - _mm256_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, shift: u32) -> i32x8 { + _mm256_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_srav_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { + _mm256_srav_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - unsafe { _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> mask32x8 { + _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - unsafe { _mm256_cmpgt_epi32(b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> mask32x8 { + _mm256_cmpgt_epi32(b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - unsafe { - _mm256_cmpeq_epi32(_mm256_min_epi32(a.into(), b.into()), a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> mask32x8 { + _mm256_cmpeq_epi32(_mm256_min_epi32(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - unsafe { - _mm256_cmpeq_epi32(_mm256_max_epi32(a.into(), b.into()), a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> mask32x8 { + _mm256_cmpeq_epi32(_mm256_max_epi32(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - unsafe { _mm256_cmpgt_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> mask32x8 { + _mm256_cmpgt_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { - let lo = _mm256_unpacklo_epi32(a.into(), b.into()); - let hi = _mm256_unpackhi_epi32(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { + let lo = _mm256_unpacklo_epi32(a.into(), b.into()); + let hi = _mm256_unpackhi_epi32(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { - let lo = _mm256_unpacklo_epi32(a.into(), b.into()); - let hi = _mm256_unpackhi_epi32(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { + let lo = _mm256_unpacklo_epi32(a.into(), b.into()); + let hi = _mm256_unpackhi_epi32(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { - let t1 = - _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = - _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { + let t1 = _mm256_permutevar8x32_epi32( + a.into(), + _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), + ); + let t2 = _mm256_permutevar8x32_epi32( + b.into(), + _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), + ); + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { - let t1 = - _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = - _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { + let t1 = _mm256_permutevar8x32_epi32( + a.into(), + _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), + ); + let t2 = _mm256_permutevar8x32_epi32( + b.into(), + _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), + ); + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { - unsafe { - let lo = _mm256_unpacklo_epi32(a.into(), b.into()); - let hi = _mm256_unpackhi_epi32(a.into(), b.into()); - ( - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> (i32x8, i32x8) { + let lo = _mm256_unpacklo_epi32(a.into(), b.into()); + let hi = _mm256_unpackhi_epi32(a.into(), b.into()); + ( + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn deinterleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { - unsafe { - let t1 = - _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = - _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - ( - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> (i32x8, i32x8) { + let t1 = _mm256_permutevar8x32_epi32( + a.into(), + _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), + ); + let t2 = _mm256_permutevar8x32_epi32( + b.into(), + _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), + ); + ( + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask32x8, + b: i32x8, + c: i32x8, + ) -> i32x8 { + _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_min_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { + _mm256_min_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { _mm256_max_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { + _mm256_max_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { @@ -4262,32 +6984,66 @@ impl Simd for Avx2 { } #[inline(always)] fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8) -> (i32x4, i32x4) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn neg_i32x8(self, a: i32x8) -> i32x8 { - unsafe { _mm256_sub_epi32(_mm256_setzero_si256(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8) -> i32x8 { + _mm256_sub_epi32(_mm256_setzero_si256(), a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { - __m256i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8) -> u8x32 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { - __m256i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8) -> u32x8 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { - unsafe { _mm256_cvtepi32_ps(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8) -> f32x8 { + _mm256_cvtepi32_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_u32x8(self, val: u32) -> u32x8 { - unsafe { _mm256_set1_epi32(val.cast_signed()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: u32) -> u32x8 { + _mm256_set1_epi32(val.cast_signed()).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { @@ -4373,27 +7129,63 @@ impl Simd for Avx2 { } #[inline(always)] fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_add_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + _mm256_add_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_sub_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + _mm256_sub_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_mullo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + _mm256_mullo_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_u32x8(self, a: u32x8) -> u32x8 { @@ -4401,129 +7193,230 @@ impl Simd for Avx2 { } #[inline(always)] fn shl_u32x8(self, a: u32x8, shift: u32) -> u32x8 { - unsafe { - _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, shift: u32) -> u32x8 { + _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_sllv_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + _mm256_sllv_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_u32x8(self, a: u32x8, shift: u32) -> u32x8 { - unsafe { - _mm256_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, shift: u32) -> u32x8 { + _mm256_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_srlv_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + _mm256_srlv_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - unsafe { _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> mask32x8 { + _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - unsafe { - let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed()); - let a_signed = _mm256_xor_si256(a.into(), sign_bit); - let b_signed = _mm256_xor_si256(b.into(), sign_bit); - _mm256_cmpgt_epi32(b_signed, a_signed).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> mask32x8 { + let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed()); + let a_signed = _mm256_xor_si256(a.into(), sign_bit); + let b_signed = _mm256_xor_si256(b.into(), sign_bit); + _mm256_cmpgt_epi32(b_signed, a_signed).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - unsafe { - _mm256_cmpeq_epi32(_mm256_min_epu32(a.into(), b.into()), a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> mask32x8 { + _mm256_cmpeq_epi32(_mm256_min_epu32(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - unsafe { - _mm256_cmpeq_epi32(_mm256_max_epu32(a.into(), b.into()), a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> mask32x8 { + _mm256_cmpeq_epi32(_mm256_max_epu32(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - unsafe { - let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed()); - let a_signed = _mm256_xor_si256(a.into(), sign_bit); - let b_signed = _mm256_xor_si256(b.into(), sign_bit); - _mm256_cmpgt_epi32(a_signed, b_signed).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> mask32x8 { + let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed()); + let a_signed = _mm256_xor_si256(a.into(), sign_bit); + let b_signed = _mm256_xor_si256(b.into(), sign_bit); + _mm256_cmpgt_epi32(a_signed, b_signed).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { - let lo = _mm256_unpacklo_epi32(a.into(), b.into()); - let hi = _mm256_unpackhi_epi32(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + let lo = _mm256_unpacklo_epi32(a.into(), b.into()); + let hi = _mm256_unpackhi_epi32(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { - let lo = _mm256_unpacklo_epi32(a.into(), b.into()); - let hi = _mm256_unpackhi_epi32(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + let lo = _mm256_unpacklo_epi32(a.into(), b.into()); + let hi = _mm256_unpackhi_epi32(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { - let t1 = - _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = - _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + let t1 = _mm256_permutevar8x32_epi32( + a.into(), + _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), + ); + let t2 = _mm256_permutevar8x32_epi32( + b.into(), + _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), + ); + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { - let t1 = - _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = - _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + let t1 = _mm256_permutevar8x32_epi32( + a.into(), + _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), + ); + let t2 = _mm256_permutevar8x32_epi32( + b.into(), + _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), + ); + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { - unsafe { - let lo = _mm256_unpacklo_epi32(a.into(), b.into()); - let hi = _mm256_unpackhi_epi32(a.into(), b.into()); - ( - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> (u32x8, u32x8) { + let lo = _mm256_unpacklo_epi32(a.into(), b.into()); + let hi = _mm256_unpackhi_epi32(a.into(), b.into()); + ( + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn deinterleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { - unsafe { - let t1 = - _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = - _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - ( - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self), - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> (u32x8, u32x8) { + let t1 = _mm256_permutevar8x32_epi32( + a.into(), + _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), + ); + let t2 = _mm256_permutevar8x32_epi32( + b.into(), + _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), + ); + ( + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask32x8, + b: u32x8, + c: u32x8, + ) -> u32x8 { + _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_min_epu32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + _mm256_min_epu32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { _mm256_max_epu32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + _mm256_max_epu32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { @@ -4534,40 +7427,58 @@ impl Simd for Avx2 { } #[inline(always)] fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8) -> (u32x4, u32x4) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { - __m256i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8) -> u8x32 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { - unsafe { - let a = a.into(); - let lo = _mm256_blend_epi16::<0xAA>(a, _mm256_set1_epi32(0x4B000000)); - let hi = _mm256_blend_epi16::<0xAA>( - _mm256_srli_epi32::<16>(a), - _mm256_set1_epi32(0x53000000), - ); - let fhi = _mm256_sub_ps( - _mm256_castsi256_ps(hi), - _mm256_set1_ps(f32::from_bits(0x53000080)), - ); - let result = _mm256_add_ps(_mm256_castsi256_ps(lo), fhi); - result.simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8) -> f32x8 { + let a = a.into(); + let lo = _mm256_blend_epi16::<0xAA>(a, _mm256_set1_epi32(0x4B000000)); + let hi = _mm256_blend_epi16::<0xAA>( + _mm256_srli_epi32::<16>(a), + _mm256_set1_epi32(0x53000000), + ); + let fhi = _mm256_sub_ps( + _mm256_castsi256_ps(hi), + _mm256_set1_ps(f32::from_bits(0x53000080)), + ); + let result = _mm256_add_ps(_mm256_castsi256_ps(lo), fhi); + result.simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_mask32x8(self, val: bool) -> mask32x8 { - unsafe { - let val: i32 = if val { !0 } else { 0 }; - _mm256_set1_epi32(val).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: bool) -> mask32x8 { + let val: i32 = if val { !0 } else { 0 }; + _mm256_set1_epi32(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { @@ -4582,30 +7493,58 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { - unsafe { - { - let bit_lanes = _mm256_set1_epi32(bits as i32); - let bit_mask = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128); - _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, bits: u64) -> mask32x8 { + { + let bit_lanes = _mm256_set1_epi32(bits as i32); + let bit_mask = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128); + _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + } + .simd_into(token) } - .simd_into(self) - } + ); + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask32x8(self, a: mask32x8) -> u64 { - unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 as u64 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x8) -> u64 { + _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 as u64 + } + ); + kernel(self, a) } #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x8, b: mask32x8) -> mask32x8 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x8, b: mask32x8) -> mask32x8 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x8, b: mask32x8) -> mask32x8 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_mask32x8(self, a: mask32x8) -> mask32x8 { @@ -4618,27 +7557,68 @@ impl Simd for Avx2 { b: mask32x8, c: mask32x8, ) -> mask32x8 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask32x8, + b: mask32x8, + c: mask32x8, + ) -> mask32x8 { + _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - unsafe { _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x8, b: mask32x8) -> mask32x8 { + _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn any_true_mask32x8(self, a: mask32x8) -> bool { - unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x8) -> bool { + _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0 + } + ); + kernel(self, a) } #[inline(always)] fn all_true_mask32x8(self, a: mask32x8) -> bool { - unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0b11111111 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x8) -> bool { + _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0b11111111 + } + ); + kernel(self, a) } #[inline(always)] fn any_false_mask32x8(self, a: mask32x8) -> bool { - unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0b11111111 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x8) -> bool { + _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0b11111111 + } + ); + kernel(self, a) } #[inline(always)] fn all_false_mask32x8(self, a: mask32x8) -> bool { - unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x8) -> bool { + _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0 + } + ); + kernel(self, a) } #[inline(always)] fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { @@ -4649,16 +7629,26 @@ impl Simd for Avx2 { } #[inline(always)] fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask32x8) -> (mask32x4, mask32x4) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn splat_f64x4(self, val: f64) -> f64x4 { - unsafe { _mm256_set1_pd(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: f64) -> f64x4 { + _mm256_set1_pd(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { @@ -4744,15 +7734,33 @@ impl Simd for Avx2 { } #[inline(always)] fn abs_f64x4(self, a: f64x4) -> f64x4 { - unsafe { _mm256_andnot_pd(_mm256_set1_pd(-0.0), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4) -> f64x4 { + _mm256_andnot_pd(_mm256_set1_pd(-0.0), a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn neg_f64x4(self, a: f64x4) -> f64x4 { - unsafe { _mm256_xor_pd(a.into(), _mm256_set1_pd(-0.0)).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4) -> f64x4 { + _mm256_xor_pd(a.into(), _mm256_set1_pd(-0.0)).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn sqrt_f64x4(self, a: f64x4) -> f64x4 { - unsafe { _mm256_sqrt_pd(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4) -> f64x4 { + _mm256_sqrt_pd(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { @@ -4760,157 +7768,283 @@ impl Simd for Avx2 { } #[inline(always)] fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { _mm256_add_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + _mm256_add_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { _mm256_sub_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + _mm256_sub_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { _mm256_mul_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + _mm256_mul_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { _mm256_div_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + _mm256_div_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - let mask = _mm256_set1_pd(-0.0); - _mm256_or_pd( - _mm256_and_pd(mask, b.into()), - _mm256_andnot_pd(mask, a.into()), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + let mask = _mm256_set1_pd(-0.0); + _mm256_or_pd( + _mm256_and_pd(mask, b.into()), + _mm256_andnot_pd(mask, a.into()), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<0i32>(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> mask64x4 { + _mm256_castpd_si256(_mm256_cmp_pd::<0i32>(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<17i32>(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> mask64x4 { + _mm256_castpd_si256(_mm256_cmp_pd::<17i32>(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<18i32>(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> mask64x4 { + _mm256_castpd_si256(_mm256_cmp_pd::<18i32>(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<29i32>(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> mask64x4 { + _mm256_castpd_si256(_mm256_cmp_pd::<29i32>(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<30i32>(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> mask64x4 { + _mm256_castpd_si256(_mm256_cmp_pd::<30i32>(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - let lo = _mm256_unpacklo_pd(a.into(), b.into()); - let hi = _mm256_unpackhi_pd(a.into(), b.into()); - _mm256_permute2f128_pd::<0b0010_0000>(lo, hi).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + let lo = _mm256_unpacklo_pd(a.into(), b.into()); + let hi = _mm256_unpackhi_pd(a.into(), b.into()); + _mm256_permute2f128_pd::<0b0010_0000>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - let lo = _mm256_unpacklo_pd(a.into(), b.into()); - let hi = _mm256_unpackhi_pd(a.into(), b.into()); - _mm256_permute2f128_pd::<0b0011_0001>(lo, hi).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + let lo = _mm256_unpacklo_pd(a.into(), b.into()); + let hi = _mm256_unpackhi_pd(a.into(), b.into()); + _mm256_permute2f128_pd::<0b0011_0001>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into()); - let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into()); - _mm256_permute2f128_pd::<0b0010_0000>(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into()); + let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into()); + _mm256_permute2f128_pd::<0b0010_0000>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into()); - let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into()); - _mm256_permute2f128_pd::<0b0011_0001>(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into()); + let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into()); + _mm256_permute2f128_pd::<0b0011_0001>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { - unsafe { - let lo = _mm256_unpacklo_pd(a.into(), b.into()); - let hi = _mm256_unpackhi_pd(a.into(), b.into()); - ( - _mm256_permute2f128_pd::<0b0010_0000>(lo, hi).simd_into(self), - _mm256_permute2f128_pd::<0b0011_0001>(lo, hi).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> (f64x4, f64x4) { + let lo = _mm256_unpacklo_pd(a.into(), b.into()); + let hi = _mm256_unpackhi_pd(a.into(), b.into()); + ( + _mm256_permute2f128_pd::<0b0010_0000>(lo, hi).simd_into(token), + _mm256_permute2f128_pd::<0b0011_0001>(lo, hi).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn deinterleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { - unsafe { - let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into()); - let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into()); - ( - _mm256_permute2f128_pd::<0b0010_0000>(t1, t2).simd_into(self), - _mm256_permute2f128_pd::<0b0011_0001>(t1, t2).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> (f64x4, f64x4) { + let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into()); + let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into()); + ( + _mm256_permute2f128_pd::<0b0010_0000>(t1, t2).simd_into(token), + _mm256_permute2f128_pd::<0b0011_0001>(t1, t2).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { _mm256_max_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + _mm256_max_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { _mm256_min_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + _mm256_min_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - let intermediate = _mm256_max_pd(a.into(), b.into()); - let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into()); - _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + let intermediate = _mm256_max_pd(a.into(), b.into()); + let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into()); + _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - let intermediate = _mm256_min_pd(a.into(), b.into()); - let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into()); - _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + let intermediate = _mm256_min_pd(a.into(), b.into()); + let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into()); + _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { - unsafe { _mm256_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + _mm256_fmadd_pd(a.into(), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn mul_sub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { - unsafe { _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn floor_f64x4(self, a: f64x4) -> f64x4 { - unsafe { - _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4) -> f64x4 { + _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn ceil_f64x4(self, a: f64x4) -> f64x4 { - unsafe { - _mm256_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4) -> f64x4 { + _mm256_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn round_ties_even_f64x4(self, a: f64x4) -> f64x4 { - unsafe { - _mm256_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4) -> f64x4 { + _mm256_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn fract_f64x4(self, a: f64x4) -> f64x4 { @@ -4918,15 +8052,29 @@ impl Simd for Avx2 { } #[inline(always)] fn trunc_f64x4(self, a: f64x4) -> f64x4 { - unsafe { - _mm256_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4) -> f64x4 { + _mm256_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { - unsafe { - _mm256_blendv_pd(c.into(), b.into(), _mm256_castsi256_pd(a.into())).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask64x4, + b: f64x4, + c: f64x4, + ) -> f64x4 { + _mm256_blendv_pd(c.into(), b.into(), _mm256_castsi256_pd(a.into())).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { @@ -4937,23 +8085,37 @@ impl Simd for Avx2 { } #[inline(always)] fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { - unsafe { - ( - _mm256_extractf128_pd::<0>(a.into()).simd_into(self), - _mm256_extractf128_pd::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4) -> (f64x2, f64x2) { + ( + _mm256_extractf128_pd::<0>(a.into()).simd_into(token), + _mm256_extractf128_pd::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { - unsafe { _mm256_castpd_ps(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4) -> f32x8 { + _mm256_castpd_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_mask64x4(self, val: bool) -> mask64x4 { - unsafe { - let val: i64 = if val { !0 } else { 0 }; - _mm256_set1_epi64x(val).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: bool) -> mask64x4 { + let val: i64 = if val { !0 } else { 0 }; + _mm256_set1_epi64x(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { @@ -4968,30 +8130,58 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { - unsafe { - { - let bit_lanes = _mm256_set1_epi64x(bits.cast_signed()); - let bit_mask = _mm256_set_epi64x(8, 4, 2, 1); - _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, bits: u64) -> mask64x4 { + { + let bit_lanes = _mm256_set1_epi64x(bits.cast_signed()); + let bit_mask = _mm256_set_epi64x(8, 4, 2, 1); + _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + } + .simd_into(token) } - .simd_into(self) - } + ); + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask64x4(self, a: mask64x4) -> u64 { - unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 as u64 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4) -> u64 { + _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 as u64 + } + ); + kernel(self, a) } #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4, b: mask64x4) -> mask64x4 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4, b: mask64x4) -> mask64x4 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4, b: mask64x4) -> mask64x4 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_mask64x4(self, a: mask64x4) -> mask64x4 { @@ -5004,27 +8194,68 @@ impl Simd for Avx2 { b: mask64x4, c: mask64x4, ) -> mask64x4 { - unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask64x4, + b: mask64x4, + c: mask64x4, + ) -> mask64x4 { + _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - unsafe { _mm256_cmpeq_epi64(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4, b: mask64x4) -> mask64x4 { + _mm256_cmpeq_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn any_true_mask64x4(self, a: mask64x4) -> bool { - unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4) -> bool { + _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0 + } + ); + kernel(self, a) } #[inline(always)] fn all_true_mask64x4(self, a: mask64x4) -> bool { - unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0b1111 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4) -> bool { + _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0b1111 + } + ); + kernel(self, a) } #[inline(always)] fn any_false_mask64x4(self, a: mask64x4) -> bool { - unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0b1111 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4) -> bool { + _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0b1111 + } + ); + kernel(self, a) } #[inline(always)] fn all_false_mask64x4(self, a: mask64x4) -> bool { - unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4) -> bool { + _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0 + } + ); + kernel(self, a) } #[inline(always)] fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { @@ -5035,12 +8266,16 @@ impl Simd for Avx2 { } #[inline(always)] fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { - unsafe { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(self), - _mm256_extracti128_si256::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4) -> (mask64x2, mask64x2) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn splat_f32x16(self, val: f32) -> f32x16 { @@ -6073,40 +9308,44 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { - unsafe { - { - let bit_bytes = _mm256_set1_epi64x(bits.cast_signed()); - let bit_mask = _mm256_setr_epi8( - 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, - 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, - ); - mask8x64 { - val: crate::support::Aligned512([ - { - let bit_bytes = _mm256_shuffle_epi8( - bit_bytes, - _mm256_setr_epi8( - 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, - 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, - ), - ); - _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) - }, - { - let bit_bytes = _mm256_shuffle_epi8( - bit_bytes, - _mm256_setr_epi8( - 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, - 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, - ), - ); - _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) - }, - ]), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, bits: u64) -> mask8x64 { + { + let bit_bytes = _mm256_set1_epi64x(bits.cast_signed()); + let bit_mask = _mm256_setr_epi8( + 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, + 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, + ); + mask8x64 { + val: crate::support::Aligned512([ + { + let bit_bytes = _mm256_shuffle_epi8( + bit_bytes, + _mm256_setr_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, + 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + ), + ); + _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) + }, + { + let bit_bytes = _mm256_shuffle_epi8( + bit_bytes, + _mm256_setr_epi8( + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, + 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, + ), + ); + _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) + }, + ]), + simd: token, + } } } - } + ); + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { @@ -6797,16 +10036,20 @@ impl Simd for Avx2 { } #[inline(always)] fn narrow_u16x32(self, a: u16x32) -> u8x32 { - let (a, b) = self.split_u16x32(a); - unsafe { - let mask = _mm256_set1_epi16(0xFF); - let lo_masked = _mm256_and_si256(a.into(), mask); - let hi_masked = _mm256_and_si256(b.into(), mask); - let result = _mm256_permute4x64_epi64::<0b_11_01_10_00>(_mm256_packus_epi16( - lo_masked, hi_masked, - )); - result.simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x32) -> u8x32 { + let (a, b) = token.split_u16x32(a); + let mask = _mm256_set1_epi16(0xFF); + let lo_masked = _mm256_and_si256(a.into(), mask); + let hi_masked = _mm256_and_si256(b.into(), mask); + let result = _mm256_permute4x64_epi64::<0b_11_01_10_00>(_mm256_packus_epi16( + lo_masked, hi_masked, + )); + result.simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { @@ -6848,15 +10091,19 @@ impl Simd for Avx2 { } #[inline(always)] fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { - unsafe { - { - let lo = _mm256_movemask_epi8(a.val.0[0]) as u32; - let hi = _mm256_movemask_epi8(a.val.0[1]) as u32; - let lo = _pext_u32(lo, 0x5555_5555u32) as u64; - let hi = _pext_u32(hi, 0x5555_5555u32) as u64; - lo | (hi << 16usize) + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask16x32) -> u64 { + { + let lo = _mm256_movemask_epi8(a.val.0[0]) as u32; + let hi = _mm256_movemask_epi8(a.val.0[1]) as u32; + let lo = _pext_u32(lo, 0x5555_5555u32) as u64; + let hi = _pext_u32(hi, 0x5555_5555u32) as u64; + lo | (hi << 16usize) + } } - } + ); + kernel(self, a) } #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { @@ -7549,25 +10796,30 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { - unsafe { - { - let bit_lanes = _mm256_set1_epi32(bits as i32); - mask32x16 { - val: crate::support::Aligned512([ - { - let bit_mask = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128); - _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) - }, - { - let bit_mask = - _mm256_setr_epi32(256, 512, 1024, 2048, 4096, 8192, 16384, 32768); - _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) - }, - ]), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, bits: u64) -> mask32x16 { + { + let bit_lanes = _mm256_set1_epi32(bits as i32); + mask32x16 { + val: crate::support::Aligned512([ + { + let bit_mask = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128); + _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + }, + { + let bit_mask = _mm256_setr_epi32( + 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, + ); + _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + }, + ]), + simd: token, + } } } - } + ); + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask32x16(self, a: mask32x16) -> u64 { @@ -7989,24 +11241,28 @@ impl Simd for Avx2 { } #[inline(always)] fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8 { - unsafe { - { - let bit_lanes = _mm256_set1_epi64x(bits.cast_signed()); - mask64x8 { - val: crate::support::Aligned512([ - { - let bit_mask = _mm256_set_epi64x(8, 4, 2, 1); - _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) - }, - { - let bit_mask = _mm256_set_epi64x(128, 64, 32, 16); - _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) - }, - ]), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, bits: u64) -> mask64x8 { + { + let bit_lanes = _mm256_set1_epi64x(bits.cast_signed()); + mask64x8 { + val: crate::support::Aligned512([ + { + let bit_mask = _mm256_set_epi64x(8, 4, 2, 1); + _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + }, + { + let bit_mask = _mm256_set_epi64x(128, 64, 32, 16); + _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + }, + ]), + simd: token, + } } } - } + ); + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask64x8(self, a: mask64x8) -> u64 { diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs index 25a544b8..a0f289a2 100644 --- a/fearless_simd/src/generated/neon.rs +++ b/fearless_simd/src/generated/neon.rs @@ -88,7 +88,13 @@ impl Simd for Neon { } #[inline(always)] fn splat_f32x4(self, val: f32) -> f32x4 { - unsafe { vdupq_n_f32(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, val: f32) -> f32x4 { + vdupq_n_f32(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4 { @@ -161,86 +167,192 @@ impl Simd for Neon { } #[inline(always)] fn abs_f32x4(self, a: f32x4) -> f32x4 { - unsafe { vabsq_f32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4) -> f32x4 { + vabsq_f32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn neg_f32x4(self, a: f32x4) -> f32x4 { - unsafe { vnegq_f32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4) -> f32x4 { + vnegq_f32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn sqrt_f32x4(self, a: f32x4) -> f32x4 { - unsafe { vsqrtq_f32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4) -> f32x4 { + vsqrtq_f32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn approximate_recip_f32x4(self, a: f32x4) -> f32x4 { - unsafe { vrecpeq_f32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4) -> f32x4 { + vrecpeq_f32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { vaddq_f32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4) -> f32x4 { + vaddq_f32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { vsubq_f32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4) -> f32x4 { + vsubq_f32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { vmulq_f32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4) -> f32x4 { + vmulq_f32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn div_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { vdivq_f32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4) -> f32x4 { + vdivq_f32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn copysign_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { - let sign_mask = vdupq_n_u32(1 << 31); - vbslq_f32(sign_mask, b.into(), a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4) -> f32x4 { + let sign_mask = vdupq_n_u32(1 << 31); + vbslq_f32(sign_mask, b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vceqq_f32(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4) -> mask32x4 { + vreinterpretq_s32_u32(vceqq_f32(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcltq_f32(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4) -> mask32x4 { + vreinterpretq_s32_u32(vcltq_f32(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcleq_f32(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4) -> mask32x4 { + vreinterpretq_s32_u32(vcleq_f32(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcgeq_f32(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4) -> mask32x4 { + vreinterpretq_s32_u32(vcgeq_f32(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcgtq_f32(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4) -> mask32x4 { + vreinterpretq_s32_u32(vcgtq_f32(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vzip1q_f32(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4) -> f32x4 { + let x = a.into(); + let y = b.into(); + vzip1q_f32(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vzip2q_f32(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4) -> f32x4 { + let x = a.into(); + let y = b.into(); + vzip2q_f32(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp1q_f32(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4) -> f32x4 { + let x = a.into(); + let y = b.into(); + vuzp1q_f32(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp2q_f32(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4) -> f32x4 { + let x = a.into(); + let y = b.into(); + vuzp2q_f32(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_f32x4(self, a: f32x4, b: f32x4) -> (f32x4, f32x4) { @@ -252,55 +364,130 @@ impl Simd for Neon { } #[inline(always)] fn max_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { vmaxq_f32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4) -> f32x4 { + vmaxq_f32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { vminq_f32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4) -> f32x4 { + vminq_f32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { vmaxnmq_f32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4) -> f32x4 { + vmaxnmq_f32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { vminnmq_f32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4) -> f32x4 { + vminnmq_f32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { - unsafe { vfmaq_f32(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { + vfmaq_f32(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn mul_sub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { - unsafe { vnegq_f32(vfmsq_f32(c.into(), b.into(), a.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { + vnegq_f32(vfmsq_f32(c.into(), b.into(), a.into())).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn floor_f32x4(self, a: f32x4) -> f32x4 { - unsafe { vrndmq_f32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4) -> f32x4 { + vrndmq_f32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn ceil_f32x4(self, a: f32x4) -> f32x4 { - unsafe { vrndpq_f32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4) -> f32x4 { + vrndpq_f32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn round_ties_even_f32x4(self, a: f32x4) -> f32x4 { - unsafe { vrndnq_f32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4) -> f32x4 { + vrndnq_f32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn fract_f32x4(self, a: f32x4) -> f32x4 { - unsafe { - let c1 = vcvtq_s32_f32(a.into()); - let c2 = vcvtq_f32_s32(c1); - vsubq_f32(a.into(), c2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4) -> f32x4 { + let c1 = vcvtq_s32_f32(a.into()); + let c2 = vcvtq_f32_s32(c1); + vsubq_f32(a.into(), c2).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn trunc_f32x4(self, a: f32x4) -> f32x4 { - unsafe { vrndq_f32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4) -> f32x4 { + vrndq_f32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn select_f32x4(self, a: mask32x4, b: f32x4, c: f32x4) -> f32x4 { - unsafe { vbslq_f32(vreinterpretq_u32_s32(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Neon, + a: mask32x4, + b: f32x4, + c: f32x4, + ) -> f32x4 { + vbslq_f32(vreinterpretq_u32_s32(a.into()), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn combine_f32x4(self, a: f32x4, b: f32x4) -> f32x8 { @@ -311,23 +498,53 @@ impl Simd for Neon { } #[inline(always)] fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2 { - unsafe { vreinterpretq_f64_f32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4) -> f64x2 { + vreinterpretq_f64_f32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_i32_f32x4(self, a: f32x4) -> i32x4 { - unsafe { vreinterpretq_s32_f32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4) -> i32x4 { + vreinterpretq_s32_f32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_f32x4(self, a: f32x4) -> u8x16 { - unsafe { vreinterpretq_u8_f32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4) -> u8x16 { + vreinterpretq_u8_f32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_f32x4(self, a: f32x4) -> u32x4 { - unsafe { vreinterpretq_u32_f32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4) -> u32x4 { + vreinterpretq_u32_f32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_u32_f32x4(self, a: f32x4) -> u32x4 { - unsafe { vcvtq_u32_f32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4) -> u32x4 { + vcvtq_u32_f32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_u32_precise_f32x4(self, a: f32x4) -> u32x4 { @@ -335,7 +552,13 @@ impl Simd for Neon { } #[inline(always)] fn cvt_i32_f32x4(self, a: f32x4) -> i32x4 { - unsafe { vcvtq_s32_f32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f32x4) -> i32x4 { + vcvtq_s32_f32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_i32_precise_f32x4(self, a: f32x4) -> i32x4 { @@ -343,7 +566,13 @@ impl Simd for Neon { } #[inline(always)] fn splat_i8x16(self, val: i8) -> i8x16 { - unsafe { vdupq_n_s8(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, val: i8) -> i8x16 { + vdupq_n_s8(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16 { @@ -416,91 +645,211 @@ impl Simd for Neon { } #[inline(always)] fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { vaddq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, b: i8x16) -> i8x16 { + vaddq_s8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { vsubq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, b: i8x16) -> i8x16 { + vsubq_s8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { vmulq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, b: i8x16) -> i8x16 { + vmulq_s8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { vandq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, b: i8x16) -> i8x16 { + vandq_s8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { vorrq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, b: i8x16) -> i8x16 { + vorrq_s8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { veorq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, b: i8x16) -> i8x16 { + veorq_s8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_i8x16(self, a: i8x16) -> i8x16 { - unsafe { vmvnq_s8(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16) -> i8x16 { + vmvnq_s8(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn shl_i8x16(self, a: i8x16, shift: u32) -> i8x16 { - unsafe { vshlq_s8(a.into(), vdupq_n_s8(shift as i8)).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, shift: u32) -> i8x16 { + vshlq_s8(a.into(), vdupq_n_s8(shift as i8)).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { vshlq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, b: i8x16) -> i8x16 { + vshlq_s8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16 { - unsafe { vshlq_s8(a.into(), vdupq_n_s8(-(shift as i8))).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, shift: u32) -> i8x16 { + vshlq_s8(a.into(), vdupq_n_s8(-(shift as i8))).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { vshlq_s8(a.into(), vnegq_s8(b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, b: i8x16) -> i8x16 { + vshlq_s8(a.into(), vnegq_s8(b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vceqq_s8(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, b: i8x16) -> mask8x16 { + vreinterpretq_s8_u8(vceqq_s8(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vcltq_s8(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, b: i8x16) -> mask8x16 { + vreinterpretq_s8_u8(vcltq_s8(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vcleq_s8(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, b: i8x16) -> mask8x16 { + vreinterpretq_s8_u8(vcleq_s8(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vcgeq_s8(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, b: i8x16) -> mask8x16 { + vreinterpretq_s8_u8(vcgeq_s8(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vcgtq_s8(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, b: i8x16) -> mask8x16 { + vreinterpretq_s8_u8(vcgtq_s8(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - let x = a.into(); - let y = b.into(); - unsafe { vzip1q_s8(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, b: i8x16) -> i8x16 { + let x = a.into(); + let y = b.into(); + vzip1q_s8(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - let x = a.into(); - let y = b.into(); - unsafe { vzip2q_s8(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, b: i8x16) -> i8x16 { + let x = a.into(); + let y = b.into(); + vzip2q_s8(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp1q_s8(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, b: i8x16) -> i8x16 { + let x = a.into(); + let y = b.into(); + vuzp1q_s8(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp2q_s8(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, b: i8x16) -> i8x16 { + let x = a.into(); + let y = b.into(); + vuzp2q_s8(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_i8x16(self, a: i8x16, b: i8x16) -> (i8x16, i8x16) { @@ -512,15 +861,38 @@ impl Simd for Neon { } #[inline(always)] fn select_i8x16(self, a: mask8x16, b: i8x16, c: i8x16) -> i8x16 { - unsafe { vbslq_s8(vreinterpretq_u8_s8(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Neon, + a: mask8x16, + b: i8x16, + c: i8x16, + ) -> i8x16 { + vbslq_s8(vreinterpretq_u8_s8(a.into()), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { vminq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, b: i8x16) -> i8x16 { + vminq_s8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { vmaxq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16, b: i8x16) -> i8x16 { + vmaxq_s8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_i8x16(self, a: i8x16, b: i8x16) -> i8x32 { @@ -531,19 +903,43 @@ impl Simd for Neon { } #[inline(always)] fn neg_i8x16(self, a: i8x16) -> i8x16 { - unsafe { vnegq_s8(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16) -> i8x16 { + vnegq_s8(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i8x16(self, a: i8x16) -> u8x16 { - unsafe { vreinterpretq_u8_s8(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16) -> u8x16 { + vreinterpretq_u8_s8(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i8x16(self, a: i8x16) -> u32x4 { - unsafe { vreinterpretq_u32_s8(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i8x16) -> u32x4 { + vreinterpretq_u32_s8(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_u8x16(self, val: u8) -> u8x16 { - unsafe { vdupq_n_u8(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, val: u8) -> u8x16 { + vdupq_n_u8(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16 { @@ -616,91 +1012,211 @@ impl Simd for Neon { } #[inline(always)] fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { vaddq_u8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, b: u8x16) -> u8x16 { + vaddq_u8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { vsubq_u8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, b: u8x16) -> u8x16 { + vsubq_u8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { vmulq_u8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, b: u8x16) -> u8x16 { + vmulq_u8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { vandq_u8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, b: u8x16) -> u8x16 { + vandq_u8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { vorrq_u8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, b: u8x16) -> u8x16 { + vorrq_u8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { veorq_u8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, b: u8x16) -> u8x16 { + veorq_u8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_u8x16(self, a: u8x16) -> u8x16 { - unsafe { vmvnq_u8(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16) -> u8x16 { + vmvnq_u8(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn shl_u8x16(self, a: u8x16, shift: u32) -> u8x16 { - unsafe { vshlq_u8(a.into(), vdupq_n_s8(shift as i8)).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, shift: u32) -> u8x16 { + vshlq_u8(a.into(), vdupq_n_s8(shift as i8)).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { vshlq_u8(a.into(), vreinterpretq_s8_u8(b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, b: u8x16) -> u8x16 { + vshlq_u8(a.into(), vreinterpretq_s8_u8(b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16 { - unsafe { vshlq_u8(a.into(), vdupq_n_s8(-(shift as i8))).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, shift: u32) -> u8x16 { + vshlq_u8(a.into(), vdupq_n_s8(-(shift as i8))).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { vshlq_u8(a.into(), vnegq_s8(vreinterpretq_s8_u8(b.into()))).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, b: u8x16) -> u8x16 { + vshlq_u8(a.into(), vnegq_s8(vreinterpretq_s8_u8(b.into()))).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vceqq_u8(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, b: u8x16) -> mask8x16 { + vreinterpretq_s8_u8(vceqq_u8(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vcltq_u8(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, b: u8x16) -> mask8x16 { + vreinterpretq_s8_u8(vcltq_u8(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vcleq_u8(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, b: u8x16) -> mask8x16 { + vreinterpretq_s8_u8(vcleq_u8(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vcgeq_u8(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, b: u8x16) -> mask8x16 { + vreinterpretq_s8_u8(vcgeq_u8(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vcgtq_u8(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, b: u8x16) -> mask8x16 { + vreinterpretq_s8_u8(vcgtq_u8(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - let x = a.into(); - let y = b.into(); - unsafe { vzip1q_u8(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, b: u8x16) -> u8x16 { + let x = a.into(); + let y = b.into(); + vzip1q_u8(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - let x = a.into(); - let y = b.into(); - unsafe { vzip2q_u8(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, b: u8x16) -> u8x16 { + let x = a.into(); + let y = b.into(); + vzip2q_u8(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp1q_u8(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, b: u8x16) -> u8x16 { + let x = a.into(); + let y = b.into(); + vuzp1q_u8(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp2q_u8(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, b: u8x16) -> u8x16 { + let x = a.into(); + let y = b.into(); + vuzp2q_u8(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_u8x16(self, a: u8x16, b: u8x16) -> (u8x16, u8x16) { @@ -712,15 +1228,38 @@ impl Simd for Neon { } #[inline(always)] fn select_u8x16(self, a: mask8x16, b: u8x16, c: u8x16) -> u8x16 { - unsafe { vbslq_u8(vreinterpretq_u8_s8(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Neon, + a: mask8x16, + b: u8x16, + c: u8x16, + ) -> u8x16 { + vbslq_u8(vreinterpretq_u8_s8(a.into()), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { vminq_u8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, b: u8x16) -> u8x16 { + vminq_u8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { vmaxq_u8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16, b: u8x16) -> u8x16 { + vmaxq_u8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_u8x16(self, a: u8x16, b: u8x16) -> u8x32 { @@ -731,22 +1270,36 @@ impl Simd for Neon { } #[inline(always)] fn widen_u8x16(self, a: u8x16) -> u16x16 { - unsafe { - let low = vmovl_u8(vget_low_u8(a.into())); - let high = vmovl_u8(vget_high_u8(a.into())); - uint16x8x2_t(low, high).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16) -> u16x16 { + let low = vmovl_u8(vget_low_u8(a.into())); + let high = vmovl_u8(vget_high_u8(a.into())); + uint16x8x2_t(low, high).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_u8x16(self, a: u8x16) -> u32x4 { - unsafe { vreinterpretq_u32_u8(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u8x16) -> u32x4 { + vreinterpretq_u32_u8(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_mask8x16(self, val: bool) -> mask8x16 { - unsafe { - let val: i8 = if val { !0 } else { 0 }; - vdupq_n_s8(val).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, val: bool) -> mask8x16 { + let val: i8 = if val { !0 } else { 0 }; + vdupq_n_s8(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16 { @@ -790,19 +1343,43 @@ impl Simd for Neon { } #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { vandq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask8x16, b: mask8x16) -> mask8x16 { + vandq_s8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { vorrq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask8x16, b: mask8x16) -> mask8x16 { + vorrq_s8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { veorq_s8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask8x16, b: mask8x16) -> mask8x16 { + veorq_s8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_mask8x16(self, a: mask8x16) -> mask8x16 { - unsafe { vmvnq_s8(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask8x16) -> mask8x16 { + vmvnq_s8(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn select_mask8x16( @@ -811,27 +1388,68 @@ impl Simd for Neon { b: mask8x16, c: mask8x16, ) -> mask8x16 { - unsafe { vbslq_s8(vreinterpretq_u8_s8(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Neon, + a: mask8x16, + b: mask8x16, + c: mask8x16, + ) -> mask8x16 { + vbslq_s8(vreinterpretq_u8_s8(a.into()), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { vreinterpretq_s8_u8(vceqq_s8(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask8x16, b: mask8x16) -> mask8x16 { + vreinterpretq_s8_u8(vceqq_s8(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn any_true_mask8x16(self, a: mask8x16) -> bool { - unsafe { vmaxvq_u32(vreinterpretq_u32_s8(a.into())) != 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask8x16) -> bool { + vmaxvq_u32(vreinterpretq_u32_s8(a.into())) != 0 + } + ); + kernel(self, a) } #[inline(always)] fn all_true_mask8x16(self, a: mask8x16) -> bool { - unsafe { vminvq_u32(vreinterpretq_u32_s8(a.into())) == 0xffffffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask8x16) -> bool { + vminvq_u32(vreinterpretq_u32_s8(a.into())) == 0xffffffff + } + ); + kernel(self, a) } #[inline(always)] fn any_false_mask8x16(self, a: mask8x16) -> bool { - unsafe { vminvq_u32(vreinterpretq_u32_s8(a.into())) != 0xffffffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask8x16) -> bool { + vminvq_u32(vreinterpretq_u32_s8(a.into())) != 0xffffffff + } + ); + kernel(self, a) } #[inline(always)] fn all_false_mask8x16(self, a: mask8x16) -> bool { - unsafe { vmaxvq_u32(vreinterpretq_u32_s8(a.into())) == 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask8x16) -> bool { + vmaxvq_u32(vreinterpretq_u32_s8(a.into())) == 0 + } + ); + kernel(self, a) } #[inline(always)] fn combine_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x32 { @@ -842,7 +1460,13 @@ impl Simd for Neon { } #[inline(always)] fn splat_i16x8(self, val: i16) -> i16x8 { - unsafe { vdupq_n_s16(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, val: i16) -> i16x8 { + vdupq_n_s16(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8 { @@ -915,91 +1539,211 @@ impl Simd for Neon { } #[inline(always)] fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { vaddq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, b: i16x8) -> i16x8 { + vaddq_s16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { vsubq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, b: i16x8) -> i16x8 { + vsubq_s16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { vmulq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, b: i16x8) -> i16x8 { + vmulq_s16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { vandq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, b: i16x8) -> i16x8 { + vandq_s16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { vorrq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, b: i16x8) -> i16x8 { + vorrq_s16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { veorq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, b: i16x8) -> i16x8 { + veorq_s16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_i16x8(self, a: i16x8) -> i16x8 { - unsafe { vmvnq_s16(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8) -> i16x8 { + vmvnq_s16(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn shl_i16x8(self, a: i16x8, shift: u32) -> i16x8 { - unsafe { vshlq_s16(a.into(), vdupq_n_s16(shift as i16)).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, shift: u32) -> i16x8 { + vshlq_s16(a.into(), vdupq_n_s16(shift as i16)).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { vshlq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, b: i16x8) -> i16x8 { + vshlq_s16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8 { - unsafe { vshlq_s16(a.into(), vdupq_n_s16(-(shift as i16))).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, shift: u32) -> i16x8 { + vshlq_s16(a.into(), vdupq_n_s16(-(shift as i16))).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { vshlq_s16(a.into(), vnegq_s16(b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, b: i16x8) -> i16x8 { + vshlq_s16(a.into(), vnegq_s16(b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vceqq_s16(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, b: i16x8) -> mask16x8 { + vreinterpretq_s16_u16(vceqq_s16(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vcltq_s16(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, b: i16x8) -> mask16x8 { + vreinterpretq_s16_u16(vcltq_s16(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vcleq_s16(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, b: i16x8) -> mask16x8 { + vreinterpretq_s16_u16(vcleq_s16(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vcgeq_s16(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, b: i16x8) -> mask16x8 { + vreinterpretq_s16_u16(vcgeq_s16(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vcgtq_s16(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, b: i16x8) -> mask16x8 { + vreinterpretq_s16_u16(vcgtq_s16(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - let x = a.into(); - let y = b.into(); - unsafe { vzip1q_s16(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, b: i16x8) -> i16x8 { + let x = a.into(); + let y = b.into(); + vzip1q_s16(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - let x = a.into(); - let y = b.into(); - unsafe { vzip2q_s16(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, b: i16x8) -> i16x8 { + let x = a.into(); + let y = b.into(); + vzip2q_s16(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp1q_s16(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, b: i16x8) -> i16x8 { + let x = a.into(); + let y = b.into(); + vuzp1q_s16(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp2q_s16(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, b: i16x8) -> i16x8 { + let x = a.into(); + let y = b.into(); + vuzp2q_s16(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_i16x8(self, a: i16x8, b: i16x8) -> (i16x8, i16x8) { @@ -1011,15 +1755,38 @@ impl Simd for Neon { } #[inline(always)] fn select_i16x8(self, a: mask16x8, b: i16x8, c: i16x8) -> i16x8 { - unsafe { vbslq_s16(vreinterpretq_u16_s16(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Neon, + a: mask16x8, + b: i16x8, + c: i16x8, + ) -> i16x8 { + vbslq_s16(vreinterpretq_u16_s16(a.into()), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { vminq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, b: i16x8) -> i16x8 { + vminq_s16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { vmaxq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8, b: i16x8) -> i16x8 { + vmaxq_s16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_i16x8(self, a: i16x8, b: i16x8) -> i16x16 { @@ -1030,19 +1797,43 @@ impl Simd for Neon { } #[inline(always)] fn neg_i16x8(self, a: i16x8) -> i16x8 { - unsafe { vnegq_s16(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8) -> i16x8 { + vnegq_s16(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i16x8(self, a: i16x8) -> u8x16 { - unsafe { vreinterpretq_u8_s16(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8) -> u8x16 { + vreinterpretq_u8_s16(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i16x8(self, a: i16x8) -> u32x4 { - unsafe { vreinterpretq_u32_s16(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i16x8) -> u32x4 { + vreinterpretq_u32_s16(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_u16x8(self, val: u16) -> u16x8 { - unsafe { vdupq_n_u16(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, val: u16) -> u16x8 { + vdupq_n_u16(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8 { @@ -1115,91 +1906,211 @@ impl Simd for Neon { } #[inline(always)] fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { vaddq_u16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, b: u16x8) -> u16x8 { + vaddq_u16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { vsubq_u16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, b: u16x8) -> u16x8 { + vsubq_u16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { vmulq_u16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, b: u16x8) -> u16x8 { + vmulq_u16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { vandq_u16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, b: u16x8) -> u16x8 { + vandq_u16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { vorrq_u16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, b: u16x8) -> u16x8 { + vorrq_u16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { veorq_u16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, b: u16x8) -> u16x8 { + veorq_u16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_u16x8(self, a: u16x8) -> u16x8 { - unsafe { vmvnq_u16(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8) -> u16x8 { + vmvnq_u16(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn shl_u16x8(self, a: u16x8, shift: u32) -> u16x8 { - unsafe { vshlq_u16(a.into(), vdupq_n_s16(shift as i16)).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, shift: u32) -> u16x8 { + vshlq_u16(a.into(), vdupq_n_s16(shift as i16)).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { vshlq_u16(a.into(), vreinterpretq_s16_u16(b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, b: u16x8) -> u16x8 { + vshlq_u16(a.into(), vreinterpretq_s16_u16(b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8 { - unsafe { vshlq_u16(a.into(), vdupq_n_s16(-(shift as i16))).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, shift: u32) -> u16x8 { + vshlq_u16(a.into(), vdupq_n_s16(-(shift as i16))).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { vshlq_u16(a.into(), vnegq_s16(vreinterpretq_s16_u16(b.into()))).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, b: u16x8) -> u16x8 { + vshlq_u16(a.into(), vnegq_s16(vreinterpretq_s16_u16(b.into()))).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vceqq_u16(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, b: u16x8) -> mask16x8 { + vreinterpretq_s16_u16(vceqq_u16(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vcltq_u16(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, b: u16x8) -> mask16x8 { + vreinterpretq_s16_u16(vcltq_u16(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vcleq_u16(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, b: u16x8) -> mask16x8 { + vreinterpretq_s16_u16(vcleq_u16(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vcgeq_u16(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, b: u16x8) -> mask16x8 { + vreinterpretq_s16_u16(vcgeq_u16(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vcgtq_u16(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, b: u16x8) -> mask16x8 { + vreinterpretq_s16_u16(vcgtq_u16(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - let x = a.into(); - let y = b.into(); - unsafe { vzip1q_u16(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, b: u16x8) -> u16x8 { + let x = a.into(); + let y = b.into(); + vzip1q_u16(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - let x = a.into(); - let y = b.into(); - unsafe { vzip2q_u16(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, b: u16x8) -> u16x8 { + let x = a.into(); + let y = b.into(); + vzip2q_u16(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp1q_u16(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, b: u16x8) -> u16x8 { + let x = a.into(); + let y = b.into(); + vuzp1q_u16(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp2q_u16(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, b: u16x8) -> u16x8 { + let x = a.into(); + let y = b.into(); + vuzp2q_u16(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_u16x8(self, a: u16x8, b: u16x8) -> (u16x8, u16x8) { @@ -1211,15 +2122,38 @@ impl Simd for Neon { } #[inline(always)] fn select_u16x8(self, a: mask16x8, b: u16x8, c: u16x8) -> u16x8 { - unsafe { vbslq_u16(vreinterpretq_u16_s16(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Neon, + a: mask16x8, + b: u16x8, + c: u16x8, + ) -> u16x8 { + vbslq_u16(vreinterpretq_u16_s16(a.into()), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { vminq_u16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, b: u16x8) -> u16x8 { + vminq_u16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { vmaxq_u16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8, b: u16x8) -> u16x8 { + vmaxq_u16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_u16x8(self, a: u16x8, b: u16x8) -> u16x16 { @@ -1230,18 +2164,34 @@ impl Simd for Neon { } #[inline(always)] fn reinterpret_u8_u16x8(self, a: u16x8) -> u8x16 { - unsafe { vreinterpretq_u8_u16(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8) -> u8x16 { + vreinterpretq_u8_u16(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_u16x8(self, a: u16x8) -> u32x4 { - unsafe { vreinterpretq_u32_u16(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x8) -> u32x4 { + vreinterpretq_u32_u16(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_mask16x8(self, val: bool) -> mask16x8 { - unsafe { - let val: i16 = if val { !0 } else { 0 }; - vdupq_n_s16(val).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, val: bool) -> mask16x8 { + let val: i16 = if val { !0 } else { 0 }; + vdupq_n_s16(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8 { @@ -1277,19 +2227,43 @@ impl Simd for Neon { } #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { vandq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask16x8, b: mask16x8) -> mask16x8 { + vandq_s16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { vorrq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask16x8, b: mask16x8) -> mask16x8 { + vorrq_s16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { veorq_s16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask16x8, b: mask16x8) -> mask16x8 { + veorq_s16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_mask16x8(self, a: mask16x8) -> mask16x8 { - unsafe { vmvnq_s16(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask16x8) -> mask16x8 { + vmvnq_s16(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn select_mask16x8( @@ -1298,27 +2272,68 @@ impl Simd for Neon { b: mask16x8, c: mask16x8, ) -> mask16x8 { - unsafe { vbslq_s16(vreinterpretq_u16_s16(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Neon, + a: mask16x8, + b: mask16x8, + c: mask16x8, + ) -> mask16x8 { + vbslq_s16(vreinterpretq_u16_s16(a.into()), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { vreinterpretq_s16_u16(vceqq_s16(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask16x8, b: mask16x8) -> mask16x8 { + vreinterpretq_s16_u16(vceqq_s16(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn any_true_mask16x8(self, a: mask16x8) -> bool { - unsafe { vmaxvq_u32(vreinterpretq_u32_s16(a.into())) != 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask16x8) -> bool { + vmaxvq_u32(vreinterpretq_u32_s16(a.into())) != 0 + } + ); + kernel(self, a) } #[inline(always)] fn all_true_mask16x8(self, a: mask16x8) -> bool { - unsafe { vminvq_u32(vreinterpretq_u32_s16(a.into())) == 0xffffffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask16x8) -> bool { + vminvq_u32(vreinterpretq_u32_s16(a.into())) == 0xffffffff + } + ); + kernel(self, a) } #[inline(always)] fn any_false_mask16x8(self, a: mask16x8) -> bool { - unsafe { vminvq_u32(vreinterpretq_u32_s16(a.into())) != 0xffffffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask16x8) -> bool { + vminvq_u32(vreinterpretq_u32_s16(a.into())) != 0xffffffff + } + ); + kernel(self, a) } #[inline(always)] fn all_false_mask16x8(self, a: mask16x8) -> bool { - unsafe { vmaxvq_u32(vreinterpretq_u32_s16(a.into())) == 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask16x8) -> bool { + vmaxvq_u32(vreinterpretq_u32_s16(a.into())) == 0 + } + ); + kernel(self, a) } #[inline(always)] fn combine_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x16 { @@ -1329,7 +2344,13 @@ impl Simd for Neon { } #[inline(always)] fn splat_i32x4(self, val: i32) -> i32x4 { - unsafe { vdupq_n_s32(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, val: i32) -> i32x4 { + vdupq_n_s32(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4 { @@ -1402,91 +2423,211 @@ impl Simd for Neon { } #[inline(always)] fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { vaddq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, b: i32x4) -> i32x4 { + vaddq_s32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { vsubq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, b: i32x4) -> i32x4 { + vsubq_s32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { vmulq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, b: i32x4) -> i32x4 { + vmulq_s32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { vandq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, b: i32x4) -> i32x4 { + vandq_s32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { vorrq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, b: i32x4) -> i32x4 { + vorrq_s32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { veorq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, b: i32x4) -> i32x4 { + veorq_s32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_i32x4(self, a: i32x4) -> i32x4 { - unsafe { vmvnq_s32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4) -> i32x4 { + vmvnq_s32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn shl_i32x4(self, a: i32x4, shift: u32) -> i32x4 { - unsafe { vshlq_s32(a.into(), vdupq_n_s32(shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, shift: u32) -> i32x4 { + vshlq_s32(a.into(), vdupq_n_s32(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { vshlq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, b: i32x4) -> i32x4 { + vshlq_s32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_i32x4(self, a: i32x4, shift: u32) -> i32x4 { - unsafe { vshlq_s32(a.into(), vdupq_n_s32(-shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, shift: u32) -> i32x4 { + vshlq_s32(a.into(), vdupq_n_s32(-shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { vshlq_s32(a.into(), vnegq_s32(b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, b: i32x4) -> i32x4 { + vshlq_s32(a.into(), vnegq_s32(b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vceqq_s32(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, b: i32x4) -> mask32x4 { + vreinterpretq_s32_u32(vceqq_s32(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcltq_s32(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, b: i32x4) -> mask32x4 { + vreinterpretq_s32_u32(vcltq_s32(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcleq_s32(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, b: i32x4) -> mask32x4 { + vreinterpretq_s32_u32(vcleq_s32(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcgeq_s32(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, b: i32x4) -> mask32x4 { + vreinterpretq_s32_u32(vcgeq_s32(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcgtq_s32(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, b: i32x4) -> mask32x4 { + vreinterpretq_s32_u32(vcgtq_s32(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vzip1q_s32(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, b: i32x4) -> i32x4 { + let x = a.into(); + let y = b.into(); + vzip1q_s32(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vzip2q_s32(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, b: i32x4) -> i32x4 { + let x = a.into(); + let y = b.into(); + vzip2q_s32(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp1q_s32(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, b: i32x4) -> i32x4 { + let x = a.into(); + let y = b.into(); + vuzp1q_s32(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp2q_s32(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, b: i32x4) -> i32x4 { + let x = a.into(); + let y = b.into(); + vuzp2q_s32(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_i32x4(self, a: i32x4, b: i32x4) -> (i32x4, i32x4) { @@ -1498,15 +2639,38 @@ impl Simd for Neon { } #[inline(always)] fn select_i32x4(self, a: mask32x4, b: i32x4, c: i32x4) -> i32x4 { - unsafe { vbslq_s32(vreinterpretq_u32_s32(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Neon, + a: mask32x4, + b: i32x4, + c: i32x4, + ) -> i32x4 { + vbslq_s32(vreinterpretq_u32_s32(a.into()), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { vminq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, b: i32x4) -> i32x4 { + vminq_s32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { vmaxq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4, b: i32x4) -> i32x4 { + vmaxq_s32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_i32x4(self, a: i32x4, b: i32x4) -> i32x8 { @@ -1517,23 +2681,53 @@ impl Simd for Neon { } #[inline(always)] fn neg_i32x4(self, a: i32x4) -> i32x4 { - unsafe { vnegq_s32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4) -> i32x4 { + vnegq_s32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i32x4(self, a: i32x4) -> u8x16 { - unsafe { vreinterpretq_u8_s32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4) -> u8x16 { + vreinterpretq_u8_s32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i32x4(self, a: i32x4) -> u32x4 { - unsafe { vreinterpretq_u32_s32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4) -> u32x4 { + vreinterpretq_u32_s32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_f32_i32x4(self, a: i32x4) -> f32x4 { - unsafe { vcvtq_f32_s32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i32x4) -> f32x4 { + vcvtq_f32_s32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_u32x4(self, val: u32) -> u32x4 { - unsafe { vdupq_n_u32(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, val: u32) -> u32x4 { + vdupq_n_u32(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4 { @@ -1606,91 +2800,211 @@ impl Simd for Neon { } #[inline(always)] fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { vaddq_u32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, b: u32x4) -> u32x4 { + vaddq_u32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { vsubq_u32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, b: u32x4) -> u32x4 { + vsubq_u32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { vmulq_u32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, b: u32x4) -> u32x4 { + vmulq_u32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { vandq_u32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, b: u32x4) -> u32x4 { + vandq_u32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { vorrq_u32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, b: u32x4) -> u32x4 { + vorrq_u32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { veorq_u32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, b: u32x4) -> u32x4 { + veorq_u32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_u32x4(self, a: u32x4) -> u32x4 { - unsafe { vmvnq_u32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4) -> u32x4 { + vmvnq_u32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn shl_u32x4(self, a: u32x4, shift: u32) -> u32x4 { - unsafe { vshlq_u32(a.into(), vdupq_n_s32(shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, shift: u32) -> u32x4 { + vshlq_u32(a.into(), vdupq_n_s32(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { vshlq_u32(a.into(), vreinterpretq_s32_u32(b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, b: u32x4) -> u32x4 { + vshlq_u32(a.into(), vreinterpretq_s32_u32(b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_u32x4(self, a: u32x4, shift: u32) -> u32x4 { - unsafe { vshlq_u32(a.into(), vdupq_n_s32(-shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, shift: u32) -> u32x4 { + vshlq_u32(a.into(), vdupq_n_s32(-shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { vshlq_u32(a.into(), vnegq_s32(vreinterpretq_s32_u32(b.into()))).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, b: u32x4) -> u32x4 { + vshlq_u32(a.into(), vnegq_s32(vreinterpretq_s32_u32(b.into()))).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vceqq_u32(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, b: u32x4) -> mask32x4 { + vreinterpretq_s32_u32(vceqq_u32(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcltq_u32(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, b: u32x4) -> mask32x4 { + vreinterpretq_s32_u32(vcltq_u32(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcleq_u32(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, b: u32x4) -> mask32x4 { + vreinterpretq_s32_u32(vcleq_u32(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcgeq_u32(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, b: u32x4) -> mask32x4 { + vreinterpretq_s32_u32(vcgeq_u32(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vcgtq_u32(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, b: u32x4) -> mask32x4 { + vreinterpretq_s32_u32(vcgtq_u32(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vzip1q_u32(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, b: u32x4) -> u32x4 { + let x = a.into(); + let y = b.into(); + vzip1q_u32(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vzip2q_u32(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, b: u32x4) -> u32x4 { + let x = a.into(); + let y = b.into(); + vzip2q_u32(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp1q_u32(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, b: u32x4) -> u32x4 { + let x = a.into(); + let y = b.into(); + vuzp1q_u32(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp2q_u32(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, b: u32x4) -> u32x4 { + let x = a.into(); + let y = b.into(); + vuzp2q_u32(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_u32x4(self, a: u32x4, b: u32x4) -> (u32x4, u32x4) { @@ -1702,15 +3016,38 @@ impl Simd for Neon { } #[inline(always)] fn select_u32x4(self, a: mask32x4, b: u32x4, c: u32x4) -> u32x4 { - unsafe { vbslq_u32(vreinterpretq_u32_s32(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Neon, + a: mask32x4, + b: u32x4, + c: u32x4, + ) -> u32x4 { + vbslq_u32(vreinterpretq_u32_s32(a.into()), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { vminq_u32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, b: u32x4) -> u32x4 { + vminq_u32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { vmaxq_u32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4, b: u32x4) -> u32x4 { + vmaxq_u32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_u32x4(self, a: u32x4, b: u32x4) -> u32x8 { @@ -1721,18 +3058,34 @@ impl Simd for Neon { } #[inline(always)] fn reinterpret_u8_u32x4(self, a: u32x4) -> u8x16 { - unsafe { vreinterpretq_u8_u32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4) -> u8x16 { + vreinterpretq_u8_u32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_f32_u32x4(self, a: u32x4) -> f32x4 { - unsafe { vcvtq_f32_u32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u32x4) -> f32x4 { + vcvtq_f32_u32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_mask32x4(self, val: bool) -> mask32x4 { - unsafe { - let val: i32 = if val { !0 } else { 0 }; - vdupq_n_s32(val).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, val: bool) -> mask32x4 { + let val: i32 = if val { !0 } else { 0 }; + vdupq_n_s32(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4 { @@ -1766,19 +3119,43 @@ impl Simd for Neon { } #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { vandq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask32x4, b: mask32x4) -> mask32x4 { + vandq_s32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { vorrq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask32x4, b: mask32x4) -> mask32x4 { + vorrq_s32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { veorq_s32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask32x4, b: mask32x4) -> mask32x4 { + veorq_s32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_mask32x4(self, a: mask32x4) -> mask32x4 { - unsafe { vmvnq_s32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask32x4) -> mask32x4 { + vmvnq_s32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn select_mask32x4( @@ -1787,27 +3164,68 @@ impl Simd for Neon { b: mask32x4, c: mask32x4, ) -> mask32x4 { - unsafe { vbslq_s32(vreinterpretq_u32_s32(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Neon, + a: mask32x4, + b: mask32x4, + c: mask32x4, + ) -> mask32x4 { + vbslq_s32(vreinterpretq_u32_s32(a.into()), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { vreinterpretq_s32_u32(vceqq_s32(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask32x4, b: mask32x4) -> mask32x4 { + vreinterpretq_s32_u32(vceqq_s32(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn any_true_mask32x4(self, a: mask32x4) -> bool { - unsafe { vmaxvq_u32(vreinterpretq_u32_s32(a.into())) != 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask32x4) -> bool { + vmaxvq_u32(vreinterpretq_u32_s32(a.into())) != 0 + } + ); + kernel(self, a) } #[inline(always)] fn all_true_mask32x4(self, a: mask32x4) -> bool { - unsafe { vminvq_u32(vreinterpretq_u32_s32(a.into())) == 0xffffffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask32x4) -> bool { + vminvq_u32(vreinterpretq_u32_s32(a.into())) == 0xffffffff + } + ); + kernel(self, a) } #[inline(always)] fn any_false_mask32x4(self, a: mask32x4) -> bool { - unsafe { vminvq_u32(vreinterpretq_u32_s32(a.into())) != 0xffffffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask32x4) -> bool { + vminvq_u32(vreinterpretq_u32_s32(a.into())) != 0xffffffff + } + ); + kernel(self, a) } #[inline(always)] fn all_false_mask32x4(self, a: mask32x4) -> bool { - unsafe { vmaxvq_u32(vreinterpretq_u32_s32(a.into())) == 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask32x4) -> bool { + vmaxvq_u32(vreinterpretq_u32_s32(a.into())) == 0 + } + ); + kernel(self, a) } #[inline(always)] fn combine_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x8 { @@ -1818,7 +3236,13 @@ impl Simd for Neon { } #[inline(always)] fn splat_f64x2(self, val: f64) -> f64x2 { - unsafe { vdupq_n_f64(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, val: f64) -> f64x2 { + vdupq_n_f64(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2 { @@ -1891,86 +3315,192 @@ impl Simd for Neon { } #[inline(always)] fn abs_f64x2(self, a: f64x2) -> f64x2 { - unsafe { vabsq_f64(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2) -> f64x2 { + vabsq_f64(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn neg_f64x2(self, a: f64x2) -> f64x2 { - unsafe { vnegq_f64(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2) -> f64x2 { + vnegq_f64(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn sqrt_f64x2(self, a: f64x2) -> f64x2 { - unsafe { vsqrtq_f64(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2) -> f64x2 { + vsqrtq_f64(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn approximate_recip_f64x2(self, a: f64x2) -> f64x2 { - unsafe { vrecpeq_f64(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2) -> f64x2 { + vrecpeq_f64(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { vaddq_f64(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2) -> f64x2 { + vaddq_f64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { vsubq_f64(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2) -> f64x2 { + vsubq_f64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { vmulq_f64(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2) -> f64x2 { + vmulq_f64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn div_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { vdivq_f64(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2) -> f64x2 { + vdivq_f64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn copysign_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let sign_mask = vdupq_n_u64(1 << 63); - vbslq_f64(sign_mask, b.into(), a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2) -> f64x2 { + let sign_mask = vdupq_n_u64(1 << 63); + vbslq_f64(sign_mask, b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { vreinterpretq_s64_u64(vceqq_f64(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2) -> mask64x2 { + vreinterpretq_s64_u64(vceqq_f64(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { vreinterpretq_s64_u64(vcltq_f64(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2) -> mask64x2 { + vreinterpretq_s64_u64(vcltq_f64(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { vreinterpretq_s64_u64(vcleq_f64(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2) -> mask64x2 { + vreinterpretq_s64_u64(vcleq_f64(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { vreinterpretq_s64_u64(vcgeq_f64(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2) -> mask64x2 { + vreinterpretq_s64_u64(vcgeq_f64(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { vreinterpretq_s64_u64(vcgtq_f64(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2) -> mask64x2 { + vreinterpretq_s64_u64(vcgtq_f64(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - let x = a.into(); - let y = b.into(); - unsafe { vzip1q_f64(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2) -> f64x2 { + let x = a.into(); + let y = b.into(); + vzip1q_f64(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - let x = a.into(); - let y = b.into(); - unsafe { vzip2q_f64(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2) -> f64x2 { + let x = a.into(); + let y = b.into(); + vzip2q_f64(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp1q_f64(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2) -> f64x2 { + let x = a.into(); + let y = b.into(); + vuzp1q_f64(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - let x = a.into(); - let y = b.into(); - unsafe { vuzp2q_f64(x, y).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2) -> f64x2 { + let x = a.into(); + let y = b.into(); + vuzp2q_f64(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_f64x2(self, a: f64x2, b: f64x2) -> (f64x2, f64x2) { @@ -1982,55 +3512,130 @@ impl Simd for Neon { } #[inline(always)] fn max_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { vmaxq_f64(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2) -> f64x2 { + vmaxq_f64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { vminq_f64(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2) -> f64x2 { + vminq_f64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { vmaxnmq_f64(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2) -> f64x2 { + vmaxnmq_f64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { vminnmq_f64(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2) -> f64x2 { + vminnmq_f64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { - unsafe { vfmaq_f64(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { + vfmaq_f64(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn mul_sub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { - unsafe { vnegq_f64(vfmsq_f64(c.into(), b.into(), a.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { + vnegq_f64(vfmsq_f64(c.into(), b.into(), a.into())).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn floor_f64x2(self, a: f64x2) -> f64x2 { - unsafe { vrndmq_f64(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2) -> f64x2 { + vrndmq_f64(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn ceil_f64x2(self, a: f64x2) -> f64x2 { - unsafe { vrndpq_f64(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2) -> f64x2 { + vrndpq_f64(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn round_ties_even_f64x2(self, a: f64x2) -> f64x2 { - unsafe { vrndnq_f64(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2) -> f64x2 { + vrndnq_f64(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn fract_f64x2(self, a: f64x2) -> f64x2 { - unsafe { - let c1 = vcvtq_s64_f64(a.into()); - let c2 = vcvtq_f64_s64(c1); - vsubq_f64(a.into(), c2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2) -> f64x2 { + let c1 = vcvtq_s64_f64(a.into()); + let c2 = vcvtq_f64_s64(c1); + vsubq_f64(a.into(), c2).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn trunc_f64x2(self, a: f64x2) -> f64x2 { - unsafe { vrndq_f64(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2) -> f64x2 { + vrndq_f64(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn select_f64x2(self, a: mask64x2, b: f64x2, c: f64x2) -> f64x2 { - unsafe { vbslq_f64(vreinterpretq_u64_s64(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Neon, + a: mask64x2, + b: f64x2, + c: f64x2, + ) -> f64x2 { + vbslq_f64(vreinterpretq_u64_s64(a.into()), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn combine_f64x2(self, a: f64x2, b: f64x2) -> f64x4 { @@ -2041,14 +3646,24 @@ impl Simd for Neon { } #[inline(always)] fn reinterpret_f32_f64x2(self, a: f64x2) -> f32x4 { - unsafe { vreinterpretq_f32_f64(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: f64x2) -> f32x4 { + vreinterpretq_f32_f64(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_mask64x2(self, val: bool) -> mask64x2 { - unsafe { - let val: i64 = if val { !0 } else { 0 }; - vdupq_n_s64(val).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, val: bool) -> mask64x2 { + let val: i64 = if val { !0 } else { 0 }; + vdupq_n_s64(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { @@ -2080,19 +3695,43 @@ impl Simd for Neon { } #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { vandq_s64(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask64x2, b: mask64x2) -> mask64x2 { + vandq_s64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { vorrq_s64(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask64x2, b: mask64x2) -> mask64x2 { + vorrq_s64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { veorq_s64(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask64x2, b: mask64x2) -> mask64x2 { + veorq_s64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_mask64x2(self, a: mask64x2) -> mask64x2 { - unsafe { vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a.into()))).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask64x2) -> mask64x2 { + vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a.into()))).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn select_mask64x2( @@ -2101,27 +3740,68 @@ impl Simd for Neon { b: mask64x2, c: mask64x2, ) -> mask64x2 { - unsafe { vbslq_s64(vreinterpretq_u64_s64(a.into()), b.into(), c.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Neon, + a: mask64x2, + b: mask64x2, + c: mask64x2, + ) -> mask64x2 { + vbslq_s64(vreinterpretq_u64_s64(a.into()), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { vreinterpretq_s64_u64(vceqq_s64(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask64x2, b: mask64x2) -> mask64x2 { + vreinterpretq_s64_u64(vceqq_s64(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn any_true_mask64x2(self, a: mask64x2) -> bool { - unsafe { vmaxvq_u32(vreinterpretq_u32_s64(a.into())) != 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask64x2) -> bool { + vmaxvq_u32(vreinterpretq_u32_s64(a.into())) != 0 + } + ); + kernel(self, a) } #[inline(always)] fn all_true_mask64x2(self, a: mask64x2) -> bool { - unsafe { vminvq_u32(vreinterpretq_u32_s64(a.into())) == 0xffffffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask64x2) -> bool { + vminvq_u32(vreinterpretq_u32_s64(a.into())) == 0xffffffff + } + ); + kernel(self, a) } #[inline(always)] fn any_false_mask64x2(self, a: mask64x2) -> bool { - unsafe { vminvq_u32(vreinterpretq_u32_s64(a.into())) != 0xffffffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask64x2) -> bool { + vminvq_u32(vreinterpretq_u32_s64(a.into())) != 0xffffffff + } + ); + kernel(self, a) } #[inline(always)] fn all_false_mask64x2(self, a: mask64x2) -> bool { - unsafe { vmaxvq_u32(vreinterpretq_u32_s64(a.into())) == 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask64x2) -> bool { + vmaxvq_u32(vreinterpretq_u32_s64(a.into())) == 0 + } + ); + kernel(self, a) } #[inline(always)] fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { @@ -3822,12 +5502,16 @@ impl Simd for Neon { } #[inline(always)] fn narrow_u16x16(self, a: u16x16) -> u8x16 { - unsafe { - let converted: uint16x8x2_t = a.into(); - let low = vmovn_u16(converted.0); - let high = vmovn_u16(converted.1); - vcombine_u8(low, high).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x16) -> u8x16 { + let converted: uint16x8x2_t = a.into(); + let low = vmovn_u16(converted.0); + let high = vmovn_u16(converted.1); + vcombine_u8(low, high).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs index 3d4b7180..47b81a4d 100644 --- a/fearless_simd/src/generated/sse4_2.rs +++ b/fearless_simd/src/generated/sse4_2.rs @@ -124,7 +124,13 @@ impl Simd for Sse4_2 { } #[inline(always)] fn splat_f32x4(self, val: f32) -> f32x4 { - unsafe { _mm_set1_ps(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, val: f32) -> f32x4 { + _mm_set1_ps(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4 { @@ -197,78 +203,185 @@ impl Simd for Sse4_2 { } #[inline(always)] fn abs_f32x4(self, a: f32x4) -> f32x4 { - unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4) -> f32x4 { + _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn neg_f32x4(self, a: f32x4) -> f32x4 { - unsafe { _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4) -> f32x4 { + _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn sqrt_f32x4(self, a: f32x4) -> f32x4 { - unsafe { _mm_sqrt_ps(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4) -> f32x4 { + _mm_sqrt_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn approximate_recip_f32x4(self, a: f32x4) -> f32x4 { - unsafe { _mm_rcp_ps(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4) -> f32x4 { + _mm_rcp_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4, b: f32x4) -> f32x4 { + _mm_add_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_sub_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4, b: f32x4) -> f32x4 { + _mm_sub_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_mul_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4, b: f32x4) -> f32x4 { + _mm_mul_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn div_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_div_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4, b: f32x4) -> f32x4 { + _mm_div_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn copysign_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { - let mask = _mm_set1_ps(-0.0); - _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4, b: f32x4) -> f32x4 { + let mask = _mm_set1_ps(-0.0); + _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmpeq_ps(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4, b: f32x4) -> mask32x4 { + _mm_castps_si128(_mm_cmpeq_ps(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmplt_ps(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4, b: f32x4) -> mask32x4 { + _mm_castps_si128(_mm_cmplt_ps(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmple_ps(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4, b: f32x4) -> mask32x4 { + _mm_castps_si128(_mm_cmple_ps(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmpge_ps(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4, b: f32x4) -> mask32x4 { + _mm_castps_si128(_mm_cmpge_ps(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { _mm_castps_si128(_mm_cmpgt_ps(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4, b: f32x4) -> mask32x4 { + _mm_castps_si128(_mm_cmpgt_ps(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_unpacklo_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4, b: f32x4) -> f32x4 { + _mm_unpacklo_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_unpackhi_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4, b: f32x4) -> f32x4 { + _mm_unpackhi_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4, b: f32x4) -> f32x4 { + _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4, b: f32x4) -> f32x4 { + _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_f32x4(self, a: f32x4, b: f32x4) -> (f32x4, f32x4) { @@ -280,27 +393,47 @@ impl Simd for Sse4_2 { } #[inline(always)] fn max_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4, b: f32x4) -> f32x4 { + _mm_max_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4, b: f32x4) -> f32x4 { + _mm_min_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { - let intermediate = _mm_max_ps(a.into(), b.into()); - let b_is_nan = _mm_cmpunord_ps(b.into(), b.into()); - _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4, b: f32x4) -> f32x4 { + let intermediate = _mm_max_ps(a.into(), b.into()); + let b_is_nan = _mm_cmpunord_ps(b.into(), b.into()); + _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { - let intermediate = _mm_min_ps(a.into(), b.into()); - let b_is_nan = _mm_cmpunord_ps(b.into(), b.into()); - _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4, b: f32x4) -> f32x4 { + let intermediate = _mm_min_ps(a.into(), b.into()); + let b_is_nan = _mm_cmpunord_ps(b.into(), b.into()); + _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { @@ -312,22 +445,36 @@ impl Simd for Sse4_2 { } #[inline(always)] fn floor_f32x4(self, a: f32x4) -> f32x4 { - unsafe { - _mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4) -> f32x4 { + _mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn ceil_f32x4(self, a: f32x4) -> f32x4 { - unsafe { - _mm_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4) -> f32x4 { + _mm_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn round_ties_even_f32x4(self, a: f32x4) -> f32x4 { - unsafe { - _mm_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4) -> f32x4 { + _mm_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn fract_f32x4(self, a: f32x4) -> f32x4 { @@ -335,13 +482,29 @@ impl Simd for Sse4_2 { } #[inline(always)] fn trunc_f32x4(self, a: f32x4) -> f32x4 { - unsafe { - _mm_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4) -> f32x4 { + _mm_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn select_f32x4(self, a: mask32x4, b: f32x4, c: f32x4) -> f32x4 { - unsafe { _mm_blendv_ps(c.into(), b.into(), _mm_castsi128_ps(a.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Sse4_2, + a: mask32x4, + b: f32x4, + c: f32x4, + ) -> f32x4 { + _mm_blendv_ps(c.into(), b.into(), _mm_castsi128_ps(a.into())).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn combine_f32x4(self, a: f32x4, b: f32x4) -> f32x8 { @@ -352,82 +515,130 @@ impl Simd for Sse4_2 { } #[inline(always)] fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2 { - unsafe { _mm_castps_pd(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4) -> f64x2 { + _mm_castps_pd(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_i32_f32x4(self, a: f32x4) -> i32x4 { - unsafe { _mm_castps_si128(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4) -> i32x4 { + _mm_castps_si128(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_f32x4(self, a: f32x4) -> u8x16 { - unsafe { _mm_castps_si128(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4) -> u8x16 { + _mm_castps_si128(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_f32x4(self, a: f32x4) -> u32x4 { - unsafe { _mm_castps_si128(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4) -> u32x4 { + _mm_castps_si128(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_u32_f32x4(self, a: f32x4) -> u32x4 { - unsafe { - let mut converted = _mm_cvttps_epi32(a.into()); - let in_range = _mm_cmplt_ps(a.into(), _mm_set1_ps(2147483648.0)); - let all_in_range = _mm_movemask_ps(in_range) == 0b1111; - if !all_in_range { - let excess = _mm_sub_ps(a.into(), _mm_set1_ps(2147483648.0)); - let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess)); - converted = _mm_add_epi32(converted, excess_converted); + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4) -> u32x4 { + let mut converted = _mm_cvttps_epi32(a.into()); + let in_range = _mm_cmplt_ps(a.into(), _mm_set1_ps(2147483648.0)); + let all_in_range = _mm_movemask_ps(in_range) == 0b1111; + if !all_in_range { + let excess = _mm_sub_ps(a.into(), _mm_set1_ps(2147483648.0)); + let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess)); + converted = _mm_add_epi32(converted, excess_converted); + } + converted.simd_into(token) } - converted.simd_into(self) - } + ); + kernel(self, a) } #[inline(always)] fn cvt_u32_precise_f32x4(self, a: f32x4) -> u32x4 { - unsafe { - let a = _mm_max_ps(a.into(), _mm_setzero_ps()); - let mut converted = _mm_cvttps_epi32(a); - let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0)); - let all_in_range = _mm_movemask_ps(in_range) == 0b1111; - if !all_in_range { - let exceeds_unsigned_range = - _mm_castps_si128(_mm_cmplt_ps(_mm_set1_ps(4294967040.0), a)); - let excess = _mm_sub_ps(a, _mm_set1_ps(2147483648.0)); - let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess)); - converted = _mm_add_epi32(converted, excess_converted); - converted = _mm_blendv_epi8( - converted, - _mm_set1_epi32(u32::MAX.cast_signed()), - exceeds_unsigned_range, - ); + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4) -> u32x4 { + let a = _mm_max_ps(a.into(), _mm_setzero_ps()); + let mut converted = _mm_cvttps_epi32(a); + let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0)); + let all_in_range = _mm_movemask_ps(in_range) == 0b1111; + if !all_in_range { + let exceeds_unsigned_range = + _mm_castps_si128(_mm_cmplt_ps(_mm_set1_ps(4294967040.0), a)); + let excess = _mm_sub_ps(a, _mm_set1_ps(2147483648.0)); + let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess)); + converted = _mm_add_epi32(converted, excess_converted); + converted = _mm_blendv_epi8( + converted, + _mm_set1_epi32(u32::MAX.cast_signed()), + exceeds_unsigned_range, + ); + } + converted.simd_into(token) } - converted.simd_into(self) - } + ); + kernel(self, a) } #[inline(always)] fn cvt_i32_f32x4(self, a: f32x4) -> i32x4 { - unsafe { _mm_cvttps_epi32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4) -> i32x4 { + _mm_cvttps_epi32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_i32_precise_f32x4(self, a: f32x4) -> i32x4 { - unsafe { - let a = a.into(); - let mut converted = _mm_cvttps_epi32(a); - let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0)); - let all_in_range = _mm_movemask_ps(in_range) == 0b1111; - if !all_in_range { - converted = _mm_blendv_epi8( - _mm_set1_epi32(i32::MAX), - converted, - _mm_castps_si128(in_range), - ); - let is_not_nan = _mm_castps_si128(_mm_cmpord_ps(a, a)); - converted = _mm_and_si128(converted, is_not_nan); + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x4) -> i32x4 { + let a = a.into(); + let mut converted = _mm_cvttps_epi32(a); + let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0)); + let all_in_range = _mm_movemask_ps(in_range) == 0b1111; + if !all_in_range { + converted = _mm_blendv_epi8( + _mm_set1_epi32(i32::MAX), + converted, + _mm_castps_si128(in_range), + ); + let is_not_nan = _mm_castps_si128(_mm_cmpord_ps(a, a)); + converted = _mm_and_si128(converted, is_not_nan); + } + converted.simd_into(token) } - converted.simd_into(self) - } + ); + kernel(self, a) } #[inline(always)] fn splat_i8x16(self, val: i8) -> i8x16 { - unsafe { _mm_set1_epi8(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, val: i8) -> i8x16 { + _mm_set1_epi8(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16 { @@ -500,36 +711,70 @@ impl Simd for Sse4_2 { } #[inline(always)] fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16, b: i8x16) -> i8x16 { + _mm_add_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16, b: i8x16) -> i8x16 { + _mm_sub_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { - let dst_even = _mm_mullo_epi16(a.into(), b.into()); - let dst_odd = - _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into())); - _mm_or_si128( - _mm_slli_epi16(dst_odd, 8), - _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16, b: i8x16) -> i8x16 { + let dst_even = _mm_mullo_epi16(a.into(), b.into()); + let dst_odd = + _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into())); + _mm_or_si128( + _mm_slli_epi16(dst_odd, 8), + _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16, b: i8x16) -> i8x16 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16, b: i8x16) -> i8x16 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16, b: i8x16) -> i8x16 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_i8x16(self, a: i8x16) -> i8x16 { @@ -537,15 +782,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shl_i8x16(self, a: i8x16, shift: u32) -> i8x16 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); - let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); - let lo_shifted = _mm_sll_epi16(lo_16, shift_count); - let hi_shifted = _mm_sll_epi16(hi_16, shift_count); - _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16, shift: u32) -> i8x16 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); + let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); + let lo_shifted = _mm_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm_sll_epi16(hi_16, shift_count); + _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { @@ -553,15 +802,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); - let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); - let lo_shifted = _mm_sra_epi16(lo_16, shift_count); - let hi_shifted = _mm_sra_epi16(hi_16, shift_count); - _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16, shift: u32) -> i8x16 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); + let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); + let lo_shifted = _mm_sra_epi16(lo_16, shift_count); + let hi_shifted = _mm_sra_epi16(hi_16, shift_count); + _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { @@ -569,49 +822,99 @@ impl Simd for Sse4_2 { } #[inline(always)] fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16, b: i8x16) -> mask8x16 { + _mm_cmpeq_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpgt_epi8(b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16, b: i8x16) -> mask8x16 { + _mm_cmpgt_epi8(b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(_mm_min_epi8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16, b: i8x16) -> mask8x16 { + _mm_cmpeq_epi8(_mm_min_epi8(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(_mm_max_epi8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16, b: i8x16) -> mask8x16 { + _mm_cmpeq_epi8(_mm_max_epi8(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { _mm_cmpgt_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16, b: i8x16) -> mask8x16 { + _mm_cmpgt_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16, b: i8x16) -> i8x16 { + _mm_unpacklo_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16, b: i8x16) -> i8x16 { + _mm_unpackhi_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16, b: i8x16) -> i8x16 { + let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpacklo_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16, b: i8x16) -> i8x16 { + let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpackhi_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_i8x16(self, a: i8x16, b: i8x16) -> (i8x16, i8x16) { @@ -623,15 +926,38 @@ impl Simd for Sse4_2 { } #[inline(always)] fn select_i8x16(self, a: mask8x16, b: i8x16, c: i8x16) -> i8x16 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Sse4_2, + a: mask8x16, + b: i8x16, + c: i8x16, + ) -> i8x16 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_min_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16, b: i8x16) -> i8x16 { + _mm_min_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { _mm_max_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16, b: i8x16) -> i8x16 { + _mm_max_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_i8x16(self, a: i8x16, b: i8x16) -> i8x32 { @@ -642,19 +968,43 @@ impl Simd for Sse4_2 { } #[inline(always)] fn neg_i8x16(self, a: i8x16) -> i8x16 { - unsafe { _mm_sub_epi8(_mm_setzero_si128(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16) -> i8x16 { + _mm_sub_epi8(_mm_setzero_si128(), a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i8x16(self, a: i8x16) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16) -> u8x16 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i8x16(self, a: i8x16) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i8x16) -> u32x4 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_u8x16(self, val: u8) -> u8x16 { - unsafe { _mm_set1_epi8(val.cast_signed()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, val: u8) -> u8x16 { + _mm_set1_epi8(val.cast_signed()).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16 { @@ -727,36 +1077,70 @@ impl Simd for Sse4_2 { } #[inline(always)] fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16, b: u8x16) -> u8x16 { + _mm_add_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16, b: u8x16) -> u8x16 { + _mm_sub_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { - let dst_even = _mm_mullo_epi16(a.into(), b.into()); - let dst_odd = - _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into())); - _mm_or_si128( - _mm_slli_epi16(dst_odd, 8), - _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16, b: u8x16) -> u8x16 { + let dst_even = _mm_mullo_epi16(a.into(), b.into()); + let dst_odd = + _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into())); + _mm_or_si128( + _mm_slli_epi16(dst_odd, 8), + _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16, b: u8x16) -> u8x16 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16, b: u8x16) -> u8x16 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16, b: u8x16) -> u8x16 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_u8x16(self, a: u8x16) -> u8x16 { @@ -764,15 +1148,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shl_u8x16(self, a: u8x16, shift: u32) -> u8x16 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); - let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); - let lo_shifted = _mm_sll_epi16(lo_16, shift_count); - let hi_shifted = _mm_sll_epi16(hi_16, shift_count); - _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16, shift: u32) -> u8x16 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); + let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); + let lo_shifted = _mm_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm_sll_epi16(hi_16, shift_count); + _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { @@ -780,15 +1168,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16 { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); - let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); - let lo_shifted = _mm_srl_epi16(lo_16, shift_count); - let hi_shifted = _mm_srl_epi16(hi_16, shift_count); - _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16, shift: u32) -> u8x16 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); + let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); + let lo_shifted = _mm_srl_epi16(lo_16, shift_count); + let hi_shifted = _mm_srl_epi16(hi_16, shift_count); + _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { @@ -796,59 +1188,105 @@ impl Simd for Sse4_2 { } #[inline(always)] fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16, b: u8x16) -> mask8x16 { + _mm_cmpeq_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { - let sign_bit = _mm_set1_epi8(0x80u8.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi8(b_signed, a_signed).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16, b: u8x16) -> mask8x16 { + let sign_bit = _mm_set1_epi8(0x80u8.cast_signed()); + let a_signed = _mm_xor_si128(a.into(), sign_bit); + let b_signed = _mm_xor_si128(b.into(), sign_bit); + _mm_cmpgt_epi8(b_signed, a_signed).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(_mm_min_epu8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16, b: u8x16) -> mask8x16 { + _mm_cmpeq_epi8(_mm_min_epu8(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(_mm_max_epu8(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16, b: u8x16) -> mask8x16 { + _mm_cmpeq_epi8(_mm_max_epu8(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { - let sign_bit = _mm_set1_epi8(0x80u8.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16, b: u8x16) -> mask8x16 { + let sign_bit = _mm_set1_epi8(0x80u8.cast_signed()); + let a_signed = _mm_xor_si128(a.into(), sign_bit); + let b_signed = _mm_xor_si128(b.into(), sign_bit); + _mm_cmpgt_epi8(a_signed, b_signed).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16, b: u8x16) -> u8x16 { + _mm_unpacklo_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16, b: u8x16) -> u8x16 { + _mm_unpackhi_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16, b: u8x16) -> u8x16 { + let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpacklo_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16, b: u8x16) -> u8x16 { + let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpackhi_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_u8x16(self, a: u8x16, b: u8x16) -> (u8x16, u8x16) { @@ -860,15 +1298,38 @@ impl Simd for Sse4_2 { } #[inline(always)] fn select_u8x16(self, a: mask8x16, b: u8x16, c: u8x16) -> u8x16 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Sse4_2, + a: mask8x16, + b: u8x16, + c: u8x16, + ) -> u8x16 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_min_epu8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16, b: u8x16) -> u8x16 { + _mm_min_epu8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { _mm_max_epu8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16, b: u8x16) -> u8x16 { + _mm_max_epu8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_u8x16(self, a: u8x16, b: u8x16) -> u8x32 { @@ -879,23 +1340,37 @@ impl Simd for Sse4_2 { } #[inline(always)] fn widen_u8x16(self, a: u8x16) -> u16x16 { - unsafe { - let raw = a.into(); - let high = _mm_cvtepu8_epi16(raw).simd_into(self); - let low = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(raw)).simd_into(self); - self.combine_u16x8(high, low) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16) -> u16x16 { + let raw = a.into(); + let high = _mm_cvtepu8_epi16(raw).simd_into(token); + let low = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(raw)).simd_into(token); + token.combine_u16x8(high, low) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_u8x16(self, a: u8x16) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x16) -> u32x4 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_mask8x16(self, val: bool) -> mask8x16 { - unsafe { - let val: i8 = if val { !0 } else { 0 }; - _mm_set1_epi8(val).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, val: bool) -> mask8x16 { + let val: i8 = if val { !0 } else { 0 }; + _mm_set1_epi8(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16 { @@ -910,35 +1385,63 @@ impl Simd for Sse4_2 { } #[inline(always)] fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16 { - unsafe { - { - let bit_bytes = _mm_cvtsi32_si128(bits as i32); - let bit_bytes = _mm_shuffle_epi8( - bit_bytes, - _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1), - ); - let bit_mask = - _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128); - _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, bits: u64) -> mask8x16 { + { + let bit_bytes = _mm_cvtsi32_si128(bits as i32); + let bit_bytes = _mm_shuffle_epi8( + bit_bytes, + _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1), + ); + let bit_mask = + _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128); + _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + } + .simd_into(token) } - .simd_into(self) - } + ); + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask8x16(self, a: mask8x16) -> u64 { - unsafe { _mm_movemask_epi8(a.into()) as u32 as u64 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask8x16) -> u64 { + _mm_movemask_epi8(a.into()) as u32 as u64 + } + ); + kernel(self, a) } #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask8x16, b: mask8x16) -> mask8x16 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask8x16, b: mask8x16) -> mask8x16 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask8x16, b: mask8x16) -> mask8x16 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_mask8x16(self, a: mask8x16) -> mask8x16 { @@ -951,27 +1454,68 @@ impl Simd for Sse4_2 { b: mask8x16, c: mask8x16, ) -> mask8x16 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Sse4_2, + a: mask8x16, + b: mask8x16, + c: mask8x16, + ) -> mask8x16 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { - unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask8x16, b: mask8x16) -> mask8x16 { + _mm_cmpeq_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn any_true_mask8x16(self, a: mask8x16) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask8x16) -> bool { + _mm_movemask_epi8(a.into()) as u32 != 0 + } + ); + kernel(self, a) } #[inline(always)] fn all_true_mask8x16(self, a: mask8x16) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask8x16) -> bool { + _mm_movemask_epi8(a.into()) as u32 == 0xffff + } + ); + kernel(self, a) } #[inline(always)] fn any_false_mask8x16(self, a: mask8x16) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask8x16) -> bool { + _mm_movemask_epi8(a.into()) as u32 != 0xffff + } + ); + kernel(self, a) } #[inline(always)] fn all_false_mask8x16(self, a: mask8x16) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask8x16) -> bool { + _mm_movemask_epi8(a.into()) as u32 == 0 + } + ); + kernel(self, a) } #[inline(always)] fn combine_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x32 { @@ -982,7 +1526,13 @@ impl Simd for Sse4_2 { } #[inline(always)] fn splat_i16x8(self, val: i16) -> i16x8 { - unsafe { _mm_set1_epi16(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, val: i16) -> i16x8 { + _mm_set1_epi16(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8 { @@ -1055,27 +1605,63 @@ impl Simd for Sse4_2 { } #[inline(always)] fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8, b: i16x8) -> i16x8 { + _mm_add_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8, b: i16x8) -> i16x8 { + _mm_sub_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8, b: i16x8) -> i16x8 { + _mm_mullo_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8, b: i16x8) -> i16x8 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8, b: i16x8) -> i16x8 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8, b: i16x8) -> i16x8 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_i16x8(self, a: i16x8) -> i16x8 { @@ -1083,7 +1669,13 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shl_i16x8(self, a: i16x8, shift: u32) -> i16x8 { - unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8, shift: u32) -> i16x8 { + _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { @@ -1091,7 +1683,13 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8 { - unsafe { _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8, shift: u32) -> i16x8 { + _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { @@ -1099,49 +1697,99 @@ impl Simd for Sse4_2 { } #[inline(always)] fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8, b: i16x8) -> mask16x8 { + _mm_cmpeq_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpgt_epi16(b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8, b: i16x8) -> mask16x8 { + _mm_cmpgt_epi16(b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(_mm_min_epi16(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8, b: i16x8) -> mask16x8 { + _mm_cmpeq_epi16(_mm_min_epi16(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(_mm_max_epi16(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8, b: i16x8) -> mask16x8 { + _mm_cmpeq_epi16(_mm_max_epi16(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { _mm_cmpgt_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8, b: i16x8) -> mask16x8 { + _mm_cmpgt_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8, b: i16x8) -> i16x8 { + _mm_unpacklo_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8, b: i16x8) -> i16x8 { + _mm_unpackhi_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8, b: i16x8) -> i16x8 { + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpacklo_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8, b: i16x8) -> i16x8 { + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpackhi_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_i16x8(self, a: i16x8, b: i16x8) -> (i16x8, i16x8) { @@ -1153,15 +1801,38 @@ impl Simd for Sse4_2 { } #[inline(always)] fn select_i16x8(self, a: mask16x8, b: i16x8, c: i16x8) -> i16x8 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Sse4_2, + a: mask16x8, + b: i16x8, + c: i16x8, + ) -> i16x8 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_min_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8, b: i16x8) -> i16x8 { + _mm_min_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_max_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8, b: i16x8) -> i16x8 { + _mm_max_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_i16x8(self, a: i16x8, b: i16x8) -> i16x16 { @@ -1172,19 +1843,43 @@ impl Simd for Sse4_2 { } #[inline(always)] fn neg_i16x8(self, a: i16x8) -> i16x8 { - unsafe { _mm_sub_epi16(_mm_setzero_si128(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8) -> i16x8 { + _mm_sub_epi16(_mm_setzero_si128(), a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i16x8(self, a: i16x8) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8) -> u8x16 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i16x8(self, a: i16x8) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i16x8) -> u32x4 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_u16x8(self, val: u16) -> u16x8 { - unsafe { _mm_set1_epi16(val.cast_signed()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, val: u16) -> u16x8 { + _mm_set1_epi16(val.cast_signed()).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8 { @@ -1257,27 +1952,63 @@ impl Simd for Sse4_2 { } #[inline(always)] fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8, b: u16x8) -> u16x8 { + _mm_add_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8, b: u16x8) -> u16x8 { + _mm_sub_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8, b: u16x8) -> u16x8 { + _mm_mullo_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8, b: u16x8) -> u16x8 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8, b: u16x8) -> u16x8 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8, b: u16x8) -> u16x8 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_u16x8(self, a: u16x8) -> u16x8 { @@ -1285,7 +2016,13 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shl_u16x8(self, a: u16x8, shift: u32) -> u16x8 { - unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8, shift: u32) -> u16x8 { + _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { @@ -1293,7 +2030,13 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8 { - unsafe { _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8, shift: u32) -> u16x8 { + _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { @@ -1301,59 +2044,105 @@ impl Simd for Sse4_2 { } #[inline(always)] fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8, b: u16x8) -> mask16x8 { + _mm_cmpeq_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { - let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi16(b_signed, a_signed).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8, b: u16x8) -> mask16x8 { + let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed()); + let a_signed = _mm_xor_si128(a.into(), sign_bit); + let b_signed = _mm_xor_si128(b.into(), sign_bit); + _mm_cmpgt_epi16(b_signed, a_signed).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(_mm_min_epu16(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8, b: u16x8) -> mask16x8 { + _mm_cmpeq_epi16(_mm_min_epu16(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(_mm_max_epu16(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8, b: u16x8) -> mask16x8 { + _mm_cmpeq_epi16(_mm_max_epu16(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { - let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8, b: u16x8) -> mask16x8 { + let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed()); + let a_signed = _mm_xor_si128(a.into(), sign_bit); + let b_signed = _mm_xor_si128(b.into(), sign_bit); + _mm_cmpgt_epi16(a_signed, b_signed).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8, b: u16x8) -> u16x8 { + _mm_unpacklo_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8, b: u16x8) -> u16x8 { + _mm_unpackhi_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8, b: u16x8) -> u16x8 { + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpacklo_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8, b: u16x8) -> u16x8 { + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpackhi_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_u16x8(self, a: u16x8, b: u16x8) -> (u16x8, u16x8) { @@ -1365,15 +2154,38 @@ impl Simd for Sse4_2 { } #[inline(always)] fn select_u16x8(self, a: mask16x8, b: u16x8, c: u16x8) -> u16x8 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Sse4_2, + a: mask16x8, + b: u16x8, + c: u16x8, + ) -> u16x8 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_min_epu16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8, b: u16x8) -> u16x8 { + _mm_min_epu16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_max_epu16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8, b: u16x8) -> u16x8 { + _mm_max_epu16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_u16x8(self, a: u16x8, b: u16x8) -> u16x16 { @@ -1384,18 +2196,34 @@ impl Simd for Sse4_2 { } #[inline(always)] fn reinterpret_u8_u16x8(self, a: u16x8) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8) -> u8x16 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_u16x8(self, a: u16x8) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x8) -> u32x4 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_mask16x8(self, val: bool) -> mask16x8 { - unsafe { - let val: i16 = if val { !0 } else { 0 }; - _mm_set1_epi16(val).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, val: bool) -> mask16x8 { + let val: i16 = if val { !0 } else { 0 }; + _mm_set1_epi16(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8 { @@ -1410,35 +2238,61 @@ impl Simd for Sse4_2 { } #[inline(always)] fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8 { - unsafe { - { - let bit_lanes = _mm_set1_epi16(bits as i16); - let bit_mask = _mm_setr_epi16(1, 2, 4, 8, 16, 32, 64, 128); - _mm_cmpeq_epi16(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, bits: u64) -> mask16x8 { + { + let bit_lanes = _mm_set1_epi16(bits as i16); + let bit_mask = _mm_setr_epi16(1, 2, 4, 8, 16, 32, 64, 128); + _mm_cmpeq_epi16(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + } + .simd_into(token) } - .simd_into(self) - } + ); + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask16x8(self, a: mask16x8) -> u64 { - unsafe { - { - let packed = _mm_packs_epi16(a.into(), a.into()); - _mm_movemask_epi8(packed) as u8 as u64 + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask16x8) -> u64 { + { + let packed = _mm_packs_epi16(a.into(), a.into()); + _mm_movemask_epi8(packed) as u8 as u64 + } } - } + ); + kernel(self, a) } #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask16x8, b: mask16x8) -> mask16x8 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask16x8, b: mask16x8) -> mask16x8 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask16x8, b: mask16x8) -> mask16x8 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_mask16x8(self, a: mask16x8) -> mask16x8 { @@ -1451,27 +2305,68 @@ impl Simd for Sse4_2 { b: mask16x8, c: mask16x8, ) -> mask16x8 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Sse4_2, + a: mask16x8, + b: mask16x8, + c: mask16x8, + ) -> mask16x8 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { - unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask16x8, b: mask16x8) -> mask16x8 { + _mm_cmpeq_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn any_true_mask16x8(self, a: mask16x8) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask16x8) -> bool { + _mm_movemask_epi8(a.into()) as u32 != 0 + } + ); + kernel(self, a) } #[inline(always)] fn all_true_mask16x8(self, a: mask16x8) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask16x8) -> bool { + _mm_movemask_epi8(a.into()) as u32 == 0xffff + } + ); + kernel(self, a) } #[inline(always)] fn any_false_mask16x8(self, a: mask16x8) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask16x8) -> bool { + _mm_movemask_epi8(a.into()) as u32 != 0xffff + } + ); + kernel(self, a) } #[inline(always)] fn all_false_mask16x8(self, a: mask16x8) -> bool { - unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask16x8) -> bool { + _mm_movemask_epi8(a.into()) as u32 == 0 + } + ); + kernel(self, a) } #[inline(always)] fn combine_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x16 { @@ -1482,7 +2377,13 @@ impl Simd for Sse4_2 { } #[inline(always)] fn splat_i32x4(self, val: i32) -> i32x4 { - unsafe { _mm_set1_epi32(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, val: i32) -> i32x4 { + _mm_set1_epi32(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4 { @@ -1555,27 +2456,63 @@ impl Simd for Sse4_2 { } #[inline(always)] fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4, b: i32x4) -> i32x4 { + _mm_add_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4, b: i32x4) -> i32x4 { + _mm_sub_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4, b: i32x4) -> i32x4 { + _mm_mullo_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4, b: i32x4) -> i32x4 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4, b: i32x4) -> i32x4 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4, b: i32x4) -> i32x4 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_i32x4(self, a: i32x4) -> i32x4 { @@ -1583,7 +2520,13 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shl_i32x4(self, a: i32x4, shift: u32) -> i32x4 { - unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4, shift: u32) -> i32x4 { + _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { @@ -1591,7 +2534,13 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shr_i32x4(self, a: i32x4, shift: u32) -> i32x4 { - unsafe { _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4, shift: u32) -> i32x4 { + _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { @@ -1599,47 +2548,97 @@ impl Simd for Sse4_2 { } #[inline(always)] fn simd_eq_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4, b: i32x4) -> mask32x4 { + _mm_cmpeq_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpgt_epi32(b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4, b: i32x4) -> mask32x4 { + _mm_cmpgt_epi32(b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(_mm_min_epi32(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4, b: i32x4) -> mask32x4 { + _mm_cmpeq_epi32(_mm_min_epi32(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(_mm_max_epi32(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4, b: i32x4) -> mask32x4 { + _mm_cmpeq_epi32(_mm_max_epi32(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { _mm_cmpgt_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4, b: i32x4) -> mask32x4 { + _mm_cmpgt_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4, b: i32x4) -> i32x4 { + _mm_unpacklo_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4, b: i32x4) -> i32x4 { + _mm_unpackhi_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4, b: i32x4) -> i32x4 { + let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); + let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); + _mm_unpacklo_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4, b: i32x4) -> i32x4 { + let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); + let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); + _mm_unpackhi_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_i32x4(self, a: i32x4, b: i32x4) -> (i32x4, i32x4) { @@ -1651,15 +2650,38 @@ impl Simd for Sse4_2 { } #[inline(always)] fn select_i32x4(self, a: mask32x4, b: i32x4, c: i32x4) -> i32x4 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Sse4_2, + a: mask32x4, + b: i32x4, + c: i32x4, + ) -> i32x4 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_min_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4, b: i32x4) -> i32x4 { + _mm_min_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { _mm_max_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4, b: i32x4) -> i32x4 { + _mm_max_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_i32x4(self, a: i32x4, b: i32x4) -> i32x8 { @@ -1670,23 +2692,53 @@ impl Simd for Sse4_2 { } #[inline(always)] fn neg_i32x4(self, a: i32x4) -> i32x4 { - unsafe { _mm_sub_epi32(_mm_setzero_si128(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4) -> i32x4 { + _mm_sub_epi32(_mm_setzero_si128(), a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_i32x4(self, a: i32x4) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4) -> u8x16 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u32_i32x4(self, a: i32x4) -> u32x4 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4) -> u32x4 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_f32_i32x4(self, a: i32x4) -> f32x4 { - unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i32x4) -> f32x4 { + _mm_cvtepi32_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_u32x4(self, val: u32) -> u32x4 { - unsafe { _mm_set1_epi32(val.cast_signed()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, val: u32) -> u32x4 { + _mm_set1_epi32(val.cast_signed()).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4 { @@ -1759,27 +2811,63 @@ impl Simd for Sse4_2 { } #[inline(always)] fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4, b: u32x4) -> u32x4 { + _mm_add_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4, b: u32x4) -> u32x4 { + _mm_sub_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4, b: u32x4) -> u32x4 { + _mm_mullo_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn and_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4, b: u32x4) -> u32x4 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4, b: u32x4) -> u32x4 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4, b: u32x4) -> u32x4 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_u32x4(self, a: u32x4) -> u32x4 { @@ -1787,7 +2875,13 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shl_u32x4(self, a: u32x4, shift: u32) -> u32x4 { - unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4, shift: u32) -> u32x4 { + _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shlv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { @@ -1795,7 +2889,13 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shr_u32x4(self, a: u32x4, shift: u32) -> u32x4 { - unsafe { _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4, shift: u32) -> u32x4 { + _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] fn shrv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { @@ -1803,57 +2903,103 @@ impl Simd for Sse4_2 { } #[inline(always)] fn simd_eq_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4, b: u32x4) -> mask32x4 { + _mm_cmpeq_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { - let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi32(b_signed, a_signed).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4, b: u32x4) -> mask32x4 { + let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed()); + let a_signed = _mm_xor_si128(a.into(), sign_bit); + let b_signed = _mm_xor_si128(b.into(), sign_bit); + _mm_cmpgt_epi32(b_signed, a_signed).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(_mm_min_epu32(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4, b: u32x4) -> mask32x4 { + _mm_cmpeq_epi32(_mm_min_epu32(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(_mm_max_epu32(a.into(), b.into()), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4, b: u32x4) -> mask32x4 { + _mm_cmpeq_epi32(_mm_max_epu32(a.into(), b.into()), a.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { - let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed()); - let a_signed = _mm_xor_si128(a.into(), sign_bit); - let b_signed = _mm_xor_si128(b.into(), sign_bit); - _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4, b: u32x4) -> mask32x4 { + let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed()); + let a_signed = _mm_xor_si128(a.into(), sign_bit); + let b_signed = _mm_xor_si128(b.into(), sign_bit); + _mm_cmpgt_epi32(a_signed, b_signed).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4, b: u32x4) -> u32x4 { + _mm_unpacklo_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4, b: u32x4) -> u32x4 { + _mm_unpackhi_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpacklo_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4, b: u32x4) -> u32x4 { + let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); + let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); + _mm_unpacklo_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpackhi_epi64(t1, t2).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4, b: u32x4) -> u32x4 { + let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); + let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); + _mm_unpackhi_epi64(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_u32x4(self, a: u32x4, b: u32x4) -> (u32x4, u32x4) { @@ -1865,15 +3011,38 @@ impl Simd for Sse4_2 { } #[inline(always)] fn select_u32x4(self, a: mask32x4, b: u32x4, c: u32x4) -> u32x4 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Sse4_2, + a: mask32x4, + b: u32x4, + c: u32x4, + ) -> u32x4 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_min_epu32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4, b: u32x4) -> u32x4 { + _mm_min_epu32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { _mm_max_epu32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4, b: u32x4) -> u32x4 { + _mm_max_epu32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn combine_u32x4(self, a: u32x4, b: u32x4) -> u32x8 { @@ -1884,28 +3053,43 @@ impl Simd for Sse4_2 { } #[inline(always)] fn reinterpret_u8_u32x4(self, a: u32x4) -> u8x16 { - __m128i::from(a).simd_into(self) + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4) -> u8x16 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_f32_u32x4(self, a: u32x4) -> f32x4 { - unsafe { - let a = a.into(); - let lo = _mm_blend_epi16::<0xAA>(a, _mm_set1_epi32(0x4B000000)); - let hi = _mm_blend_epi16::<0xAA>(_mm_srli_epi32::<16>(a), _mm_set1_epi32(0x53000000)); - let fhi = _mm_sub_ps( - _mm_castsi128_ps(hi), - _mm_set1_ps(f32::from_bits(0x53000080)), - ); - let result = _mm_add_ps(_mm_castsi128_ps(lo), fhi); - result.simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x4) -> f32x4 { + let a = a.into(); + let lo = _mm_blend_epi16::<0xAA>(a, _mm_set1_epi32(0x4B000000)); + let hi = + _mm_blend_epi16::<0xAA>(_mm_srli_epi32::<16>(a), _mm_set1_epi32(0x53000000)); + let fhi = _mm_sub_ps( + _mm_castsi128_ps(hi), + _mm_set1_ps(f32::from_bits(0x53000080)), + ); + let result = _mm_add_ps(_mm_castsi128_ps(lo), fhi); + result.simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_mask32x4(self, val: bool) -> mask32x4 { - unsafe { - let val: i32 = if val { !0 } else { 0 }; - _mm_set1_epi32(val).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, val: bool) -> mask32x4 { + let val: i32 = if val { !0 } else { 0 }; + _mm_set1_epi32(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4 { @@ -1920,30 +3104,58 @@ impl Simd for Sse4_2 { } #[inline(always)] fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4 { - unsafe { - { - let bit_lanes = _mm_set1_epi32(bits as i32); - let bit_mask = _mm_setr_epi32(1, 2, 4, 8); - _mm_cmpeq_epi32(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, bits: u64) -> mask32x4 { + { + let bit_lanes = _mm_set1_epi32(bits as i32); + let bit_mask = _mm_setr_epi32(1, 2, 4, 8); + _mm_cmpeq_epi32(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + } + .simd_into(token) } - .simd_into(self) - } + ); + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask32x4(self, a: mask32x4) -> u64 { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 as u64 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask32x4) -> u64 { + _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 as u64 + } + ); + kernel(self, a) } #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask32x4, b: mask32x4) -> mask32x4 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask32x4, b: mask32x4) -> mask32x4 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask32x4, b: mask32x4) -> mask32x4 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_mask32x4(self, a: mask32x4) -> mask32x4 { @@ -1956,27 +3168,68 @@ impl Simd for Sse4_2 { b: mask32x4, c: mask32x4, ) -> mask32x4 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Sse4_2, + a: mask32x4, + b: mask32x4, + c: mask32x4, + ) -> mask32x4 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { - unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask32x4, b: mask32x4) -> mask32x4 { + _mm_cmpeq_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn any_true_mask32x4(self, a: mask32x4) -> bool { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask32x4) -> bool { + _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0 + } + ); + kernel(self, a) } #[inline(always)] fn all_true_mask32x4(self, a: mask32x4) -> bool { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0b1111 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask32x4) -> bool { + _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0b1111 + } + ); + kernel(self, a) } #[inline(always)] fn any_false_mask32x4(self, a: mask32x4) -> bool { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0b1111 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask32x4) -> bool { + _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0b1111 + } + ); + kernel(self, a) } #[inline(always)] fn all_false_mask32x4(self, a: mask32x4) -> bool { - unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask32x4) -> bool { + _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0 + } + ); + kernel(self, a) } #[inline(always)] fn combine_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x8 { @@ -1987,7 +3240,13 @@ impl Simd for Sse4_2 { } #[inline(always)] fn splat_f64x2(self, val: f64) -> f64x2 { - unsafe { _mm_set1_pd(val).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, val: f64) -> f64x2 { + _mm_set1_pd(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2 { @@ -2060,15 +3319,33 @@ impl Simd for Sse4_2 { } #[inline(always)] fn abs_f64x2(self, a: f64x2) -> f64x2 { - unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2) -> f64x2 { + _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn neg_f64x2(self, a: f64x2) -> f64x2 { - unsafe { _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2) -> f64x2 { + _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn sqrt_f64x2(self, a: f64x2) -> f64x2 { - unsafe { _mm_sqrt_pd(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2) -> f64x2 { + _mm_sqrt_pd(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn approximate_recip_f64x2(self, a: f64x2) -> f64x2 { @@ -2076,62 +3353,145 @@ impl Simd for Sse4_2 { } #[inline(always)] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2, b: f64x2) -> f64x2 { + _mm_add_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn sub_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_sub_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2, b: f64x2) -> f64x2 { + _mm_sub_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_mul_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2, b: f64x2) -> f64x2 { + _mm_mul_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn div_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_div_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2, b: f64x2) -> f64x2 { + _mm_div_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn copysign_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let mask = _mm_set1_pd(-0.0); - _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2, b: f64x2) -> f64x2 { + let mask = _mm_set1_pd(-0.0); + _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmpeq_pd(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2, b: f64x2) -> mask64x2 { + _mm_castpd_si128(_mm_cmpeq_pd(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmplt_pd(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2, b: f64x2) -> mask64x2 { + _mm_castpd_si128(_mm_cmplt_pd(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmple_pd(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2, b: f64x2) -> mask64x2 { + _mm_castpd_si128(_mm_cmple_pd(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmpge_pd(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2, b: f64x2) -> mask64x2 { + _mm_castpd_si128(_mm_cmpge_pd(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { _mm_castpd_si128(_mm_cmpgt_pd(a.into(), b.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2, b: f64x2) -> mask64x2 { + _mm_castpd_si128(_mm_cmpgt_pd(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_unpacklo_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2, b: f64x2) -> f64x2 { + _mm_unpacklo_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_unpackhi_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2, b: f64x2) -> f64x2 { + _mm_unpackhi_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2, b: f64x2) -> f64x2 { + _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2, b: f64x2) -> f64x2 { + _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_f64x2(self, a: f64x2, b: f64x2) -> (f64x2, f64x2) { @@ -2143,27 +3503,47 @@ impl Simd for Sse4_2 { } #[inline(always)] fn max_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2, b: f64x2) -> f64x2 { + _mm_max_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2, b: f64x2) -> f64x2 { + _mm_min_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let intermediate = _mm_max_pd(a.into(), b.into()); - let b_is_nan = _mm_cmpunord_pd(b.into(), b.into()); - _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2, b: f64x2) -> f64x2 { + let intermediate = _mm_max_pd(a.into(), b.into()); + let b_is_nan = _mm_cmpunord_pd(b.into(), b.into()); + _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let intermediate = _mm_min_pd(a.into(), b.into()); - let b_is_nan = _mm_cmpunord_pd(b.into(), b.into()); - _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2, b: f64x2) -> f64x2 { + let intermediate = _mm_min_pd(a.into(), b.into()); + let b_is_nan = _mm_cmpunord_pd(b.into(), b.into()); + _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { @@ -2175,22 +3555,36 @@ impl Simd for Sse4_2 { } #[inline(always)] fn floor_f64x2(self, a: f64x2) -> f64x2 { - unsafe { - _mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2) -> f64x2 { + _mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn ceil_f64x2(self, a: f64x2) -> f64x2 { - unsafe { - _mm_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2) -> f64x2 { + _mm_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn round_ties_even_f64x2(self, a: f64x2) -> f64x2 { - unsafe { - _mm_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2) -> f64x2 { + _mm_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn fract_f64x2(self, a: f64x2) -> f64x2 { @@ -2198,13 +3592,29 @@ impl Simd for Sse4_2 { } #[inline(always)] fn trunc_f64x2(self, a: f64x2) -> f64x2 { - unsafe { - _mm_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2) -> f64x2 { + _mm_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn select_f64x2(self, a: mask64x2, b: f64x2, c: f64x2) -> f64x2 { - unsafe { _mm_blendv_pd(c.into(), b.into(), _mm_castsi128_pd(a.into())).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Sse4_2, + a: mask64x2, + b: f64x2, + c: f64x2, + ) -> f64x2 { + _mm_blendv_pd(c.into(), b.into(), _mm_castsi128_pd(a.into())).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn combine_f64x2(self, a: f64x2, b: f64x2) -> f64x4 { @@ -2215,14 +3625,24 @@ impl Simd for Sse4_2 { } #[inline(always)] fn reinterpret_f32_f64x2(self, a: f64x2) -> f32x4 { - unsafe { _mm_castpd_ps(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f64x2) -> f32x4 { + _mm_castpd_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_mask64x2(self, val: bool) -> mask64x2 { - unsafe { - let val: i64 = if val { !0 } else { 0 }; - _mm_set1_epi64x(val).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, val: bool) -> mask64x2 { + let val: i64 = if val { !0 } else { 0 }; + _mm_set1_epi64x(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { @@ -2237,30 +3657,58 @@ impl Simd for Sse4_2 { } #[inline(always)] fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { - unsafe { - { - let bit_lanes = _mm_set1_epi64x(bits.cast_signed()); - let bit_mask = _mm_set_epi64x(2, 1); - _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, bits: u64) -> mask64x2 { + { + let bit_lanes = _mm_set1_epi64x(bits.cast_signed()); + let bit_mask = _mm_set_epi64x(2, 1); + _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + } + .simd_into(token) } - .simd_into(self) - } + ); + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask64x2) -> u64 { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 + } + ); + kernel(self, a) } #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask64x2, b: mask64x2) -> mask64x2 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask64x2, b: mask64x2) -> mask64x2 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask64x2, b: mask64x2) -> mask64x2 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn not_mask64x2(self, a: mask64x2) -> mask64x2 { @@ -2273,27 +3721,68 @@ impl Simd for Sse4_2 { b: mask64x2, c: mask64x2, ) -> mask64x2 { - unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Sse4_2, + a: mask64x2, + b: mask64x2, + c: mask64x2, + ) -> mask64x2 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - unsafe { _mm_cmpeq_epi64(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask64x2, b: mask64x2) -> mask64x2 { + _mm_cmpeq_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn any_true_mask64x2(self, a: mask64x2) -> bool { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask64x2) -> bool { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0 + } + ); + kernel(self, a) } #[inline(always)] fn all_true_mask64x2(self, a: mask64x2) -> bool { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask64x2) -> bool { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11 + } + ); + kernel(self, a) } #[inline(always)] fn any_false_mask64x2(self, a: mask64x2) -> bool { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask64x2) -> bool { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11 + } + ); + kernel(self, a) } #[inline(always)] fn all_false_mask64x2(self, a: mask64x2) -> bool { - unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0 } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask64x2) -> bool { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0 + } + ); + kernel(self, a) } #[inline(always)] fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { @@ -3882,14 +5371,18 @@ impl Simd for Sse4_2 { } #[inline(always)] fn narrow_u16x16(self, a: u16x16) -> u8x16 { - let (a, b) = self.split_u16x16(a); - unsafe { - let mask = _mm_set1_epi16(0xFF); - let lo_masked = _mm_and_si128(a.into(), mask); - let hi_masked = _mm_and_si128(b.into(), mask); - let result = _mm_packus_epi16(lo_masked, hi_masked); - result.simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x16) -> u8x16 { + let (a, b) = token.split_u16x16(a); + let mask = _mm_set1_epi16(0xFF); + let lo_masked = _mm_and_si128(a.into(), mask); + let hi_masked = _mm_and_si128(b.into(), mask); + let result = _mm_packus_epi16(lo_masked, hi_masked); + result.simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { @@ -3928,12 +5421,16 @@ impl Simd for Sse4_2 { } #[inline(always)] fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { - unsafe { - { - let packed = _mm_packs_epi16(a.val.0[0], a.val.0[1]); - _mm_movemask_epi8(packed) as u32 as u64 + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask16x16) -> u64 { + { + let packed = _mm_packs_epi16(a.val.0[0], a.val.0[1]); + _mm_movemask_epi8(packed) as u32 as u64 + } } - } + ); + kernel(self, a) } #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { @@ -6156,46 +7653,50 @@ impl Simd for Sse4_2 { } #[inline(always)] fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { - unsafe { - { - let bit_bytes = _mm_set1_epi64x(bits.cast_signed()); - let bit_mask = - _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128); - mask8x64 { - val: crate::support::Aligned512([ - { - let bit_bytes = _mm_shuffle_epi8( - bit_bytes, - _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1), - ); - _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) - }, - { - let bit_bytes = _mm_shuffle_epi8( - bit_bytes, - _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3), - ); - _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) - }, - { - let bit_bytes = _mm_shuffle_epi8( - bit_bytes, - _mm_setr_epi8(4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5), - ); - _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) - }, - { - let bit_bytes = _mm_shuffle_epi8( - bit_bytes, - _mm_setr_epi8(6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7), - ); - _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) - }, - ]), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, bits: u64) -> mask8x64 { + { + let bit_bytes = _mm_set1_epi64x(bits.cast_signed()); + let bit_mask = + _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128); + mask8x64 { + val: crate::support::Aligned512([ + { + let bit_bytes = _mm_shuffle_epi8( + bit_bytes, + _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1), + ); + _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + }, + { + let bit_bytes = _mm_shuffle_epi8( + bit_bytes, + _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3), + ); + _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + }, + { + let bit_bytes = _mm_shuffle_epi8( + bit_bytes, + _mm_setr_epi8(4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5), + ); + _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + }, + { + let bit_bytes = _mm_shuffle_epi8( + bit_bytes, + _mm_setr_epi8(6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7), + ); + _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + }, + ]), + simd: token, + } } } - } + ); + kernel(self, bits) } #[inline(always)] fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { @@ -6929,15 +8430,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { - unsafe { - { - let lo = _mm_packs_epi16(a.val.0[0], a.val.0[1]); - let hi = _mm_packs_epi16(a.val.0[2], a.val.0[3]); - let lo = _mm_movemask_epi8(lo) as u32 as u64; - let hi = _mm_movemask_epi8(hi) as u32 as u64; - lo | (hi << 16usize) + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask16x32) -> u64 { + { + let lo = _mm_packs_epi16(a.val.0[0], a.val.0[1]); + let hi = _mm_packs_epi16(a.val.0[2], a.val.0[3]); + let lo = _mm_movemask_epi8(lo) as u32 as u64; + let hi = _mm_movemask_epi8(hi) as u32 as u64; + lo | (hi << 16usize) + } } - } + ); + kernel(self, a) } #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs index 129d4052..7acc32d0 100644 --- a/fearless_simd_gen/src/mk_neon.rs +++ b/fearless_simd_gen/src/mk_neon.rs @@ -83,14 +83,14 @@ impl Level for Neon { OpSig::Splat => { let expr = neon::expr(method, vec_ty, &[quote! { val }]); let normalize_mask = integer_lane_mask_splat_arg(vec_ty); - quote! { - #method_sig { - unsafe { - #normalize_mask - #expr.simd_into(self) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { + #normalize_mask + #expr.simd_into(self) + }, + ) } OpSig::Shift => { let dup_type = vec_ty.cast(ScalarType::Int); @@ -109,26 +109,14 @@ impl Level for Neon { vec_ty, &[quote! { a.into() }, quote! { #dup_intrinsic ( #shift ) }], ); - quote! { - #method_sig { - unsafe { - #expr.simd_into(self) - } - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, quote! { #expr.simd_into(self) }) } OpSig::Unary => { let args = [quote! { a.into() }]; let expr = neon::expr(method, vec_ty, &args); - quote! { - #method_sig { - unsafe { - #expr.simd_into(self) - } - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, quote! { #expr.simd_into(self) }) } OpSig::LoadInterleaved { block_size, @@ -187,17 +175,17 @@ impl Level for Neon { let id2 = Ident::new(&format!("vcombine_{}", target_scalar_ty), Span::call_site()); - quote! { - #method_sig { - unsafe { - let converted: #arch = a.into(); - let low = #id1(converted.0); - let high = #id1(converted.1); + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { + let converted: #arch = a.into(); + let low = #id1(converted.0); + let high = #id1(converted.1); - #id2(low, high).simd_into(self) - } - } - } + #id2(low, high).simd_into(self) + }, + ) } else { let arch = self.arch_ty(&target_ty); let id1 = Ident::new(&format!("vmovl_{}", vec_scalar_ty), Span::call_site()); @@ -205,16 +193,16 @@ impl Level for Neon { let id3 = Ident::new(&format!("vget_high_{}", vec_scalar_ty), Span::call_site()); - quote! { - #method_sig { - unsafe { - let low = #id1(#id2(a.into())); - let high = #id1(#id3(a.into())); + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { + let low = #id1(#id2(a.into())); + let high = #id1(#id3(a.into())); - #arch(low, high).simd_into(self) - } - } - } + #arch(low, high).simd_into(self) + }, + ) } } OpSig::Binary => { @@ -263,13 +251,7 @@ impl Level for Neon { } }; - quote! { - #method_sig { - unsafe { - #expr - } - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, expr) } OpSig::Ternary => { let args = match method { @@ -291,13 +273,7 @@ impl Level for Neon { let neg = simple_intrinsic("vneg", vec_ty); expr = quote! { #neg(#expr) }; } - quote! { - #method_sig { - unsafe { - #expr.simd_into(self) - } - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, quote! { #expr.simd_into(self) }) } OpSig::Compare => { let args = [quote! { a.into() }, quote! { b.into() }]; @@ -306,13 +282,11 @@ impl Level for Neon { let scalar_bits = vec_ty.scalar_bits; let reinterpret_str = format!("vreinterpret{opt_q}_s{scalar_bits}_u{scalar_bits}"); let reinterpret = Ident::new(&reinterpret_str, Span::call_site()); - quote! { - #method_sig { - unsafe { - #reinterpret(#expr).simd_into(self) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #reinterpret(#expr).simd_into(self) }, + ) } OpSig::Select => { let opt_q = neon::opt_q(vec_ty); @@ -320,13 +294,11 @@ impl Level for Neon { let reinterpret_str = format!("vreinterpret{opt_q}_u{scalar_bits}_s{scalar_bits}"); let reinterpret = Ident::new(&reinterpret_str, Span::call_site()); let vbsl = simple_intrinsic("vbsl", vec_ty); - quote! { - #method_sig { - unsafe { - #vbsl(#reinterpret(a.into()), b.into(), c.into()).simd_into(self) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #vbsl(#reinterpret(a.into()), b.into(), c.into()).simd_into(self) }, + ) } OpSig::Combine { combined_ty } => { let combined_wrapper = combined_ty.aligned_wrapper(); @@ -375,28 +347,28 @@ impl Level for Neon { OpSig::Zip { select_low } => { let neon = if select_low { "vzip1" } else { "vzip2" }; let zip = simple_intrinsic(neon, vec_ty); - quote! { - #method_sig { + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { let x = a.into(); let y = b.into(); - unsafe { - #zip(x, y).simd_into(self) - } - } - } + #zip(x, y).simd_into(self) + }, + ) } OpSig::Unzip { select_even } => { let neon = if select_even { "vuzp1" } else { "vuzp2" }; let zip = simple_intrinsic(neon, vec_ty); - quote! { - #method_sig { + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { let x = a.into(); let y = b.into(); - unsafe { - #zip(x, y).simd_into(self) - } - } - } + #zip(x, y).simd_into(self) + }, + ) } OpSig::Slide { granularity } => { use SlideGranularity::*; @@ -480,13 +452,11 @@ impl Level for Neon { } else { let to_ty = &vec_ty.reinterpret(target_ty, scalar_bits); let neon = cvt_intrinsic("vcvt", to_ty, vec_ty); - quote! { - #method_sig { - unsafe { - #neon(a.into()).simd_into(self) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #neon(a.into()).simd_into(self) }, + ) } } OpSig::Reinterpret { @@ -497,13 +467,11 @@ impl Level for Neon { let to_ty = vec_ty.reinterpret(target_ty, scalar_bits); let neon = cvt_intrinsic("vreinterpret", &to_ty, vec_ty); - quote! { - #method_sig { - unsafe { - #neon(a.into()).simd_into(self) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #neon(a.into()).simd_into(self) }, + ) } else { quote! {} } @@ -522,13 +490,11 @@ impl Level for Neon { let u32_ty = vec_ty.reinterpret(ScalarType::Unsigned, 32); let min_max = simple_intrinsic(reduction, &u32_ty); let reinterpret = format_ident!("vreinterpretq_u32_s{}", vec_ty.scalar_bits); - quote! { - #method_sig { - unsafe { - #min_max(#reinterpret(a.into())) #target - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #min_max(#reinterpret(a.into())) #target }, + ) } OpSig::MaskFromBitmask => self.handle_mask_from_bitmask(method_sig, vec_ty), OpSig::MaskToBitmask => self.handle_mask_to_bitmask(method_sig, vec_ty), diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index eea35754..6eb8b2c6 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -186,36 +186,36 @@ impl Level for X86 { let method_sig = op.simd_trait_method_sig(vec_ty); match sig { - OpSig::Splat => self.handle_splat(method_sig, vec_ty), - OpSig::Compare => self.handle_compare(method_sig, method, vec_ty), - OpSig::Unary => self.handle_unary(method_sig, method, vec_ty), + OpSig::Splat => self.handle_splat(op, vec_ty), + OpSig::Compare => self.handle_compare(op, method, vec_ty), + OpSig::Unary => self.handle_unary(op, method_sig, method, vec_ty), OpSig::WidenNarrow { target_ty } => { - self.handle_widen_narrow(method_sig, method, vec_ty, target_ty) - } - OpSig::Binary => self.handle_binary(method_sig, method, vec_ty), - OpSig::Shift => self.handle_shift(method_sig, method, vec_ty), - OpSig::Ternary => self.handle_ternary(method_sig, method, vec_ty), - OpSig::Select => self.handle_select(method_sig, vec_ty), - OpSig::Combine { combined_ty } => self.handle_combine(method_sig, vec_ty, &combined_ty), - OpSig::Split { half_ty } => self.handle_split(method_sig, vec_ty, &half_ty), - OpSig::Zip { select_low } => self.handle_zip(method_sig, vec_ty, select_low), - OpSig::Unzip { select_even } => self.handle_unzip(method_sig, vec_ty, select_even), + self.handle_widen_narrow(op, method, vec_ty, target_ty) + } + OpSig::Binary => self.handle_binary(op, method, vec_ty), + OpSig::Shift => self.handle_shift(op, method, vec_ty), + OpSig::Ternary => self.handle_ternary(op, method_sig, method, vec_ty), + OpSig::Select => self.handle_select(op, vec_ty), + OpSig::Combine { combined_ty } => self.handle_combine(op, vec_ty, &combined_ty), + OpSig::Split { half_ty } => self.handle_split(op, vec_ty, &half_ty), + OpSig::Zip { select_low } => self.handle_zip(op, vec_ty, select_low), + OpSig::Unzip { select_even } => self.handle_unzip(op, vec_ty, select_even), OpSig::Slide { granularity } => self.handle_slide(method_sig, vec_ty, granularity), OpSig::Cvt { target_ty, scalar_bits, precise, - } => self.handle_cvt(method_sig, vec_ty, target_ty, scalar_bits, precise), + } => self.handle_cvt(op, vec_ty, target_ty, scalar_bits, precise), OpSig::Reinterpret { target_ty, scalar_bits, - } => self.handle_reinterpret(self, method_sig, vec_ty, target_ty, scalar_bits), + } => self.handle_reinterpret(self, op, vec_ty, target_ty, scalar_bits), OpSig::MaskReduce { quantifier, condition, - } => self.handle_mask_reduce(method_sig, vec_ty, quantifier, condition), - OpSig::MaskFromBitmask => self.handle_mask_from_bitmask(method_sig, vec_ty), - OpSig::MaskToBitmask => self.handle_mask_to_bitmask(method_sig, vec_ty), + } => self.handle_mask_reduce(op, vec_ty, quantifier, condition), + OpSig::MaskFromBitmask => self.handle_mask_from_bitmask(op, vec_ty), + OpSig::MaskToBitmask => self.handle_mask_to_bitmask(op, vec_ty), OpSig::LoadInterleaved { block_size, block_count, @@ -233,8 +233,8 @@ impl Level for X86 { OpSig::StoreArray => generic_store_array(method_sig, vec_ty), OpSig::FromBytes => generic_from_bytes(method_sig, vec_ty), OpSig::ToBytes => generic_to_bytes(method_sig, vec_ty), - OpSig::Interleave => self.handle_interleave(method_sig, vec_ty), - OpSig::Deinterleave => self.handle_deinterleave(method_sig, vec_ty), + OpSig::Interleave => self.handle_interleave(op, vec_ty), + OpSig::Deinterleave => self.handle_deinterleave(op, vec_ty), } } } @@ -594,21 +594,21 @@ fn signed_literal(value: u64, bits: u32) -> TokenStream { } impl X86 { - pub(crate) fn handle_splat(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream { + pub(crate) fn handle_splat(&self, op: Op, vec_ty: &VecType) -> TokenStream { let intrinsic = set1_intrinsic(vec_ty); let cast = match vec_ty.scalar { ScalarType::Unsigned => quote!(.cast_signed()), _ => quote!(), }; let normalize_mask = integer_lane_mask_splat_arg(vec_ty); - quote! { - #method_sig { - unsafe { - #normalize_mask - #intrinsic(val #cast).simd_into(self) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { + #normalize_mask + #intrinsic(val #cast).simd_into(self) + }, + ) } fn has_specialized_mask_from_bitmask(&self, vec_ty: &VecType) -> bool { @@ -634,11 +634,7 @@ impl X86 { vec_ty.scalar == ScalarType::Mask && vec_ty.scalar_bits == 16 } - pub(crate) fn handle_mask_from_bitmask( - &self, - method_sig: TokenStream, - vec_ty: &VecType, - ) -> TokenStream { + pub(crate) fn handle_mask_from_bitmask(&self, op: Op, vec_ty: &VecType) -> TokenStream { assert_eq!( vec_ty.scalar, ScalarType::Mask, @@ -647,24 +643,12 @@ impl X86 { if self.has_wide_byte_mask_from_bitmask(vec_ty) { let expr = mask_from_bitmask_wide_bytes(self.native_width(), vec_ty); - return quote! { - #method_sig { - unsafe { - #expr - } - } - }; + return op.simd_trait_kernel_method(self.token(), vec_ty, expr); } if self.has_wide_avx2_mask_from_bitmask(vec_ty) { let expr = mask_from_bitmask_wide_avx2(vec_ty); - return quote! { - #method_sig { - unsafe { - #expr - } - } - }; + return op.simd_trait_kernel_method(self.token(), vec_ty, expr); } let expr = match vec_ty.scalar_bits { @@ -683,20 +667,10 @@ impl X86 { _ => unreachable!(), }; - quote! { - #method_sig { - unsafe { - #expr - } - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, expr) } - pub(crate) fn handle_mask_to_bitmask( - &self, - method_sig: TokenStream, - vec_ty: &VecType, - ) -> TokenStream { + pub(crate) fn handle_mask_to_bitmask(&self, op: Op, vec_ty: &VecType) -> TokenStream { assert_eq!( vec_ty.scalar, ScalarType::Mask, @@ -707,21 +681,15 @@ impl X86 { 8 => { let bits_ty = vec_ty.reinterpret(ScalarType::Int, 8); let movemask = simple_intrinsic("movemask", &bits_ty); - quote! { - #method_sig { - unsafe { #movemask(a.into()) as u32 as u64 } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #movemask(a.into()) as u32 as u64 }, + ) } 16 => { let bits = mask_to_bitmask_words(self.native_width(), vec_ty); - quote! { - #method_sig { - unsafe { - #bits - } - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, bits) } 32 | 64 => { let float_ty = vec_ty.cast(ScalarType::Float); @@ -733,22 +701,17 @@ impl X86 { vec_ty.scalar_bits, vec_ty.n_bits(), ); - quote! { - #method_sig { - unsafe { #movemask(#cast(a.into())) as u32 as u64 } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #movemask(#cast(a.into())) as u32 as u64 }, + ) } _ => unreachable!(), } } - pub(crate) fn handle_compare( - &self, - method_sig: TokenStream, - method: &str, - vec_ty: &VecType, - ) -> TokenStream { + pub(crate) fn handle_compare(&self, op: Op, method: &str, vec_ty: &VecType) -> TokenStream { let args = [quote! { a.into() }, quote! { b.into() }]; let expr = if vec_ty.scalar != ScalarType::Float { @@ -817,15 +780,12 @@ impl X86 { quote! { #ident(#compare_op(a.into(), b.into())) } }; - quote! { - #method_sig { - unsafe { #expr.simd_into(self) } - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, quote! { #expr.simd_into(self) }) } pub(crate) fn handle_unary( &self, + op: Op, method_sig: TokenStream, method: &str, vec_ty: &VecType, @@ -865,18 +825,14 @@ impl X86 { _ => { let args = [quote! { a.into() }]; let expr = x86::expr(method, vec_ty, &args); - quote! { - #method_sig { - unsafe { #expr.simd_into(self) } - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, quote! { #expr.simd_into(self) }) } } } pub(crate) fn handle_widen_narrow( &self, - method_sig: TokenStream, + op: Op, method: &str, vec_ty: &VecType, target_ty: VecType, @@ -893,9 +849,7 @@ impl X86 { dst_width, ); quote! { - unsafe { - #extend(a.into()).simd_into(self) - } + #extend(a.into()).simd_into(self) } } (Self::Avx2, 512, 256) => { @@ -911,12 +865,10 @@ impl X86 { ); let split = generic_op_name("split", vec_ty); quote! { - unsafe { - let (a0, a1) = self.#split(a); - let high = #extend(a0.into()).simd_into(self); - let low = #extend(a1.into()).simd_into(self); - self.#combine(high, low) - } + let (a0, a1) = self.#split(a); + let high = #extend(a0.into()).simd_into(self); + let low = #extend(a1.into()).simd_into(self); + self.#combine(high, low) } } (Self::Sse4_2, 256, 128) => { @@ -931,14 +883,12 @@ impl X86 { &vec_ty.reinterpret(vec_ty.scalar, vec_ty.scalar_bits * 2), ); quote! { - unsafe { - let raw = a.into(); - let high = #extend(raw).simd_into(self); - // Shift by 8 since we want to get the higher part into the - // lower position. - let low = #extend(_mm_srli_si128::<8>(raw)).simd_into(self); - self.#combine(high, low) - } + let raw = a.into(); + let high = #extend(raw).simd_into(self); + // Shift by 8 since we want to get the higher part into the + // lower position. + let low = #extend(_mm_srli_si128::<8>(raw)).simd_into(self); + self.#combine(high, low) } } _ => unimplemented!(), @@ -954,14 +904,12 @@ impl X86 { _ => unimplemented!(), }; quote! { - unsafe { - let mask = _mm256_setr_epi8(#mask, #mask); + let mask = _mm256_setr_epi8(#mask, #mask); - let shuffled = _mm256_shuffle_epi8(a.into(), mask); - let packed = _mm256_permute4x64_epi64::<0b11_01_10_00>(shuffled); + let shuffled = _mm256_shuffle_epi8(a.into(), mask); + let packed = _mm256_permute4x64_epi64::<0b11_01_10_00>(shuffled); - _mm256_castsi256_si128(packed).simd_into(self) - } + _mm256_castsi256_si128(packed).simd_into(self) } } (Self::Avx2, 256, 512) => { @@ -978,17 +926,15 @@ impl X86 { let split = generic_op_name("split", vec_ty); quote! { let (a, b) = self.#split(a); - unsafe { - // Note that AVX2 only has an intrinsic for saturating cast, - // but not wrapping. - let mask = #mask(0xFF); - let lo_masked = _mm256_and_si256(a.into(), mask); - let hi_masked = _mm256_and_si256(b.into(), mask); - // The 256-bit version of packus_epi16 operates lane-wise, so we need to arrange things - // properly afterwards. - let result = _mm256_permute4x64_epi64::<0b_11_01_10_00>(#pack(lo_masked, hi_masked)); - result.simd_into(self) - } + // Note that AVX2 only has an intrinsic for saturating cast, + // but not wrapping. + let mask = #mask(0xFF); + let lo_masked = _mm256_and_si256(a.into(), mask); + let hi_masked = _mm256_and_si256(b.into(), mask); + // The 256-bit version of packus_epi16 operates lane-wise, so we need to arrange things + // properly afterwards. + let result = _mm256_permute4x64_epi64::<0b_11_01_10_00>(#pack(lo_masked, hi_masked)); + result.simd_into(self) } } (Self::Sse4_2, 128, 256) => { @@ -1005,14 +951,12 @@ impl X86 { let split = generic_op_name("split", vec_ty); quote! { let (a, b) = self.#split(a); - unsafe { - // Below AVX-512. we only have an intrinsic for saturating cast, but not wrapping. - let mask = #mask(0xFF); - let lo_masked = _mm_and_si128(a.into(), mask); - let hi_masked = _mm_and_si128(b.into(), mask); - let result = #pack(lo_masked, hi_masked); - result.simd_into(self) - } + // Below AVX-512. we only have an intrinsic for saturating cast, but not wrapping. + let mask = #mask(0xFF); + let lo_masked = _mm_and_si128(a.into(), mask); + let hi_masked = _mm_and_si128(b.into(), mask); + let result = #pack(lo_masked, hi_masked); + result.simd_into(self) } } _ => unimplemented!(), @@ -1021,19 +965,11 @@ impl X86 { _ => unreachable!(), }; - quote! { - #method_sig { - #expr - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, expr) } - pub(crate) fn handle_binary( - &self, - method_sig: TokenStream, - method: &str, - vec_ty: &VecType, - ) -> TokenStream { + pub(crate) fn handle_binary(&self, op: Op, method: &str, vec_ty: &VecType) -> TokenStream { + let method_sig = op.simd_trait_method_sig(vec_ty); let body = match method { "mul" if vec_ty.scalar_bits == 8 => { // https://stackoverflow.com/questions/8193601/sse-multiplication-16-x-uint8-t @@ -1044,12 +980,10 @@ impl X86 { let slli = intrinsic_ident("slli", "epi16", vec_ty.n_bits()); let srli = intrinsic_ident("srli", "epi16", vec_ty.n_bits()); quote! { - unsafe { - let dst_even = #mullo(a.into(), b.into()); - let dst_odd = #mullo(#srli::<8>(a.into()), #srli::<8>(b.into())); + let dst_even = #mullo(a.into(), b.into()); + let dst_odd = #mullo(#srli::<8>(a.into()), #srli::<8>(b.into())); - #or(#slli(dst_odd, 8), #and(dst_even, #set1(0xFF))).simd_into(self) - } + #or(#slli(dst_odd, 8), #and(dst_even, #set1(0xFF))).simd_into(self) } } "shlv" | "shrv" if *self == Self::Avx2 && vec_ty.scalar_bits >= 32 => { @@ -1062,7 +996,7 @@ impl X86 { }; let intrinsic = intrinsic_ident(name, suffix, vec_ty.n_bits()); quote! { - unsafe { #intrinsic(a.into(), b.into()).simd_into(self) } + #intrinsic(a.into(), b.into()).simd_into(self) } } // SSE2 has shift operations, but they shift every lane by the same amount, so we can't use them here. @@ -1072,25 +1006,25 @@ impl X86 { let args = [quote! { a.into() }, quote! { b.into() }]; let expr = x86::expr(method, vec_ty, &args); quote! { - unsafe { #expr.simd_into(self) } + #expr.simd_into(self) } } }; - quote! { - #method_sig { - #body + match method { + "shlv" | "shrv" if !(*self == Self::Avx2 && vec_ty.scalar_bits >= 32) => { + quote! { + #method_sig { + #body + } + } } + _ => op.simd_trait_kernel_method(self.token(), vec_ty, body), } } - pub(crate) fn handle_shift( - &self, - method_sig: TokenStream, - method: &str, - vec_ty: &VecType, - ) -> TokenStream { - let op = match (method, vec_ty.scalar) { + pub(crate) fn handle_shift(&self, op: Op, method: &str, vec_ty: &VecType) -> TokenStream { + let shift_op = match (method, vec_ty.scalar) { ("shr", ScalarType::Unsigned) => "srl", ("shr", ScalarType::Int) => "sra", ("shl", _) => "sll", @@ -1098,7 +1032,7 @@ impl X86 { }; let ty_bits = vec_ty.n_bits(); let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits.max(16), false); - let shift_intrinsic = intrinsic_ident(op, suffix, ty_bits); + let shift_intrinsic = intrinsic_ident(shift_op, suffix, ty_bits); if vec_ty.scalar_bits == 8 { // x86 doesn't have shifting for 8-bit, so we first convert into 16-bit, shift, and then back to 8-bit. @@ -1124,33 +1058,34 @@ impl X86 { let extend_intrinsic_hi = extend_expr(unpack_hi); let pack_intrinsic = pack_intrinsic(16, vec_ty.scalar == ScalarType::Int, ty_bits); - quote! { - #method_sig { - unsafe { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = #extend_intrinsic_lo; - let hi_16 = #extend_intrinsic_hi; + let lo_16 = #extend_intrinsic_lo; + let hi_16 = #extend_intrinsic_hi; - let lo_shifted = #shift_intrinsic(lo_16, shift_count); - let hi_shifted = #shift_intrinsic(hi_16, shift_count); + let lo_shifted = #shift_intrinsic(lo_16, shift_count); + let hi_shifted = #shift_intrinsic(hi_16, shift_count); - #pack_intrinsic(lo_shifted, hi_shifted).simd_into(self) - } - } - } + #pack_intrinsic(lo_shifted, hi_shifted).simd_into(self) + }, + ) } else { - quote! { - #method_sig { - unsafe { #shift_intrinsic(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #shift_intrinsic(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }, + ) } } pub(crate) fn handle_ternary( &self, + op: Op, method_sig: TokenStream, method: &str, vec_ty: &VecType, @@ -1158,19 +1093,19 @@ impl X86 { match method { "mul_add" if *self == Self::Avx2 => { let intrinsic = simple_intrinsic("fmadd", vec_ty); - quote! { - #method_sig { - unsafe { #intrinsic(a.into(), b.into(), c.into()).simd_into(self) } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #intrinsic(a.into(), b.into(), c.into()).simd_into(self) }, + ) } "mul_sub" if *self == Self::Avx2 => { let intrinsic = simple_intrinsic("fmsub", vec_ty); - quote! { - #method_sig { - unsafe { #intrinsic(a.into(), b.into(), c.into()).simd_into(self) } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #intrinsic(a.into(), b.into(), c.into()).simd_into(self) }, + ) } "mul_add" => { quote! { @@ -1194,16 +1129,12 @@ impl X86 { ]; let expr = x86::expr(method, vec_ty, &args); - quote! { - #method_sig { - #expr.simd_into(self) - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, quote! { #expr.simd_into(self) }) } } } - pub(crate) fn handle_select(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream { + pub(crate) fn handle_select(&self, op: Op, vec_ty: &VecType) -> TokenStream { // Our select ops' argument order is mask, a, b; Intel's intrinsics are b, a, mask let args = [ quote! { c.into() }, @@ -1224,43 +1155,35 @@ impl X86 { ]; let expr = x86::expr("select", vec_ty, &args); - quote! { - #method_sig { - unsafe { #expr.simd_into(self) } - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, quote! { #expr.simd_into(self) }) } - pub(crate) fn handle_split( - &self, - method_sig: TokenStream, - vec_ty: &VecType, - half_ty: &VecType, - ) -> TokenStream { + pub(crate) fn handle_split(&self, op: Op, vec_ty: &VecType, half_ty: &VecType) -> TokenStream { if *self == Self::Avx2 && half_ty.n_bits() == 128 { let extract_op = match vec_ty.scalar { ScalarType::Float => "extractf128", _ => "extracti128", }; let extract_intrinsic = intrinsic_ident(extract_op, coarse_type(vec_ty), 256); - quote! { - #method_sig { - unsafe { - ( - #extract_intrinsic::<0>(a.into()).simd_into(self), - #extract_intrinsic::<1>(a.into()).simd_into(self), - ) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { + ( + #extract_intrinsic::<0>(a.into()).simd_into(self), + #extract_intrinsic::<1>(a.into()).simd_into(self), + ) + }, + ) } else { + let method_sig = op.simd_trait_method_sig(vec_ty); generic_block_split(method_sig, half_ty, self.max_block_size()) } } pub(crate) fn handle_combine( &self, - method_sig: TokenStream, + op: Op, vec_ty: &VecType, combined_ty: &VecType, ) -> TokenStream { @@ -1271,24 +1194,18 @@ impl X86 { _ => "m128i", }; let set_intrinsic = intrinsic_ident("setr", suffix, 256); - quote! { - #method_sig { - unsafe { - #set_intrinsic(a.into(), b.into()).simd_into(self) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #set_intrinsic(a.into(), b.into()).simd_into(self) }, + ) } else { + let method_sig = op.simd_trait_method_sig(vec_ty); generic_block_combine(method_sig, combined_ty, self.max_block_size()) } } - pub(crate) fn handle_zip( - &self, - method_sig: TokenStream, - vec_ty: &VecType, - select_low: bool, - ) -> TokenStream { + pub(crate) fn handle_zip(&self, op: Op, vec_ty: &VecType, select_low: bool) -> TokenStream { let expr = match vec_ty.n_bits() { 128 => { let op = if select_low { "unpacklo" } else { "unpackhi" }; @@ -1296,7 +1213,7 @@ impl X86 { let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false); let unpack_intrinsic = intrinsic_ident(op, suffix, vec_ty.n_bits()); quote! { - unsafe { #unpack_intrinsic(a.into(), b.into()).simd_into(self) } + #unpack_intrinsic(a.into(), b.into()).simd_into(self) } } 256 => { @@ -1319,29 +1236,19 @@ impl X86 { ); quote! { - unsafe { - let lo = #lo(a.into(), b.into()); - let hi = #hi(a.into(), b.into()); + let lo = #lo(a.into(), b.into()); + let hi = #hi(a.into(), b.into()); - #shuffle::<#shuffle_immediate>(lo, hi).simd_into(self) - } + #shuffle::<#shuffle_immediate>(lo, hi).simd_into(self) } } _ => unreachable!(), }; - quote! { - #method_sig { - #expr - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, expr) } - pub(crate) fn handle_interleave( - &self, - method_sig: TokenStream, - vec_ty: &VecType, - ) -> TokenStream { + pub(crate) fn handle_interleave(&self, op: Op, vec_ty: &VecType) -> TokenStream { match vec_ty.n_bits() { 256 => { // Optimized path: compute unpacklo and unpackhi once, then use permute2f128 to @@ -1358,24 +1265,25 @@ impl X86 { coarse_type(vec_ty), 256, ); - quote! { - #method_sig { - unsafe { - let lo = #lo(a.into(), b.into()); - let hi = #hi(a.into(), b.into()); - ( - #shuffle::<0b0010_0000>(lo, hi).simd_into(self), - #shuffle::<0b0011_0001>(lo, hi).simd_into(self), - ) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { + let lo = #lo(a.into(), b.into()); + let hi = #hi(a.into(), b.into()); + ( + #shuffle::<0b0010_0000>(lo, hi).simd_into(self), + #shuffle::<0b0011_0001>(lo, hi).simd_into(self), + ) + }, + ) } _ => { // For 128-bit vectors, zip_low/zip_high are single instructions (unpacklo/unpackhi), // so there's no redundancy in calling them separately. let zip_low = generic_op_name("zip_low", vec_ty); let zip_high = generic_op_name("zip_high", vec_ty); + let method_sig = op.simd_trait_method_sig(vec_ty); quote! { #method_sig { (self.#zip_low(a, b), self.#zip_high(a, b)) @@ -1385,11 +1293,7 @@ impl X86 { } } - pub(crate) fn handle_deinterleave( - &self, - method_sig: TokenStream, - vec_ty: &VecType, - ) -> TokenStream { + pub(crate) fn handle_deinterleave(&self, op: Op, vec_ty: &VecType) -> TokenStream { match vec_ty.n_bits() { 256 => { // Optimized path: compute the per-input shuffles once, then use permute2f128 / @@ -1397,24 +1301,25 @@ impl X86 { // the redundant shuffle operations that occur when unzip_low and unzip_high are // called separately. let (t1, t2, shuffle) = self.unzip256_intermediates(vec_ty); - quote! { - #method_sig { - unsafe { - let t1 = #t1; - let t2 = #t2; - ( - #shuffle::<0b0010_0000>(t1, t2).simd_into(self), - #shuffle::<0b0011_0001>(t1, t2).simd_into(self), - ) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { + let t1 = #t1; + let t2 = #t2; + ( + #shuffle::<0b0010_0000>(t1, t2).simd_into(self), + #shuffle::<0b0011_0001>(t1, t2).simd_into(self), + ) + }, + ) } _ => { // For 128-bit vectors, unzip_low/unzip_high are cheap, so there's no // redundancy in calling them separately. let unzip_low = generic_op_name("unzip_low", vec_ty); let unzip_high = generic_op_name("unzip_high", vec_ty); + let method_sig = op.simd_trait_method_sig(vec_ty); quote! { #method_sig { (self.#unzip_low(a, b), self.#unzip_high(a, b)) @@ -1476,12 +1381,7 @@ impl X86 { (t1, t2, shuffle) } - pub(crate) fn handle_unzip( - &self, - method_sig: TokenStream, - vec_ty: &VecType, - select_even: bool, - ) -> TokenStream { + pub(crate) fn handle_unzip(&self, op: Op, vec_ty: &VecType, select_even: bool) -> TokenStream { let expr = match (vec_ty.scalar, vec_ty.n_bits(), vec_ty.scalar_bits) { (ScalarType::Float, 128, _) => { // 128-bit shuffle of floats or doubles; there are built-in SSE intrinsics for this @@ -1496,7 +1396,7 @@ impl X86 { _ => unimplemented!(), }; - quote! { unsafe { #intrinsic::<#mask>(a.into(), b.into()).simd_into(self) } } + quote! { #intrinsic::<#mask>(a.into(), b.into()).simd_into(self) } } (ScalarType::Int | ScalarType::Mask | ScalarType::Unsigned, 128, 32) => { // 128-bit shuffle of 32-bit integers; unlike with floats, there is no single shuffle instruction that @@ -1505,11 +1405,9 @@ impl X86 { let intrinsic = intrinsic_ident(op, "epi64", vec_ty.n_bits()); quote! { - unsafe { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - #intrinsic(t1, t2).simd_into(self) - } + let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); + let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); + #intrinsic(t1, t2).simd_into(self) } } (ScalarType::Int | ScalarType::Mask | ScalarType::Unsigned, 128, 16 | 8) => { @@ -1535,13 +1433,11 @@ impl X86 { let unpack_epi64 = intrinsic_ident(op, "epi64", vec_ty.n_bits()); quote! { - unsafe { - let mask = #mask_reg; + let mask = #mask_reg; - let t1 = #shuffle_epi8(a.into(), mask); - let t2 = #shuffle_epi8(b.into(), mask); - #unpack_epi64(t1, t2).simd_into(self) - } + let t1 = #shuffle_epi8(a.into(), mask); + let t2 = #shuffle_epi8(b.into(), mask); + #unpack_epi64(t1, t2).simd_into(self) } } (_, 256, _) => { @@ -1553,21 +1449,15 @@ impl X86 { }; quote! { - unsafe { - let t1 = #t1; - let t2 = #t2; - #shuffle::<#shuffle_immediate>(t1, t2).simd_into(self) - } + let t1 = #t1; + let t2 = #t2; + #shuffle::<#shuffle_immediate>(t1, t2).simd_into(self) } } _ => unimplemented!(), }; - quote! { - #method_sig { - #expr - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, expr) } pub(crate) fn handle_slide( @@ -1631,7 +1521,7 @@ impl X86 { pub(crate) fn handle_cvt( &self, - method_sig: TokenStream, + op: Op, vec_ty: &VecType, target_scalar: ScalarType, target_scalar_bits: usize, @@ -1673,81 +1563,73 @@ impl X86 { match (target_scalar, precise) { (ScalarType::Int, false) => { quote! { - unsafe { - #convert(a.into()).simd_into(self) - } + #convert(a.into()).simd_into(self) } } (ScalarType::Unsigned, false) => { quote! { - unsafe { - let mut converted = #convert(a.into()); - - // In the common case where everything is in range of an i32, we don't need to do anything else. - let in_range = #cmplt(a.into(), #set1_float(2147483648.0)); - let all_in_range = #movemask(in_range) == #all_ones; + let mut converted = #convert(a.into()); - if !all_in_range { - // Add any excess (beyond the maximum value) - let excess = #sub_float(a.into(), #set1_float(2147483648.0)); - let excess_converted = #convert(#andnot(in_range, excess)); - converted = #add_int(converted, excess_converted); - } + // In the common case where everything is in range of an i32, we don't need to do anything else. + let in_range = #cmplt(a.into(), #set1_float(2147483648.0)); + let all_in_range = #movemask(in_range) == #all_ones; - converted.simd_into(self) + if !all_in_range { + // Add any excess (beyond the maximum value) + let excess = #sub_float(a.into(), #set1_float(2147483648.0)); + let excess_converted = #convert(#andnot(in_range, excess)); + converted = #add_int(converted, excess_converted); } + + converted.simd_into(self) } } (ScalarType::Int, true) => { quote! { - unsafe { - let a = a.into(); - - let mut converted = #convert(a); - - // In the common case where everything is in range, we don't need to do anything else. - let in_range = #cmplt(a, #set1_float(2147483648.0)); - let all_in_range = #movemask(in_range) == #all_ones; - - if !all_in_range { - // If we are above i32::MAX (2147483647), clamp to it. - converted = #blend(#set1_int(i32::MAX), converted, #cast_to_int(in_range)); - // Set NaN to 0. Using `and` seems slightly faster than `blend`. - let is_not_nan = #cast_to_int(#cmpord(a, a)); - converted = #and(converted, is_not_nan); - // We don't need to handle negative overflow because Intel's "invalid result" sentinel - // value is -2147483648, which is what we want anyway. - } - - converted.simd_into(self) + let a = a.into(); + + let mut converted = #convert(a); + + // In the common case where everything is in range, we don't need to do anything else. + let in_range = #cmplt(a, #set1_float(2147483648.0)); + let all_in_range = #movemask(in_range) == #all_ones; + + if !all_in_range { + // If we are above i32::MAX (2147483647), clamp to it. + converted = #blend(#set1_int(i32::MAX), converted, #cast_to_int(in_range)); + // Set NaN to 0. Using `and` seems slightly faster than `blend`. + let is_not_nan = #cast_to_int(#cmpord(a, a)); + converted = #and(converted, is_not_nan); + // We don't need to handle negative overflow because Intel's "invalid result" sentinel + // value is -2147483648, which is what we want anyway. } + + converted.simd_into(self) } } (ScalarType::Unsigned, true) => { quote! { - unsafe { - // Clamp out-of-range values (and NaN) to 0. Intel's `_mm_max_ps` always takes the second - // operand if the first is NaN. - let a = #max(a.into(), #set0()); - let mut converted = #convert(a); - - // In the common case where everything is in range of an i32, we don't need to do anything else. - let in_range = #cmplt(a, #set1_float(2147483648.0)); - let all_in_range = #movemask(in_range) == #all_ones; - - if !all_in_range { - let exceeds_unsigned_range = #cast_to_int(#cmplt(#set1_float(4294967040.0), a)); - // Add any excess (beyond the maximum value) - let excess = #sub_float(a, #set1_float(2147483648.0)); - let excess_converted = #convert(#andnot(in_range, excess)); - - // Clamp to u32::MAX. - converted = #add_int(converted, excess_converted); - converted = #blend(converted, #set1_int(u32::MAX.cast_signed()), exceeds_unsigned_range); - } - - converted.simd_into(self) + // Clamp out-of-range values (and NaN) to 0. Intel's `_mm_max_ps` always takes the second + // operand if the first is NaN. + let a = #max(a.into(), #set0()); + let mut converted = #convert(a); + + // In the common case where everything is in range of an i32, we don't need to do anything else. + let in_range = #cmplt(a, #set1_float(2147483648.0)); + let all_in_range = #movemask(in_range) == #all_ones; + + if !all_in_range { + let exceeds_unsigned_range = #cast_to_int(#cmplt(#set1_float(4294967040.0), a)); + // Add any excess (beyond the maximum value) + let excess = #sub_float(a, #set1_float(2147483648.0)); + let excess_converted = #convert(#andnot(in_range, excess)); + + // Clamp to u32::MAX. + converted = #add_int(converted, excess_converted); + converted = #blend(converted, #set1_int(u32::MAX.cast_signed()), exceeds_unsigned_range); } + + converted.simd_into(self) } } _ => unreachable!(), @@ -1761,9 +1643,7 @@ impl X86 { let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits); let intrinsic = simple_intrinsic("cvtepi32", &target_ty); quote! { - unsafe { - #intrinsic(a.into()).simd_into(self) - } + #intrinsic(a.into()).simd_into(self) } } (ScalarType::Unsigned, ScalarType::Float) => { @@ -1791,32 +1671,26 @@ impl X86 { // https://github.com/llvm/llvm-project/blob/6f8e87b9d097c5ef631f24d2eb2f34eb31b54d3b/llvm/lib/Target/X86/X86ISelLowering.cpp // (The file is too big for GitHub to show a preview, so no line numbers.) quote! { - unsafe { - let a = a.into(); - let lo = #blend::<0xAA>(a, #set1_int(0x4B000000)); - let hi = #blend::<0xAA>(#srli::<16>(a), #set1_int(0x53000000)); + let a = a.into(); + let lo = #blend::<0xAA>(a, #set1_int(0x4B000000)); + let hi = #blend::<0xAA>(#srli::<16>(a), #set1_int(0x53000000)); - let fhi = #sub_float(#cast_to_float(hi), #set1_float(f32::from_bits(0x53000080))); - let result = #add_float(#cast_to_float(lo), fhi); + let fhi = #sub_float(#cast_to_float(hi), #set1_float(f32::from_bits(0x53000080))); + let result = #add_float(#cast_to_float(lo), fhi); - result.simd_into(self) - } + result.simd_into(self) } } _ => unimplemented!(), }; - quote! { - #method_sig { - #expr - } - } + op.simd_trait_kernel_method(self.token(), vec_ty, expr) } pub(crate) fn handle_reinterpret( &self, level: &impl Level, - method_sig: TokenStream, + op: Op, vec_ty: &VecType, target_ty: ScalarType, scalar_bits: usize, @@ -1829,11 +1703,11 @@ impl X86 { if coarse_type(vec_ty) == coarse_type(&dst_ty) { let arch_ty = level.arch_ty(vec_ty); - quote! { - #method_sig { - #arch_ty::from(a).simd_into(self) - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #arch_ty::from(a).simd_into(self) }, + ) } else { let ident = cast_ident( vec_ty.scalar, @@ -1842,19 +1716,17 @@ impl X86 { scalar_bits, vec_ty.n_bits(), ); - quote! { - #method_sig { - unsafe { - #ident(a.into()).simd_into(self) - } - } - } + op.simd_trait_kernel_method( + self.token(), + vec_ty, + quote! { #ident(a.into()).simd_into(self) }, + ) } } pub(crate) fn handle_mask_reduce( &self, - method_sig: TokenStream, + method_op: Op, vec_ty: &VecType, quantifier: Quantifier, condition: bool, @@ -1908,13 +1780,7 @@ impl X86 { (Quantifier::All, false) => quote! { == 0 }, }; - quote! { - #method_sig { - unsafe { - #movemask as u32 #op - } - } - } + method_op.simd_trait_kernel_method(self.token(), vec_ty, quote! { #movemask as u32 #op }) } pub(crate) fn handle_load_interleaved( diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs index c1129e6b..d481f14e 100644 --- a/fearless_simd_gen/src/ops.rs +++ b/fearless_simd_gen/src/ops.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT use anyhow::{Context, anyhow}; -use proc_macro2::{Ident, Span, TokenStream}; +use proc_macro2::{Group, Ident, Span, TokenStream, TokenTree}; use quote::{format_ident, quote}; use std::fmt::Write; @@ -336,6 +336,167 @@ impl Op { } } + /// Generate a `Simd` trait method that delegates its body to a local `kernel!` function. + /// + /// The generated method keeps the trait signature using `Self`, while the local kernel uses + /// the concrete SIMD token type required by `kernel!`. Const-generic operations are rejected + /// because `kernel!` currently only accepts plain non-generic functions. + pub(crate) fn simd_trait_kernel_method( + &self, + level: Ident, + vec_ty: &VecType, + body: TokenStream, + ) -> TokenStream { + assert!( + !matches!(self.sig, OpSig::Slide { .. }), + "kernel! does not support const-generic methods" + ); + + let method_sig = self.simd_trait_method_sig(vec_ty); + let token = Ident::new("token", Span::call_site()); + let kernel_body = replace_ident(body, "self", &token); + let (arg_decls, call_args, ret) = self.simd_trait_kernel_sig_parts(&level, vec_ty); + + quote! { + #method_sig { + crate::kernel!( + #[inline(always)] + fn kernel(#token: #level #(, #arg_decls)*) -> #ret { + #kernel_body + } + ); + + kernel(self #(, #call_args)*) + } + } + } + + /// Build the concrete argument declarations, forwarding arguments, and return type for a generated kernel. + /// + /// This mirrors [`Op::simd_trait_method_sig`], but substitutes the concrete SIMD level token for `Self`. + fn simd_trait_kernel_sig_parts( + &self, + level: &Ident, + vec_ty: &VecType, + ) -> (Vec, Vec, TokenStream) { + let ty = vec_ty.rust(); + let arg_names = self + .sig + .simd_trait_arg_names() + .iter() + .map(|n| Ident::new(n, Span::call_site())) + .collect::>(); + let vec = quote! { #ty<#level> }; + + let (arg_tys, ret) = match &self.sig { + OpSig::Splat => { + let arg_ty = splat_arg_ty(vec_ty); + (vec![arg_ty], vec) + } + OpSig::LoadInterleaved { + block_size, + block_count, + } => { + let arg_ty = load_interleaved_arg_ty(*block_size, *block_count, vec_ty); + (vec![arg_ty], vec) + } + OpSig::StoreInterleaved { + block_size, + block_count, + } => { + let arg_ty = store_interleaved_arg_ty(*block_size, *block_count, vec_ty); + (vec![vec.clone(), arg_ty], quote! { () }) + } + OpSig::Compare => { + let result = vec_ty.mask_ty().rust(); + (vec![vec.clone(), vec.clone()], quote! { #result<#level> }) + } + OpSig::Split { half_ty } => { + let result = half_ty.rust(); + (vec![vec], quote! { (#result<#level>, #result<#level>) }) + } + OpSig::Combine { combined_ty } => { + let result = combined_ty.rust(); + (vec![vec.clone(), vec], quote! { #result<#level> }) + } + OpSig::Unary => (vec![vec.clone()], vec), + OpSig::Binary | OpSig::Zip { .. } | OpSig::Unzip { .. } => { + (vec![vec.clone(), vec.clone()], vec) + } + OpSig::Interleave | OpSig::Deinterleave => { + (vec![vec.clone(), vec.clone()], quote! { (#vec, #vec) }) + } + OpSig::Slide { .. } => unreachable!("checked by caller"), + OpSig::Cvt { + target_ty, + scalar_bits, + .. + } + | OpSig::Reinterpret { + target_ty, + scalar_bits, + } => { + let result = vec_ty.reinterpret(*target_ty, *scalar_bits).rust(); + (vec![vec], quote! { #result<#level> }) + } + OpSig::WidenNarrow { target_ty } => { + let result = target_ty.rust(); + (vec![vec], quote! { #result<#level> }) + } + OpSig::MaskReduce { .. } => (vec![vec], quote! { bool }), + OpSig::MaskFromBitmask => (vec![quote! { u64 }], vec), + OpSig::MaskToBitmask => (vec![vec], quote! { u64 }), + OpSig::Shift => (vec![vec.clone(), quote! { u32 }], vec), + OpSig::Ternary => (vec![vec.clone(), vec.clone(), vec.clone()], vec), + OpSig::Select => { + let mask_ty = vec_ty.mask_ty().rust(); + ( + vec![quote! { #mask_ty<#level> }, vec.clone(), vec.clone()], + vec, + ) + } + OpSig::FromArray { kind } => { + let ref_tok = kind.token(); + let rust_scalar = vec_ty.scalar.rust(vec_ty.scalar_bits); + let len = vec_ty.len; + let array_ty = quote! { [#rust_scalar; #len] }; + (vec![quote! { #ref_tok #array_ty }], vec) + } + OpSig::AsArray { kind } => { + let ref_tok = kind.token(); + let rust_scalar = vec_ty.scalar.rust(vec_ty.scalar_bits); + let len = vec_ty.len; + let array_ty = quote! { [#rust_scalar; #len] }; + ( + vec![quote! { #ref_tok #vec }], + quote! { #ref_tok #array_ty }, + ) + } + OpSig::StoreArray => { + let rust_scalar = vec_ty.scalar.rust(vec_ty.scalar_bits); + let len = vec_ty.len; + let array_ty = quote! { [#rust_scalar; #len] }; + (vec![vec, quote! { &mut #array_ty }], quote! { () }) + } + OpSig::FromBytes => { + let bytes_ty = vec_ty.reinterpret(ScalarType::Unsigned, 8).rust(); + (vec![quote! { #bytes_ty<#level> }], vec) + } + OpSig::ToBytes => { + let bytes_ty = vec_ty.reinterpret(ScalarType::Unsigned, 8).rust(); + (vec![vec], quote! { #bytes_ty<#level> }) + } + }; + + let arg_decls = arg_names + .iter() + .zip(arg_tys) + .map(|(name, ty)| quote! { #name: #ty }) + .collect(); + + (arg_decls, arg_names, ret) + } + pub(crate) fn vec_trait_method_sig(&self) -> Option { let arg_names = self .sig @@ -457,6 +618,26 @@ impl Op { } } +/// Replace all identifiers named `from` in a token stream with `to`, recursing into token groups. +/// +/// This is used to turn generated method bodies that mention `self` into kernel bodies that mention +/// the concrete token parameter instead. +fn replace_ident(stream: TokenStream, from: &str, to: &Ident) -> TokenStream { + stream + .into_iter() + .map(|tree| match tree { + TokenTree::Group(group) => { + let mut new_group = + Group::new(group.delimiter(), replace_ident(group.stream(), from, to)); + new_group.set_span(group.span()); + TokenTree::Group(new_group) + } + TokenTree::Ident(ident) if ident.to_string() == from => TokenTree::Ident(to.clone()), + tree => tree, + }) + .collect() +} + fn splat_arg_ty(vec_ty: &VecType) -> TokenStream { if vec_ty.scalar == ScalarType::Mask { quote! { bool }