diff --git a/src/f32x4_.rs b/src/f32x4_.rs index 52a13ee9..8a107282 100644 --- a/src/f32x4_.rs +++ b/src/f32x4_.rs @@ -4,13 +4,13 @@ pick! { if #[cfg(target_feature="sse")] { #[derive(Default, Clone, Copy, PartialEq)] #[repr(C, align(16))] - pub struct f32x4 { sse: m128 } + pub struct f32x4 { pub(crate) sse: m128 } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] - pub struct f32x4 { simd: v128 } + pub struct f32x4 { pub(crate) simd: v128 } impl Default for f32x4 { fn default() -> Self { @@ -26,7 +26,7 @@ pick! { } else { #[derive(Default, Clone, Copy, PartialEq)] #[repr(C, align(16))] - pub struct f32x4 { arr: [f32;4] } + pub struct f32x4 { pub(crate) arr: [f32;4] } } } @@ -664,14 +664,9 @@ impl f32x4 { /// values you get implementation defined behavior. #[inline] #[must_use] + #[deprecated(since = "0.7.6", note = "use `to_i32x4_round_fast` instead")] pub fn fast_round_int(self) -> i32x4 { - pick! { - if #[cfg(target_feature="sse2")] { - cast(convert_to_i32_m128i_from_m128(self.sse)) - } else { - self.round_int() - } - } + self.to_i32x4_round_fast() } /// Rounds each lane into an integer. This saturates out of range values and @@ -679,27 +674,9 @@ impl f32x4 { /// doesn't handle out of range values or NaNs. #[inline] #[must_use] + #[deprecated(since = "0.7.6", note = "use `to_i32x4_round` instead")] pub fn round_int(self) -> i32x4 { - pick! { - if #[cfg(target_feature="sse2")] { - // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 - let non_nan_mask = self.cmp_eq(self); - let non_nan = self & non_nan_mask; - let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0))); - let cast: i32x4 = cast(convert_to_i32_m128i_from_m128(non_nan.sse)); - flip_to_max ^ cast - } else if #[cfg(target_feature="simd128")] { - cast(Self { simd: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd)) }) - } else { - let rounded: [f32; 4] = cast(self.round()); - cast([ - rounded[0] as i32, - rounded[1] as i32, - rounded[2] as i32, - rounded[3] as i32, - ]) - } - } + self.to_i32x4_round() } /// Truncates each lane into an integer. This is a faster implementation than @@ -707,14 +684,9 @@ impl f32x4 { /// values you get implementation defined behavior. #[inline] #[must_use] + #[deprecated(since = "0.7.6", note = "use `to_i32x4_truncate_fast` instead")] pub fn fast_trunc_int(self) -> i32x4 { - pick! { - if #[cfg(target_feature="sse2")] { - cast(truncate_m128_to_m128i(self.sse)) - } else { - self.trunc_int() - } - } + self.to_i32x4_truncate_fast() } /// Truncates each lane into an integer. This saturates out of range values @@ -722,27 +694,9 @@ impl f32x4 { /// that doesn't handle out of range values or NaNs. #[inline] #[must_use] + #[deprecated(since = "0.7.6", note = "use `to_i32x4_truncate` instead")] pub fn trunc_int(self) -> i32x4 { - pick! { - if #[cfg(target_feature="sse2")] { - // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 - let non_nan_mask = self.cmp_eq(self); - let non_nan = self & non_nan_mask; - let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0))); - let cast: i32x4 = cast(truncate_m128_to_m128i(non_nan.sse)); - flip_to_max ^ cast - } else if #[cfg(target_feature="simd128")] { - cast(Self { simd: i32x4_trunc_sat_f32x4(self.simd) }) - } else { - let n: [f32;4] = cast(self); - cast([ - n[0] as i32, - n[1] as i32, - n[2] as i32, - n[3] as i32, - ]) - } - } + self.to_i32x4_truncate() } #[inline] #[must_use] @@ -1037,7 +991,7 @@ impl f32x4 { // Find quadrant let y = (xa * TWO_OVER_PI).round(); - let q: i32x4 = y.round_int(); + let q: i32x4 = y.to_i32x4_round(); let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa))); @@ -1417,7 +1371,7 @@ impl f32x4 { ); let ee = e1 + e2 + e3; - let ei = cast::<_, i32x4>(ee.round_int()); + let ei = cast::<_, i32x4>(ee.to_i32x4_round()); let ej = cast::<_, i32x4>(ei + (cast::<_, i32x4>(z) >> 23)); let overflow = cast::<_, f32x4>(ej.cmp_gt(i32x4::splat(0x0FF))) @@ -1451,7 +1405,7 @@ impl f32x4 { // Y into an integer let yi = y.cmp_eq(y.round()); // Is y odd? - let y_odd = cast::<_, i32x4>(y.round_int() << 31).round_float(); + let y_odd = cast::<_, i32x4>(y.to_i32x4_round() << 31).round_float(); let z1 = yi.blend(z | y_odd, self.cmp_eq(Self::ZERO).blend(z, Self::nan_pow())); @@ -1484,4 +1438,144 @@ impl f32x4 { pub fn as_array_ref(&self) -> &[f32; 4] { cast_ref(self) } + + /// Converts the first two f32 elements within this struct to f64 elements. + /// + /// The remaining elements are discarded. + #[inline] + #[must_use] + pub fn to_f64x2(self) -> f64x2 { + pick! { + if #[cfg(target_feature="sse2")] { + f64x2 { sse: convert_to_m128d_from_lower2_m128(self.sse) } + } else { + let arr = self.to_array(); + f64x2::new([ + f64::from(arr[0]), + f64::from(arr[1]), + ]) + } + } + } + + /// Converts the f32 elements within this struct to f64 elements. + #[inline] + #[must_use] + pub fn to_f64x4(self) -> f64x4 { + pick! { + if #[cfg(target_feature="avx")] { + f64x4 { avx: convert_to_m256d_from_m128(self.sse) } + } else { + let arr = self.to_array(); + f64x4::new([ + f64::from(arr[0]), + f64::from(arr[1]), + f64::from(arr[2]), + f64::from(arr[3]), + ]) + } + } + } + + /// Converts the f32 elements within this struct to i32 elements. + /// + /// The decimal portions of the values are truncated. + /// + /// This is a faster implementation than `to_i32x4_truncate`, + /// but it doesn't handle out of range values or NaNs. For those + /// values you get implementation defined behavior. + #[inline] + #[must_use] + pub fn to_i32x4_truncate_fast(self) -> i32x4 { + pick! { + if #[cfg(target_feature="sse2")] { + i32x4 { sse: truncate_m128_to_m128i(self.sse) } + } else { + self.to_i32x4_truncate() + } + } + } + + /// Converts the f32 elements within this struct to i32 elements. + /// + /// The decimal portions of the values are truncated. + /// + /// This saturates out of range values and turns NaNs into 0. + /// Use `to_i32x4_truncate_fast` for a faster implementation + /// that doesn't handle out of range values or NaNs. + #[inline] + #[must_use] + pub fn to_i32x4_truncate(self) -> i32x4 { + pick! { + if #[cfg(target_feature="sse2")] { + // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 + let non_nan_mask = self.cmp_eq(self); + let non_nan = self & non_nan_mask; + let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0))); + let cast: i32x4 = cast(truncate_m128_to_m128i(non_nan.sse)); + flip_to_max ^ cast + } else if #[cfg(target_feature="simd128")] { + cast(Self { simd: i32x4_trunc_sat_f32x4(self.simd) }) + } else { + let n: [f32;4] = cast(self); + cast([ + n[0] as i32, + n[1] as i32, + n[2] as i32, + n[3] as i32, + ]) + } + } + } + + /// Converts the f32 elements within this struct to i32 elements. + /// + /// The decimal portions of the values are rounded to the nearest integer. + /// + /// This is a faster implementation than `to_i32x4_round`, + /// but it doesn't handle out of range values or NaNs. For those + /// values you get implementation defined behavior. + #[inline] + #[must_use] + pub fn to_i32x4_round_fast(self) -> i32x4 { + pick! { + if #[cfg(target_feature="sse2")] { + cast(convert_to_i32_m128i_from_m128(self.sse)) + } else { + self.to_i32x4_round() + } + } + } + + /// Converts the f32 elements within this struct to i32 elements. + /// + /// The decimal portions of the values are rounded to the nearest integer. + /// + /// Rounds each lane into an integer. This saturates out of range values and + /// turns NaNs into 0. Use `to_i32x4_round_fast` for a faster implementation that + /// doesn't handle out of range values or NaNs. + #[inline] + #[must_use] + pub fn to_i32x4_round(self) -> i32x4 { + pick! { + if #[cfg(target_feature="sse2")] { + // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 + let non_nan_mask = self.cmp_eq(self); + let non_nan = self & non_nan_mask; + let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0))); + let cast: i32x4 = cast(convert_to_i32_m128i_from_m128(non_nan.sse)); + flip_to_max ^ cast + } else if #[cfg(target_feature="simd128")] { + cast(Self { simd: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd)) }) + } else { + let rounded: [f32; 4] = cast(self.round()); + cast([ + rounded[0] as i32, + rounded[1] as i32, + rounded[2] as i32, + rounded[3] as i32, + ]) + } + } + } } diff --git a/src/f32x8_.rs b/src/f32x8_.rs index 070d7f3a..9e85a4d7 100644 --- a/src/f32x8_.rs +++ b/src/f32x8_.rs @@ -4,17 +4,17 @@ pick! { if #[cfg(target_feature="avx")] { #[derive(Default, Clone, Copy, PartialEq)] #[repr(C, align(32))] - pub struct f32x8 { avx: m256 } + pub struct f32x8 { pub(crate) avx: m256 } } else if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq)] #[repr(C, align(32))] - pub struct f32x8 { sse0: m128, sse1: m128 } + pub struct f32x8 { pub(crate) sse0: m128, pub(crate) sse1: m128 } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(C, align(32))] - pub struct f32x8 { simd0: v128, simd1: v128 } + pub struct f32x8 { pub(crate) simd0: v128, pub(crate) simd1: v128 } impl Default for f32x8 { fn default() -> Self { @@ -31,7 +31,7 @@ pick! { } else { #[derive(Default, Clone, Copy, PartialEq)] #[repr(C, align(32))] - pub struct f32x8 { arr: [f32;8] } + pub struct f32x8 { pub(crate) arr: [f32;8] } } } @@ -786,16 +786,9 @@ impl f32x8 { /// values you get implementation defined behavior. #[inline] #[must_use] + #[deprecated(since = "0.7.6", note = "use `to_i32x8_round_fast` instead")] pub fn fast_round_int(self) -> i32x8 { - pick! { - if #[cfg(target_feature="avx")] { - cast(convert_to_i32_m256i_from_m256(self.avx)) - } else if #[cfg(target_feature="sse2")] { - i32x8 { sse0: convert_to_i32_m128i_from_m128(self.sse0), sse1: convert_to_i32_m128i_from_m128(self.sse1) } - } else { - self.round_int() - } - } + self.to_i32x8_round_fast() } /// Rounds each lane into an integer. This saturates out of range values and @@ -803,41 +796,9 @@ impl f32x8 { /// doesn't handle out of range values or NaNs. #[inline] #[must_use] + #[deprecated(since = "0.7.6", note = "use `to_i32x8_round` instead")] pub fn round_int(self) -> i32x8 { - pick! { - if #[cfg(target_feature="avx")] { - // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 - let non_nan_mask = self.cmp_eq(self); - let non_nan = self & non_nan_mask; - let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0))); - let cast: i32x8 = cast(convert_to_i32_m256i_from_m256(non_nan.avx)); - flip_to_max ^ cast - } else if #[cfg(target_feature="sse2")] { - // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 - let non_nan_mask = self.cmp_eq(self); - let non_nan = self & non_nan_mask; - let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0))); - let cast: i32x8 = i32x8 { sse0: convert_to_i32_m128i_from_m128(non_nan.sse0), sse1: convert_to_i32_m128i_from_m128(non_nan.sse1) }; - flip_to_max ^ cast - } else if #[cfg(target_feature="simd128")] { - cast(Self { - simd0: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd0)), - simd1: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd1)), - }) - } else { - let rounded: [f32; 8] = cast(self.round()); - cast([ - rounded[0] as i32, - rounded[1] as i32, - rounded[2] as i32, - rounded[3] as i32, - rounded[4] as i32, - rounded[5] as i32, - rounded[6] as i32, - rounded[7] as i32, - ]) - } - } + self.to_i32x8_round() } /// Truncates each lane into an integer. This is a faster implementation than @@ -845,16 +806,9 @@ impl f32x8 { /// values you get implementation defined behavior. #[inline] #[must_use] + #[deprecated(since = "0.7.6", note = "use `to_i32x8_truncate_fast` instead")] pub fn fast_trunc_int(self) -> i32x8 { - pick! { - if #[cfg(all(target_feature="avx"))] { - cast(convert_truncate_to_i32_m256i_from_m256(self.avx)) - } else if #[cfg(target_feature="sse2")] { - i32x8 { sse0: truncate_m128_to_m128i(self.sse0), sse1: truncate_m128_to_m128i(self.sse1) } - } else { - self.trunc_int() - } - } + self.to_i32x8_truncate_fast() } /// Truncates each lane into an integer. This saturates out of range values @@ -862,41 +816,9 @@ impl f32x8 { /// that doesn't handle out of range values or NaNs. #[inline] #[must_use] + #[deprecated(since = "0.7.6", note = "use `to_i32x8_truncate` instead")] pub fn trunc_int(self) -> i32x8 { - pick! { - if #[cfg(target_feature="avx")] { - // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 - let non_nan_mask = self.cmp_eq(self); - let non_nan = self & non_nan_mask; - let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0))); - let cast: i32x8 = cast(convert_truncate_to_i32_m256i_from_m256(non_nan.avx)); - flip_to_max ^ cast - } else if #[cfg(target_feature="sse2")] { - // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 - let non_nan_mask = self.cmp_eq(self); - let non_nan = self & non_nan_mask; - let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0))); - let cast: i32x8 = i32x8 { sse0: truncate_m128_to_m128i(non_nan.sse0), sse1: truncate_m128_to_m128i(non_nan.sse1) }; - flip_to_max ^ cast - } else if #[cfg(target_feature="simd128")] { - cast(Self { - simd0: i32x4_trunc_sat_f32x4(self.simd0), - simd1: i32x4_trunc_sat_f32x4(self.simd1), - }) - } else { - let n: [f32; 8] = cast(self); - cast([ - n[0] as i32, - n[1] as i32, - n[2] as i32, - n[3] as i32, - n[4] as i32, - n[5] as i32, - n[6] as i32, - n[7] as i32, - ]) - } - } + self.to_i32x8_truncate() } #[inline] #[must_use] @@ -1200,7 +1122,7 @@ impl f32x8 { // Find quadrant let y = (xa * TWO_OVER_PI).round(); - let q: i32x8 = y.round_int(); + let q: i32x8 = y.to_i32x8_round(); let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa))); @@ -1635,7 +1557,7 @@ impl f32x8 { ); let ee = e1 + e2 + e3; - let ei = cast::<_, i32x8>(ee.round_int()); + let ei = cast::<_, i32x8>(ee.to_i32x8_round()); let ej = cast::<_, i32x8>(ei + (cast::<_, i32x8>(z) >> 23)); let overflow = cast::<_, f32x8>(ej.cmp_gt(i32x8::splat(0x0FF))) @@ -1665,7 +1587,7 @@ impl f32x8 { let yi = y.cmp_eq(y.round()); // Is y odd? - let y_odd = cast::<_, i32x8>(y.round_int() << 31).round_float(); + let y_odd = cast::<_, i32x8>(y.to_i32x8_round() << 31).round_float(); let z1 = yi.blend(z | y_odd, self.cmp_eq(Self::ZERO).blend(z, Self::nan_pow())); @@ -1698,6 +1620,140 @@ impl f32x8 { pub fn as_array_ref(&self) -> &[f32; 8] { cast_ref(self) } + + /// Converts the f32 elements within this struct to i32 elements. + /// + /// The decimal portions of the values are truncated. + /// + /// This is a faster implementation than + /// `to_i32x8_truncate`, but it doesn't handle out of range values or NaNs. For those + /// values you get implementation defined behavior. + #[inline] + #[must_use] + pub fn to_i32x8_truncate_fast(self) -> i32x8 { + pick! { + if #[cfg(all(target_feature="avx"))] { + cast(convert_truncate_to_i32_m256i_from_m256(self.avx)) + } else if #[cfg(target_feature="sse2")] { + i32x8 { sse0: truncate_m128_to_m128i(self.sse0), sse1: truncate_m128_to_m128i(self.sse1) } + } else { + self.trunc_int() + } + } + } + + /// Converts the f32 elements within this struct to i32 elements. + /// + /// The decimal portions of the values are truncated. + /// + /// This saturates out of range values + /// and turns NaNs into 0. Use `to_i32x8_truncate_fast` for a faster implementation + /// that doesn't handle out of range values or NaNs. + #[inline] + #[must_use] + pub fn to_i32x8_truncate(self) -> i32x8 { + pick! { + if #[cfg(target_feature="avx")] { + // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 + let non_nan_mask = self.cmp_eq(self); + let non_nan = self & non_nan_mask; + let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0))); + let cast: i32x8 = cast(convert_truncate_to_i32_m256i_from_m256(non_nan.avx)); + flip_to_max ^ cast + } else if #[cfg(target_feature="sse2")] { + // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 + let non_nan_mask = self.cmp_eq(self); + let non_nan = self & non_nan_mask; + let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0))); + let cast: i32x8 = i32x8 { sse0: truncate_m128_to_m128i(non_nan.sse0), sse1: truncate_m128_to_m128i(non_nan.sse1) }; + flip_to_max ^ cast + } else if #[cfg(target_feature="simd128")] { + cast(Self { + simd0: i32x4_trunc_sat_f32x4(self.simd0), + simd1: i32x4_trunc_sat_f32x4(self.simd1), + }) + } else { + let n: [f32; 8] = cast(self); + cast([ + n[0] as i32, + n[1] as i32, + n[2] as i32, + n[3] as i32, + n[4] as i32, + n[5] as i32, + n[6] as i32, + n[7] as i32, + ]) + } + } + } + + /// Converts the f32 elements within this struct to i32 elements. + /// + /// The decimal portions of the values are rounded to the nearest integer. + /// + /// This is a faster implementation than + /// `to_i32x8_round`, but it doesn't handle out of range values or NaNs. For those + /// values you get implementation defined behavior. + #[inline] + #[must_use] + pub fn to_i32x8_round_fast(self) -> i32x8 { + pick! { + if #[cfg(target_feature="avx")] { + cast(convert_to_i32_m256i_from_m256(self.avx)) + } else if #[cfg(target_feature="sse2")] { + i32x8 { sse0: convert_to_i32_m128i_from_m128(self.sse0), sse1: convert_to_i32_m128i_from_m128(self.sse1) } + } else { + self.round_int() + } + } + } + + /// Converts the f32 elements within this struct to i32 elements. + /// + /// The decimal portions of the values are rounded to the nearest integer. + /// + /// This saturates out of range values and + /// turns NaNs into 0. Use `to_i32x8_round_fast` for a faster implementation that + /// doesn't handle out of range values or NaNs. + #[inline] + #[must_use] + pub fn to_i32x8_round(self) -> i32x8 { + pick! { + if #[cfg(target_feature="avx")] { + // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 + let non_nan_mask = self.cmp_eq(self); + let non_nan = self & non_nan_mask; + let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0))); + let cast: i32x8 = cast(convert_to_i32_m256i_from_m256(non_nan.avx)); + flip_to_max ^ cast + } else if #[cfg(target_feature="sse2")] { + // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 + let non_nan_mask = self.cmp_eq(self); + let non_nan = self & non_nan_mask; + let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0))); + let cast: i32x8 = i32x8 { sse0: convert_to_i32_m128i_from_m128(non_nan.sse0), sse1: convert_to_i32_m128i_from_m128(non_nan.sse1) }; + flip_to_max ^ cast + } else if #[cfg(target_feature="simd128")] { + cast(Self { + simd0: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd0)), + simd1: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd1)), + }) + } else { + let rounded: [f32; 8] = cast(self.round()); + cast([ + rounded[0] as i32, + rounded[1] as i32, + rounded[2] as i32, + rounded[3] as i32, + rounded[4] as i32, + rounded[5] as i32, + rounded[6] as i32, + rounded[7] as i32, + ]) + } + } + } } impl Not for f32x8 { diff --git a/src/f64x2_.rs b/src/f64x2_.rs index 356e46c2..a3085592 100644 --- a/src/f64x2_.rs +++ b/src/f64x2_.rs @@ -4,13 +4,13 @@ pick! { if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq)] #[repr(C, align(16))] - pub struct f64x2 { sse: m128d } + pub struct f64x2 { pub(crate) sse: m128d } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] - pub struct f64x2 { simd: v128 } + pub struct f64x2 { pub(crate) simd: v128 } impl Default for f64x2 { fn default() -> Self { @@ -26,7 +26,7 @@ pick! { } else { #[derive(Default, Clone, Copy, PartialEq)] #[repr(C, align(16))] - pub struct f64x2 { arr: [f64;2] } + pub struct f64x2 { pub(crate) arr: [f64;2] } } } @@ -599,9 +599,9 @@ impl f64x2 { } #[inline] #[must_use] + #[deprecated(since = "0.7.6", note = "use `to_i64x2_round` instead")] pub fn round_int(self) -> i64x2 { - let rounded: [f64; 2] = cast(self.round()); - cast([rounded[0] as i64, rounded[1] as i64]) + self.to_i64x2_round() } #[inline] #[must_use] @@ -1098,7 +1098,7 @@ impl f64x2 { let xa = self.abs(); let y = (xa * TWO_OVER_PI).round(); - let q = y.round_int(); + let q = y.to_i64x2_round(); let x = y.mul_neg_add(DP3, y.mul_neg_add(DP2, y.mul_neg_add(DP1, xa))); @@ -1459,7 +1459,7 @@ impl f64x2 { polynomial_13m!(x, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) + f64x2::ONE; let ee = e1 + e2 + e3; - let ei = cast::<_, i64x2>(ee.round_int()); + let ei = cast::<_, i64x2>(ee.to_i64x2_round()); let ej = cast::<_, i64x2>(ei + (cast::<_, i64x2>(z) >> 52)); let overflow = cast::<_, f64x2>(!ej.cmp_lt(i64x2::splat(0x07FF))) @@ -1493,7 +1493,7 @@ impl f64x2 { // Y into an integer let yi = y.cmp_eq(y.round()); // Is y odd? - let y_odd = cast::<_, i64x2>(y.round_int() << 63).round_float(); + let y_odd = cast::<_, i64x2>(y.to_i64x2_round() << 63).round_float(); let z1 = yi.blend(z | y_odd, self.cmp_eq(Self::ZERO).blend(z, Self::nan_pow())); @@ -1527,6 +1527,88 @@ impl f64x2 { pub fn as_array_ref(&self) -> &[f64; 2] { cast_ref(self) } + + /// Converts the f64 elements within this struct to f32 elements. + /// + /// The first two elements will be the downcast values from this struct. + /// The remaining elements will be zero. + #[inline] + #[must_use] + pub fn to_f32x4(self) -> f32x4 { + pick! { + if #[cfg(target_feature="sse2")] { + f32x4 { sse: convert_to_m128_from_m128d(self.sse) } + } else { + let arr = self.to_array(); + f32x4::new([ + arr[0] as f32, + arr[1] as f32, + 0.0f32, + 0.0f32 + ]) + } + } + } + + /// Converts the f64 elements within this struct to i32 elements. + /// + /// The decimal portions of the values are truncated. + /// + /// The first two elements will be the downcast values from this struct. + /// The remaining elements will be zero. + #[inline] + #[must_use] + pub fn to_i32x4_truncate(self) -> i32x4 { + pick! { + if #[cfg(target_feature="sse2")] { + i32x4 { sse: truncate_m128d_to_m128i(self.sse) } + } else { + let arr = self.to_array(); + i32x4::new([ + arr[0] as i32, + arr[1] as i32, + 0i32, + 0i32, + ]) + } + } + } + + /// Converts the f64 elements within this struct to i32 elements. + /// + /// The decimal portions of the values are rounded to the nearest integer. + /// + /// The first two elements will be the downcast values from this struct. + /// The remaining elements will be zero. + #[inline] + #[must_use] + pub fn to_i32x4_round(self) -> i32x4 { + pick! { + if #[cfg(target_feature="sse2")] { + i32x4 { sse: convert_to_i32_m128i_from_m128d(self.sse) } + } else { + let rounded = self.round().to_array(); + i32x4::new([ + rounded[0] as i32, + rounded[1] as i32, + 0i32, + 0i32, + ]) + } + } + } + + /// Converts the f64 elements within this struct to i64 elements. + /// + /// The decimal portions of the values are rounded to the nearest integer. + /// + /// There is no direct SIMD instruction for this, so it may be slower than `to_i32x4_round`. + #[inline] + #[must_use] + pub fn to_i64x2_round(self) -> i64x2 { + let rounded: [f64; 2] = cast(self.round()); + cast([rounded[0] as i64, rounded[1] as i64]) + } } impl Not for f64x2 { diff --git a/src/f64x4_.rs b/src/f64x4_.rs index 6e709ff6..72982e41 100644 --- a/src/f64x4_.rs +++ b/src/f64x4_.rs @@ -4,17 +4,17 @@ pick! { if #[cfg(target_feature="avx")] { #[derive(Default, Clone, Copy, PartialEq)] #[repr(C, align(32))] - pub struct f64x4 { avx: m256d } + pub struct f64x4 { pub(crate) avx: m256d } } else if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq)] #[repr(C, align(32))] - pub struct f64x4 { sse0: m128d, sse1: m128d } + pub struct f64x4 { pub(crate) sse0: m128d, pub(crate) sse1: m128d } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(C, align(32))] - pub struct f64x4 { simd0: v128, simd1: v128 } + pub struct f64x4 { pub(crate) simd0: v128, pub(crate) simd1: v128 } impl Default for f64x4 { fn default() -> Self { @@ -31,7 +31,7 @@ pick! { } else { #[derive(Default, Clone, Copy, PartialEq)] #[repr(C, align(32))] - pub struct f64x4 { arr: [f64;4] } + pub struct f64x4 { pub(crate) arr: [f64;4] } } } @@ -694,15 +694,9 @@ impl f64x4 { #[inline] #[must_use] + #[deprecated(since = "0.7.6", note = "use `to_i64x4_round` instead")] pub fn round_int(self) -> i64x4 { - // NOTE:No optimization for this currently available so delegate to LLVM - let rounded: [f64; 4] = cast(self.round()); - cast([ - rounded[0] as i64, - rounded[1] as i64, - rounded[2] as i64, - rounded[3] as i64, - ]) + self.to_i64x4_round() } #[inline] @@ -1213,7 +1207,7 @@ impl f64x4 { let xa = self.abs(); let y = (xa * TWO_OVER_PI).round(); - let q = y.round_int(); + let q = y.to_i64x4_round(); let x = y.mul_neg_add(DP3, y.mul_neg_add(DP2, y.mul_neg_add(DP1, xa))); @@ -1587,7 +1581,7 @@ impl f64x4 { polynomial_13m!(x, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) + f64x4::ONE; let ee = e1 + e2 + e3; - let ei = cast::<_, i64x4>(ee.round_int()); + let ei = cast::<_, i64x4>(ee.to_i64x4_round()); let ej = cast::<_, i64x4>(ei + (cast::<_, i64x4>(z) >> 52)); let overflow = cast::<_, f64x4>(!ej.cmp_lt(i64x4::splat(0x07FF))) @@ -1622,7 +1616,7 @@ impl f64x4 { // Y into an integer let yi = y.cmp_eq(y.round()); // Is y odd? - let y_odd = cast::<_, i64x4>(y.round_int() << 63).round_float(); + let y_odd = cast::<_, i64x4>(y.to_i64x4_round() << 63).round_float(); let z1 = yi.blend(z | y_odd, self.cmp_eq(Self::ZERO).blend(z, Self::nan_pow())); x_sign.blend(z1, z) @@ -1654,6 +1648,84 @@ impl f64x4 { pub fn as_array_ref(&self) -> &[f64; 4] { cast_ref(self) } + + /// Converts the f64 elements within this struct to f32 elements. + #[inline] + #[must_use] + pub fn to_f32x4(self) -> f32x4 { + pick! { + if #[cfg(target_feature="avx2")] { + f32x4 { sse: convert_to_m128_from_m256d(self.avx) } + } else { + let arr = self.to_array(); + f32x4::new([ + arr[0] as f32, + arr[1] as f32, + arr[2] as f32, + arr[3] as f32, + ]) + } + } + } + + /// Converts the f64 elements within this struct to i32 elements. + /// + /// The decimal portions of the values are truncated. + #[inline] + #[must_use] + pub fn to_i32x4_truncate(self) -> i32x4 { + pick! { + if #[cfg(target_feature="avx2")] { + i32x4 { sse: convert_truncate_to_i32_m128i_from_m256d(self.avx) } + } else { + let arr = self.to_array(); + i32x4::new([ + arr[0] as i32, + arr[1] as i32, + arr[2] as i32, + arr[3] as i32, + ]) + } + } + } + + /// Converts the f64 elements within this struct to i32 elements. + /// + /// The decimal portions of the values are rounded to the nearest integer. + #[inline] + #[must_use] + pub fn to_i32x4_round(self) -> i32x4 { + pick! { + if #[cfg(target_feature="avx2")] { + i32x4 { sse: convert_to_i32_m128i_from_m256d(self.avx) } + } else { + let rounded = self.round().to_array(); + i32x4::new([ + rounded[0] as i32, + rounded[1] as i32, + rounded[2] as i32, + rounded[3] as i32, + ]) + } + } + } + + /// Converts the f64 elements within this struct to i64 elements. + /// + /// The decimal portions of the values are rounded to the nearest integer. + /// + /// There is no direct SIMD instruction for this, so it may be slower than `to_i32x4_round`. + #[inline] + #[must_use] + pub fn to_i64x4_round(self) -> i64x4 { + let rounded: [f64; 4] = cast(self.round()); + cast([ + rounded[0] as i64, + rounded[1] as i64, + rounded[2] as i64, + rounded[3] as i64, + ]) + } } impl Not for f64x4 { diff --git a/src/i16x16_.rs b/src/i16x16_.rs index 59654d65..f691fbb1 100644 --- a/src/i16x16_.rs +++ b/src/i16x16_.rs @@ -4,7 +4,7 @@ pick! { if #[cfg(target_feature="avx2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] - pub struct i16x16 { avx2: m256i } + pub struct i16x16 { pub(crate) avx2: m256i } } else if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] @@ -14,7 +14,7 @@ pick! { #[derive(Clone, Copy)] #[repr(C, align(32))] - pub struct i16x16 { simd0: v128, simd1: v128 } + pub struct i16x16 { pub(crate) simd0: v128, pub(crate) simd1: v128 } impl Default for i16x16 { fn default() -> Self { @@ -32,7 +32,7 @@ pick! { } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] - pub struct i16x16 { arr: [i16;16] } + pub struct i16x16 { pub(crate) arr: [i16;16] } } } diff --git a/src/i16x8_.rs b/src/i16x8_.rs index f0a13f84..78720635 100644 --- a/src/i16x8_.rs +++ b/src/i16x8_.rs @@ -4,13 +4,13 @@ pick! { if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] - pub struct i16x8 { sse: m128i } + pub struct i16x8 { pub(crate) sse: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] - pub struct i16x8 { simd: v128 } + pub struct i16x8 { pub(crate) simd: v128 } impl Default for i16x8 { fn default() -> Self { @@ -28,7 +28,7 @@ pick! { } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] - pub struct i16x8 { arr: [i16;8] } + pub struct i16x8 { pub(crate) arr: [i16;8] } } } @@ -471,4 +471,90 @@ impl i16x8 { pub fn as_array_ref(&self) -> &[i16; 8] { cast_ref(self) } + + /// Converts the first four i16 elements within this struct to i32 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_i32x4(self) -> i32x4 { + pick! { + if #[cfg(target_feature="sse4.1")] { + i32x4 { sse: convert_to_i32_m128i_from_lower4_i16_m128i(self.sse) } + } else { + let arr = self.to_array(); + i32x4::new([ + i32::from(arr[0]), + i32::from(arr[1]), + i32::from(arr[2]), + i32::from(arr[3]), + ]) + } + } + } + + /// Converts the i16 elements within this struct to i32 elements. + #[inline] + #[must_use] + pub fn to_i32x8(self) -> i32x8 { + pick! { + if #[cfg(target_feature="avx2")] { + i32x8 { avx2: convert_to_i32_m256i_from_i16_m128i(self.sse) } + } else { + let arr = self.to_array(); + i32x8::new([ + i32::from(arr[0]), + i32::from(arr[1]), + i32::from(arr[2]), + i32::from(arr[3]), + i32::from(arr[4]), + i32::from(arr[5]), + i32::from(arr[6]), + i32::from(arr[7]), + ]) + } + } + } + + /// Converts the first two i16 elements within this struct to i64 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_i64x2(self) -> i64x2 { + pick! { + if #[cfg(target_feature="sse4.1")] { + // Pretty sure this function is misnamed in the `safe_arch` crate. + // It calls the `_mm_cvtepi16_epi64` intrinsic. + i64x2 { sse: convert_to_i16_m128i_from_lower2_i16_m128i(self.sse) } + } else { + let arr = self.to_array(); + i64x2::new([ + i64::from(arr[0]), + i64::from(arr[1]), + ]) + } + } + } + + /// Converts the first four i16 elements within this struct to i64 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_i64x4(self) -> i64x4 { + pick! { + if #[cfg(target_feature="avx2")] { + i64x4 { avx2: convert_to_i64_m256i_from_lower4_i16_m128i(self.sse) } + } else { + let arr = self.to_array(); + i64x4::new([ + i64::from(arr[0]), + i64::from(arr[1]), + i64::from(arr[2]), + i64::from(arr[3]), + ]) + } + } + } } diff --git a/src/i32x4_.rs b/src/i32x4_.rs index f015a61c..c683df27 100644 --- a/src/i32x4_.rs +++ b/src/i32x4_.rs @@ -4,13 +4,13 @@ pick! { if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] - pub struct i32x4 { sse: m128i } + pub struct i32x4 { pub(crate) sse: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] - pub struct i32x4 { simd: v128 } + pub struct i32x4 { pub(crate) simd: v128 } impl Default for i32x4 { fn default() -> Self { @@ -28,7 +28,7 @@ pick! { } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] - pub struct i32x4 { arr: [i32;4] } + pub struct i32x4 { pub(crate) arr: [i32;4] } } } @@ -489,4 +489,99 @@ impl i32x4 { pub fn as_array_ref(&self) -> &[i32; 4] { cast_ref(self) } + + /// Converts the first two i32 elements within this struct to i64 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_i64x2(self) -> i64x2 { + pick! { + if #[cfg(target_feature="sse4.1")] { + i64x2 { sse: convert_to_i64_m128i_from_lower2_i32_m128i(self.sse) } + } else { + let arr = self.to_array(); + i64x2::new([ + i64::from(arr[0]), + i64::from(arr[1]), + ]) + } + } + } + + /// Converts the i32 elements within this struct to i64 elements. + #[inline] + #[must_use] + pub fn to_i64x4(self) -> i64x4 { + pick! { + if #[cfg(target_feature="avx2")] { + i64x4 { avx2: convert_to_i64_m256i_from_i32_m128i(self.sse) } + } else { + let arr = self.to_array(); + i64x4::new([ + i64::from(arr[0]), + i64::from(arr[1]), + i64::from(arr[2]), + i64::from(arr[3]), + ]) + } + } + } + + /// Converts the first two i32 elements within this struct to f64 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_f64x2(self) -> f64x2 { + pick! { + if #[cfg(target_feature="sse2")] { + f64x2 { sse: convert_to_m128d_from_lower2_i32_m128i(self.sse) } + } else { + let arr = self.to_array(); + f64x2::new([ + f64::from(arr[0]), + f64::from(arr[1]), + ]) + } + } + } + + /// Converts the i32 elements within this struct to f64 elements. + #[inline] + #[must_use] + pub fn to_f64x4(self) -> f64x4 { + pick! { + if #[cfg(target_feature="avx")] { + f64x4 { avx: convert_to_m256d_from_i32_m128i(self.sse) } + } else { + let arr = self.to_array(); + f64x4::new([ + f64::from(arr[0]), + f64::from(arr[1]), + f64::from(arr[2]), + f64::from(arr[3]), + ]) + } + } + } + + /// Converts the i32 elements within this struct to f32 elements. + #[inline] + #[must_use] + pub fn to_f32x4(self) -> f32x4 { + pick! { + if #[cfg(target_feature="sse2")] { + f32x4 { sse: convert_to_m128_from_i32_m128i(self.sse) } + } else { + let arr = self.to_array(); + f32x4::new([ + arr[0] as f32, + arr[1] as f32, + arr[2] as f32, + arr[3] as f32, + ]) + } + } + } } diff --git a/src/i32x8_.rs b/src/i32x8_.rs index dcab8712..a4f8ae91 100644 --- a/src/i32x8_.rs +++ b/src/i32x8_.rs @@ -4,7 +4,7 @@ pick! { if #[cfg(target_feature="avx2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] - pub struct i32x8 { avx2: m256i } + pub struct i32x8 { pub(crate) avx2: m256i } } else if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] @@ -14,7 +14,7 @@ pick! { #[derive(Clone, Copy)] #[repr(C, align(32))] - pub struct i32x8 { simd0: v128, simd1: v128 } + pub struct i32x8 { pub(crate) simd0: v128, pub(crate) simd1: v128 } impl Default for i32x8 { fn default() -> Self { @@ -32,7 +32,7 @@ pick! { } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] - pub struct i32x8 { arr: [i32;8] } + pub struct i32x8 { pub(crate) arr: [i32;8] } } } @@ -586,6 +586,29 @@ impl i32x8 { pub fn as_array_ref(&self) -> &[i32; 8] { cast_ref(self) } + + /// Converts the i32 elements within this struct to f32 elements. + #[inline] + #[must_use] + pub fn to_f32x8(self) -> f32x8 { + pick! { + if #[cfg(target_feature="avx")] { + f32x8 { avx: convert_to_m256_from_i32_m256i(self.avx2) } + } else { + let arr = self.to_array(); + f32x8::new([ + arr[0] as f32, + arr[1] as f32, + arr[2] as f32, + arr[3] as f32, + arr[4] as f32, + arr[5] as f32, + arr[6] as f32, + arr[7] as f32, + ]) + } + } + } } impl Not for i32x8 { diff --git a/src/i64x2_.rs b/src/i64x2_.rs index 5a3cc18b..e1e0820e 100644 --- a/src/i64x2_.rs +++ b/src/i64x2_.rs @@ -4,13 +4,13 @@ pick! { if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] - pub struct i64x2 { sse: m128i } + pub struct i64x2 { pub(crate) sse: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] - pub struct i64x2 { simd: v128 } + pub struct i64x2 { pub(crate) simd: v128 } impl Default for i64x2 { fn default() -> Self { @@ -28,7 +28,7 @@ pick! { } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] - pub struct i64x2 { arr: [i64;2] } + pub struct i64x2 { pub(crate) arr: [i64;2] } } } diff --git a/src/i64x4_.rs b/src/i64x4_.rs index d5aabb13..e7f9cd51 100644 --- a/src/i64x4_.rs +++ b/src/i64x4_.rs @@ -4,17 +4,17 @@ pick! { if #[cfg(target_feature="avx2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] - pub struct i64x4 { avx2: m256i } + pub struct i64x4 { pub(crate) avx2: m256i } } else if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] - pub struct i64x4 { sse0: m128i, sse1: m128i } + pub struct i64x4 { pub(crate) sse0: m128i, pub(crate) sse1: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(C, align(32))] - pub struct i64x4 { simd0: v128, simd1: v128 } + pub struct i64x4 { pub(crate) simd0: v128, pub(crate) simd1: v128 } impl Default for i64x4 { fn default() -> Self { @@ -32,7 +32,7 @@ pick! { } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] - pub struct i64x4 { arr: [i64;4] } + pub struct i64x4 { pub(crate) arr: [i64;4] } } } diff --git a/src/i8x16_.rs b/src/i8x16_.rs index c924a6a4..7240a3b3 100644 --- a/src/i8x16_.rs +++ b/src/i8x16_.rs @@ -4,13 +4,13 @@ pick! { if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] - pub struct i8x16 { sse: m128i } + pub struct i8x16 { pub(crate) sse: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] - pub struct i8x16 { simd: v128 } + pub struct i8x16 { pub(crate) simd: v128 } impl Default for i8x16 { fn default() -> Self { @@ -28,7 +28,7 @@ pick! { } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] - pub struct i8x16 { arr: [i8;16] } + pub struct i8x16 { pub(crate) arr: [i8;16] } } } @@ -488,4 +488,146 @@ impl i8x16 { pub fn as_array_ref(&self) -> &[i8; 16] { cast_ref(self) } + + /// Converts the first eight i8 elements within this struct to i16 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_i16x8(self) -> i16x8 { + pick! { + if #[cfg(target_feature="sse4.1")] { + i16x8 { sse: convert_to_i16_m128i_from_lower8_i8_m128i(self.sse) } + } else { + let arr = self.to_array(); + i16x8::new([ + i16::from(arr[0]), + i16::from(arr[1]), + i16::from(arr[2]), + i16::from(arr[3]), + i16::from(arr[4]), + i16::from(arr[5]), + i16::from(arr[6]), + i16::from(arr[7]), + ]) + } + } + } + + /// Converts the i8 elements within this struct to i16 elements. + #[inline] + #[must_use] + pub fn to_i16x16(self) -> i16x16 { + pick! { + if #[cfg(target_feature="avx2")] { + i16x16 { avx2: convert_to_i16_m256i_from_i8_m128i(self.sse) } + } else { + let arr = self.to_array(); + i16x16::new([ + i16::from(arr[0]), + i16::from(arr[1]), + i16::from(arr[2]), + i16::from(arr[3]), + i16::from(arr[4]), + i16::from(arr[5]), + i16::from(arr[6]), + i16::from(arr[7]), + i16::from(arr[8]), + i16::from(arr[9]), + i16::from(arr[10]), + i16::from(arr[11]), + i16::from(arr[12]), + i16::from(arr[13]), + i16::from(arr[14]), + i16::from(arr[15]), + ]) + } + } + } + + /// Converts the first four i8 elements within this struct to i32 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_i32x4(self) -> i32x4 { + pick! { + if #[cfg(target_feature="sse4.1")] { + i32x4 { sse: convert_to_i32_m128i_from_lower4_i8_m128i(self.sse) } + } else { + let arr = self.to_array(); + i32x4::new([ + i32::from(arr[0]), + i32::from(arr[1]), + i32::from(arr[2]), + i32::from(arr[3]), + ]) + } + } + } + + /// Converts the first eight i8 elements within this struct to i32 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_i32x8(self) -> i32x8 { + pick! { + if #[cfg(target_feature="avx2")] { + i32x8 { avx2: convert_to_i32_m256i_from_lower8_i8_m128i(self.sse) } + } else { + let arr = self.to_array(); + i32x8::new([ + i32::from(arr[0]), + i32::from(arr[1]), + i32::from(arr[2]), + i32::from(arr[3]), + i32::from(arr[4]), + i32::from(arr[5]), + i32::from(arr[6]), + i32::from(arr[7]), + ]) + } + } + } + + /// Converts the first two i8 elements within this struct to i64 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_i64x2(self) -> i64x2 { + pick! { + if #[cfg(target_feature="sse4.1")] { + i64x2 { sse: convert_to_i64_m128i_from_lower2_i8_m128i(self.sse) } + } else { + let arr = self.to_array(); + i64x2::new([ + i64::from(arr[0]), + i64::from(arr[1]), + ]) + } + } + } + + /// Converts the first four i8 elements within this struct to i64 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_i64x4(self) -> i64x4 { + pick! { + if #[cfg(target_feature="avx2")] { + i64x4 { avx2: convert_to_i64_m256i_from_lower4_i8_m128i(self.sse) } + } else { + let arr = self.to_array(); + i64x4::new([ + i64::from(arr[0]), + i64::from(arr[1]), + i64::from(arr[2]), + i64::from(arr[3]), + ]) + } + } + } } diff --git a/src/i8x32_.rs b/src/i8x32_.rs index fdff9ede..31786fd3 100644 --- a/src/i8x32_.rs +++ b/src/i8x32_.rs @@ -4,17 +4,17 @@ pick! { if #[cfg(target_feature="avx2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] - pub struct i8x32 { avx: m256i } + pub struct i8x32 { pub(crate) avx: m256i } } else if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] - pub struct i8x32 { sse0: m128i, sse1: m128i } + pub struct i8x32 { pub(crate) sse0: m128i, pub(crate) sse1: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(C, align(32))] - pub struct i8x32 { simd0: v128, simd1: v128 } + pub struct i8x32 { pub(crate) simd0: v128, pub(crate) simd1: v128 } impl Default for i8x32 { fn default() -> Self { @@ -32,7 +32,7 @@ pick! { } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] - pub struct i8x32 { arr: [i8;32] } + pub struct i8x32 { pub(crate) arr: [i8;32] } } } diff --git a/src/u16x8_.rs b/src/u16x8_.rs index b800f28e..6d69eba1 100644 --- a/src/u16x8_.rs +++ b/src/u16x8_.rs @@ -4,13 +4,13 @@ pick! { if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] - pub struct u16x8 { sse: m128i } + pub struct u16x8 { pub(crate) sse: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] - pub struct u16x8 { simd: v128 } + pub struct u16x8 { pub(crate) simd: v128 } impl Default for u16x8 { fn default() -> Self { @@ -28,7 +28,7 @@ pick! { } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] - pub struct u16x8 { arr: [u16;8] } + pub struct u16x8 { pub(crate) arr: [u16;8] } } } @@ -414,4 +414,172 @@ impl u16x8 { pub fn as_array_ref(&self) -> &[u16; 8] { cast_ref(self) } + + /// Converts the first four u16 elements within this struct to u32 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_u32x4(self) -> u32x4 { + pick! { + if #[cfg(target_feature="sse4.1")] { + u32x4 { sse: convert_to_u32_m128i_from_lower4_u16_m128i(self.sse) } + } else { + let arr = self.to_array(); + u32x4::new([ + u32::from(arr[0]), + u32::from(arr[1]), + u32::from(arr[2]), + u32::from(arr[3]), + ]) + } + } + } + + /// Converts the first four u16 elements within this struct to i32 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_i32x4(self) -> i32x4 { + pick! { + if #[cfg(target_feature="sse4.1")] { + i32x4 { sse: convert_to_u32_m128i_from_lower4_u16_m128i(self.sse) } + } else { + let arr = self.to_array(); + i32x4::new([ + i32::from(arr[0]), + i32::from(arr[1]), + i32::from(arr[2]), + i32::from(arr[3]), + ]) + } + } + } + + /// Converts the u16 elements within this struct to u32 elements. + #[inline] + #[must_use] + pub fn to_u32x8(self) -> u32x8 { + pick! { + if #[cfg(target_feature="avx2")] { + u32x8 { avx2: convert_to_i32_m256i_from_u16_m128i(self.sse) } + } else { + let arr = self.to_array(); + u32x8::new([ + u32::from(arr[0]), + u32::from(arr[1]), + u32::from(arr[2]), + u32::from(arr[3]), + u32::from(arr[4]), + u32::from(arr[5]), + u32::from(arr[6]), + u32::from(arr[7]), + ]) + } + } + } + + /// Converts the u16 elements within this struct to i32 elements. + #[inline] + #[must_use] + pub fn to_i32x8(self) -> i32x8 { + pick! { + if #[cfg(target_feature="avx2")] { + i32x8 { avx2: convert_to_i32_m256i_from_u16_m128i(self.sse) } + } else { + let arr = self.to_array(); + i32x8::new([ + i32::from(arr[0]), + i32::from(arr[1]), + i32::from(arr[2]), + i32::from(arr[3]), + i32::from(arr[4]), + i32::from(arr[5]), + i32::from(arr[6]), + i32::from(arr[7]), + ]) + } + } + } + + /// Converts the first two u16 elements within this struct to u64 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_u64x2(self) -> u64x2 { + pick! { + if #[cfg(target_feature="sse4.1")] { + u64x2 { sse: convert_to_u64_m128i_from_lower2_u16_m128i(self.sse) } + } else { + let arr = self.to_array(); + u64x2::new([ + u64::from(arr[0]), + u64::from(arr[1]), + ]) + } + } + } + + /// Converts the first two u16 elements within this struct to i64 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_i64x2(self) -> i64x2 { + pick! { + if #[cfg(target_feature="sse4.1")] { + i64x2 { sse: convert_to_u64_m128i_from_lower2_u16_m128i(self.sse) } + } else { + let arr = self.to_array(); + i64x2::new([ + i64::from(arr[0]), + i64::from(arr[1]), + ]) + } + } + } + + /// Converts the first four u16 elements within this struct to u64 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_u64x4(self) -> u64x4 { + pick! { + if #[cfg(target_feature="avx2")] { + u64x4 { avx2: convert_to_i64_m256i_from_lower4_u16_m128i(self.sse) } + } else { + let arr = self.to_array(); + u64x4::new([ + u64::from(arr[0]), + u64::from(arr[1]), + u64::from(arr[2]), + u64::from(arr[3]), + ]) + } + } + } + + /// Converts the first four u16 elements within this struct to i64 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_i64x4(self) -> i64x4 { + pick! { + if #[cfg(target_feature="avx2")] { + i64x4 { avx2: convert_to_i64_m256i_from_lower4_u16_m128i(self.sse) } + } else { + let arr = self.to_array(); + i64x4::new([ + i64::from(arr[0]), + i64::from(arr[1]), + i64::from(arr[2]), + i64::from(arr[3]), + ]) + } + } + } } diff --git a/src/u32x4_.rs b/src/u32x4_.rs index f7f673ad..0c06dc3c 100644 --- a/src/u32x4_.rs +++ b/src/u32x4_.rs @@ -4,13 +4,13 @@ pick! { if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] - pub struct u32x4 { sse: m128i } + pub struct u32x4 { pub(crate) sse: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] - pub struct u32x4 { simd: v128 } + pub struct u32x4 { pub(crate) simd: v128 } impl Default for u32x4 { fn default() -> Self { @@ -28,7 +28,7 @@ pick! { } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] - pub struct u32x4 { arr: [u32;4] } + pub struct u32x4 { pub(crate) arr: [u32;4] } } } @@ -372,4 +372,137 @@ impl u32x4 { pub fn as_array_ref(&self) -> &[u32; 4] { cast_ref(self) } + + /// Converts the first two u32 elements within this struct to u64 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_u64x2(self) -> u64x2 { + pick! { + if #[cfg(target_feature="sse4.1")] { + u64x2 { sse: convert_to_u64_m128i_from_lower2_u32_m128i(self.sse) } + } else { + let arr = self.to_array(); + u64x2::new([ + u64::from(arr[0]), + u64::from(arr[1]), + ]) + } + } + } + + /// Converts the first two u32 elements within this struct to i64 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_i64x2(self) -> i64x2 { + pick! { + if #[cfg(target_feature="sse4.1")] { + i64x2 { sse: convert_to_u64_m128i_from_lower2_u32_m128i(self.sse) } + } else { + let arr = self.to_array(); + i64x2::new([ + i64::from(arr[0]), + i64::from(arr[1]), + ]) + } + } + } + + /// Converts the u32 elements within this struct to u64 elements. + #[inline] + #[must_use] + pub fn to_u64x4(self) -> u64x4 { + pick! { + if #[cfg(target_feature="avx2")] { + u64x4 { avx2: convert_to_i64_m256i_from_lower4_u16_m128i(self.sse) } + } else { + let arr = self.to_array(); + u64x4::new([ + u64::from(arr[0]), + u64::from(arr[1]), + u64::from(arr[2]), + u64::from(arr[3]), + ]) + } + } + } + + /// Converts the u32 elements within this struct to i64 elements. + #[inline] + #[must_use] + pub fn to_i64x4(self) -> i64x4 { + pick! { + if #[cfg(target_feature="avx2")] { + i64x4 { avx2: convert_to_i64_m256i_from_lower4_u16_m128i(self.sse) } + } else { + let arr = self.to_array(); + i64x4::new([ + i64::from(arr[0]), + i64::from(arr[1]), + i64::from(arr[2]), + i64::from(arr[3]), + ]) + } + } + } + + /// Converts the first two u32 elements within this struct to f64 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_f64x2(self) -> f64x2 { + pick! { + if #[cfg(target_feature="sse2")] { + f64x2 { sse: convert_to_m128d_from_lower2_i32_m128i(self.sse) } + } else { + let arr = self.to_array(); + f64x2::new([ + f64::from(arr[0]), + f64::from(arr[1]), + ]) + } + } + } + + /// Converts the u32 elements within this struct to f64 elements. + #[inline] + #[must_use] + pub fn to_f64x4(self) -> f64x4 { + pick! { + if #[cfg(target_feature="avx")] { + f64x4 { avx: convert_to_m256d_from_i32_m128i(self.sse) } + } else { + let arr = self.to_array(); + f64x4::new([ + f64::from(arr[0]), + f64::from(arr[1]), + f64::from(arr[2]), + f64::from(arr[3]), + ]) + } + } + } + + /// Converts the u32 elements within this struct to f32 elements. + #[inline] + #[must_use] + pub fn to_f32x4(self) -> f32x4 { + pick! { + if #[cfg(target_feature="sse2")] { + f32x4 { sse: convert_to_m128_from_i32_m128i(self.sse) } + } else { + let arr = self.to_array(); + f32x4::new([ + arr[0] as f32, + arr[1] as f32, + arr[2] as f32, + arr[3] as f32, + ]) + } + } + } } diff --git a/src/u32x8_.rs b/src/u32x8_.rs index 4934d8e0..97cd2cf9 100644 --- a/src/u32x8_.rs +++ b/src/u32x8_.rs @@ -4,17 +4,17 @@ pick! { if #[cfg(target_feature="avx2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] - pub struct u32x8 { avx2: m256i } + pub struct u32x8 { pub(crate) avx2: m256i } } else if #[cfg(target_feature="sse")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] - pub struct u32x8 { sse0: m128i, sse1: m128i } + pub struct u32x8 { pub(crate) sse0: m128i, pub(crate) sse1: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(C, align(32))] - pub struct u32x8 { simd0: v128, simd1: v128 } + pub struct u32x8 { pub(crate) simd0: v128, pub(crate) simd1: v128 } impl Default for u32x8 { fn default() -> Self { @@ -32,7 +32,7 @@ pick! { } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] - pub struct u32x8 { arr: [u32;8] } + pub struct u32x8 { pub(crate) arr: [u32;8] } } } @@ -422,6 +422,29 @@ impl u32x8 { pub fn as_array_ref(&self) -> &[u32; 8] { cast_ref(self) } + + /// Converts the u32 elements within this struct to f32 elements. + #[inline] + #[must_use] + pub fn to_f32x8(self) -> f32x8 { + pick! { + if #[cfg(target_feature="avx")] { + f32x8 { avx: convert_to_m256_from_i32_m256i(self.avx2) } + } else { + let arr = self.to_array(); + f32x8::new([ + arr[0] as f32, + arr[1] as f32, + arr[2] as f32, + arr[3] as f32, + arr[4] as f32, + arr[5] as f32, + arr[6] as f32, + arr[7] as f32, + ]) + } + } + } } impl Not for u32x8 { diff --git a/src/u64x2_.rs b/src/u64x2_.rs index ad706feb..1a77bc97 100644 --- a/src/u64x2_.rs +++ b/src/u64x2_.rs @@ -4,13 +4,13 @@ pick! { if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] - pub struct u64x2 { sse: m128i } + pub struct u64x2 { pub(crate) sse: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] - pub struct u64x2 { simd: v128 } + pub struct u64x2 { pub(crate) simd: v128 } impl Default for u64x2 { fn default() -> Self { @@ -28,7 +28,7 @@ pick! { } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] - pub struct u64x2 { arr: [u64;2] } + pub struct u64x2 { pub(crate) arr: [u64;2] } } } diff --git a/src/u64x4_.rs b/src/u64x4_.rs index 0171e981..1c4f5fd9 100644 --- a/src/u64x4_.rs +++ b/src/u64x4_.rs @@ -4,17 +4,17 @@ pick! { if #[cfg(target_feature="avx2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] - pub struct u64x4 { avx2: m256i } + pub struct u64x4 { pub(crate) avx2: m256i } } else if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] - pub struct u64x4 { sse0: m128i, sse1: m128i } + pub struct u64x4 { pub(crate) sse0: m128i, sse1: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(C, align(32))] - pub struct u64x4 { simd0: v128, simd1: v128 } + pub struct u64x4 { pub(crate) simd0: v128, pub(crate) simd1: v128 } impl Default for u64x4 { fn default() -> Self { @@ -32,7 +32,7 @@ pick! { } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] - pub struct u64x4 { arr: [u64;4] } + pub struct u64x4 { pub(crate) arr: [u64;4] } } } diff --git a/src/u8x16_.rs b/src/u8x16_.rs index c21ca005..0af3c4c2 100644 --- a/src/u8x16_.rs +++ b/src/u8x16_.rs @@ -4,13 +4,13 @@ pick! { if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] - pub struct u8x16 { sse: m128i } + pub struct u8x16 { pub(crate) sse: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] - pub struct u8x16 { simd: v128 } + pub struct u8x16 { pub(crate) simd: v128 } impl Default for u8x16 { fn default() -> Self { @@ -28,7 +28,7 @@ pick! { } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] - pub struct u8x16 { arr: [u8;16] } + pub struct u8x16 { pub(crate) arr: [u8;16] } } } @@ -362,4 +362,265 @@ impl u8x16 { pub fn as_array_ref(&self) -> &[u8; 16] { cast_ref(self) } + + /// Converts the first eight u8 elements within this struct to u16 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_u16x8(self) -> u16x8 { + pick! { + if #[cfg(target_feature="sse4.1")] { + u16x8 { sse: convert_to_u16_m128i_from_lower8_u8_m128i(self.sse) } + } else { + let arr = self.to_array(); + u16x8::new([ + u16::from(arr[0]), + u16::from(arr[1]), + u16::from(arr[2]), + u16::from(arr[3]), + u16::from(arr[4]), + u16::from(arr[5]), + u16::from(arr[6]), + u16::from(arr[7]), + ]) + } + } + } + + /// Converts the first eight u8 elements within this struct to i16 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_i16x8(self) -> i16x8 { + pick! { + if #[cfg(target_feature="sse4.1")] { + i16x8 { sse: convert_to_u16_m128i_from_lower8_u8_m128i(self.sse) } + } else { + let arr = self.to_array(); + i16x8::new([ + i16::from(arr[0]), + i16::from(arr[1]), + i16::from(arr[2]), + i16::from(arr[3]), + i16::from(arr[4]), + i16::from(arr[5]), + i16::from(arr[6]), + i16::from(arr[7]), + ]) + } + } + } + + /// Converts the u8 elements within this struct to i16 elements. + #[inline] + #[must_use] + pub fn to_i16x16(self) -> i16x16 { + pick! { + if #[cfg(target_feature="avx2")] { + i16x16 { avx2: convert_to_i16_m256i_from_u8_m128i(self.sse) } + } else { + let arr = self.to_array(); + i16x16::new([ + i16::from(arr[0]), + i16::from(arr[1]), + i16::from(arr[2]), + i16::from(arr[3]), + i16::from(arr[4]), + i16::from(arr[5]), + i16::from(arr[6]), + i16::from(arr[7]), + i16::from(arr[8]), + i16::from(arr[9]), + i16::from(arr[10]), + i16::from(arr[11]), + i16::from(arr[12]), + i16::from(arr[13]), + i16::from(arr[14]), + i16::from(arr[15]), + ]) + } + } + } + + /// Converts the first four u8 elements within this struct to u32 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_u32x4(self) -> u32x4 { + pick! { + if #[cfg(target_feature="sse4.1")] { + u32x4 { sse: convert_to_u32_m128i_from_lower4_u8_m128i(self.sse) } + } else { + let arr = self.to_array(); + u32x4::new([ + u32::from(arr[0]), + u32::from(arr[1]), + u32::from(arr[2]), + u32::from(arr[3]), + ]) + } + } + } + + /// Converts the first four u8 elements within this struct to i32 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_i32x4(self) -> i32x4 { + pick! { + if #[cfg(target_feature="sse4.1")] { + i32x4 { sse: convert_to_u32_m128i_from_lower4_u8_m128i(self.sse) } + } else { + let arr = self.to_array(); + i32x4::new([ + i32::from(arr[0]), + i32::from(arr[1]), + i32::from(arr[2]), + i32::from(arr[3]), + ]) + } + } + } + + /// Converts the first eight u8 elements within this struct to u32 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_u32x8(self) -> u32x8 { + pick! { + if #[cfg(target_feature="avx2")] { + // This function is named wrong in `safe_arch`. + // It calls `_mm256_cvtepu8_epi32`. + u32x8 { avx2: convert_to_i16_m256i_from_lower8_u8_m128i(self.sse) } + } else { + let arr = self.to_array(); + u32x8::new([ + u32::from(arr[0]), + u32::from(arr[1]), + u32::from(arr[2]), + u32::from(arr[3]), + u32::from(arr[4]), + u32::from(arr[5]), + u32::from(arr[6]), + u32::from(arr[7]), + ]) + } + } + } + + /// Converts the first eight u8 elements within this struct to i32 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_i32x8(self) -> i32x8 { + pick! { + if #[cfg(target_feature="avx2")] { + // This function is named wrong in `safe_arch`. + // It calls `_mm256_cvtepu8_epi32`. + i32x8 { avx2: convert_to_i16_m256i_from_lower8_u8_m128i(self.sse) } + } else { + let arr = self.to_array(); + i32x8::new([ + i32::from(arr[0]), + i32::from(arr[1]), + i32::from(arr[2]), + i32::from(arr[3]), + i32::from(arr[4]), + i32::from(arr[5]), + i32::from(arr[6]), + i32::from(arr[7]), + ]) + } + } + } + + /// Converts the first two u8 elements within this struct to u64 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_u64x2(self) -> u64x2 { + pick! { + if #[cfg(target_feature="sse4.1")] { + u64x2 { sse: convert_to_u64_m128i_from_lower2_u8_m128i(self.sse) } + } else { + let arr = self.to_array(); + u64x2::new([ + u64::from(arr[0]), + u64::from(arr[1]), + ]) + } + } + } + + /// Converts the first two u8 elements within this struct to i64 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_i64x2(self) -> i64x2 { + pick! { + if #[cfg(target_feature="sse4.1")] { + i64x2 { sse: convert_to_u64_m128i_from_lower2_u8_m128i(self.sse) } + } else { + let arr = self.to_array(); + i64x2::new([ + i64::from(arr[0]), + i64::from(arr[1]), + ]) + } + } + } + + /// Converts the first four u8 elements within this struct to u64 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_u64x4(self) -> u64x4 { + pick! { + if #[cfg(target_feature="avx2")] { + // This function is named wrong in `safe_arch`. + // It calls `_mm256_cvtepu8_epi64`. + u64x4 { avx2: convert_to_i16_m256i_from_lower4_u8_m128i(self.sse) } + } else { + let arr = self.to_array(); + u64x4::new([ + u64::from(arr[0]), + u64::from(arr[1]), + u64::from(arr[2]), + u64::from(arr[3]), + ]) + } + } + } + + /// Converts the first four u8 elements within this struct to i64 elements. + /// + /// The remaining elements will be discarded. + #[inline] + #[must_use] + pub fn to_i64x4(self) -> i64x4 { + pick! { + if #[cfg(target_feature="avx2")] { + // This function is named wrong in `safe_arch`. + // It calls `_mm256_cvtepu8_epi64`. + i64x4 { avx2: convert_to_i16_m256i_from_lower4_u8_m128i(self.sse) } + } else { + let arr = self.to_array(); + i64x4::new([ + i64::from(arr[0]), + i64::from(arr[1]), + i64::from(arr[2]), + i64::from(arr[3]), + ]) + } + } + } }