| // x86 specific conversion optimizations -*- C++ -*- |
| |
| // Copyright (C) 2020-2021 Free Software Foundation, Inc. |
| // |
| // This file is part of the GNU ISO C++ Library. This library is free |
| // software; you can redistribute it and/or modify it under the |
| // terms of the GNU General Public License as published by the |
| // Free Software Foundation; either version 3, or (at your option) |
| // any later version. |
| |
| // This library is distributed in the hope that it will be useful, |
| // but WITHOUT ANY WARRANTY; without even the implied warranty of |
| // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| // GNU General Public License for more details. |
| |
| // Under Section 7 of GPL version 3, you are granted additional |
| // permissions described in the GCC Runtime Library Exception, version |
| // 3.1, as published by the Free Software Foundation. |
| |
| // You should have received a copy of the GNU General Public License and |
| // a copy of the GCC Runtime Library Exception along with this program; |
| // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| // <http://www.gnu.org/licenses/>. |
| |
| #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H |
| #define _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H |
| |
| #if __cplusplus >= 201703L |
| |
| // work around PR85827 |
| // 1-arg __convert_x86 {{{1 |
| template <typename _To, typename _V, typename _Traits> |
| _GLIBCXX_SIMD_INTRINSIC _To |
| __convert_x86(_V __v) |
| { |
| static_assert(__is_vector_type_v<_V>); |
| using _Tp = typename _Traits::value_type; |
| constexpr size_t _Np = _Traits::_S_full_size; |
| [[maybe_unused]] const auto __intrin = __to_intrin(__v); |
| using _Up = typename _VectorTraits<_To>::value_type; |
| constexpr size_t _M = _VectorTraits<_To>::_S_full_size; |
| |
| // [xyz]_to_[xyz] {{{2 |
| [[maybe_unused]] constexpr bool __x_to_x |
| = sizeof(__v) <= 16 && sizeof(_To) <= 16; |
| [[maybe_unused]] constexpr bool __x_to_y |
| = sizeof(__v) <= 16 && sizeof(_To) == 32; |
| [[maybe_unused]] constexpr bool __x_to_z |
| = sizeof(__v) <= 16 && sizeof(_To) == 64; |
| [[maybe_unused]] constexpr bool __y_to_x |
| = sizeof(__v) == 32 && sizeof(_To) <= 16; |
| [[maybe_unused]] constexpr bool __y_to_y |
| = sizeof(__v) == 32 && sizeof(_To) == 32; |
| [[maybe_unused]] constexpr bool __y_to_z |
| = sizeof(__v) == 32 && sizeof(_To) == 64; |
| [[maybe_unused]] constexpr bool __z_to_x |
| = sizeof(__v) == 64 && sizeof(_To) <= 16; |
| [[maybe_unused]] constexpr bool __z_to_y |
| = sizeof(__v) == 64 && sizeof(_To) == 32; |
| [[maybe_unused]] constexpr bool __z_to_z |
| = sizeof(__v) == 64 && sizeof(_To) == 64; |
| |
| // iX_to_iX {{{2 |
| [[maybe_unused]] constexpr bool __i_to_i |
| = is_integral_v<_Up> && is_integral_v<_Tp>; |
| [[maybe_unused]] constexpr bool __i8_to_i16 |
| = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2; |
| [[maybe_unused]] constexpr bool __i8_to_i32 |
| = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __i8_to_i64 |
| = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __i16_to_i8 |
| = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1; |
| [[maybe_unused]] constexpr bool __i16_to_i32 |
| = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __i16_to_i64 |
| = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __i32_to_i8 |
| = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1; |
| [[maybe_unused]] constexpr bool __i32_to_i16 |
| = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2; |
| [[maybe_unused]] constexpr bool __i32_to_i64 |
| = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __i64_to_i8 |
| = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1; |
| [[maybe_unused]] constexpr bool __i64_to_i16 |
| = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2; |
| [[maybe_unused]] constexpr bool __i64_to_i32 |
| = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4; |
| |
| // [fsu]X_to_[fsu]X {{{2 |
| // ibw = integral && byte or word, i.e. char and short with any signedness |
| [[maybe_unused]] constexpr bool __s64_to_f32 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __s32_to_f32 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __s16_to_f32 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __s8_to_f32 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __u64_to_f32 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __u32_to_f32 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __u16_to_f32 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __u8_to_f32 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __s64_to_f64 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __s32_to_f64 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __u64_to_f64 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __u32_to_f64 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __f32_to_s64 |
| = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
| [[maybe_unused]] constexpr bool __f32_to_s32 |
| = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
| [[maybe_unused]] constexpr bool __f32_to_u64 |
| = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
| [[maybe_unused]] constexpr bool __f32_to_u32 |
| = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
| [[maybe_unused]] constexpr bool __f64_to_s64 |
| = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
| [[maybe_unused]] constexpr bool __f64_to_s32 |
| = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
| [[maybe_unused]] constexpr bool __f64_to_u64 |
| = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
| [[maybe_unused]] constexpr bool __f64_to_u32 |
| = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
| [[maybe_unused]] constexpr bool __ibw_to_f32 |
| = is_integral_v<_Tp> && sizeof(_Tp) <= 2 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __ibw_to_f64 |
| = is_integral_v<_Tp> && sizeof(_Tp) <= 2 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __f32_to_ibw |
| = is_integral_v<_Up> && sizeof(_Up) <= 2 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
| [[maybe_unused]] constexpr bool __f64_to_ibw |
| = is_integral_v<_Up> && sizeof(_Up) <= 2 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
| [[maybe_unused]] constexpr bool __f32_to_f64 |
| = is_floating_point_v<_Tp> && sizeof(_Tp) == 4 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __f64_to_f32 |
| = is_floating_point_v<_Tp> && sizeof(_Tp) == 8 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| |
| if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2 |
| return __convert_x86<_To>(__lo128(__v), __hi128(__v)); |
| else if constexpr (__i_to_i && __x_to_y && !__have_avx2) //{{{2 |
| return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v), |
| __convert_x86<__vector_type_t<_Up, _M / 2>>( |
| __extract_part<1, _Np / _M * 2>(__v))); |
| else if constexpr (__i_to_i) //{{{2 |
| { |
| static_assert(__x_to_x || __have_avx2, |
| "integral conversions with ymm registers require AVX2"); |
| static_assert(__have_avx512bw |
| || ((sizeof(_Tp) >= 4 || sizeof(__v) < 64) |
| && (sizeof(_Up) >= 4 || sizeof(_To) < 64)), |
| "8/16-bit integers in zmm registers require AVX512BW"); |
| static_assert((sizeof(__v) < 64 && sizeof(_To) < 64) || __have_avx512f, |
| "integral conversions with ymm registers require AVX2"); |
| } |
| if constexpr (is_floating_point_v<_Tp> == is_floating_point_v<_Up> && //{{{2 |
| sizeof(_Tp) == sizeof(_Up)) |
| { |
| // conversion uses simple bit reinterpretation (or no conversion at all) |
| if constexpr (_Np >= _M) |
| return __intrin_bitcast<_To>(__v); |
| else |
| return __zero_extend(__vector_bitcast<_Up>(__v)); |
| } |
| else if constexpr (_Np < _M && sizeof(_To) > 16) //{{{2 |
| // zero extend (eg. xmm -> ymm) |
| return __zero_extend( |
| __convert_x86<__vector_type_t< |
| _Up, (16 / sizeof(_Up) > _Np) ? 16 / sizeof(_Up) : _Np>>(__v)); |
| else if constexpr (_Np > _M && sizeof(__v) > 16) //{{{2 |
| // partial input (eg. ymm -> xmm) |
| return __convert_x86<_To>(__extract_part<0, _Np / _M>(__v)); |
| else if constexpr (__i64_to_i32) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_avx512vl) |
| return __intrin_bitcast<_To>(_mm_cvtepi64_epi32(__intrin)); |
| else if constexpr (__x_to_x) |
| return __auto_bitcast( |
| _mm_shuffle_ps(__vector_bitcast<float>(__v), __m128(), 8)); |
| else if constexpr (__y_to_x && __have_avx512vl) |
| return __intrin_bitcast<_To>(_mm256_cvtepi64_epi32(__intrin)); |
| else if constexpr (__y_to_x && __have_avx512f) |
| return __intrin_bitcast<_To>( |
| __lo128(_mm512_cvtepi64_epi32(__auto_bitcast(__v)))); |
| else if constexpr (__y_to_x) |
| return __intrin_bitcast<_To>( |
| __lo128(_mm256_permute4x64_epi64(_mm256_shuffle_epi32(__intrin, 8), |
| 0 + 4 * 2))); |
| else if constexpr (__z_to_y) |
| return __intrin_bitcast<_To>(_mm512_cvtepi64_epi32(__intrin)); |
| } |
| else if constexpr (__i64_to_i16) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_avx512vl) |
| return __intrin_bitcast<_To>(_mm_cvtepi64_epi16(__intrin)); |
| else if constexpr (__x_to_x && __have_avx512f) |
| return __intrin_bitcast<_To>( |
| __lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v)))); |
| else if constexpr (__x_to_x && __have_ssse3) |
| { |
| return __intrin_bitcast<_To>( |
| _mm_shuffle_epi8(__intrin, |
| _mm_setr_epi8(0, 1, 8, 9, -0x80, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80))); |
| // fallback without SSSE3 |
| } |
| else if constexpr (__y_to_x && __have_avx512vl) |
| return __intrin_bitcast<_To>(_mm256_cvtepi64_epi16(__intrin)); |
| else if constexpr (__y_to_x && __have_avx512f) |
| return __intrin_bitcast<_To>( |
| __lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v)))); |
| else if constexpr (__y_to_x) |
| { |
| const auto __a = _mm256_shuffle_epi8( |
| __intrin, |
| _mm256_setr_epi8(0, 1, 8, 9, -0x80, -0x80, -0x80, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80, 0, 1, 8, 9, -0x80, |
| -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, |
| -0x80)); |
| return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a)); |
| } |
| else if constexpr (__z_to_x) |
| return __intrin_bitcast<_To>(_mm512_cvtepi64_epi16(__intrin)); |
| } |
| else if constexpr (__i64_to_i8) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_avx512vl) |
| return __intrin_bitcast<_To>(_mm_cvtepi64_epi8(__intrin)); |
| else if constexpr (__x_to_x && __have_avx512f) |
| return __intrin_bitcast<_To>( |
| __lo128(_mm512_cvtepi64_epi8(__zero_extend(__intrin)))); |
| else if constexpr (__y_to_x && __have_avx512vl) |
| return __intrin_bitcast<_To>(_mm256_cvtepi64_epi8(__intrin)); |
| else if constexpr (__y_to_x && __have_avx512f) |
| return __intrin_bitcast<_To>( |
| _mm512_cvtepi64_epi8(__zero_extend(__intrin))); |
| else if constexpr (__z_to_x) |
| return __intrin_bitcast<_To>(_mm512_cvtepi64_epi8(__intrin)); |
| } |
| else if constexpr (__i32_to_i64) //{{{2 |
| { |
| if constexpr (__have_sse4_1 && __x_to_x) |
| return __intrin_bitcast<_To>(is_signed_v<_Tp> |
| ? _mm_cvtepi32_epi64(__intrin) |
| : _mm_cvtepu32_epi64(__intrin)); |
| else if constexpr (__x_to_x) |
| { |
| return __intrin_bitcast<_To>( |
| _mm_unpacklo_epi32(__intrin, is_signed_v<_Tp> |
| ? _mm_srai_epi32(__intrin, 31) |
| : __m128i())); |
| } |
| else if constexpr (__x_to_y) |
| return __intrin_bitcast<_To>(is_signed_v<_Tp> |
| ? _mm256_cvtepi32_epi64(__intrin) |
| : _mm256_cvtepu32_epi64(__intrin)); |
| else if constexpr (__y_to_z) |
| return __intrin_bitcast<_To>(is_signed_v<_Tp> |
| ? _mm512_cvtepi32_epi64(__intrin) |
| : _mm512_cvtepu32_epi64(__intrin)); |
| } |
| else if constexpr (__i32_to_i16) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_avx512vl) |
| return __intrin_bitcast<_To>(_mm_cvtepi32_epi16(__intrin)); |
| else if constexpr (__x_to_x && __have_avx512f) |
| return __intrin_bitcast<_To>( |
| __lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v)))); |
| else if constexpr (__x_to_x && __have_ssse3) |
| return __intrin_bitcast<_To>(_mm_shuffle_epi8( |
| __intrin, _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80, -0x80, -0x80))); |
| else if constexpr (__x_to_x) |
| { |
| auto __a = _mm_unpacklo_epi16(__intrin, __m128i()); // 0o.o 1o.o |
| auto __b = _mm_unpackhi_epi16(__intrin, __m128i()); // 2o.o 3o.o |
| auto __c = _mm_unpacklo_epi16(__a, __b); // 02oo ..oo |
| auto __d = _mm_unpackhi_epi16(__a, __b); // 13oo ..oo |
| return __intrin_bitcast<_To>( |
| _mm_unpacklo_epi16(__c, __d)); // 0123 oooo |
| } |
| else if constexpr (__y_to_x && __have_avx512vl) |
| return __intrin_bitcast<_To>(_mm256_cvtepi32_epi16(__intrin)); |
| else if constexpr (__y_to_x && __have_avx512f) |
| return __intrin_bitcast<_To>( |
| __lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v)))); |
| else if constexpr (__y_to_x) |
| { |
| auto __a = _mm256_shuffle_epi8( |
| __intrin, |
| _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80, -0x80, 0, 1, 4, 5, 8, |
| 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, |
| -0x80, -0x80, -0x80)); |
| return __intrin_bitcast<_To>(__lo128( |
| _mm256_permute4x64_epi64(__a, |
| 0xf8))); // __a[0] __a[2] | __a[3] __a[3] |
| } |
| else if constexpr (__z_to_y) |
| return __intrin_bitcast<_To>(_mm512_cvtepi32_epi16(__intrin)); |
| } |
| else if constexpr (__i32_to_i8) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_avx512vl) |
| return __intrin_bitcast<_To>(_mm_cvtepi32_epi8(__intrin)); |
| else if constexpr (__x_to_x && __have_avx512f) |
| return __intrin_bitcast<_To>( |
| __lo128(_mm512_cvtepi32_epi8(__zero_extend(__intrin)))); |
| else if constexpr (__x_to_x && __have_ssse3) |
| { |
| return __intrin_bitcast<_To>( |
| _mm_shuffle_epi8(__intrin, |
| _mm_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80))); |
| } |
| else if constexpr (__x_to_x) |
| { |
| const auto __a |
| = _mm_unpacklo_epi8(__intrin, __intrin); // 0... .... 1... .... |
| const auto __b |
| = _mm_unpackhi_epi8(__intrin, __intrin); // 2... .... 3... .... |
| const auto __c = _mm_unpacklo_epi8(__a, __b); // 02.. .... .... .... |
| const auto __d = _mm_unpackhi_epi8(__a, __b); // 13.. .... .... .... |
| const auto __e = _mm_unpacklo_epi8(__c, __d); // 0123 .... .... .... |
| return __intrin_bitcast<_To>(__e & _mm_cvtsi32_si128(-1)); |
| } |
| else if constexpr (__y_to_x && __have_avx512vl) |
| return __intrin_bitcast<_To>(_mm256_cvtepi32_epi8(__intrin)); |
| else if constexpr (__y_to_x && __have_avx512f) |
| return __intrin_bitcast<_To>( |
| _mm512_cvtepi32_epi8(__zero_extend(__intrin))); |
| else if constexpr (__z_to_x) |
| return __intrin_bitcast<_To>(_mm512_cvtepi32_epi8(__intrin)); |
| } |
| else if constexpr (__i16_to_i64) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_sse4_1) |
| return __intrin_bitcast<_To>(is_signed_v<_Tp> |
| ? _mm_cvtepi16_epi64(__intrin) |
| : _mm_cvtepu16_epi64(__intrin)); |
| else if constexpr (__x_to_x && is_signed_v<_Tp>) |
| { |
| auto __x = _mm_srai_epi16(__intrin, 15); |
| auto __y = _mm_unpacklo_epi16(__intrin, __x); |
| __x = _mm_unpacklo_epi16(__x, __x); |
| return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__y, __x)); |
| } |
| else if constexpr (__x_to_x) |
| return __intrin_bitcast<_To>( |
| _mm_unpacklo_epi32(_mm_unpacklo_epi16(__intrin, __m128i()), |
| __m128i())); |
| else if constexpr (__x_to_y) |
| return __intrin_bitcast<_To>(is_signed_v<_Tp> |
| ? _mm256_cvtepi16_epi64(__intrin) |
| : _mm256_cvtepu16_epi64(__intrin)); |
| else if constexpr (__x_to_z) |
| return __intrin_bitcast<_To>(is_signed_v<_Tp> |
| ? _mm512_cvtepi16_epi64(__intrin) |
| : _mm512_cvtepu16_epi64(__intrin)); |
| } |
| else if constexpr (__i16_to_i32) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_sse4_1) |
| return __intrin_bitcast<_To>(is_signed_v<_Tp> |
| ? _mm_cvtepi16_epi32(__intrin) |
| : _mm_cvtepu16_epi32(__intrin)); |
| else if constexpr (__x_to_x && is_signed_v<_Tp>) |
| return __intrin_bitcast<_To>( |
| _mm_srai_epi32(_mm_unpacklo_epi16(__intrin, __intrin), 16)); |
| else if constexpr (__x_to_x && is_unsigned_v<_Tp>) |
| return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__intrin, __m128i())); |
| else if constexpr (__x_to_y) |
| return __intrin_bitcast<_To>(is_signed_v<_Tp> |
| ? _mm256_cvtepi16_epi32(__intrin) |
| : _mm256_cvtepu16_epi32(__intrin)); |
| else if constexpr (__y_to_z) |
| return __intrin_bitcast<_To>(is_signed_v<_Tp> |
| ? _mm512_cvtepi16_epi32(__intrin) |
| : _mm512_cvtepu16_epi32(__intrin)); |
| } |
| else if constexpr (__i16_to_i8) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_avx512bw_vl) |
| return __intrin_bitcast<_To>(_mm_cvtepi16_epi8(__intrin)); |
| else if constexpr (__x_to_x && __have_avx512bw) |
| return __intrin_bitcast<_To>( |
| __lo128(_mm512_cvtepi16_epi8(__zero_extend(__intrin)))); |
| else if constexpr (__x_to_x && __have_ssse3) |
| return __intrin_bitcast<_To>(_mm_shuffle_epi8( |
| __intrin, _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80, -0x80, -0x80))); |
| else if constexpr (__x_to_x) |
| { |
| auto __a |
| = _mm_unpacklo_epi8(__intrin, __intrin); // 00.. 11.. 22.. 33.. |
| auto __b |
| = _mm_unpackhi_epi8(__intrin, __intrin); // 44.. 55.. 66.. 77.. |
| auto __c = _mm_unpacklo_epi8(__a, __b); // 0404 .... 1515 .... |
| auto __d = _mm_unpackhi_epi8(__a, __b); // 2626 .... 3737 .... |
| auto __e = _mm_unpacklo_epi8(__c, __d); // 0246 0246 .... .... |
| auto __f = _mm_unpackhi_epi8(__c, __d); // 1357 1357 .... .... |
| return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f)); |
| } |
| else if constexpr (__y_to_x && __have_avx512bw_vl) |
| return __intrin_bitcast<_To>(_mm256_cvtepi16_epi8(__intrin)); |
| else if constexpr (__y_to_x && __have_avx512bw) |
| return __intrin_bitcast<_To>( |
| __lo256(_mm512_cvtepi16_epi8(__zero_extend(__intrin)))); |
| else if constexpr (__y_to_x) |
| { |
| auto __a = _mm256_shuffle_epi8( |
| __intrin, |
| _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -0x80, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, 0, 2, |
| 4, 6, 8, 10, 12, 14)); |
| return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a)); |
| } |
| else if constexpr (__z_to_y && __have_avx512bw) |
| return __intrin_bitcast<_To>(_mm512_cvtepi16_epi8(__intrin)); |
| else if constexpr (__z_to_y) |
| __assert_unreachable<_Tp>(); |
| } |
| else if constexpr (__i8_to_i64) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_sse4_1) |
| return __intrin_bitcast<_To>(is_signed_v<_Tp> |
| ? _mm_cvtepi8_epi64(__intrin) |
| : _mm_cvtepu8_epi64(__intrin)); |
| else if constexpr (__x_to_x && is_signed_v<_Tp>) |
| { |
| if constexpr (__have_ssse3) |
| { |
| auto __dup = _mm_unpacklo_epi8(__intrin, __intrin); |
| auto __epi16 = _mm_srai_epi16(__dup, 8); |
| _mm_shuffle_epi8(__epi16, |
| _mm_setr_epi8(0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, |
| 3, 3, 3, 3, 3)); |
| } |
| else |
| { |
| auto __x = _mm_unpacklo_epi8(__intrin, __intrin); |
| __x = _mm_unpacklo_epi16(__x, __x); |
| return __intrin_bitcast<_To>( |
| _mm_unpacklo_epi32(_mm_srai_epi32(__x, 24), |
| _mm_srai_epi32(__x, 31))); |
| } |
| } |
| else if constexpr (__x_to_x) |
| { |
| return __intrin_bitcast<_To>(_mm_unpacklo_epi32( |
| _mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()), |
| __m128i()), |
| __m128i())); |
| } |
| else if constexpr (__x_to_y) |
| return __intrin_bitcast<_To>(is_signed_v<_Tp> |
| ? _mm256_cvtepi8_epi64(__intrin) |
| : _mm256_cvtepu8_epi64(__intrin)); |
| else if constexpr (__x_to_z) |
| return __intrin_bitcast<_To>(is_signed_v<_Tp> |
| ? _mm512_cvtepi8_epi64(__intrin) |
| : _mm512_cvtepu8_epi64(__intrin)); |
| } |
| else if constexpr (__i8_to_i32) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_sse4_1) |
| return __intrin_bitcast<_To>(is_signed_v<_Tp> |
| ? _mm_cvtepi8_epi32(__intrin) |
| : _mm_cvtepu8_epi32(__intrin)); |
| else if constexpr (__x_to_x && is_signed_v<_Tp>) |
| { |
| const auto __x = _mm_unpacklo_epi8(__intrin, __intrin); |
| return __intrin_bitcast<_To>( |
| _mm_srai_epi32(_mm_unpacklo_epi16(__x, __x), 24)); |
| } |
| else if constexpr (__x_to_x && is_unsigned_v<_Tp>) |
| return __intrin_bitcast<_To>( |
| _mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()), |
| __m128i())); |
| else if constexpr (__x_to_y) |
| return __intrin_bitcast<_To>(is_signed_v<_Tp> |
| ? _mm256_cvtepi8_epi32(__intrin) |
| : _mm256_cvtepu8_epi32(__intrin)); |
| else if constexpr (__x_to_z) |
| return __intrin_bitcast<_To>(is_signed_v<_Tp> |
| ? _mm512_cvtepi8_epi32(__intrin) |
| : _mm512_cvtepu8_epi32(__intrin)); |
| } |
| else if constexpr (__i8_to_i16) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_sse4_1) |
| return __intrin_bitcast<_To>(is_signed_v<_Tp> |
| ? _mm_cvtepi8_epi16(__intrin) |
| : _mm_cvtepu8_epi16(__intrin)); |
| else if constexpr (__x_to_x && is_signed_v<_Tp>) |
| return __intrin_bitcast<_To>( |
| _mm_srai_epi16(_mm_unpacklo_epi8(__intrin, __intrin), 8)); |
| else if constexpr (__x_to_x && is_unsigned_v<_Tp>) |
| return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__intrin, __m128i())); |
| else if constexpr (__x_to_y) |
| return __intrin_bitcast<_To>(is_signed_v<_Tp> |
| ? _mm256_cvtepi8_epi16(__intrin) |
| : _mm256_cvtepu8_epi16(__intrin)); |
| else if constexpr (__y_to_z && __have_avx512bw) |
| return __intrin_bitcast<_To>(is_signed_v<_Tp> |
| ? _mm512_cvtepi8_epi16(__intrin) |
| : _mm512_cvtepu8_epi16(__intrin)); |
| else if constexpr (__y_to_z) |
| __assert_unreachable<_Tp>(); |
| } |
| else if constexpr (__f32_to_s64) //{{{2 |
| { |
| if constexpr (__have_avx512dq_vl && __x_to_x) |
| return __intrin_bitcast<_To>(_mm_cvttps_epi64(__intrin)); |
| else if constexpr (__have_avx512dq_vl && __x_to_y) |
| return __intrin_bitcast<_To>(_mm256_cvttps_epi64(__intrin)); |
| else if constexpr (__have_avx512dq && __y_to_z) |
| return __intrin_bitcast<_To>(_mm512_cvttps_epi64(__intrin)); |
| // else use scalar fallback |
| } |
| else if constexpr (__f32_to_u64) //{{{2 |
| { |
| if constexpr (__have_avx512dq_vl && __x_to_x) |
| return __intrin_bitcast<_To>(_mm_cvttps_epu64(__intrin)); |
| else if constexpr (__have_avx512dq_vl && __x_to_y) |
| return __intrin_bitcast<_To>(_mm256_cvttps_epu64(__intrin)); |
| else if constexpr (__have_avx512dq && __y_to_z) |
| return __intrin_bitcast<_To>(_mm512_cvttps_epu64(__intrin)); |
| // else use scalar fallback |
| } |
| else if constexpr (__f32_to_s32) //{{{2 |
| { |
| if constexpr (__x_to_x || __y_to_y || __z_to_z) |
| { |
| // go to fallback, it does the right thing |
| } |
| else |
| __assert_unreachable<_Tp>(); |
| } |
| else if constexpr (__f32_to_u32) //{{{2 |
| { |
| if constexpr (__have_avx512vl && __x_to_x) |
| return __auto_bitcast(_mm_cvttps_epu32(__intrin)); |
| else if constexpr (__have_avx512f && __x_to_x) |
| return __auto_bitcast( |
| __lo128(_mm512_cvttps_epu32(__auto_bitcast(__v)))); |
| else if constexpr (__have_avx512vl && __y_to_y) |
| return __vector_bitcast<_Up>(_mm256_cvttps_epu32(__intrin)); |
| else if constexpr (__have_avx512f && __y_to_y) |
| return __vector_bitcast<_Up>( |
| __lo256(_mm512_cvttps_epu32(__auto_bitcast(__v)))); |
| else if constexpr (__x_to_x || __y_to_y || __z_to_z) |
| { |
| // go to fallback, it does the right thing. We can't use the |
| // _mm_floor_ps - 0x8000'0000 trick for f32->u32 because it would |
| // discard small input values (only 24 mantissa bits) |
| } |
| else |
| __assert_unreachable<_Tp>(); |
| } |
| else if constexpr (__f32_to_ibw) //{{{2 |
| return __convert_x86<_To>(__convert_x86<__vector_type_t<int, _Np>>(__v)); |
| else if constexpr (__f64_to_s64) //{{{2 |
| { |
| if constexpr (__have_avx512dq_vl && __x_to_x) |
| return __intrin_bitcast<_To>(_mm_cvttpd_epi64(__intrin)); |
| else if constexpr (__have_avx512dq_vl && __y_to_y) |
| return __intrin_bitcast<_To>(_mm256_cvttpd_epi64(__intrin)); |
| else if constexpr (__have_avx512dq && __z_to_z) |
| return __intrin_bitcast<_To>(_mm512_cvttpd_epi64(__intrin)); |
| // else use scalar fallback |
| } |
| else if constexpr (__f64_to_u64) //{{{2 |
| { |
| if constexpr (__have_avx512dq_vl && __x_to_x) |
| return __intrin_bitcast<_To>(_mm_cvttpd_epu64(__intrin)); |
| else if constexpr (__have_avx512dq_vl && __y_to_y) |
| return __intrin_bitcast<_To>(_mm256_cvttpd_epu64(__intrin)); |
| else if constexpr (__have_avx512dq && __z_to_z) |
| return __intrin_bitcast<_To>(_mm512_cvttpd_epu64(__intrin)); |
| // else use scalar fallback |
| } |
| else if constexpr (__f64_to_s32) //{{{2 |
| { |
| if constexpr (__x_to_x) |
| return __intrin_bitcast<_To>(_mm_cvttpd_epi32(__intrin)); |
| else if constexpr (__y_to_x) |
| return __intrin_bitcast<_To>(_mm256_cvttpd_epi32(__intrin)); |
| else if constexpr (__z_to_y) |
| return __intrin_bitcast<_To>(_mm512_cvttpd_epi32(__intrin)); |
| } |
| else if constexpr (__f64_to_u32) //{{{2 |
| { |
| if constexpr (__have_avx512vl && __x_to_x) |
| return __intrin_bitcast<_To>(_mm_cvttpd_epu32(__intrin)); |
| else if constexpr (__have_sse4_1 && __x_to_x) |
| return __vector_bitcast<_Up, _M>( |
| _mm_cvttpd_epi32(_mm_floor_pd(__intrin) - 0x8000'0000u)) |
| ^ 0x8000'0000u; |
| else if constexpr (__x_to_x) |
| { |
| // use scalar fallback: it's only 2 values to convert, can't get |
| // much better than scalar decomposition |
| } |
| else if constexpr (__have_avx512vl && __y_to_x) |
| return __intrin_bitcast<_To>(_mm256_cvttpd_epu32(__intrin)); |
| else if constexpr (__y_to_x) |
| { |
| return __intrin_bitcast<_To>( |
| __vector_bitcast<_Up>( |
| _mm256_cvttpd_epi32(_mm256_floor_pd(__intrin) - 0x8000'0000u)) |
| ^ 0x8000'0000u); |
| } |
| else if constexpr (__z_to_y) |
| return __intrin_bitcast<_To>(_mm512_cvttpd_epu32(__intrin)); |
| } |
| else if constexpr (__f64_to_ibw) //{{{2 |
| { |
| return __convert_x86<_To>( |
| __convert_x86<__vector_type_t<int, (_Np < 4 ? 4 : _Np)>>(__v)); |
| } |
| else if constexpr (__s64_to_f32) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_avx512dq_vl) |
| return __intrin_bitcast<_To>(_mm_cvtepi64_ps(__intrin)); |
| else if constexpr (__y_to_x && __have_avx512dq_vl) |
| return __intrin_bitcast<_To>(_mm256_cvtepi64_ps(__intrin)); |
| else if constexpr (__z_to_y && __have_avx512dq) |
| return __intrin_bitcast<_To>(_mm512_cvtepi64_ps(__intrin)); |
| else if constexpr (__z_to_y) |
| return __intrin_bitcast<_To>( |
| _mm512_cvtpd_ps(__convert_x86<__vector_type_t<double, 8>>(__v))); |
| } |
| else if constexpr (__u64_to_f32) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_avx512dq_vl) |
| return __intrin_bitcast<_To>(_mm_cvtepu64_ps(__intrin)); |
| else if constexpr (__y_to_x && __have_avx512dq_vl) |
| return __intrin_bitcast<_To>(_mm256_cvtepu64_ps(__intrin)); |
| else if constexpr (__z_to_y && __have_avx512dq) |
| return __intrin_bitcast<_To>(_mm512_cvtepu64_ps(__intrin)); |
| else if constexpr (__z_to_y) |
| { |
| return __intrin_bitcast<_To>( |
| __lo256(_mm512_cvtepu32_ps(__auto_bitcast( |
| _mm512_cvtepi64_epi32(_mm512_srai_epi64(__intrin, 32))))) |
| * 0x100000000LL |
| + __lo256(_mm512_cvtepu32_ps( |
| __auto_bitcast(_mm512_cvtepi64_epi32(__intrin))))); |
| } |
| } |
| else if constexpr (__s32_to_f32) //{{{2 |
| { |
| // use fallback (builtin conversion) |
| } |
| else if constexpr (__u32_to_f32) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_avx512vl) |
| { |
| // use fallback |
| } |
| else if constexpr (__x_to_x && __have_avx512f) |
| return __intrin_bitcast<_To>( |
| __lo128(_mm512_cvtepu32_ps(__auto_bitcast(__v)))); |
| else if constexpr (__x_to_x && (__have_fma || __have_fma4)) |
| // work around PR85819 |
| return __auto_bitcast(0x10000 |
| * _mm_cvtepi32_ps(__to_intrin(__v >> 16)) |
| + _mm_cvtepi32_ps(__to_intrin(__v & 0xffff))); |
| else if constexpr (__y_to_y && __have_avx512vl) |
| { |
| // use fallback |
| } |
| else if constexpr (__y_to_y && __have_avx512f) |
| return __intrin_bitcast<_To>( |
| __lo256(_mm512_cvtepu32_ps(__auto_bitcast(__v)))); |
| else if constexpr (__y_to_y) |
| // work around PR85819 |
| return 0x10000 * _mm256_cvtepi32_ps(__to_intrin(__v >> 16)) |
| + _mm256_cvtepi32_ps(__to_intrin(__v & 0xffff)); |
| // else use fallback (builtin conversion) |
| } |
| else if constexpr (__ibw_to_f32) //{{{2 |
| { |
| if constexpr (_M <= 4 || __have_avx2) |
| return __convert_x86<_To>( |
| __convert_x86<__vector_type_t<int, _M>>(__v)); |
| else |
| { |
| static_assert(__x_to_y); |
| __m128i __a, __b; |
| if constexpr (__have_sse4_1) |
| { |
| __a = sizeof(_Tp) == 2 |
| ? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(__intrin) |
| : _mm_cvtepu16_epi32(__intrin)) |
| : (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(__intrin) |
| : _mm_cvtepu8_epi32(__intrin)); |
| const auto __w |
| = _mm_shuffle_epi32(__intrin, sizeof(_Tp) == 2 ? 0xee : 0xe9); |
| __b = sizeof(_Tp) == 2 |
| ? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(__w) |
| : _mm_cvtepu16_epi32(__w)) |
| : (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(__w) |
| : _mm_cvtepu8_epi32(__w)); |
| } |
| else |
| { |
| __m128i __tmp; |
| if constexpr (sizeof(_Tp) == 1) |
| { |
| __tmp = is_signed_v<_Tp> |
| ? _mm_srai_epi16(_mm_unpacklo_epi8(__intrin, |
| __intrin), |
| 8) |
| : _mm_unpacklo_epi8(__intrin, __m128i()); |
| } |
| else |
| { |
| static_assert(sizeof(_Tp) == 2); |
| __tmp = __intrin; |
| } |
| __a = is_signed_v<_Tp> |
| ? _mm_srai_epi32(_mm_unpacklo_epi16(__tmp, __tmp), 16) |
| : _mm_unpacklo_epi16(__tmp, __m128i()); |
| __b = is_signed_v<_Tp> |
| ? _mm_srai_epi32(_mm_unpackhi_epi16(__tmp, __tmp), 16) |
| : _mm_unpackhi_epi16(__tmp, __m128i()); |
| } |
| return __convert_x86<_To>(__vector_bitcast<int>(__a), |
| __vector_bitcast<int>(__b)); |
| } |
| } |
| else if constexpr (__s64_to_f64) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_avx512dq_vl) |
| return __intrin_bitcast<_To>(_mm_cvtepi64_pd(__intrin)); |
| else if constexpr (__y_to_y && __have_avx512dq_vl) |
| return __intrin_bitcast<_To>(_mm256_cvtepi64_pd(__intrin)); |
| else if constexpr (__z_to_z && __have_avx512dq) |
| return __intrin_bitcast<_To>(_mm512_cvtepi64_pd(__intrin)); |
| else if constexpr (__z_to_z) |
| { |
| return __intrin_bitcast<_To>( |
| _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> 32))) |
| * 0x100000000LL |
| + _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin))); |
| } |
| } |
| else if constexpr (__u64_to_f64) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_avx512dq_vl) |
| return __intrin_bitcast<_To>(_mm_cvtepu64_pd(__intrin)); |
| else if constexpr (__y_to_y && __have_avx512dq_vl) |
| return __intrin_bitcast<_To>(_mm256_cvtepu64_pd(__intrin)); |
| else if constexpr (__z_to_z && __have_avx512dq) |
| return __intrin_bitcast<_To>(_mm512_cvtepu64_pd(__intrin)); |
| else if constexpr (__z_to_z) |
| { |
| return __intrin_bitcast<_To>( |
| _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> 32))) |
| * 0x100000000LL |
| + _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin))); |
| } |
| } |
| else if constexpr (__s32_to_f64) //{{{2 |
| { |
| if constexpr (__x_to_x) |
| return __intrin_bitcast<_To>(_mm_cvtepi32_pd(__intrin)); |
| else if constexpr (__x_to_y) |
| return __intrin_bitcast<_To>(_mm256_cvtepi32_pd(__intrin)); |
| else if constexpr (__y_to_z) |
| return __intrin_bitcast<_To>(_mm512_cvtepi32_pd(__intrin)); |
| } |
| else if constexpr (__u32_to_f64) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_avx512vl) |
| return __intrin_bitcast<_To>(_mm_cvtepu32_pd(__intrin)); |
| else if constexpr (__x_to_x && __have_avx512f) |
| return __intrin_bitcast<_To>( |
| __lo128(_mm512_cvtepu32_pd(__auto_bitcast(__v)))); |
| else if constexpr (__x_to_x) |
| return __intrin_bitcast<_To>( |
| _mm_cvtepi32_pd(__to_intrin(__v ^ 0x8000'0000u)) + 0x8000'0000u); |
| else if constexpr (__x_to_y && __have_avx512vl) |
| return __intrin_bitcast<_To>(_mm256_cvtepu32_pd(__intrin)); |
| else if constexpr (__x_to_y && __have_avx512f) |
| return __intrin_bitcast<_To>( |
| __lo256(_mm512_cvtepu32_pd(__auto_bitcast(__v)))); |
| else if constexpr (__x_to_y) |
| return __intrin_bitcast<_To>( |
| _mm256_cvtepi32_pd(__to_intrin(__v ^ 0x8000'0000u)) + 0x8000'0000u); |
| else if constexpr (__y_to_z) |
| return __intrin_bitcast<_To>(_mm512_cvtepu32_pd(__intrin)); |
| } |
| else if constexpr (__ibw_to_f64) //{{{2 |
| { |
| return __convert_x86<_To>( |
| __convert_x86<__vector_type_t<int, std::max(size_t(4), _M)>>(__v)); |
| } |
| else if constexpr (__f32_to_f64) //{{{2 |
| { |
| if constexpr (__x_to_x) |
| return __intrin_bitcast<_To>(_mm_cvtps_pd(__intrin)); |
| else if constexpr (__x_to_y) |
| return __intrin_bitcast<_To>(_mm256_cvtps_pd(__intrin)); |
| else if constexpr (__y_to_z) |
| return __intrin_bitcast<_To>(_mm512_cvtps_pd(__intrin)); |
| } |
| else if constexpr (__f64_to_f32) //{{{2 |
| { |
| if constexpr (__x_to_x) |
| return __intrin_bitcast<_To>(_mm_cvtpd_ps(__intrin)); |
| else if constexpr (__y_to_x) |
| return __intrin_bitcast<_To>(_mm256_cvtpd_ps(__intrin)); |
| else if constexpr (__z_to_y) |
| return __intrin_bitcast<_To>(_mm512_cvtpd_ps(__intrin)); |
| } |
| else //{{{2 |
| __assert_unreachable<_Tp>(); |
| |
| // fallback:{{{2 |
| return __vector_convert<_To>(__v, make_index_sequence<std::min(_M, _Np)>()); |
| //}}} |
| } |
| |
| // }}} |
| // 2-arg __convert_x86 {{{1 |
| template <typename _To, typename _V, typename _Traits> |
| _GLIBCXX_SIMD_INTRINSIC _To |
| __convert_x86(_V __v0, _V __v1) |
| { |
| static_assert(__is_vector_type_v<_V>); |
| using _Tp = typename _Traits::value_type; |
| constexpr size_t _Np = _Traits::_S_full_size; |
| [[maybe_unused]] const auto __i0 = __to_intrin(__v0); |
| [[maybe_unused]] const auto __i1 = __to_intrin(__v1); |
| using _Up = typename _VectorTraits<_To>::value_type; |
| constexpr size_t _M = _VectorTraits<_To>::_S_full_size; |
| |
| static_assert(2 * _Np <= _M, |
| "__v1 would be discarded; use the one-argument " |
| "__convert_x86 overload instead"); |
| |
| // [xyz]_to_[xyz] {{{2 |
| [[maybe_unused]] constexpr bool __x_to_x |
| = sizeof(__v0) <= 16 && sizeof(_To) <= 16; |
| [[maybe_unused]] constexpr bool __x_to_y |
| = sizeof(__v0) <= 16 && sizeof(_To) == 32; |
| [[maybe_unused]] constexpr bool __x_to_z |
| = sizeof(__v0) <= 16 && sizeof(_To) == 64; |
| [[maybe_unused]] constexpr bool __y_to_x |
| = sizeof(__v0) == 32 && sizeof(_To) <= 16; |
| [[maybe_unused]] constexpr bool __y_to_y |
| = sizeof(__v0) == 32 && sizeof(_To) == 32; |
| [[maybe_unused]] constexpr bool __y_to_z |
| = sizeof(__v0) == 32 && sizeof(_To) == 64; |
| [[maybe_unused]] constexpr bool __z_to_x |
| = sizeof(__v0) == 64 && sizeof(_To) <= 16; |
| [[maybe_unused]] constexpr bool __z_to_y |
| = sizeof(__v0) == 64 && sizeof(_To) == 32; |
| [[maybe_unused]] constexpr bool __z_to_z |
| = sizeof(__v0) == 64 && sizeof(_To) == 64; |
| |
| // iX_to_iX {{{2 |
| [[maybe_unused]] constexpr bool __i_to_i |
| = is_integral_v<_Up> && is_integral_v<_Tp>; |
| [[maybe_unused]] constexpr bool __i8_to_i16 |
| = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2; |
| [[maybe_unused]] constexpr bool __i8_to_i32 |
| = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __i8_to_i64 |
| = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __i16_to_i8 |
| = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1; |
| [[maybe_unused]] constexpr bool __i16_to_i32 |
| = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __i16_to_i64 |
| = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __i32_to_i8 |
| = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1; |
| [[maybe_unused]] constexpr bool __i32_to_i16 |
| = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2; |
| [[maybe_unused]] constexpr bool __i32_to_i64 |
| = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __i64_to_i8 |
| = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1; |
| [[maybe_unused]] constexpr bool __i64_to_i16 |
| = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2; |
| [[maybe_unused]] constexpr bool __i64_to_i32 |
| = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4; |
| |
| // [fsu]X_to_[fsu]X {{{2 |
| // ibw = integral && byte or word, i.e. char and short with any signedness |
| [[maybe_unused]] constexpr bool __i64_to_f32 |
| = is_integral_v<_Tp> && sizeof(_Tp) == 8 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __s32_to_f32 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __s16_to_f32 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __s8_to_f32 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __u32_to_f32 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __u16_to_f32 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __u8_to_f32 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __s64_to_f64 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __s32_to_f64 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __s16_to_f64 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __s8_to_f64 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __u64_to_f64 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __u32_to_f64 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __u16_to_f64 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __u8_to_f64 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __f32_to_s64 |
| = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
| [[maybe_unused]] constexpr bool __f32_to_s32 |
| = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
| [[maybe_unused]] constexpr bool __f32_to_u64 |
| = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
| [[maybe_unused]] constexpr bool __f32_to_u32 |
| = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
| [[maybe_unused]] constexpr bool __f64_to_s64 |
| = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
| [[maybe_unused]] constexpr bool __f64_to_s32 |
| = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
| [[maybe_unused]] constexpr bool __f64_to_u64 |
| = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
| [[maybe_unused]] constexpr bool __f64_to_u32 |
| = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
| [[maybe_unused]] constexpr bool __f32_to_ibw |
| = is_integral_v<_Up> && sizeof(_Up) <= 2 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
| [[maybe_unused]] constexpr bool __f64_to_ibw |
| = is_integral_v<_Up> && sizeof(_Up) <= 2 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
| [[maybe_unused]] constexpr bool __f32_to_f64 |
| = is_floating_point_v<_Tp> && sizeof(_Tp) == 4 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __f64_to_f32 |
| = is_floating_point_v<_Tp> && sizeof(_Tp) == 8 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| |
| if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2 |
| // <double, 4>, <double, 4> => <short, 8> |
| return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1), |
| __hi128(__v1)); |
| else if constexpr (__i_to_i) // assert ISA {{{2 |
| { |
| static_assert(__x_to_x || __have_avx2, |
| "integral conversions with ymm registers require AVX2"); |
| static_assert(__have_avx512bw |
| || ((sizeof(_Tp) >= 4 || sizeof(__v0) < 64) |
| && (sizeof(_Up) >= 4 || sizeof(_To) < 64)), |
| "8/16-bit integers in zmm registers require AVX512BW"); |
| static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) || __have_avx512f, |
| "integral conversions with ymm registers require AVX2"); |
| } |
| // concat => use 1-arg __convert_x86 {{{2 |
| if constexpr (sizeof(__v0) < 16 || (sizeof(__v0) == 16 && __have_avx2) |
| || (sizeof(__v0) == 16 && __have_avx |
| && is_floating_point_v<_Tp>) |
| || (sizeof(__v0) == 32 && __have_avx512f |
| && (sizeof(_Tp) >= 4 || __have_avx512bw))) |
| { |
| // The ISA can handle wider input registers, so concat and use one-arg |
| // implementation. This reduces code duplication considerably. |
| return __convert_x86<_To>(__concat(__v0, __v1)); |
| } |
| else //{{{2 |
| { |
| // conversion using bit reinterpretation (or no conversion at all) |
| // should all go through the concat branch above: |
| static_assert( |
| !(is_floating_point_v< |
| _Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up))); |
| // handle all zero extension{{{2 |
| if constexpr (2 * _Np < _M && sizeof(_To) > 16) |
| { |
| constexpr size_t Min = 16 / sizeof(_Up); |
| return __zero_extend( |
| __convert_x86< |
| __vector_type_t<_Up, (Min > 2 * _Np) ? Min : 2 * _Np>>(__v0, |
| __v1)); |
| } |
| else if constexpr (__i64_to_i32) //{{{2 |
| { |
| if constexpr (__x_to_x) |
| return __auto_bitcast(_mm_shuffle_ps(__auto_bitcast(__v0), |
| __auto_bitcast(__v1), 0x88)); |
| else if constexpr (__y_to_y) |
| { |
| // AVX512F is not available (would concat otherwise) |
| return __auto_bitcast( |
| __xzyw(_mm256_shuffle_ps(__auto_bitcast(__v0), |
| __auto_bitcast(__v1), 0x88))); |
| // alternative: |
| // const auto v0_abxxcdxx = _mm256_shuffle_epi32(__v0, 8); |
| // const auto v1_efxxghxx = _mm256_shuffle_epi32(__v1, 8); |
| // const auto v_abefcdgh = _mm256_unpacklo_epi64(v0_abxxcdxx, |
| // v1_efxxghxx); return _mm256_permute4x64_epi64(v_abefcdgh, |
| // 0x01 * 0 + 0x04 * 2 + 0x10 * 1 + 0x40 * 3); // abcdefgh |
| } |
| else if constexpr (__z_to_z) |
| return __intrin_bitcast<_To>( |
| __concat(_mm512_cvtepi64_epi32(__i0), |
| _mm512_cvtepi64_epi32(__i1))); |
| } |
| else if constexpr (__i64_to_i16) //{{{2 |
| { |
| if constexpr (__x_to_x) |
| { |
| // AVX2 is not available (would concat otherwise) |
| if constexpr (__have_sse4_1) |
| { |
| return __intrin_bitcast<_To>(_mm_shuffle_epi8( |
| _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 4), 0x44), |
| _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80, -0x80, -0x80))); |
| } |
| else |
| { |
| return __vector_type_t<_Up, _M>{_Up(__v0[0]), _Up(__v0[1]), |
| _Up(__v1[0]), _Up(__v1[1])}; |
| } |
| } |
| else if constexpr (__y_to_x) |
| { |
| auto __a |
| = _mm256_unpacklo_epi16(__i0, __i1); // 04.. .... 26.. .... |
| auto __b |
| = _mm256_unpackhi_epi16(__i0, __i1); // 15.. .... 37.. .... |
| auto __c |
| = _mm256_unpacklo_epi16(__a, __b); // 0145 .... 2367 .... |
| return __intrin_bitcast<_To>( |
| _mm_unpacklo_epi32(__lo128(__c), __hi128(__c))); // 0123 4567 |
| } |
| else if constexpr (__z_to_y) |
| return __intrin_bitcast<_To>( |
| __concat(_mm512_cvtepi64_epi16(__i0), |
| _mm512_cvtepi64_epi16(__i1))); |
| } |
| else if constexpr (__i64_to_i8) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_sse4_1) |
| { |
| return __intrin_bitcast<_To>(_mm_shuffle_epi8( |
| _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 4), 0x44), |
| _mm_setr_epi8(0, 8, 4, 12, -0x80, -0x80, -0x80, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, |
| -0x80))); |
| } |
| else if constexpr (__x_to_x && __have_ssse3) |
| { |
| return __intrin_bitcast<_To>(_mm_unpacklo_epi16( |
| _mm_shuffle_epi8( |
| __i0, _mm_setr_epi8(0, 8, -0x80, -0x80, -0x80, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80)), |
| _mm_shuffle_epi8( |
| __i1, _mm_setr_epi8(0, 8, -0x80, -0x80, -0x80, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80)))); |
| } |
| else if constexpr (__x_to_x) |
| { |
| return __vector_type_t<_Up, _M>{_Up(__v0[0]), _Up(__v0[1]), |
| _Up(__v1[0]), _Up(__v1[1])}; |
| } |
| else if constexpr (__y_to_x) |
| { |
| const auto __a = _mm256_shuffle_epi8( |
| _mm256_blend_epi32(__i0, _mm256_slli_epi64(__i1, 32), 0xAA), |
| _mm256_setr_epi8(0, 8, -0x80, -0x80, 4, 12, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80, 0, 8, -0x80, |
| -0x80, 4, 12, -0x80, -0x80, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80)); |
| return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a)); |
| } // __z_to_x uses concat fallback |
| } |
| else if constexpr (__i32_to_i16) //{{{2 |
| { |
| if constexpr (__x_to_x) |
| { |
| // AVX2 is not available (would concat otherwise) |
| if constexpr (__have_sse4_1) |
| { |
| return __intrin_bitcast<_To>(_mm_shuffle_epi8( |
| _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 2), 0xaa), |
| _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, |
| 11, 14, 15))); |
| } |
| else if constexpr (__have_ssse3) |
| { |
| return __intrin_bitcast<_To>( |
| _mm_hadd_epi16(__to_intrin(__v0 << 16), |
| __to_intrin(__v1 << 16))); |
| /* |
| return _mm_unpacklo_epi64( |
| _mm_shuffle_epi8(__i0, _mm_setr_epi8(0, 1, 4, 5, 8, 9, |
| 12, 13, 8, 9, 12, 13, 12, 13, 14, 15)), |
| _mm_shuffle_epi8(__i1, _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, |
| 13, 8, 9, 12, 13, 12, 13, 14, 15))); |
| */ |
| } |
| else |
| { |
| auto __a = _mm_unpacklo_epi16(__i0, __i1); // 04.. 15.. |
| auto __b = _mm_unpackhi_epi16(__i0, __i1); // 26.. 37.. |
| auto __c = _mm_unpacklo_epi16(__a, __b); // 0246 .... |
| auto __d = _mm_unpackhi_epi16(__a, __b); // 1357 .... |
| return __intrin_bitcast<_To>( |
| _mm_unpacklo_epi16(__c, __d)); // 0123 4567 |
| } |
| } |
| else if constexpr (__y_to_y) |
| { |
| const auto __shuf |
| = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, |
| 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80, -0x80, -0x80); |
| auto __a = _mm256_shuffle_epi8(__i0, __shuf); |
| auto __b = _mm256_shuffle_epi8(__i1, __shuf); |
| return __intrin_bitcast<_To>( |
| __xzyw(_mm256_unpacklo_epi64(__a, __b))); |
| } // __z_to_z uses concat fallback |
| } |
| else if constexpr (__i32_to_i8) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_ssse3) |
| { |
| const auto shufmask |
| = _mm_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80, -0x80, |
| -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, |
| -0x80, -0x80); |
| return __intrin_bitcast<_To>( |
| _mm_unpacklo_epi32(_mm_shuffle_epi8(__i0, shufmask), |
| _mm_shuffle_epi8(__i1, shufmask))); |
| } |
| else if constexpr (__x_to_x) |
| { |
| auto __a = _mm_unpacklo_epi8(__i0, __i1); // 04.. .... 15.. .... |
| auto __b = _mm_unpackhi_epi8(__i0, __i1); // 26.. .... 37.. .... |
| auto __c = _mm_unpacklo_epi8(__a, __b); // 0246 .... .... .... |
| auto __d = _mm_unpackhi_epi8(__a, __b); // 1357 .... .... .... |
| auto __e = _mm_unpacklo_epi8(__c, __d); // 0123 4567 .... .... |
| return __intrin_bitcast<_To>(__e & __m128i{-1, 0}); |
| } |
| else if constexpr (__y_to_x) |
| { |
| const auto __a = _mm256_shuffle_epi8( |
| _mm256_blend_epi16(__i0, _mm256_slli_epi32(__i1, 16), 0xAA), |
| _mm256_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80, -0x80, 2, |
| 6, 10, 14, -0x80, -0x80, -0x80, -0x80, -0x80, |
| -0x80, -0x80, -0x80, 0, 4, 8, 12, -0x80, |
| -0x80, -0x80, -0x80, 2, 6, 10, 14)); |
| return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a)); |
| } // __z_to_y uses concat fallback |
| } |
| else if constexpr (__i16_to_i8) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_ssse3) |
| { |
| const auto __shuf = reinterpret_cast<__m128i>( |
| __vector_type_t<_UChar, 16>{0, 2, 4, 6, 8, 10, 12, 14, 0x80, |
| 0x80, 0x80, 0x80, 0x80, 0x80, |
| 0x80, 0x80}); |
| return __intrin_bitcast<_To>( |
| _mm_unpacklo_epi64(_mm_shuffle_epi8(__i0, __shuf), |
| _mm_shuffle_epi8(__i1, __shuf))); |
| } |
| else if constexpr (__x_to_x) |
| { |
| auto __a = _mm_unpacklo_epi8(__i0, __i1); // 08.. 19.. 2A.. 3B.. |
| auto __b = _mm_unpackhi_epi8(__i0, __i1); // 4C.. 5D.. 6E.. 7F.. |
| auto __c = _mm_unpacklo_epi8(__a, __b); // 048C .... 159D .... |
| auto __d = _mm_unpackhi_epi8(__a, __b); // 26AE .... 37BF .... |
| auto __e = _mm_unpacklo_epi8(__c, __d); // 0246 8ACE .... .... |
| auto __f = _mm_unpackhi_epi8(__c, __d); // 1357 9BDF .... .... |
| return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f)); |
| } |
| else if constexpr (__y_to_y) |
| { |
| return __intrin_bitcast<_To>(__xzyw(_mm256_shuffle_epi8( |
| (__to_intrin(__v0) & _mm256_set1_epi32(0x00ff00ff)) |
| | _mm256_slli_epi16(__i1, 8), |
| _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, |
| 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, |
| 7, 9, 11, 13, 15)))); |
| } // __z_to_z uses concat fallback |
| } |
| else if constexpr (__i64_to_f32) //{{{2 |
| { |
| if constexpr (__x_to_x) |
| return __make_wrapper<float>(__v0[0], __v0[1], __v1[0], __v1[1]); |
| else if constexpr (__y_to_y) |
| { |
| static_assert(__y_to_y && __have_avx2); |
| const auto __a = _mm256_unpacklo_epi32(__i0, __i1); // aeAE cgCG |
| const auto __b = _mm256_unpackhi_epi32(__i0, __i1); // bfBF dhDH |
| const auto __lo32 |
| = _mm256_unpacklo_epi32(__a, __b); // abef cdgh |
| const auto __hi32 = __vector_bitcast< |
| conditional_t<is_signed_v<_Tp>, int, _UInt>>( |
| _mm256_unpackhi_epi32(__a, __b)); // ABEF CDGH |
| const auto __hi |
| = 0x100000000LL |
| * __convert_x86<__vector_type_t<float, 8>>(__hi32); |
| const auto __mid |
| = 0x10000 * _mm256_cvtepi32_ps(_mm256_srli_epi32(__lo32, 16)); |
| const auto __lo |
| = _mm256_cvtepi32_ps(_mm256_set1_epi32(0x0000ffffu) & __lo32); |
| return __xzyw((__hi + __mid) + __lo); |
| } |
| else if constexpr (__z_to_z && __have_avx512dq) |
| { |
| return is_signed_v<_Tp> ? __concat(_mm512_cvtepi64_ps(__i0), |
| _mm512_cvtepi64_ps(__i1)) |
| : __concat(_mm512_cvtepu64_ps(__i0), |
| _mm512_cvtepu64_ps(__i1)); |
| } |
| else if constexpr (__z_to_z && is_signed_v<_Tp>) |
| { |
| const __m512 __hi32 = _mm512_cvtepi32_ps( |
| __concat(_mm512_cvtepi64_epi32(__to_intrin(__v0 >> 32)), |
| _mm512_cvtepi64_epi32(__to_intrin(__v1 >> 32)))); |
| const __m512i __lo32 = __concat(_mm512_cvtepi64_epi32(__i0), |
| _mm512_cvtepi64_epi32(__i1)); |
| // split low 32-bits, because if __hi32 is a small negative |
| // number, the 24-bit mantissa may lose important information if |
| // any of the high 8 bits of __lo32 is set, leading to |
| // catastrophic cancelation in the FMA |
| const __m512 __hi16 |
| = _mm512_cvtepu32_ps(_mm512_set1_epi32(0xffff0000u) & __lo32); |
| const __m512 __lo16 |
| = _mm512_cvtepi32_ps(_mm512_set1_epi32(0x0000ffffu) & __lo32); |
| return (__hi32 * 0x100000000LL + __hi16) + __lo16; |
| } |
| else if constexpr (__z_to_z && is_unsigned_v<_Tp>) |
| { |
| return __intrin_bitcast<_To>( |
| _mm512_cvtepu32_ps(__concat( |
| _mm512_cvtepi64_epi32(_mm512_srai_epi64(__i0, 32)), |
| _mm512_cvtepi64_epi32(_mm512_srai_epi64(__i1, 32)))) |
| * 0x100000000LL |
| + _mm512_cvtepu32_ps(__concat(_mm512_cvtepi64_epi32(__i0), |
| _mm512_cvtepi64_epi32(__i1)))); |
| } |
| } |
| else if constexpr (__f64_to_s32) //{{{2 |
| { |
| // use concat fallback |
| } |
| else if constexpr (__f64_to_u32) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_sse4_1) |
| { |
| return __vector_bitcast<_Up, _M>(_mm_unpacklo_epi64( |
| _mm_cvttpd_epi32(_mm_floor_pd(__i0) - 0x8000'0000u), |
| _mm_cvttpd_epi32(_mm_floor_pd(__i1) - 0x8000'0000u))) |
| ^ 0x8000'0000u; |
| // without SSE4.1 just use the scalar fallback, it's only four |
| // values |
| } |
| else if constexpr (__y_to_y) |
| { |
| return __vector_bitcast<_Up>( |
| __concat(_mm256_cvttpd_epi32(_mm256_floor_pd(__i0) |
| - 0x8000'0000u), |
| _mm256_cvttpd_epi32(_mm256_floor_pd(__i1) |
| - 0x8000'0000u))) |
| ^ 0x8000'0000u; |
| } // __z_to_z uses fallback |
| } |
| else if constexpr (__f64_to_ibw) //{{{2 |
| { |
| // one-arg __f64_to_ibw goes via _SimdWrapper<int, ?>. The fallback |
| // would go via two independet conversions to _SimdWrapper<_To> and |
| // subsequent interleaving. This is better, because f64->__i32 |
| // allows to combine __v0 and __v1 into one register: if constexpr |
| // (__z_to_x || __y_to_x) { |
| return __convert_x86<_To>( |
| __convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1)); |
| //} |
| } |
| else if constexpr (__f32_to_ibw) //{{{2 |
| { |
| return __convert_x86<_To>( |
| __convert_x86<__vector_type_t<int, _Np>>(__v0), |
| __convert_x86<__vector_type_t<int, _Np>>(__v1)); |
| } //}}} |
| |
| // fallback: {{{2 |
| if constexpr (sizeof(_To) >= 32) |
| // if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm |
| return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0), |
| __convert_x86<__vector_type_t<_Up, _M / 2>>(__v1)); |
| else if constexpr (sizeof(_To) == 16) |
| { |
| const auto __lo = __to_intrin(__convert_x86<_To>(__v0)); |
| const auto __hi = __to_intrin(__convert_x86<_To>(__v1)); |
| if constexpr (sizeof(_Up) * _Np == 8) |
| { |
| if constexpr (is_floating_point_v<_Up>) |
| return __auto_bitcast( |
| _mm_unpacklo_pd(__vector_bitcast<double>(__lo), |
| __vector_bitcast<double>(__hi))); |
| else |
| return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi)); |
| } |
| else if constexpr (sizeof(_Up) * _Np == 4) |
| { |
| if constexpr (is_floating_point_v<_Up>) |
| return __auto_bitcast( |
| _mm_unpacklo_ps(__vector_bitcast<float>(__lo), |
| __vector_bitcast<float>(__hi))); |
| else |
| return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi)); |
| } |
| else if constexpr (sizeof(_Up) * _Np == 2) |
| return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__lo, __hi)); |
| else |
| __assert_unreachable<_Tp>(); |
| } |
| else |
| return __vector_convert<_To>(__v0, __v1, make_index_sequence<_Np>()); |
| //}}} |
| } |
| } |
| |
| //}}}1 |
| // 4-arg __convert_x86 {{{1 |
| template <typename _To, typename _V, typename _Traits> |
| _GLIBCXX_SIMD_INTRINSIC _To |
| __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3) |
| { |
| static_assert(__is_vector_type_v<_V>); |
| using _Tp = typename _Traits::value_type; |
| constexpr size_t _Np = _Traits::_S_full_size; |
| [[maybe_unused]] const auto __i0 = __to_intrin(__v0); |
| [[maybe_unused]] const auto __i1 = __to_intrin(__v1); |
| [[maybe_unused]] const auto __i2 = __to_intrin(__v2); |
| [[maybe_unused]] const auto __i3 = __to_intrin(__v3); |
| using _Up = typename _VectorTraits<_To>::value_type; |
| constexpr size_t _M = _VectorTraits<_To>::_S_full_size; |
| |
| static_assert(4 * _Np <= _M, |
| "__v2/__v3 would be discarded; use the two/one-argument " |
| "__convert_x86 overload instead"); |
| |
| // [xyz]_to_[xyz] {{{2 |
| [[maybe_unused]] constexpr bool __x_to_x |
| = sizeof(__v0) <= 16 && sizeof(_To) <= 16; |
| [[maybe_unused]] constexpr bool __x_to_y |
| = sizeof(__v0) <= 16 && sizeof(_To) == 32; |
| [[maybe_unused]] constexpr bool __x_to_z |
| = sizeof(__v0) <= 16 && sizeof(_To) == 64; |
| [[maybe_unused]] constexpr bool __y_to_x |
| = sizeof(__v0) == 32 && sizeof(_To) <= 16; |
| [[maybe_unused]] constexpr bool __y_to_y |
| = sizeof(__v0) == 32 && sizeof(_To) == 32; |
| [[maybe_unused]] constexpr bool __y_to_z |
| = sizeof(__v0) == 32 && sizeof(_To) == 64; |
| [[maybe_unused]] constexpr bool __z_to_x |
| = sizeof(__v0) == 64 && sizeof(_To) <= 16; |
| [[maybe_unused]] constexpr bool __z_to_y |
| = sizeof(__v0) == 64 && sizeof(_To) == 32; |
| [[maybe_unused]] constexpr bool __z_to_z |
| = sizeof(__v0) == 64 && sizeof(_To) == 64; |
| |
| // iX_to_iX {{{2 |
| [[maybe_unused]] constexpr bool __i_to_i |
| = is_integral_v<_Up> && is_integral_v<_Tp>; |
| [[maybe_unused]] constexpr bool __i8_to_i16 |
| = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2; |
| [[maybe_unused]] constexpr bool __i8_to_i32 |
| = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __i8_to_i64 |
| = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __i16_to_i8 |
| = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1; |
| [[maybe_unused]] constexpr bool __i16_to_i32 |
| = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __i16_to_i64 |
| = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __i32_to_i8 |
| = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1; |
| [[maybe_unused]] constexpr bool __i32_to_i16 |
| = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2; |
| [[maybe_unused]] constexpr bool __i32_to_i64 |
| = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __i64_to_i8 |
| = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1; |
| [[maybe_unused]] constexpr bool __i64_to_i16 |
| = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2; |
| [[maybe_unused]] constexpr bool __i64_to_i32 |
| = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4; |
| |
| // [fsu]X_to_[fsu]X {{{2 |
| // ibw = integral && byte or word, i.e. char and short with any signedness |
| [[maybe_unused]] constexpr bool __i64_to_f32 |
| = is_integral_v<_Tp> && sizeof(_Tp) == 8 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __s32_to_f32 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __s16_to_f32 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __s8_to_f32 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __u32_to_f32 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __u16_to_f32 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __u8_to_f32 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| [[maybe_unused]] constexpr bool __s64_to_f64 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __s32_to_f64 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __s16_to_f64 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __s8_to_f64 |
| = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __u64_to_f64 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __u32_to_f64 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __u16_to_f64 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __u8_to_f64 |
| = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __f32_to_s64 |
| = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
| [[maybe_unused]] constexpr bool __f32_to_s32 |
| = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
| [[maybe_unused]] constexpr bool __f32_to_u64 |
| = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
| [[maybe_unused]] constexpr bool __f32_to_u32 |
| = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
| [[maybe_unused]] constexpr bool __f64_to_s64 |
| = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
| [[maybe_unused]] constexpr bool __f64_to_s32 |
| = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
| [[maybe_unused]] constexpr bool __f64_to_u64 |
| = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
| [[maybe_unused]] constexpr bool __f64_to_u32 |
| = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
| [[maybe_unused]] constexpr bool __f32_to_ibw |
| = is_integral_v<_Up> && sizeof(_Up) <= 2 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; |
| [[maybe_unused]] constexpr bool __f64_to_ibw |
| = is_integral_v<_Up> && sizeof(_Up) <= 2 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
| [[maybe_unused]] constexpr bool __f32_to_f64 |
| = is_floating_point_v<_Tp> && sizeof(_Tp) == 4 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 8; |
| [[maybe_unused]] constexpr bool __f64_to_f32 |
| = is_floating_point_v<_Tp> && sizeof(_Tp) == 8 |
| && is_floating_point_v<_Up> && sizeof(_Up) == 4; |
| |
| if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2 |
| { |
| // <double, 4>, <double, 4>, <double, 4>, <double, 4> => <char, 16> |
| return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1), |
| __hi128(__v1), __lo128(__v2), __hi128(__v2), |
| __lo128(__v3), __hi128(__v3)); |
| } |
| else if constexpr (__i_to_i) // assert ISA {{{2 |
| { |
| static_assert(__x_to_x || __have_avx2, |
| "integral conversions with ymm registers require AVX2"); |
| static_assert(__have_avx512bw |
| || ((sizeof(_Tp) >= 4 || sizeof(__v0) < 64) |
| && (sizeof(_Up) >= 4 || sizeof(_To) < 64)), |
| "8/16-bit integers in zmm registers require AVX512BW"); |
| static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) || __have_avx512f, |
| "integral conversions with ymm registers require AVX2"); |
| } |
| // concat => use 2-arg __convert_x86 {{{2 |
| if constexpr (sizeof(__v0) < 16 || (sizeof(__v0) == 16 && __have_avx2) |
| || (sizeof(__v0) == 16 && __have_avx |
| && is_floating_point_v<_Tp>) |
| || (sizeof(__v0) == 32 && __have_avx512f)) |
| { |
| // The ISA can handle wider input registers, so concat and use two-arg |
| // implementation. This reduces code duplication considerably. |
| return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3)); |
| } |
| else //{{{2 |
| { |
| // conversion using bit reinterpretation (or no conversion at all) |
| // should all go through the concat branch above: |
| static_assert( |
| !(is_floating_point_v< |
| _Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up))); |
| // handle all zero extension{{{2 |
| if constexpr (4 * _Np < _M && sizeof(_To) > 16) |
| { |
| constexpr size_t Min = 16 / sizeof(_Up); |
| return __zero_extend( |
| __convert_x86< |
| __vector_type_t<_Up, (Min > 4 * _Np) ? Min : 4 * _Np>>( |
| __v0, __v1, __v2, __v3)); |
| } |
| else if constexpr (__i64_to_i16) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_sse4_1) |
| { |
| return __intrin_bitcast<_To>(_mm_shuffle_epi8( |
| _mm_blend_epi16( |
| _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 2), 0x22), |
| _mm_blend_epi16(_mm_slli_si128(__i2, 4), |
| _mm_slli_si128(__i3, 6), 0x88), |
| 0xcc), |
| _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, |
| 14, 15))); |
| } |
| else if constexpr (__y_to_y && __have_avx2) |
| { |
| return __intrin_bitcast<_To>(_mm256_shuffle_epi8( |
| __xzyw(_mm256_blend_epi16( |
| __auto_bitcast( |
| _mm256_shuffle_ps(__vector_bitcast<float>(__v0), |
| __vector_bitcast<float>(__v2), |
| 0x88)), // 0.1. 8.9. 2.3. A.B. |
| __to_intrin(__vector_bitcast<int>(_mm256_shuffle_ps( |
| __vector_bitcast<float>(__v1), |
| __vector_bitcast<float>(__v3), 0x88)) |
| << 16), // .4.5 .C.D .6.7 .E.F |
| 0xaa) // 0415 8C9D 2637 AEBF |
| ), // 0415 2637 8C9D AEBF |
| _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, |
| 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, |
| 10, 11, 14, 15))); |
| /* |
| auto __a = _mm256_unpacklo_epi16(__v0, __v1); // 04.. .... 26.. |
| .... auto __b = _mm256_unpackhi_epi16(__v0, __v1); // 15.. |
| .... 37.. .... auto __c = _mm256_unpacklo_epi16(__v2, __v3); // |
| 8C.. .... AE.. .... auto __d = _mm256_unpackhi_epi16(__v2, |
| __v3); |
| // 9D.. .... BF.. .... auto __e = _mm256_unpacklo_epi16(__a, |
| __b); |
| // 0145 .... 2367 .... auto __f = _mm256_unpacklo_epi16(__c, |
| __d); |
| // 89CD .... ABEF .... auto __g = _mm256_unpacklo_epi64(__e, |
| __f); |
| // 0145 89CD 2367 ABEF return __concat( |
| _mm_unpacklo_epi32(__lo128(__g), __hi128(__g)), |
| _mm_unpackhi_epi32(__lo128(__g), __hi128(__g))); // 0123 |
| 4567 89AB CDEF |
| */ |
| } // else use fallback |
| } |
| else if constexpr (__i64_to_i8) //{{{2 |
| { |
| if constexpr (__x_to_x) |
| { |
| // TODO: use fallback for now |
| } |
| else if constexpr (__y_to_x) |
| { |
| auto __a |
| = _mm256_srli_epi32(_mm256_slli_epi32(__i0, 24), 24) |
| | _mm256_srli_epi32(_mm256_slli_epi32(__i1, 24), 16) |
| | _mm256_srli_epi32(_mm256_slli_epi32(__i2, 24), 8) |
| | _mm256_slli_epi32( |
| __i3, 24); // 048C .... 159D .... 26AE .... 37BF .... |
| /*return _mm_shuffle_epi8( |
| _mm_blend_epi32(__lo128(__a) << 32, __hi128(__a), 0x5), |
| _mm_setr_epi8(4, 12, 0, 8, 5, 13, 1, 9, 6, 14, 2, 10, 7, 15, |
| 3, 11));*/ |
| auto __b = _mm256_unpackhi_epi64( |
| __a, __a); // 159D .... 159D .... 37BF .... 37BF .... |
| auto __c = _mm256_unpacklo_epi8( |
| __a, __b); // 0145 89CD .... .... 2367 ABEF .... .... |
| return __intrin_bitcast<_To>( |
| _mm_unpacklo_epi16(__lo128(__c), |
| __hi128(__c))); // 0123 4567 89AB CDEF |
| } |
| } |
| else if constexpr (__i32_to_i8) //{{{2 |
| { |
| if constexpr (__x_to_x) |
| { |
| if constexpr (__have_ssse3) |
| { |
| const auto __x0 = __vector_bitcast<_UInt>(__v0) & 0xff; |
| const auto __x1 = (__vector_bitcast<_UInt>(__v1) & 0xff) |
| << 8; |
| const auto __x2 = (__vector_bitcast<_UInt>(__v2) & 0xff) |
| << 16; |
| const auto __x3 = __vector_bitcast<_UInt>(__v3) << 24; |
| return __intrin_bitcast<_To>( |
| _mm_shuffle_epi8(__to_intrin(__x0 | __x1 | __x2 | __x3), |
| _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, |
| 2, 6, 10, 14, 3, 7, 11, |
| 15))); |
| } |
| else |
| { |
| auto __a |
| = _mm_unpacklo_epi8(__i0, __i2); // 08.. .... 19.. .... |
| auto __b |
| = _mm_unpackhi_epi8(__i0, __i2); // 2A.. .... 3B.. .... |
| auto __c |
| = _mm_unpacklo_epi8(__i1, __i3); // 4C.. .... 5D.. .... |
| auto __d |
| = _mm_unpackhi_epi8(__i1, __i3); // 6E.. .... 7F.. .... |
| auto __e |
| = _mm_unpacklo_epi8(__a, __c); // 048C .... .... .... |
| auto __f |
| = _mm_unpackhi_epi8(__a, __c); // 159D .... .... .... |
| auto __g |
| = _mm_unpacklo_epi8(__b, __d); // 26AE .... .... .... |
| auto __h |
| = _mm_unpackhi_epi8(__b, __d); // 37BF .... .... .... |
| return __intrin_bitcast<_To>(_mm_unpacklo_epi8( |
| _mm_unpacklo_epi8(__e, __g), // 0246 8ACE .... .... |
| _mm_unpacklo_epi8(__f, __h) // 1357 9BDF .... .... |
| )); // 0123 4567 89AB CDEF |
| } |
| } |
| else if constexpr (__y_to_y) |
| { |
| const auto __a = _mm256_shuffle_epi8( |
| __to_intrin((__vector_bitcast<_UShort>(_mm256_blend_epi16( |
| __i0, _mm256_slli_epi32(__i1, 16), 0xAA)) |
| & 0xff) |
| | (__vector_bitcast<_UShort>(_mm256_blend_epi16( |
| __i2, _mm256_slli_epi32(__i3, 16), 0xAA)) |
| << 8)), |
| _mm256_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, |
| 11, 15, 0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, |
| 13, 3, 7, 11, 15)); |
| return __intrin_bitcast<_To>(_mm256_permutevar8x32_epi32( |
| __a, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7))); |
| } |
| } |
| else if constexpr (__i64_to_f32) //{{{2 |
| { |
| // this branch is only relevant with AVX and w/o AVX2 (i.e. no ymm |
| // integers) |
| if constexpr (__x_to_y) |
| { |
| return __make_wrapper<float>(__v0[0], __v0[1], __v1[0], __v1[1], |
| __v2[0], __v2[1], __v3[0], |
| __v3[1]); |
| |
| const auto __a = _mm_unpacklo_epi32(__i0, __i1); // acAC |
| const auto __b = _mm_unpackhi_epi32(__i0, __i1); // bdBD |
| const auto __c = _mm_unpacklo_epi32(__i2, __i3); // egEG |
| const auto __d = _mm_unpackhi_epi32(__i2, __i3); // fhFH |
| const auto __lo32a = _mm_unpacklo_epi32(__a, __b); // abcd |
| const auto __lo32b = _mm_unpacklo_epi32(__c, __d); // efgh |
| const auto __hi32 = __vector_bitcast< |
| conditional_t<is_signed_v<_Tp>, int, _UInt>>( |
| __concat(_mm_unpackhi_epi32(__a, __b), |
| _mm_unpackhi_epi32(__c, __d))); // ABCD EFGH |
| const auto __hi |
| = 0x100000000LL |
| * __convert_x86<__vector_type_t<float, 8>>(__hi32); |
| const auto __mid |
| = 0x10000 |
| * _mm256_cvtepi32_ps(__concat(_mm_srli_epi32(__lo32a, 16), |
| _mm_srli_epi32(__lo32b, 16))); |
| const auto __lo = _mm256_cvtepi32_ps( |
| __concat(_mm_set1_epi32(0x0000ffffu) & __lo32a, |
| _mm_set1_epi32(0x0000ffffu) & __lo32b)); |
| return (__hi + __mid) + __lo; |
| } |
| } |
| else if constexpr (__f64_to_ibw) //{{{2 |
| { |
| return __convert_x86<_To>( |
| __convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1), |
| __convert_x86<__vector_type_t<int, _Np * 2>>(__v2, __v3)); |
| } |
| else if constexpr (__f32_to_ibw) //{{{2 |
| { |
| return __convert_x86<_To>( |
| __convert_x86<__vector_type_t<int, _Np>>(__v0), |
| __convert_x86<__vector_type_t<int, _Np>>(__v1), |
| __convert_x86<__vector_type_t<int, _Np>>(__v2), |
| __convert_x86<__vector_type_t<int, _Np>>(__v3)); |
| } //}}} |
| |
| // fallback: {{{2 |
| if constexpr (sizeof(_To) >= 32) |
| // if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm |
| return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, |
| __v1), |
| __convert_x86<__vector_type_t<_Up, _M / 2>>(__v2, |
| __v3)); |
| else if constexpr (sizeof(_To) == 16) |
| { |
| const auto __lo = __to_intrin(__convert_x86<_To>(__v0, __v1)); |
| const auto __hi = __to_intrin(__convert_x86<_To>(__v2, __v3)); |
| if constexpr (sizeof(_Up) * _Np * 2 == 8) |
| { |
| if constexpr (is_floating_point_v<_Up>) |
| return __auto_bitcast(_mm_unpacklo_pd(__lo, __hi)); |
| else |
| return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi)); |
| } |
| else if constexpr (sizeof(_Up) * _Np * 2 == 4) |
| { |
| if constexpr (is_floating_point_v<_Up>) |
| return __auto_bitcast(_mm_unpacklo_ps(__lo, __hi)); |
| else |
| return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi)); |
| } |
| else |
| __assert_unreachable<_Tp>(); |
| } |
| else |
| return __vector_convert<_To>(__v0, __v1, __v2, __v3, |
| make_index_sequence<_Np>()); |
| //}}}2 |
| } |
| } |
| |
| //}}} |
| // 8-arg __convert_x86 {{{1 |
| template <typename _To, typename _V, typename _Traits> |
| _GLIBCXX_SIMD_INTRINSIC _To |
| __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6, |
| _V __v7) |
| { |
| static_assert(__is_vector_type_v<_V>); |
| using _Tp = typename _Traits::value_type; |
| constexpr size_t _Np = _Traits::_S_full_size; |
| [[maybe_unused]] const auto __i0 = __to_intrin(__v0); |
| [[maybe_unused]] const auto __i1 = __to_intrin(__v1); |
| [[maybe_unused]] const auto __i2 = __to_intrin(__v2); |
| [[maybe_unused]] const auto __i3 = __to_intrin(__v3); |
| [[maybe_unused]] const auto __i4 = __to_intrin(__v4); |
| [[maybe_unused]] const auto __i5 = __to_intrin(__v5); |
| [[maybe_unused]] const auto __i6 = __to_intrin(__v6); |
| [[maybe_unused]] const auto __i7 = __to_intrin(__v7); |
| using _Up = typename _VectorTraits<_To>::value_type; |
| constexpr size_t _M = _VectorTraits<_To>::_S_full_size; |
| |
| static_assert(8 * _Np <= _M, |
| "__v4-__v7 would be discarded; use the four/two/one-argument " |
| "__convert_x86 overload instead"); |
| |
| // [xyz]_to_[xyz] {{{2 |
| [[maybe_unused]] constexpr bool __x_to_x |
| = sizeof(__v0) <= 16 && sizeof(_To) <= 16; |
| [[maybe_unused]] constexpr bool __x_to_y |
| = sizeof(__v0) <= 16 && sizeof(_To) == 32; |
| [[maybe_unused]] constexpr bool __x_to_z |
| = sizeof(__v0) <= 16 && sizeof(_To) == 64; |
| [[maybe_unused]] constexpr bool __y_to_x |
| = sizeof(__v0) == 32 && sizeof(_To) <= 16; |
| [[maybe_unused]] constexpr bool __y_to_y |
| = sizeof(__v0) == 32 && sizeof(_To) == 32; |
| [[maybe_unused]] constexpr bool __y_to_z |
| = sizeof(__v0) == 32 && sizeof(_To) == 64; |
| [[maybe_unused]] constexpr bool __z_to_x |
| = sizeof(__v0) == 64 && sizeof(_To) <= 16; |
| [[maybe_unused]] constexpr bool __z_to_y |
| = sizeof(__v0) == 64 && sizeof(_To) == 32; |
| [[maybe_unused]] constexpr bool __z_to_z |
| = sizeof(__v0) == 64 && sizeof(_To) == 64; |
| |
| // [if]X_to_i8 {{{2 |
| [[maybe_unused]] constexpr bool __i_to_i |
| = is_integral_v<_Up> && is_integral_v<_Tp>; |
| [[maybe_unused]] constexpr bool __i64_to_i8 |
| = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1; |
| [[maybe_unused]] constexpr bool __f64_to_i8 |
| = is_integral_v<_Up> && sizeof(_Up) == 1 |
| && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; |
| |
| if constexpr (__i_to_i) // assert ISA {{{2 |
| { |
| static_assert(__x_to_x || __have_avx2, |
| "integral conversions with ymm registers require AVX2"); |
| static_assert(__have_avx512bw |
| || ((sizeof(_Tp) >= 4 || sizeof(__v0) < 64) |
| && (sizeof(_Up) >= 4 || sizeof(_To) < 64)), |
| "8/16-bit integers in zmm registers require AVX512BW"); |
| static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) || __have_avx512f, |
| "integral conversions with ymm registers require AVX2"); |
| } |
| // concat => use 4-arg __convert_x86 {{{2 |
| if constexpr (sizeof(__v0) < 16 || (sizeof(__v0) == 16 && __have_avx2) |
| || (sizeof(__v0) == 16 && __have_avx |
| && is_floating_point_v<_Tp>) |
| || (sizeof(__v0) == 32 && __have_avx512f)) |
| { |
| // The ISA can handle wider input registers, so concat and use two-arg |
| // implementation. This reduces code duplication considerably. |
| return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3), |
| __concat(__v4, __v5), __concat(__v6, __v7)); |
| } |
| else //{{{2 |
| { |
| // conversion using bit reinterpretation (or no conversion at all) |
| // should all go through the concat branch above: |
| static_assert( |
| !(is_floating_point_v< |
| _Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up))); |
| static_assert(!(8 * _Np < _M && sizeof(_To) > 16), |
| "zero extension should be impossible"); |
| if constexpr (__i64_to_i8) //{{{2 |
| { |
| if constexpr (__x_to_x && __have_ssse3) |
| { |
| // unsure whether this is better than the variant below |
| return __intrin_bitcast<_To>(_mm_shuffle_epi8( |
| __to_intrin( |
| (((__v0 & 0xff) | ((__v1 & 0xff) << 8)) |
| | (((__v2 & 0xff) << 16) | ((__v3 & 0xff) << 24))) |
| | ((((__v4 & 0xff) << 32) | ((__v5 & 0xff) << 40)) |
| | (((__v6 & 0xff) << 48) | (__v7 << 56)))), |
| _mm_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, |
| 7, 15))); |
| } |
| else if constexpr (__x_to_x) |
| { |
| const auto __a = _mm_unpacklo_epi8(__i0, __i1); // ac |
| const auto __b = _mm_unpackhi_epi8(__i0, __i1); // bd |
| const auto __c = _mm_unpacklo_epi8(__i2, __i3); // eg |
| const auto __d = _mm_unpackhi_epi8(__i2, __i3); // fh |
| const auto __e = _mm_unpacklo_epi8(__i4, __i5); // ik |
| const auto __f = _mm_unpackhi_epi8(__i4, __i5); // jl |
| const auto __g = _mm_unpacklo_epi8(__i6, __i7); // mo |
| const auto __h = _mm_unpackhi_epi8(__i6, __i7); // np |
| return __intrin_bitcast<_To>(_mm_unpacklo_epi64( |
| _mm_unpacklo_epi32(_mm_unpacklo_epi8(__a, __b), // abcd |
| _mm_unpacklo_epi8(__c, __d)), // efgh |
| _mm_unpacklo_epi32(_mm_unpacklo_epi8(__e, __f), // ijkl |
| _mm_unpacklo_epi8(__g, __h)) // mnop |
| )); |
| } |
| else if constexpr (__y_to_y) |
| { |
| auto __a = // 048C GKOS 159D HLPT 26AE IMQU 37BF JNRV |
| __to_intrin( |
| (((__v0 & 0xff) | ((__v1 & 0xff) << 8)) |
| | (((__v2 & 0xff) << 16) | ((__v3 & 0xff) << 24))) |
| | ((((__v4 & 0xff) << 32) | ((__v5 & 0xff) << 40)) |
| | (((__v6 & 0xff) << 48) | ((__v7 << 56))))); |
| /* |
| auto __b = _mm256_unpackhi_epi64(__a, __a); // 159D HLPT 159D |
| HLPT 37BF JNRV 37BF JNRV auto __c = _mm256_unpacklo_epi8(__a, |
| __b); // 0145 89CD GHKL OPST 2367 ABEF IJMN QRUV auto __d = |
| __xzyw(__c); // 0145 89CD 2367 ABEF GHKL OPST IJMN QRUV return |
| _mm256_shuffle_epi8( |
| __d, _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, |
| 13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, |
| 14, 15)); |
| */ |
| auto __b = _mm256_shuffle_epi8( // 0145 89CD GHKL OPST 2367 ABEF |
| // IJMN QRUV |
| __a, _mm256_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, |
| 6, 14, 7, 15, 0, 8, 1, 9, 2, 10, 3, 11, |
| 4, 12, 5, 13, 6, 14, 7, 15)); |
| auto __c |
| = __xzyw(__b); // 0145 89CD 2367 ABEF GHKL OPST IJMN QRUV |
| return __intrin_bitcast<_To>(_mm256_shuffle_epi8( |
| __c, _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, |
| 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, |
| 4, 5, 12, 13, 6, 7, 14, 15))); |
| } |
| else if constexpr (__z_to_z) |
| { |
| return __concat( |
| __convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, __v1, __v2, |
| __v3), |
| __convert_x86<__vector_type_t<_Up, _M / 2>>(__v4, __v5, __v6, |
| __v7)); |
| } |
| } |
| else if constexpr (__f64_to_i8) //{{{2 |
| { |
| return __convert_x86<_To>( |
| __convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1), |
| __convert_x86<__vector_type_t<int, _Np * 2>>(__v2, __v3), |
| __convert_x86<__vector_type_t<int, _Np * 2>>(__v4, __v5), |
| __convert_x86<__vector_type_t<int, _Np * 2>>(__v6, __v7)); |
| } |
| else // unreachable {{{2 |
| __assert_unreachable<_Tp>(); |
| //}}} |
| |
| // fallback: {{{2 |
| if constexpr (sizeof(_To) >= 32) |
| // if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm |
| return __concat( |
| __convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, __v1, __v2, __v3), |
| __convert_x86<__vector_type_t<_Up, _M / 2>>(__v4, __v5, __v6, |
| __v7)); |
| else if constexpr (sizeof(_To) == 16) |
| { |
| const auto __lo |
| = __to_intrin(__convert_x86<_To>(__v0, __v1, __v2, __v3)); |
| const auto __hi |
| = __to_intrin(__convert_x86<_To>(__v4, __v5, __v6, __v7)); |
| static_assert(sizeof(_Up) == 1 && _Np == 2); |
| return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi)); |
| } |
| else |
| { |
| __assert_unreachable<_Tp>(); |
| // return __vector_convert<_To>(__v0, __v1, __v2, __v3, __v4, __v5, |
| // __v6, __v7, |
| // make_index_sequence<_Np>()); |
| } //}}}2 |
| } |
| } |
| |
| //}}} |
| // 16-arg __convert_x86 {{{1 |
| template <typename _To, typename _V, typename _Traits> |
| _GLIBCXX_SIMD_INTRINSIC _To |
| __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6, |
| _V __v7, _V __v8, _V __v9, _V __v10, _V __v11, _V __v12, |
| _V __v13, _V __v14, _V __v15) |
| { |
| // concat => use 8-arg __convert_x86 |
| return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3), |
| __concat(__v4, __v5), __concat(__v6, __v7), |
| __concat(__v8, __v9), __concat(__v10, __v11), |
| __concat(__v12, __v13), __concat(__v14, __v15)); |
| } |
| |
| //}}} |
| |
| #endif // __cplusplus >= 201703L |
| #endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H |
| |
| // vim: foldmethod=marker |