| // Implementation of <simd> -*- C++ -*- |
| |
| // Copyright The GNU Toolchain Authors. |
| // |
| // This file is part of the GNU ISO C++ Library. This library is free |
| // software; you can redistribute it and/or modify it under the |
| // terms of the GNU General Public License as published by the |
| // Free Software Foundation; either version 3, or (at your option) |
| // any later version. |
| |
| // This library is distributed in the hope that it will be useful, |
| // but WITHOUT ANY WARRANTY; without even the implied warranty of |
| // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| // GNU General Public License for more details. |
| |
| // Under Section 7 of GPL version 3, you are granted additional |
| // permissions described in the GCC Runtime Library Exception, version |
| // 3.1, as published by the Free Software Foundation. |
| |
| // You should have received a copy of the GNU General Public License and |
| // a copy of the GCC Runtime Library Exception along with this program; |
| // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| // <http://www.gnu.org/licenses/>. |
| |
| #ifndef _GLIBCXX_SIMD_DETAILS_H |
| #define _GLIBCXX_SIMD_DETAILS_H 1 |
| |
| #ifdef _GLIBCXX_SYSHDR |
| #pragma GCC system_header |
| #endif |
| |
| #if __cplusplus >= 202400L |
| |
| #include <bit> |
| #include <bits/c++config.h> // _GLIBCXX_FLOAT_IS_IEEE_BINARY32 |
| #include <bits/stl_function.h> // plus, minus, multiplies, ... |
| #include <bits/utility.h> // integer_sequence, etc. |
| #include <cmath> // for math_errhandling :( |
| #include <concepts> |
| #include <cstdint> |
| #include <limits> |
| #include <span> // for dynamic_extent |
| |
| #if __CHAR_BIT__ != 8 |
| // There are simply too many constants and bit operators that currently depend on CHAR_BIT == 8. |
| // Generalization to CHAR_BIT != 8 does not make sense without testability (i.e. a test target). |
| #error "<simd> is not supported for CHAR_BIT != 8" |
| #endif |
| |
| // psabi warnings are bogus because the ABI of the internal types never leaks into user code |
| #pragma GCC diagnostic push |
| #pragma GCC diagnostic ignored "-Wpsabi" |
| |
| #if defined __x86_64__ || defined __i386__ |
| #define _GLIBCXX_X86 1 |
| #else |
| #define _GLIBCXX_X86 0 |
| #endif |
| |
| #ifndef _GLIBCXX_SIMD_NOEXCEPT |
| /** @internal |
| * For unit-testing preconditions, use this macro to remove noexcept. |
| */ |
| #define _GLIBCXX_SIMD_NOEXCEPT noexcept |
| #endif |
| |
| #define _GLIBCXX_SIMD_TOSTRING_IMPL(x) #x |
| #define _GLIBCXX_SIMD_TOSTRING(x) _GLIBCXX_SIMD_TOSTRING_IMPL(x) |
| |
| // This is used for unit-testing precondition checking |
| #define __glibcxx_simd_precondition(expr, msg, ...) \ |
| __glibcxx_assert(expr) |
| |
| namespace std _GLIBCXX_VISIBILITY(default) |
| { |
| _GLIBCXX_BEGIN_NAMESPACE_VERSION |
| |
| namespace simd |
| { |
| template <typename _Tp> |
| inline constexpr _Tp |
| __iota = [] { static_assert(false, "invalid __iota specialization"); }(); |
| |
| // [simd.general] vectorizable types |
| |
| template <typename _Tp> |
| concept __vectorizable_scalar |
| = same_as<remove_cv_t<_Tp>, _Tp> |
| #ifdef __STDCPP_BFLOAT16_T__ |
| && !same_as<_Tp, __gnu_cxx::__bfloat16_t> |
| #endif |
| && ((integral<_Tp> && sizeof(_Tp) <= sizeof(0ULL) && !same_as<_Tp, bool>) |
| || (floating_point<_Tp> && sizeof(_Tp) <= sizeof(double))); |
| |
| // [simd.general] p2 |
| template <typename _Tp> |
| concept __vectorizable = __vectorizable_scalar<_Tp>; |
| |
| /** @internal |
| * Describes variants of _Abi. |
| */ |
| enum class _AbiVariant : unsigned long long |
| { |
| _BitMask = 0x01, // AVX512 bit-masks |
| _MaskVariants = 0x0f, // vector masks if bits [0:3] are 0 |
| }; |
| |
| /** @internal |
| * Return @p __in with only bits set that are set in any of @p __to_keep. |
| */ |
| consteval _AbiVariant |
| __filter_abi_variant(_AbiVariant __in, same_as<_AbiVariant> auto... __to_keep) |
| { |
| using _Up = underlying_type_t<_AbiVariant>; |
| return static_cast<_AbiVariant>(static_cast<_Up>(__in) & (static_cast<_Up>(__to_keep) | ...)); |
| } |
| |
| /** @internal |
| * Type used whenever no valid integer/value type exists. |
| */ |
| struct _InvalidInteger |
| {}; |
| |
| /** @internal |
| * Alias for a signed integer type T such that sizeof(T) equals _Bytes. |
| * |
| * C++26 [simd.expos.defn] |
| */ |
| template <size_t _Bytes> |
| using __integer_from |
| = decltype([] consteval { |
| if constexpr (sizeof(signed char) == _Bytes) |
| return static_cast<signed char>(0); |
| else if constexpr (sizeof(signed short) == _Bytes) |
| return static_cast<signed short>(0); |
| else if constexpr (sizeof(signed int) == _Bytes) |
| return static_cast<signed int>(0); |
| else if constexpr (sizeof(signed long long) == _Bytes) |
| return static_cast<signed long long>(0); |
| else |
| return _InvalidInteger(); |
| }()); |
| |
| /** @internal |
| * Alias for an unsigned integer type T such that sizeof(T) equals _Bytes. |
| */ |
| template <size_t _Bytes> |
| using _UInt = make_unsigned_t<__integer_from<_Bytes>>; |
| |
| /** @internal |
| * Divide @p __x by @p __y while rounding up instead of down. |
| * |
| * Preconditions: __x >= 0 && __y > 0. |
| */ |
| template <typename _Tp> |
| consteval _Tp |
| __div_ceil(_Tp __x, _Tp __y) |
| { return (__x + __y - 1) / __y; } |
| |
| /** @internal |
| * Alias for an unsigned integer type that can store at least @p _NBits bits. |
| */ |
| template <int _NBits> |
| requires (_NBits > 0 && _NBits <= numeric_limits<unsigned long long>::digits) |
| using _Bitmask = _UInt<__div_ceil(__bit_ceil(unsigned(_NBits)), unsigned(__CHAR_BIT__))>; |
| |
| /** @internal |
| * Map a given type @p _Tp to an equivalent type. |
| * |
| * This helps with reducing the necessary branches && casts in the implementation as well as |
| * reducing the number of template instantiations. |
| */ |
| template <typename _Tp> |
| struct __canonical_vec_type |
| { using type = _Tp; }; |
| |
| template <typename _Tp> |
| using __canonical_vec_type_t = typename __canonical_vec_type<_Tp>::type; |
| |
| #if __SIZEOF_INT__ == __SIZEOF_LONG__ |
| template <> |
| struct __canonical_vec_type<long> |
| { using type = int; }; |
| |
| template <> |
| struct __canonical_vec_type<unsigned long> |
| { using type = unsigned int; }; |
| #elif __SIZEOF_LONG_LONG__ == __SIZEOF_LONG__ |
| template <> |
| struct __canonical_vec_type<long> |
| { using type = long long; }; |
| |
| template <> |
| struct __canonical_vec_type<unsigned long> |
| { using type = unsigned long long; }; |
| #endif |
| |
| template <typename _Tp> |
| requires std::is_enum_v<_Tp> |
| struct __canonical_vec_type<_Tp> |
| { using type = __canonical_vec_type<std::underlying_type_t<_Tp>>::type; }; |
| |
| template <> |
| struct __canonical_vec_type<char> |
| #if __CHAR_UNSIGNED__ |
| { using type = unsigned char; }; |
| #else |
| { using type = signed char; }; |
| #endif |
| |
| template <> |
| struct __canonical_vec_type<char8_t> |
| { using type = unsigned char; }; |
| |
| template <> |
| struct __canonical_vec_type<char16_t> |
| { using type = uint_least16_t; }; |
| |
| template <> |
| struct __canonical_vec_type<char32_t> |
| { using type = uint_least32_t; }; |
| |
| template <> |
| struct __canonical_vec_type<wchar_t> |
| { |
| using type = std::__conditional_t<std::is_signed_v<wchar_t>, |
| simd::__integer_from<sizeof(wchar_t)>, |
| simd::_UInt<sizeof(wchar_t)>>; |
| }; |
| |
| #if defined(__FLT64_DIG__) && defined(_GLIBCXX_DOUBLE_IS_IEEE_BINARY64) |
| template <> |
| struct __canonical_vec_type<_Float64> |
| { using type = double; }; |
| #endif |
| |
| #if defined(__FLT32_DIG__) && defined(_GLIBCXX_FLOAT_IS_IEEE_BINARY32) |
| template <> |
| struct __canonical_vec_type<_Float32> |
| { using type = float; }; |
| #endif |
| |
| /** @internal |
| * This ABI tag describes basic_vec objects that store one element per data member and basic_mask |
| * objects that store one bool data members. |
| * |
| * @tparam _Np The number of elements, which also matches the number of data members in |
| * basic_vec and basic_mask. |
| */ |
| template <int _Np = 1> |
| struct _ScalarAbi |
| { |
| static constexpr int _S_size = _Np; |
| |
| static constexpr int _S_nreg = _Np; |
| |
| static constexpr _AbiVariant _S_variant = {}; |
| |
| template <typename _Tp> |
| using _DataType = __canonical_vec_type_t<_Tp>; |
| |
| static constexpr bool _S_is_vecmask = false; |
| |
| // in principle a bool is a 1-bit bitmask, but this is asking for an AVX512 bitmask |
| static constexpr bool _S_is_bitmask = false; |
| |
| template <size_t> |
| using _MaskDataType = bool; |
| |
| template <int _N2, int _Nreg2 = _N2> |
| static consteval _ScalarAbi<_N2> |
| _S_resize() |
| { |
| static_assert(_N2 == _Nreg2); |
| return {}; |
| } |
| }; |
| |
| /** @internal |
| * This ABI tag describes basic_vec objects that store one or more objects declared with the |
| * [[gnu::vector_size(N)]] attribute. |
| * Applied to basic_mask objects, this ABI tag either describes corresponding vector-mask objects |
| * or bit-mask objects. Which one is used is determined via @p _Var. |
| * |
| * @tparam _Np The number of elements. |
| * @tparam _Nreg The number of registers needed to store @p _Np elements. |
| * @tparam _Var Determines how complex value-types are layed out and whether mask types use |
| * bit-masks or vector-masks. |
| */ |
| template <int _Np, int _Nreg, underlying_type_t<_AbiVariant> _Var> |
| struct _Abi |
| { |
| static constexpr int _S_size = _Np; |
| |
| /** @internal |
| * The number of registers needed to represent one basic_vec for the element type that was |
| * used on ABI deduction. |
| * |
| * Examples: |
| * - '_Abi< 8, 2>' for 'int' is 2x 128-bit |
| * - '_Abi< 9, 3>' for 'int' is 2x 128-bit and 1x 32-bit |
| * - '_Abi<10, 3>' for 'int' is 2x 128-bit and 1x 64-bit |
| * - '_Abi<10, 1>' for 'int' is 1x 512-bit |
| * - '_Abi<10, 2>' for 'int' is 1x 256-bit and 1x 64-bit |
| */ |
| static constexpr int _S_nreg = _Nreg; |
| |
| static_assert(_S_size > 0); |
| static_assert(_S_nreg > 0); |
| |
| static constexpr _AbiVariant _S_variant = static_cast<_AbiVariant>(_Var); |
| |
| static constexpr bool _S_is_bitmask |
| = __filter_abi_variant(_S_variant, _AbiVariant::_BitMask) == _AbiVariant::_BitMask; |
| |
| static constexpr bool _S_is_vecmask = !_S_is_bitmask; |
| |
| template <typename _Tp> |
| using _DataType = decltype([] { |
| static_assert(_S_nreg == 1); |
| if constexpr (_S_size == 1) |
| return __canonical_vec_type_t<_Tp>(); |
| else |
| { |
| constexpr int __n = __bit_ceil(unsigned(_S_size)); |
| using _Vp [[__gnu__::__vector_size__(sizeof(_Tp) * __n)]] |
| = __canonical_vec_type_t<_Tp>; |
| return _Vp(); |
| } |
| }()); |
| |
| template <size_t _Bytes> |
| using _MaskDataType |
| = decltype([] { |
| static_assert(_S_nreg == 1); |
| if constexpr (_S_size == 1) |
| return bool(); |
| else if constexpr (_S_is_vecmask) |
| { |
| constexpr unsigned __vbytes = _Bytes * __bit_ceil(unsigned(_S_size)); |
| using _Vp [[__gnu__::__vector_size__(__vbytes)]] = __integer_from<_Bytes>; |
| return _Vp(); |
| } |
| else if constexpr (_Nreg > 1) |
| return _InvalidInteger(); |
| else |
| return _Bitmask<_S_size>(); |
| }()); |
| |
| template <int _N2, int _Nreg2 = __div_ceil(_N2, _S_size)> |
| static consteval auto |
| _S_resize() |
| { |
| if constexpr (_N2 == 1) |
| return _Abi<1, 1, _Var>(); |
| else |
| return _Abi<_N2, _Nreg2, _Var>(); |
| } |
| }; |
| |
| /** @internal |
| * Alias for an _Abi specialization where the _AbiVariant bits are combined into a single integer |
| * value. |
| * |
| * Rationale: Consider diagnostic output and mangling of e.g. vec<int, 4> with AVX512. That's an |
| * alias for std::simd::basic_vec<int, std::simd::_Abi<4, 1, 1ull>>. If _AbiVariant were the |
| * template argument type of _Abi, the diagnostic output would be 'std::simd::basic_vec<int, |
| * std::simd::_Abi<4, 1, (std::simd::_AbiVariant)std::simd::_AbiVariant::_BitMask>>'. That's a lot |
| * longer, requires longer mangled names, and bakes the names of the enumerators into the ABI. As |
| * soon as bits of multiple _AbiVariants are combined, this becomes hard to parse for humans |
| * anyway. |
| */ |
| template <int _Np, int _Nreg, _AbiVariant... _Vs> |
| using _Abi_t = _Abi<_Np, _Nreg, (static_cast<underlying_type_t<_AbiVariant>>(_Vs) | ... | 0)>; |
| |
| /** @internal |
| * This type is used whenever ABI tag deduction can't give a useful answer. |
| */ |
| struct _InvalidAbi |
| { static constexpr int _S_size = 0; }; |
| |
| /** @internal |
| * Satisfied if @p _Tp is a valid simd ABI tag. This is a necessary but not sufficient condition |
| * for an enabled basic_vec/basic_mask specialization. |
| */ |
| template <typename _Tp> |
| concept __abi_tag |
| = same_as<decltype(_Tp::_S_variant), const _AbiVariant> |
| && (_Tp::_S_size >= _Tp::_S_nreg) && (_Tp::_S_nreg >= 1) |
| && requires(_Tp __x) { |
| { __x.template _S_resize<_Tp::_S_size, _Tp::_S_nreg>() } -> same_as<_Tp>; |
| }; |
| |
| template <typename _Tp> |
| concept __scalar_abi_tag |
| = same_as<_Tp, _ScalarAbi<_Tp::_S_size>> && __abi_tag<_Tp>; |
| |
| // Determine if math functions must *raise* floating-point exceptions. |
| // math_errhandling may expand to an extern symbol, in which case we must assume fp exceptions |
| // need to be considered. A conforming C library must define math_errhandling, but in case it |
| // isn't defined we simply use the fallback. |
| #ifdef math_errhandling |
| template <int = 0> |
| requires requires { typename bool_constant<0 != (math_errhandling & MATH_ERREXCEPT)>; } |
| consteval bool |
| __handle_fpexcept_impl(int) |
| { return 0 != (math_errhandling & MATH_ERREXCEPT); } |
| #endif |
| |
| // Fallback if math_errhandling doesn't work: implement correct exception behavior. |
| consteval bool |
| __handle_fpexcept_impl(float) |
| { return true; } |
| |
| /** @internal |
| * This type can be used as a template parameter for avoiding ODR violations, where code needs to |
| * differ depending on optimization flags (mostly fp-math related). |
| */ |
| struct _OptTraits |
| { |
| consteval bool |
| _M_test(int __bit) const |
| { return ((_M_build_flags >> __bit) & 1) == 1; } |
| |
| // true iff floating-point operations can signal an exception (allow non-default handler) |
| consteval bool |
| _M_fp_may_signal() const |
| { return _M_test(0); } |
| |
| // true iff floating-point operations can raise an exception flag |
| consteval bool |
| _M_fp_may_raise() const |
| { return _M_test(12); } |
| |
| consteval bool |
| _M_fast_math() const |
| { return _M_test(1); } |
| |
| consteval bool |
| _M_finite_math_only() const |
| { return _M_test(2); } |
| |
| consteval bool |
| _M_no_signed_zeros() const |
| { return _M_test(3); } |
| |
| consteval bool |
| _M_signed_zeros() const |
| { return !_M_test(3); } |
| |
| consteval bool |
| _M_reciprocal_math() const |
| { return _M_test(4); } |
| |
| consteval bool |
| _M_no_math_errno() const |
| { return _M_test(5); } |
| |
| consteval bool |
| _M_math_errno() const |
| { return !_M_test(5); } |
| |
| consteval bool |
| _M_associative_math() const |
| { return _M_test(6); } |
| |
| consteval bool |
| _M_conforming_to_STDC_annex_G() const |
| { return _M_test(10) && !_M_finite_math_only(); } |
| |
| consteval bool |
| _M_support_snan() const |
| { return _M_test(11); } |
| |
| __UINT64_TYPE__ _M_build_flags |
| = 0 |
| #if !__NO_TRAPPING_MATH__ |
| + (1 << 0) |
| #endif |
| + (__handle_fpexcept_impl(0) << 12) |
| #if __FAST_MATH__ |
| + (1 << 1) |
| #endif |
| #if __FINITE_MATH_ONLY__ |
| + (1 << 2) |
| #endif |
| #if __NO_SIGNED_ZEROS__ |
| + (1 << 3) |
| #endif |
| #if __RECIPROCAL_MATH__ |
| + (1 << 4) |
| #endif |
| #if __NO_MATH_ERRNO__ |
| + (1 << 5) |
| #endif |
| #if __ASSOCIATIVE_MATH__ |
| + (1 << 6) |
| #endif |
| // bits 7, 8, and 9 reserved for __FLT_EVAL_METHOD__ |
| #if __FLT_EVAL_METHOD__ == 1 |
| + (1 << 7) |
| #elif __FLT_EVAL_METHOD__ == 2 |
| + (2 << 7) |
| #elif __FLT_EVAL_METHOD__ != 0 |
| + (3 << 7) |
| #endif |
| |
| // C Annex G defines the behavior of complex<T> where T is IEC60559 floating-point. If |
| // __STDC_IEC_60559_COMPLEX__ is defined then Annex G is implemented - and simd<complex> |
| // will do so as well. However, Clang never defines the macro. |
| #if defined __STDC_IEC_60559_COMPLEX__ || defined __STDC_IEC_559_COMPLEX__ || defined _GLIBCXX_CLANG |
| + (1 << 10) |
| #endif |
| #if __SUPPORT_SNAN__ |
| + (1 << 11) |
| #endif |
| ; |
| }; |
| |
| /** @internal |
| * Return true iff @p __s equals "1". |
| */ |
| consteval bool |
| __streq_to_1(const char* __s) |
| { return __s != nullptr && __s[0] == '1' && __s[1] == '\0'; } |
| |
| /** @internal |
| * If the macro given as @p feat is defined to 1, expands to a bit set at position @p off. |
| * Otherwise, expand to zero. |
| */ |
| #define _GLIBCXX_SIMD_ARCH_FLAG(off, feat) \ |
| (static_cast<__UINT64_TYPE__>(std::simd::__streq_to_1(_GLIBCXX_SIMD_TOSTRING_IMPL(feat))) << off) |
| |
| #if _GLIBCXX_X86 |
| |
| #define _GLIBCXX_SIMD_ARCH_TRAITS_INIT { \ |
| _GLIBCXX_SIMD_ARCH_FLAG(0, __MMX__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG( 1, __SSE__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG( 2, __SSE2__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG( 3, __SSE3__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG( 4, __SSSE3__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG( 5, __SSE4_1__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG( 6, __SSE4_2__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG( 7, __POPCNT__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG( 8, __AVX__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG( 9, __F16C__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(10, __BMI__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(11, __BMI2__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(12, __LZCNT__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(13, __AVX2__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(14, __FMA__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(15, __AVX512F__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(16, __AVX512CD__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(17, __AVX512DQ__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(18, __AVX512BW__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(19, __AVX512VL__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(20, __AVX512BITALG__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(21, __AVX512VBMI__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(22, __AVX512VBMI2__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(23, __AVX512IFMA__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(24, __AVX512VNNI__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(25, __AVX512VPOPCNTDQ__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(26, __AVX512FP16__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(27, __AVX512BF16__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(28, __AVXIFMA__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(29, __AVXNECONVERT__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(30, __AVXVNNI__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(31, __AVXVNNIINT8__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(32, __AVXVNNIINT16__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(33, __AVX10_1__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(34, __AVX10_2__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(35, __AVX512VP2INTERSECT__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(36, __SSE4A__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(37, __FMA4__) \ |
| | _GLIBCXX_SIMD_ARCH_FLAG(38, __XOP__) \ |
| } |
| // Should this include __APX_F__? I don't think it's relevant for use in constexpr-if branches => |
| // no ODR issue? The same could be said about several other flags above that are not checked |
| // anywhere. |
| |
| struct _ArchTraits |
| { |
| __UINT64_TYPE__ _M_flags = _GLIBCXX_SIMD_ARCH_TRAITS_INIT; |
| |
| consteval bool |
| _M_test(int __bit) const |
| { return ((_M_flags >> __bit) & 1) == 1; } |
| |
| consteval bool |
| _M_have_mmx() const |
| { return _M_test(0); } |
| |
| consteval bool |
| _M_have_sse() const |
| { return _M_test(1); } |
| |
| consteval bool |
| _M_have_sse2() const |
| { return _M_test(2); } |
| |
| consteval bool |
| _M_have_sse3() const |
| { return _M_test(3); } |
| |
| consteval bool |
| _M_have_ssse3() const |
| { return _M_test(4); } |
| |
| consteval bool |
| _M_have_sse4_1() const |
| { return _M_test(5); } |
| |
| consteval bool |
| _M_have_sse4_2() const |
| { return _M_test(6); } |
| |
| consteval bool |
| _M_have_popcnt() const |
| { return _M_test(7); } |
| |
| consteval bool |
| _M_have_avx() const |
| { return _M_test(8); } |
| |
| consteval bool |
| _M_have_f16c() const |
| { return _M_test(9); } |
| |
| consteval bool |
| _M_have_bmi() const |
| { return _M_test(10); } |
| |
| consteval bool |
| _M_have_bmi2() const |
| { return _M_test(11); } |
| |
| consteval bool |
| _M_have_lzcnt() const |
| { return _M_test(12); } |
| |
| consteval bool |
| _M_have_avx2() const |
| { return _M_test(13); } |
| |
| consteval bool |
| _M_have_fma() const |
| { return _M_test(14); } |
| |
| consteval bool |
| _M_have_avx512f() const |
| { return _M_test(15); } |
| |
| consteval bool |
| _M_have_avx512cd() const |
| { return _M_test(16); } |
| |
| consteval bool |
| _M_have_avx512dq() const |
| { return _M_test(17); } |
| |
| consteval bool |
| _M_have_avx512bw() const |
| { return _M_test(18); } |
| |
| consteval bool |
| _M_have_avx512vl() const |
| { return _M_test(19); } |
| |
| consteval bool |
| _M_have_avx512bitalg() const |
| { return _M_test(20); } |
| |
| consteval bool |
| _M_have_avx512vbmi() const |
| { return _M_test(21); } |
| |
| consteval bool |
| _M_have_avx512vbmi2() const |
| { return _M_test(22); } |
| |
| consteval bool |
| _M_have_avx512ifma() const |
| { return _M_test(23); } |
| |
| consteval bool |
| _M_have_avx512vnni() const |
| { return _M_test(24); } |
| |
| consteval bool |
| _M_have_avx512vpopcntdq() const |
| { return _M_test(25); } |
| |
| consteval bool |
| _M_have_avx512fp16() const |
| { return _M_test(26); } |
| |
| consteval bool |
| _M_have_avx512bf16() const |
| { return _M_test(27); } |
| |
| consteval bool |
| _M_have_avxifma() const |
| { return _M_test(28); } |
| |
| consteval bool |
| _M_have_avxneconvert() const |
| { return _M_test(29); } |
| |
| consteval bool |
| _M_have_avxvnni() const |
| { return _M_test(30); } |
| |
| consteval bool |
| _M_have_avxvnniint8() const |
| { return _M_test(31); } |
| |
| consteval bool |
| _M_have_avxvnniint16() const |
| { return _M_test(32); } |
| |
| consteval bool |
| _M_have_avx10_1() const |
| { return _M_test(33); } |
| |
| consteval bool |
| _M_have_avx10_2() const |
| { return _M_test(34); } |
| |
| consteval bool |
| _M_have_avx512vp2intersect() const |
| { return _M_test(35); } |
| |
| consteval bool |
| _M_have_sse4a() const |
| { return _M_test(36); } |
| |
| consteval bool |
| _M_have_fma4() const |
| { return _M_test(37); } |
| |
| consteval bool |
| _M_have_xop() const |
| { return _M_test(38); } |
| |
| template <typename _Tp> |
| consteval bool |
| _M_eval_as_f32() const |
| { return is_same_v<_Tp, _Float16> && !_M_have_avx512fp16(); } |
| }; |
| |
| template <typename _Tp, _ArchTraits _Traits = {}> |
| consteval auto |
| __native_abi() |
| { |
| constexpr int __adj_sizeof = sizeof(_Tp) * (1 + is_same_v<_Tp, _Float16>); |
| if constexpr (!__vectorizable<_Tp>) |
| return _InvalidAbi(); |
| else if constexpr (_Traits._M_have_avx512fp16()) |
| return _Abi_t<64 / sizeof(_Tp), 1, _AbiVariant::_BitMask>(); |
| else if constexpr (_Traits._M_have_avx512f()) |
| return _Abi_t<64 / __adj_sizeof, 1, _AbiVariant::_BitMask>(); |
| else if constexpr (is_same_v<_Tp, _Float16> && !_Traits._M_have_f16c()) |
| return _ScalarAbi<1>(); |
| else if constexpr (_Traits._M_have_avx2()) |
| return _Abi_t<32 / __adj_sizeof, 1>(); |
| else if constexpr (_Traits._M_have_avx() && is_floating_point_v<_Tp>) |
| return _Abi_t<32 / __adj_sizeof, 1>(); |
| else if constexpr (_Traits._M_have_sse2()) |
| return _Abi_t<16 / __adj_sizeof, 1>(); |
| else if constexpr (_Traits._M_have_sse() && is_floating_point_v<_Tp> |
| && sizeof(_Tp) == sizeof(float)) |
| return _Abi_t<16 / __adj_sizeof, 1>(); |
| // no MMX: we can't emit EMMS where it would be necessary |
| else |
| return _ScalarAbi<1>(); |
| } |
| |
| #else |
| |
| // scalar fallback |
| struct _ArchTraits |
| { |
| __UINT64_TYPE__ _M_flags = 0; |
| |
| constexpr bool |
| _M_test(int __bit) const |
| { return ((_M_flags >> __bit) & 1) == 1; } |
| }; |
| |
| template <typename _Tp> |
| consteval auto |
| __native_abi() |
| { |
| if constexpr (!__vectorizable<_Tp>) |
| return _InvalidAbi(); |
| else |
| return _ScalarAbi<1>(); |
| } |
| |
| #endif |
| |
| /** @internal |
| * You must use this type as template argument to function templates that are not declared |
| * always_inline (to avoid issues when linking code compiled with different compiler flags). |
| */ |
| struct _TargetTraits |
| : _ArchTraits, _OptTraits |
| {}; |
| |
| /** @internal |
| * Alias for an ABI tag such that basic_vec<_Tp, __native_abi_t_<_Tp>> stores one SIMD register of |
| * optimal width. |
| * |
| * @tparam _Tp A vectorizable type. |
| * |
| * C++26 [simd.expos.abi] |
| */ |
| template <typename _Tp> |
| using __native_abi_t = decltype(std::simd::__native_abi<_Tp>()); |
| |
| template <typename _Tp, int _Np, _TargetTraits _Target = {}> |
| consteval auto |
| __deduce_abi() |
| { |
| constexpr auto __native = std::simd::__native_abi<_Tp>(); |
| if constexpr (0 == __native._S_size || _Np <= 0) |
| return _InvalidAbi(); |
| else if constexpr (_Np == __native._S_size) |
| return __native; |
| else |
| return __native.template _S_resize<_Np>(); |
| } |
| |
| /** @internal |
| * Alias for an ABI tag @c A such that `basic_vec<_Tp, A>` stores @p _Np elements. |
| * |
| * C++26 [simd.expos.abi] |
| */ |
| template <typename _Tp, int _Np> |
| using __deduce_abi_t = decltype(std::simd::__deduce_abi<_Tp, _Np>()); |
| |
| /** @internal |
| * \c rebind implementation detail for basic_vec, and basic_mask where we know the destination |
| * value-type |
| */ |
| template <typename _Tp, int _Np, __abi_tag _A0, _ArchTraits = {}> |
| consteval auto |
| __abi_rebind() |
| { |
| if constexpr (_Np <= 0 || !__vectorizable<_Tp>) |
| return _InvalidAbi(); |
| |
| else if constexpr (__scalar_abi_tag<_A0>) |
| return _A0::template _S_resize<_Np>(); |
| |
| else |
| { |
| using _Native = remove_const_t<decltype(std::simd::__native_abi<_Tp>())>; |
| static_assert(0 != _Native::_S_size); |
| constexpr int __nreg = __div_ceil(_Np, _Native::_S_size); |
| |
| if constexpr (__scalar_abi_tag<_Native>) |
| return _Native::template _S_resize<_Np>(); |
| else |
| return _Abi_t<_Native::_S_size, 1, __filter_abi_variant(_A0::_S_variant, |
| _AbiVariant::_MaskVariants) |
| >::template _S_resize<_Np, __nreg>(); |
| } |
| } |
| |
| /** @internal |
| * @c rebind implementation detail for basic_mask. |
| * |
| * The important difference here is that we have no information about the actual value-type other |
| * than its @c sizeof. So `_Bytes == 8` could mean `complex<float>`, @c double, or @c int64_t. |
| * E.g. `_Np == 4` with AVX w/o AVX2 that's `vector(4) int`, `vector(4) long long`, or `2x |
| * vector(2) long long`. |
| * That's why this overload has the additional @p _IsOnlyResize parameter, which tells us that the |
| * value-type doesn't change. |
| */ |
| template <size_t _Bytes, int _Np, __abi_tag _A0, bool _IsOnlyResize, _ArchTraits _Traits = {}> |
| consteval auto |
| __abi_rebind() |
| { |
| if constexpr (_Bytes == 0 || _Np <= 0) |
| return _InvalidAbi(); |
| |
| else if constexpr (__scalar_abi_tag<_A0>) |
| return _A0::template _S_resize<_Np>(); |
| |
| #if _GLIBCXX_X86 |
| // AVX w/o AVX2: |
| // e.g. resize_t<8, mask<float, Whatever>> needs to be _Abi<8, 1> not _Abi<8, 2> |
| // We determine whether _A0 identifies an AVX vector by looking at the size of a native |
| // register. If it's 32, it's a YMM register, otherwise it's 16 or less. |
| else if constexpr (_IsOnlyResize |
| && _Traits._M_have_avx() && !_Traits._M_have_avx2() |
| && __bit_ceil(__div_ceil<unsigned>( |
| _A0::_S_size, _A0::_S_nreg)) * _Bytes == 32) |
| { |
| if constexpr (_Bytes == sizeof(double)) |
| return __abi_rebind<double, _Np, _A0>(); |
| else if constexpr (_Bytes == sizeof(float)) |
| return __abi_rebind<float, _Np, _A0>(); |
| else if constexpr (_Traits._M_have_f16c() && _Bytes == sizeof(_Float16)) |
| return __abi_rebind<_Float16, _Np, _A0>(); |
| else // impossible |
| static_assert(false); |
| } |
| #endif |
| |
| else |
| return __abi_rebind<__integer_from<_Bytes>, _Np, _A0>(); |
| } |
| |
| /** @internal |
| * Returns true unless _GLIBCXX_SIMD_COND_EXPLICIT_MASK_CONVERSION is defined. |
| * |
| * On IvyBridge, (vec<float> == 0.f) == (rebind_t<int, vec<float>> == 0) does not compile. It does |
| * compile on basically every other target, though. This is due to the difference in ABI tag: |
| * _Abi<8, 1, [...]> vs. _Abi<8, 2, [...]> (8 elements, 1 vs. 2 registers). |
| * I know how to define this funtion for libstdc++ to avoid interconvertible masks. The question |
| * is whether we can specify this in general for C++29. |
| * |
| * Idea: Is rebind_t<integer-from<...>, mask>::abi_type the same type as |
| * deduce-t<integer-from<...>, mask::size()>? If yes, it's the "better" ABI tag. However, this |
| * makes the conversion behavior dependent on compiler flags. Probably not what we want. |
| */ |
| template <typename _To, typename _From> |
| consteval bool |
| __is_mask_conversion_explicit([[maybe_unused]] size_t __b0, [[maybe_unused]] size_t __b1) |
| { |
| constexpr int __n = _To::_S_size; |
| static_assert(__n == _From::_S_size); |
| #ifndef _GLIBCXX_SIMD_COND_EXPLICIT_MASK_CONVERSION |
| /// C++26 [simd.mask.ctor] uses unconditional explicit |
| return true; |
| #else |
| if (__b0 != __b1) |
| return true; |
| |
| // everything is better than _ScalarAbi, except when converting to a single bool |
| if constexpr (__scalar_abi_tag<_To>) |
| return __n > 1; |
| else if constexpr (__scalar_abi_tag<_From>) |
| return true; |
| |
| // converting to a bit-mask is better |
| else if constexpr (_To::_S_is_vecmask != _From::_S_is_vecmask) |
| return _To::_S_is_vecmask; // to vector-mask is explicit |
| |
| // with vec-masks, fewer registers is better |
| else if constexpr (_From::_S_nreg != _To::_S_nreg) |
| return _From::_S_nreg < _To::_S_nreg; |
| |
| else |
| __builtin_unreachable(); |
| #endif |
| } |
| |
| /** @internal |
| * An alias for a signed integer type. |
| * |
| * libstdc++ unconditionally uses @c int here, since it matches the return type of |
| * 'Bit Operation Builtins' in GCC. |
| * |
| * C++26 [simd.expos.defn] |
| */ |
| using __simd_size_type = int; |
| |
| // integral_constant shortcut |
| template <__simd_size_type _Xp> |
| inline constexpr integral_constant<__simd_size_type, _Xp> __simd_size_c = {}; |
| |
| // [simd.syn] |
| template <typename _Tp, typename _Ap = __native_abi_t<_Tp>> |
| class basic_vec; |
| |
| template <typename _Tp, __simd_size_type _Np = __native_abi_t<_Tp>::_S_size> |
| using vec = basic_vec<_Tp, __deduce_abi_t<_Tp, _Np>>; |
| |
| template <size_t _Bytes, typename _Ap = __native_abi_t<__integer_from<_Bytes>>> |
| class basic_mask; |
| |
| template <typename _Tp, __simd_size_type _Np = __native_abi_t<_Tp>::_S_size> |
| using mask = basic_mask<sizeof(_Tp), __deduce_abi_t<_Tp, _Np>>; |
| |
| // [simd.ctor] load constructor constraints |
| template <typename _Tp, size_t _Np = -1uz> |
| concept __static_sized_range |
| = ranges::sized_range<_Tp> && requires(_Tp&& __r) { |
| typename integral_constant<size_t, ranges::size(__r)>; |
| requires (_Np == -1uz || ranges::size(__r) == _Np); |
| }; |
| |
| template <typename _Rg> |
| consteval size_t |
| __static_range_size(_Rg& __r) |
| { |
| if constexpr (requires { typename integral_constant<size_t, ranges::size(__r)>; }) |
| return ranges::size(__r); |
| else |
| return dynamic_extent; |
| } |
| |
| // [simd.general] value-preserving |
| template <typename _From, typename _To> |
| concept __arithmetic_only_value_preserving_convertible_to |
| = convertible_to<_From, _To> && is_arithmetic_v<_From> && is_arithmetic_v<_To> |
| && !(is_signed_v<_From> && is_unsigned_v<_To>) |
| && numeric_limits<_From>::digits <= numeric_limits<_To>::digits |
| && numeric_limits<_From>::max() <= numeric_limits<_To>::max() |
| && numeric_limits<_From>::lowest() >= numeric_limits<_To>::lowest(); |
| |
| /** @internal |
| * Satisfied if the conversion from @p _From to @p _To is a value-preserving conversion. |
| * |
| * C++26 [simd.general] |
| */ |
| template <typename _From, typename _To> |
| concept __value_preserving_convertible_to |
| = __arithmetic_only_value_preserving_convertible_to<_From, _To>; |
| |
| // LWG4420 |
| template <typename _From, typename _To> |
| concept __explicitly_convertible_to = requires { |
| static_cast<_To>(declval<_From>()); |
| }; |
| |
| /** @internal |
| * C++26 [simd.expos] |
| */ |
| template<typename _Tp> |
| concept __constexpr_wrapper_like |
| = convertible_to<_Tp, decltype(_Tp::value)> |
| && equality_comparable_with<_Tp, decltype(_Tp::value)> |
| && bool_constant<_Tp() == _Tp::value>::value |
| && bool_constant<static_cast<decltype(_Tp::value)>(_Tp()) == _Tp::value>::value; |
| |
| // [simd.ctor] explicit(...) of broadcast ctor |
| template <auto _From, typename _To> |
| concept __non_narrowing_constexpr_conversion |
| = is_arithmetic_v<decltype(_From)> |
| && static_cast<decltype(_From)>(static_cast<_To>(_From)) == _From |
| && !(unsigned_integral<_To> && _From < decltype(_From)()) |
| && _From <= std::numeric_limits<_To>::max() |
| && _From >= std::numeric_limits<_To>::lowest(); |
| |
| // [simd.ctor] p4 |
| // This implements LWG4436 (submitted on 2025-10-28) |
| template <typename _From, typename _To> |
| concept __broadcast_constructible |
| = ((convertible_to<_From, _To> && !is_arithmetic_v<remove_cvref_t<_From>> |
| && !__constexpr_wrapper_like<remove_cvref_t<_From>>) // 4.1 |
| || __value_preserving_convertible_to<remove_cvref_t<_From>, _To> // 4.2 |
| || (__constexpr_wrapper_like<remove_cvref_t<_From>> // 4.3 |
| && __non_narrowing_constexpr_conversion<auto(remove_cvref_t<_From>::value), |
| _To>)); |
| |
| // __higher_floating_point_rank_than<_Tp, U> (_Tp has higher or equal floating point rank than U) |
| template <typename _From, typename _To> |
| consteval bool |
| __higher_floating_point_rank_than() |
| { |
| return floating_point<_From> && floating_point<_To> |
| && is_same_v<common_type_t<_From, _To>, _From> && !is_same_v<_From, _To>; |
| } |
| |
| // __higher_integer_rank_than<_Tp, U> (_Tp has higher or equal integer rank than U) |
| template <typename _From, typename _To> |
| consteval bool |
| __higher_integer_rank_than() |
| { |
| return integral<_From> && integral<_To> |
| && (sizeof(_From) > sizeof(_To) || is_same_v<common_type_t<_From, _To>, _From>) |
| && !is_same_v<_From, _To>; |
| } |
| |
| template <typename _From, typename _To> |
| concept __higher_rank_than |
| = __higher_floating_point_rank_than<_From, _To>() || __higher_integer_rank_than<_From, _To>(); |
| |
| struct __convert_flag; |
| |
| template <typename _From, typename _To, typename... _Flags> |
| concept __loadstore_convertible_to |
| = same_as<_From, _To> |
| || (__vectorizable<_From> && __vectorizable<_To> |
| && (__value_preserving_convertible_to<_From, _To> |
| || (__explicitly_convertible_to<_From, _To> |
| && (std::is_same_v<_Flags, __convert_flag> || ...)))); |
| |
| template <typename _From, typename _To> |
| concept __simd_generator_convertible_to |
| = std::convertible_to<_From, _To> |
| && (!is_arithmetic_v<_From> || __value_preserving_convertible_to<_From, _To>); |
| |
| template <typename _Fp, typename _Tp, __simd_size_type... _Is> |
| requires (__simd_generator_convertible_to< |
| decltype(declval<_Fp>()(__simd_size_c<_Is>)), _Tp> && ...) |
| constexpr void |
| __simd_generator_invokable_impl(integer_sequence<__simd_size_type, _Is...>); |
| |
| template <typename _Fp, typename _Tp, __simd_size_type _Np> |
| concept __simd_generator_invokable = requires { |
| __simd_generator_invokable_impl<_Fp, _Tp>(make_integer_sequence<__simd_size_type, _Np>()); |
| }; |
| |
| template <typename _Fp> |
| concept __index_permutation_function_sized = requires(_Fp const& __f) |
| { |
| { __f(0, 0) } -> std::integral; |
| }; |
| |
| template <typename _Fp, typename _Simd> |
| concept __index_permutation_function |
| = __index_permutation_function_sized<_Fp> || requires(_Fp const& __f) { |
| { __f(0) } -> std::integral; |
| }; |
| |
| /** @internal |
| * The value of the @c _Bytes template argument to a @c basic_mask specialization. |
| * |
| * C++26 [simd.expos.defn] |
| */ |
| template <typename _Tp> |
| constexpr size_t __mask_element_size = 0; |
| |
| template <size_t _Bytes, __abi_tag _Ap> |
| constexpr size_t __mask_element_size<basic_mask<_Bytes, _Ap>> = _Bytes; |
| |
| // [simd.expos] |
| template <typename _Vp> |
| concept __simd_vec_type |
| = same_as<_Vp, basic_vec<typename _Vp::value_type, typename _Vp::abi_type>> |
| && is_default_constructible_v<_Vp>; |
| |
| template <typename _Vp> |
| concept __simd_mask_type |
| = same_as<_Vp, basic_mask<__mask_element_size<_Vp>, typename _Vp::abi_type>> |
| && is_default_constructible_v<_Vp>; |
| |
| /** @internal |
| * Satisfied if @p _Tp is a data-parallel type. |
| */ |
| template <typename _Vp> |
| concept __simd_vec_or_mask_type = __simd_vec_type<_Vp> || __simd_mask_type<_Vp>; |
| |
| template <typename _Vp> |
| concept __simd_floating_point |
| = __simd_vec_type<_Vp> && floating_point<typename _Vp::value_type>; |
| |
| template <typename _Vp> |
| concept __simd_integral |
| = __simd_vec_type<_Vp> && integral<typename _Vp::value_type>; |
| |
| template <typename _Tp> |
| concept __converts_to_vec |
| = __simd_vec_type<decltype(declval<const _Tp&>() + declval<const _Tp&>())>; |
| |
| template <__converts_to_vec _Tp> |
| using __deduced_vec_t = decltype(declval<const _Tp&>() + declval<const _Tp&>()); |
| |
| template <typename _Vp, typename _Tp> |
| using __make_compatible_simd_t |
| = decltype([] { |
| using _Up = decltype(declval<const _Tp&>() + declval<const _Tp&>()); |
| if constexpr (__simd_vec_type<_Up>) |
| return _Up(); |
| else |
| return vec<_Up, _Vp::size()>(); |
| }()); |
| |
| template <typename _Tp> |
| concept __math_floating_point = __simd_floating_point<__deduced_vec_t<_Tp>>; |
| |
| template <typename _BinaryOperation, typename _Tp> |
| concept __reduction_binary_operation |
| = requires (const _BinaryOperation __binary_op, const vec<_Tp, 1> __v) { |
| { __binary_op(__v, __v) } -> same_as<vec<_Tp, 1>>; |
| }; |
| |
| /** @internal |
| * Returns the highest index @c i where `(__bits >> i) & 1` equals @c 1. |
| */ |
| [[__gnu__::__always_inline__]] |
| constexpr __simd_size_type |
| __highest_bit(std::unsigned_integral auto __bits) |
| { |
| using __gnu_cxx::__int_traits; |
| constexpr auto _Nd = __int_traits<decltype(__bits)>::__digits; |
| return _Nd - 1 - __countl_zero(__bits); |
| } |
| |
| template <__vectorizable _Tp, __simd_size_type _Np, __abi_tag _Ap> |
| using __similar_mask = basic_mask<sizeof(_Tp), decltype(__abi_rebind<_Tp, _Np, _Ap>())>; |
| |
| // Allow _Tp to be _InvalidInteger for __integer_from<16> |
| template <typename _Tp, __simd_size_type _Np, __abi_tag _Ap> |
| using __similar_vec = basic_vec<_Tp, decltype(__abi_rebind<_Tp, _Np, _Ap>())>; |
| |
| // LWG4470 [simd.expos] |
| template <size_t _Bytes, typename _Ap> |
| using __simd_vec_from_mask_t = __similar_vec<__integer_from<_Bytes>, _Ap::_S_size, _Ap>; |
| |
| #if _GLIBCXX_SIMD_THROW_ON_BAD_VALUE // used for unit tests (also see P3844) |
| class __bad_value_preserving_cast |
| {}; |
| |
| #define __glibcxx_on_bad_value_preserving_cast throw __bad_value_preserving_cast |
| #else |
| void __bad_value_preserving_cast(); // not defined |
| |
| #define __glibcxx_on_bad_value_preserving_cast __bad_value_preserving_cast |
| #endif |
| |
| template <typename _To, typename _From> |
| #if _GLIBCXX_SIMD_THROW_ON_BAD_VALUE // see P3844 |
| [[__gnu__::__optimize__("exceptions")]] // work around potential -fno-exceptions |
| #endif |
| consteval _To |
| __value_preserving_cast(const _From& __x) |
| { |
| static_assert(is_arithmetic_v<_From>); |
| if constexpr (!__value_preserving_convertible_to<_From, _To>) |
| { |
| using _Up = typename __make_unsigned<_From>::__type; |
| if (static_cast<_Up>(static_cast<_To>(__x)) != static_cast<_Up>(__x)) |
| __glibcxx_on_bad_value_preserving_cast(); |
| else if constexpr (is_signed_v<_From> && is_unsigned_v<_To>) |
| { |
| if (__x < _From()) |
| __glibcxx_on_bad_value_preserving_cast(); |
| } |
| else if constexpr (unsigned_integral<_From> && signed_integral<_To>) |
| { |
| if (__x > numeric_limits<_To>::max()) |
| __glibcxx_on_bad_value_preserving_cast(); |
| } |
| } |
| return static_cast<_To>(__x); |
| } |
| |
| template <typename _From, typename _To> |
| concept __simd_vec_bcast_consteval |
| = __explicitly_convertible_to<_From, _To> |
| && is_arithmetic_v<remove_cvref_t<_From>> && convertible_to<_From, _To> |
| && !__value_preserving_convertible_to<remove_cvref_t<_From>, _To> |
| && (is_same_v<common_type_t<_From, _To>, _To> |
| || (is_same_v<remove_cvref_t<_From>, int> && is_integral_v<_To>) |
| || (is_same_v<remove_cvref_t<_From>, unsigned> && unsigned_integral<_To>)); |
| |
| /** @internal |
| * std::pair is not trivially copyable, this one is |
| */ |
| template <typename _T0, typename _T1> |
| struct __trivial_pair |
| { |
| _T0 _M_first; |
| _T1 _M_second; |
| }; |
| |
| template <typename _From, typename _To> |
| concept __converts_trivially = convertible_to<_From, _To> |
| && sizeof(_From) == sizeof(_To) |
| && is_integral_v<_From> == is_integral_v<_To> |
| && is_floating_point_v<_From> == is_floating_point_v<_To>; |
| |
| [[__gnu__::__always_inline__]] |
| constexpr void |
| __bit_foreach(unsigned_integral auto __bits, auto&& __fun) |
| { |
| static_assert(sizeof(__bits) >= sizeof(int)); // avoid promotion to int |
| while (__bits) |
| { |
| __fun(__countr_zero(__bits)); |
| __bits &= (__bits - 1); |
| } |
| } |
| |
| /** @internal |
| * Optimized @c memcpy for use in partial loads and stores. |
| * |
| * The implementation uses at most two fixed-size power-of-2 @c memcpy calls and reduces the |
| * number of branches to a minimum. The variable size is achieved by overlapping two @c memcpy |
| * calls. |
| * |
| * @tparam _Chunk Copies @p __n times @p _Chunk bytes. |
| * @tparam _Max Copy no more than @p _Max bytes. |
| * |
| * @param __dst The destination pointer. |
| * @param __src The source pointer. |
| * @param __n Thu number of chunks that need to be copied. |
| */ |
| template <size_t _Chunk, size_t _Max> |
| inline void |
| __memcpy_chunks(byte* __restrict__ __dst, const byte* __restrict__ __src, |
| size_t __n) |
| { |
| static_assert(_Max <= 64); |
| static_assert(__has_single_bit(_Chunk) && _Chunk <= 8); |
| size_t __bytes = _Chunk * __n; |
| if (__builtin_constant_p(__bytes)) |
| { // If __n is known via constant propagation use a single memcpy call. Since this is still |
| // a fixed-size memcpy to the compiler, this leaves more room for optimization. |
| __builtin_memcpy(__dst, __src, __bytes); |
| } |
| else if (__bytes > 32 && _Max > 32) |
| { |
| __builtin_memcpy(__dst, __src, 32); |
| __bytes -= 32; |
| __builtin_memcpy(__dst + __bytes, __src + __bytes, 32); |
| } |
| else if (__bytes > 16 && _Max > 16) |
| { |
| __builtin_memcpy(__dst, __src, 16); |
| if constexpr (_Chunk == 8) |
| { |
| __bytes -= 8; |
| __builtin_memcpy(__dst + __bytes, __src + __bytes, 8); |
| } |
| else |
| { |
| __bytes -= 16; |
| __builtin_memcpy(__dst + __bytes, __src + __bytes, 16); |
| } |
| } |
| else if (__bytes > 8 && _Max > 8) |
| { |
| __builtin_memcpy(__dst, __src, 8); |
| if constexpr (_Chunk == 4) |
| { |
| __bytes -= 4; |
| __builtin_memcpy(__dst + __bytes, __src + __bytes, 4); |
| } |
| else if constexpr (_Chunk < 4) |
| { |
| __bytes -= 8; |
| __builtin_memcpy(__dst + __bytes, __src + __bytes, 8); |
| } |
| } |
| else if (__bytes > 4 && _Max > 4) |
| { |
| __builtin_memcpy(__dst, __src, 4); |
| if constexpr (_Chunk == 2) |
| { |
| __bytes -= 2; |
| __builtin_memcpy(__dst + __bytes, __src + __bytes, 2); |
| } |
| else if constexpr (_Chunk == 1) |
| { |
| __bytes -= 4; |
| __builtin_memcpy(__dst + __bytes, __src + __bytes, 4); |
| } |
| } |
| else if (__bytes >= 2) |
| { |
| __builtin_memcpy(__dst, __src, 2); |
| if constexpr (_Chunk == 2) |
| { |
| __bytes -= 2; |
| __builtin_memcpy(__dst + __bytes, __src + __bytes, 2); |
| } |
| else if constexpr (_Chunk == 1) |
| { |
| __bytes -= 1; |
| __builtin_memcpy(__dst + __bytes, __src + __bytes, 1); |
| } |
| } |
| else if (__bytes == 1) |
| __builtin_memcpy(__dst, __src, 1); |
| } |
| |
| // [simd.reductions] identity_element = *see below* |
| template <typename _Tp, typename _BinaryOperation> |
| requires __is_one_of<_BinaryOperation, |
| plus<>, multiplies<>, bit_and<>, bit_or<>, bit_xor<>>::value |
| consteval _Tp |
| __default_identity_element() |
| { |
| if constexpr (same_as<_BinaryOperation, multiplies<>>) |
| return _Tp(1); |
| else if constexpr (same_as<_BinaryOperation, bit_and<>>) |
| return _Tp(~_Tp()); |
| else |
| return _Tp(0); |
| } |
| } // namespace simd |
| _GLIBCXX_END_NAMESPACE_VERSION |
| } // namespace std |
| |
| #pragma GCC diagnostic pop |
| #endif // C++26 |
| #endif // _GLIBCXX_SIMD_DETAILS_H |