libstdc++-v3/include/bits/simd_details.h - gcc - Git at Google

 // Implementation of <simd> -*- C++ -*-

 // Copyright The GNU Toolchain Authors.
 //
 // This file is part of the GNU ISO C++ Library.  This library is free
 // software; you can redistribute it and/or modify it under the
 // terms of the GNU General Public License as published by the
 // Free Software Foundation; either version 3, or (at your option)
 // any later version.

 // This library is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.

 // Under Section 7 of GPL version 3, you are granted additional
 // permissions described in the GCC Runtime Library Exception, version
 // 3.1, as published by the Free Software Foundation.

 // You should have received a copy of the GNU General Public License and
 // a copy of the GCC Runtime Library Exception along with this program;
 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 // <http://www.gnu.org/licenses/>.

 #ifndef _GLIBCXX_SIMD_DETAILS_H
 #define _GLIBCXX_SIMD_DETAILS_H 1

 #ifdef _GLIBCXX_SYSHDR
 #pragma GCC system_header
 #endif

 #if __cplusplus >= 202400L

 #include <bit>
 #include <bits/c++config.h> // _GLIBCXX_FLOAT_IS_IEEE_BINARY32
 #include <bits/stl_function.h> // plus, minus, multiplies, ...
 #include <bits/utility.h> // integer_sequence, etc.
 #include <cmath> // for math_errhandling :(
 #include <concepts>
 #include <cstdint>
 #include <limits>
 #include <span> // for dynamic_extent

 #if __CHAR_BIT__ != 8
 // There are simply too many constants and bit operators that currently depend on CHAR_BIT == 8.
 // Generalization to CHAR_BIT != 8 does not make sense without testability (i.e. a test target).
 #error "<simd> is not supported for CHAR_BIT != 8"
 #endif

 // psabi warnings are bogus because the ABI of the internal types never leaks into user code
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wpsabi"

 #if defined __x86_64__ || defined __i386__
 #define _GLIBCXX_X86 1
 #else
 #define _GLIBCXX_X86 0
 #endif

 #ifndef _GLIBCXX_SIMD_NOEXCEPT
 /** @internal
  * For unit-testing preconditions, use this macro to remove noexcept.
  */
 #define _GLIBCXX_SIMD_NOEXCEPT noexcept
 #endif

 #define _GLIBCXX_SIMD_TOSTRING_IMPL(x) #x
 #define _GLIBCXX_SIMD_TOSTRING(x) _GLIBCXX_SIMD_TOSTRING_IMPL(x)

 // This is used for unit-testing precondition checking
 #define __glibcxx_simd_precondition(expr, msg, ...)                                                \
   __glibcxx_assert(expr)

 namespace std _GLIBCXX_VISIBILITY(default)
 {
 _GLIBCXX_BEGIN_NAMESPACE_VERSION

 namespace simd
 {
   template <typename _Tp>
     inline constexpr _Tp
     __iota = [] { static_assert(false, "invalid __iota specialization"); }();

   // [simd.general] vectorizable types

   template <typename _Tp>
     concept __vectorizable_scalar
       = same_as<remove_cv_t<_Tp>, _Tp>
 #ifdef __STDCPP_BFLOAT16_T__
 	  && !same_as<_Tp, __gnu_cxx::__bfloat16_t>
 #endif
 	  && ((integral<_Tp> && sizeof(_Tp) <= sizeof(0ULL) && !same_as<_Tp, bool>)
 		 || (floating_point<_Tp> && sizeof(_Tp) <= sizeof(double)));

   // [simd.general] p2
   template <typename _Tp>
     concept __vectorizable = __vectorizable_scalar<_Tp>;

   /** @internal
    * Describes variants of _Abi.
    */
   enum class _AbiVariant : unsigned long long
   {
     _BitMask      = 0x01, // AVX512 bit-masks
     _MaskVariants = 0x0f, // vector masks if bits [0:3] are 0
   };

   /** @internal
    * Return @p __in with only bits set that are set in any of @p __to_keep.
    */
   consteval _AbiVariant
   __filter_abi_variant(_AbiVariant __in, same_as<_AbiVariant> auto... __to_keep)
   {
     using _Up = underlying_type_t<_AbiVariant>;
     return static_cast<_AbiVariant>(static_cast<_Up>(__in) & (static_cast<_Up>(__to_keep) | ...));
   }

   /** @internal
    * Type used whenever no valid integer/value type exists.
    */
   struct _InvalidInteger
   {};

   /** @internal
    * Alias for a signed integer type T such that sizeof(T) equals _Bytes.
    *
    * C++26 [simd.expos.defn]
    */
   template <size_t _Bytes>
     using __integer_from
       = decltype([] consteval {
 	  if constexpr (sizeof(signed char) == _Bytes)
 	    return static_cast<signed char>(0);
 	  else if constexpr (sizeof(signed short) == _Bytes)
 	    return static_cast<signed short>(0);
 	  else if constexpr (sizeof(signed int) == _Bytes)
 	    return static_cast<signed int>(0);
 	  else if constexpr (sizeof(signed long long) == _Bytes)
 	    return static_cast<signed long long>(0);
 	  else
 	    return _InvalidInteger();
 	}());

   /** @internal
    * Alias for an unsigned integer type T such that sizeof(T) equals _Bytes.
    */
   template <size_t _Bytes>
     using _UInt = make_unsigned_t<__integer_from<_Bytes>>;

   /** @internal
    * Divide @p __x by @p __y while rounding up instead of down.
    *
    * Preconditions: __x >= 0 && __y > 0.
    */
   template <typename _Tp>
     consteval _Tp
     __div_ceil(_Tp __x, _Tp __y)
     { return (__x + __y - 1) / __y; }

   /** @internal
    * Alias for an unsigned integer type that can store at least @p _NBits bits.
    */
   template <int _NBits>
     requires (_NBits > 0 && _NBits <= numeric_limits<unsigned long long>::digits)
     using _Bitmask = _UInt<__div_ceil(__bit_ceil(unsigned(_NBits)), unsigned(__CHAR_BIT__))>;

   /** @internal
    * Map a given type @p _Tp to an equivalent type.
    *
    * This helps with reducing the necessary branches && casts in the implementation as well as
    * reducing the number of template instantiations.
    */
   template <typename _Tp>
     struct __canonical_vec_type
     { using type = _Tp; };

   template <typename _Tp>
     using __canonical_vec_type_t = typename __canonical_vec_type<_Tp>::type;

 #if __SIZEOF_INT__ == __SIZEOF_LONG__
   template <>
     struct __canonical_vec_type<long>
     { using type = int; };

   template <>
     struct __canonical_vec_type<unsigned long>
     { using type = unsigned int; };
 #elif __SIZEOF_LONG_LONG__ == __SIZEOF_LONG__
   template <>
     struct __canonical_vec_type<long>
     { using type = long long; };

   template <>
     struct __canonical_vec_type<unsigned long>
     { using type = unsigned long long; };
 #endif

   template <typename _Tp>
     requires std::is_enum_v<_Tp>
     struct __canonical_vec_type<_Tp>
     { using type = __canonical_vec_type<std::underlying_type_t<_Tp>>::type; };

   template <>
     struct __canonical_vec_type<char>
 #if __CHAR_UNSIGNED__
     { using type = unsigned char; };
 #else
     { using type = signed char; };
 #endif

   template <>
     struct __canonical_vec_type<char8_t>
     { using type = unsigned char; };

   template <>
     struct __canonical_vec_type<char16_t>
     { using type = uint_least16_t; };

   template <>
     struct __canonical_vec_type<char32_t>
     { using type = uint_least32_t; };

   template <>
     struct __canonical_vec_type<wchar_t>
     {
       using type = std::__conditional_t<std::is_signed_v<wchar_t>,
 					simd::__integer_from<sizeof(wchar_t)>,
 					simd::_UInt<sizeof(wchar_t)>>;
     };

 #if defined(__FLT64_DIG__) && defined(_GLIBCXX_DOUBLE_IS_IEEE_BINARY64)
   template <>
     struct __canonical_vec_type<_Float64>
     { using type = double; };
 #endif

 #if defined(__FLT32_DIG__) && defined(_GLIBCXX_FLOAT_IS_IEEE_BINARY32)
   template <>
     struct __canonical_vec_type<_Float32>
     { using type = float; };
 #endif

   /** @internal
    * This ABI tag describes basic_vec objects that store one element per data member and basic_mask
    * objects that store one bool data members.
    *
    * @tparam _Np   The number of elements, which also matches the number of data members in
    *               basic_vec and basic_mask.
    */
   template <int _Np = 1>
     struct _ScalarAbi
     {
       static constexpr int _S_size = _Np;

       static constexpr int _S_nreg = _Np;

       static constexpr _AbiVariant _S_variant = {};

       template <typename _Tp>
 	using _DataType = __canonical_vec_type_t<_Tp>;

       static constexpr bool _S_is_vecmask = false;

       // in principle a bool is a 1-bit bitmask, but this is asking for an AVX512 bitmask
       static constexpr bool _S_is_bitmask = false;

       template <size_t>
 	using _MaskDataType = bool;

       template <int _N2, int _Nreg2 = _N2>
 	static consteval _ScalarAbi<_N2>
 	_S_resize()
 	{
 	  static_assert(_N2 == _Nreg2);
 	  return {};
 	}
     };

   /** @internal
    * This ABI tag describes basic_vec objects that store one or more objects declared with the
    * [[gnu::vector_size(N)]] attribute.
    * Applied to basic_mask objects, this ABI tag either describes corresponding vector-mask objects
    * or bit-mask objects. Which one is used is determined via @p _Var.
    *
    * @tparam _Np    The number of elements.
    * @tparam _Nreg  The number of registers needed to store @p _Np elements.
    * @tparam _Var   Determines how complex value-types are layed out and whether mask types use
    *                bit-masks or vector-masks.
    */
   template <int _Np, int _Nreg, underlying_type_t<_AbiVariant> _Var>
     struct _Abi
     {
       static constexpr int _S_size = _Np;

       /** @internal
        * The number of registers needed to represent one basic_vec for the element type that was
        * used on ABI deduction.
        *
        * Examples:
        * - '_Abi< 8, 2>' for 'int' is 2x 128-bit
        * - '_Abi< 9, 3>' for 'int' is 2x 128-bit and 1x 32-bit
        * - '_Abi<10, 3>' for 'int' is 2x 128-bit and 1x 64-bit
        * - '_Abi<10, 1>' for 'int' is 1x 512-bit
        * - '_Abi<10, 2>' for 'int' is 1x 256-bit and 1x 64-bit
        */
       static constexpr int _S_nreg = _Nreg;

       static_assert(_S_size > 0);
       static_assert(_S_nreg > 0);

       static constexpr _AbiVariant _S_variant = static_cast<_AbiVariant>(_Var);

       static constexpr bool _S_is_bitmask
 	= __filter_abi_variant(_S_variant, _AbiVariant::_BitMask) == _AbiVariant::_BitMask;

       static constexpr bool _S_is_vecmask = !_S_is_bitmask;

       template <typename _Tp>
 	using _DataType = decltype([] {
 			    static_assert(_S_nreg == 1);
 			    if constexpr (_S_size == 1)
 			      return __canonical_vec_type_t<_Tp>();
 			    else
 			      {
 				constexpr int __n = __bit_ceil(unsigned(_S_size));
 				using _Vp [[__gnu__::__vector_size__(sizeof(_Tp) * __n)]]
 				  = __canonical_vec_type_t<_Tp>;
 				return _Vp();
 			      }
 			  }());

       template <size_t _Bytes>
 	using _MaskDataType
 	  = decltype([] {
 	      static_assert(_S_nreg == 1);
 	      if constexpr (_S_size == 1)
 		return bool();
 	      else if constexpr (_S_is_vecmask)
 		{
 		  constexpr unsigned __vbytes = _Bytes * __bit_ceil(unsigned(_S_size));
 		  using _Vp [[__gnu__::__vector_size__(__vbytes)]] = __integer_from<_Bytes>;
 		  return _Vp();
 		}
 	      else if constexpr (_Nreg > 1)
 		return _InvalidInteger();
 	      else
 		return _Bitmask<_S_size>();
 	    }());

       template <int _N2, int _Nreg2 = __div_ceil(_N2, _S_size)>
 	static consteval auto
 	_S_resize()
 	{
 	  if constexpr (_N2 == 1)
 	    return _Abi<1, 1, _Var>();
 	  else
 	    return _Abi<_N2, _Nreg2, _Var>();
 	}
     };

   /** @internal
    * Alias for an _Abi specialization where the _AbiVariant bits are combined into a single integer
    * value.
    *
    * Rationale: Consider diagnostic output and mangling of e.g. vec<int, 4> with AVX512. That's an
    * alias for std::simd::basic_vec<int, std::simd::_Abi<4, 1, 1ull>>. If _AbiVariant were the
    * template argument type of _Abi, the diagnostic output would be 'std::simd::basic_vec<int,
    * std::simd::_Abi<4, 1, (std::simd::_AbiVariant)std::simd::_AbiVariant::_BitMask>>'. That's a lot
    * longer, requires longer mangled names, and bakes the names of the enumerators into the ABI. As
    * soon as bits of multiple _AbiVariants are combined, this becomes hard to parse for humans
    * anyway.
    */
   template <int _Np, int _Nreg, _AbiVariant... _Vs>
     using _Abi_t = _Abi<_Np, _Nreg, (static_cast<underlying_type_t<_AbiVariant>>(_Vs) | ... | 0)>;

   /** @internal
    * This type is used whenever ABI tag deduction can't give a useful answer.
    */
   struct _InvalidAbi
   { static constexpr int _S_size = 0; };

   /** @internal
    * Satisfied if @p _Tp is a valid simd ABI tag. This is a necessary but not sufficient condition
    * for an enabled basic_vec/basic_mask specialization.
    */
   template <typename _Tp>
     concept __abi_tag
       = same_as<decltype(_Tp::_S_variant), const _AbiVariant>
 	  && (_Tp::_S_size >= _Tp::_S_nreg) && (_Tp::_S_nreg >= 1)
 	  && requires(_Tp __x) {
 	    { __x.template _S_resize<_Tp::_S_size, _Tp::_S_nreg>() } -> same_as<_Tp>;
 	  };

   template <typename _Tp>
     concept __scalar_abi_tag
       = same_as<_Tp, _ScalarAbi<_Tp::_S_size>> && __abi_tag<_Tp>;

   // Determine if math functions must *raise* floating-point exceptions.
   // math_errhandling may expand to an extern symbol, in which case we must assume fp exceptions
   // need to be considered. A conforming C library must define math_errhandling, but in case it
   // isn't defined we simply use the fallback.
 #ifdef math_errhandling
   template <int = 0>
     requires requires { typename bool_constant<0 != (math_errhandling & MATH_ERREXCEPT)>; }
     consteval bool
     __handle_fpexcept_impl(int)
     { return 0 != (math_errhandling & MATH_ERREXCEPT); }
 #endif

   // Fallback if math_errhandling doesn't work: implement correct exception behavior.
   consteval bool
   __handle_fpexcept_impl(float)
   { return true; }

   /** @internal
    * This type can be used as a template parameter for avoiding ODR violations, where code needs to
    * differ depending on optimization flags (mostly fp-math related).
    */
   struct _OptTraits
   {
     consteval bool
     _M_test(int __bit) const
     { return ((_M_build_flags >> __bit) & 1) == 1; }

     // true iff floating-point operations can signal an exception (allow non-default handler)
     consteval bool
     _M_fp_may_signal() const
     { return _M_test(0); }

     // true iff floating-point operations can raise an exception flag
     consteval bool
     _M_fp_may_raise() const
     { return _M_test(12); }

     consteval bool
     _M_fast_math() const
     { return _M_test(1); }

     consteval bool
     _M_finite_math_only() const
     { return _M_test(2); }

     consteval bool
     _M_no_signed_zeros() const
     { return _M_test(3); }

     consteval bool
     _M_signed_zeros() const
     { return !_M_test(3); }

     consteval bool
     _M_reciprocal_math() const
     { return _M_test(4); }

     consteval bool
     _M_no_math_errno() const
     { return _M_test(5); }

     consteval bool
     _M_math_errno() const
     { return !_M_test(5); }

     consteval bool
     _M_associative_math() const
     { return _M_test(6); }

     consteval bool
     _M_conforming_to_STDC_annex_G() const
     { return _M_test(10) && !_M_finite_math_only(); }

     consteval bool
     _M_support_snan() const
     { return _M_test(11); }

     __UINT64_TYPE__ _M_build_flags
       = 0
 #if !__NO_TRAPPING_MATH__
 	  + (1 << 0)
 #endif
 	  + (__handle_fpexcept_impl(0) << 12)
 #if __FAST_MATH__
 	  + (1 << 1)
 #endif
 #if __FINITE_MATH_ONLY__
 	  + (1 << 2)
 #endif
 #if __NO_SIGNED_ZEROS__
 	  + (1 << 3)
 #endif
 #if __RECIPROCAL_MATH__
 	  + (1 << 4)
 #endif
 #if __NO_MATH_ERRNO__
 	  + (1 << 5)
 #endif
 #if __ASSOCIATIVE_MATH__
 	  + (1 << 6)
 #endif
 	// bits 7, 8, and 9 reserved for __FLT_EVAL_METHOD__
 #if __FLT_EVAL_METHOD__ == 1
 	  + (1 << 7)
 #elif __FLT_EVAL_METHOD__ == 2
 	  + (2 << 7)
 #elif __FLT_EVAL_METHOD__ != 0
 	  + (3 << 7)
 #endif

 	// C Annex G defines the behavior of complex<T> where T is IEC60559 floating-point. If
 	// __STDC_IEC_60559_COMPLEX__ is defined then Annex G is implemented - and simd<complex>
 	// will do so as well. However, Clang never defines the macro.
 #if defined __STDC_IEC_60559_COMPLEX__ || defined __STDC_IEC_559_COMPLEX__ || defined _GLIBCXX_CLANG
 	  + (1 << 10)
 #endif
 #if __SUPPORT_SNAN__
 	  + (1 << 11)
 #endif
 	;
   };

   /** @internal
    * Return true iff @p __s equals "1".
    */
   consteval bool
   __streq_to_1(const char* __s)
   { return __s != nullptr && __s[0] == '1' && __s[1] == '\0'; }

   /** @internal
    * If the macro given as @p feat is defined to 1, expands to a bit set at position @p off.
    * Otherwise, expand to zero.
    */
 #define _GLIBCXX_SIMD_ARCH_FLAG(off, feat) \
   (static_cast<__UINT64_TYPE__>(std::simd::__streq_to_1(_GLIBCXX_SIMD_TOSTRING_IMPL(feat))) << off)

 #if _GLIBCXX_X86

 #define _GLIBCXX_SIMD_ARCH_TRAITS_INIT {                      \
   _GLIBCXX_SIMD_ARCH_FLAG(0, __MMX__)                         \
     | _GLIBCXX_SIMD_ARCH_FLAG( 1, __SSE__)                    \
     | _GLIBCXX_SIMD_ARCH_FLAG( 2, __SSE2__)                   \
     | _GLIBCXX_SIMD_ARCH_FLAG( 3, __SSE3__)                   \
     | _GLIBCXX_SIMD_ARCH_FLAG( 4, __SSSE3__)                  \
     | _GLIBCXX_SIMD_ARCH_FLAG( 5, __SSE4_1__)                 \
     | _GLIBCXX_SIMD_ARCH_FLAG( 6, __SSE4_2__)                 \
     | _GLIBCXX_SIMD_ARCH_FLAG( 7, __POPCNT__)                 \
     | _GLIBCXX_SIMD_ARCH_FLAG( 8, __AVX__)                    \
     | _GLIBCXX_SIMD_ARCH_FLAG( 9, __F16C__)                   \
     | _GLIBCXX_SIMD_ARCH_FLAG(10, __BMI__)                    \
     | _GLIBCXX_SIMD_ARCH_FLAG(11, __BMI2__)                   \
     | _GLIBCXX_SIMD_ARCH_FLAG(12, __LZCNT__)                  \
     | _GLIBCXX_SIMD_ARCH_FLAG(13, __AVX2__)                   \
     | _GLIBCXX_SIMD_ARCH_FLAG(14, __FMA__)                    \
     | _GLIBCXX_SIMD_ARCH_FLAG(15, __AVX512F__)                \
     | _GLIBCXX_SIMD_ARCH_FLAG(16, __AVX512CD__)               \
     | _GLIBCXX_SIMD_ARCH_FLAG(17, __AVX512DQ__)               \
     | _GLIBCXX_SIMD_ARCH_FLAG(18, __AVX512BW__)               \
     | _GLIBCXX_SIMD_ARCH_FLAG(19, __AVX512VL__)               \
     | _GLIBCXX_SIMD_ARCH_FLAG(20, __AVX512BITALG__)           \
     | _GLIBCXX_SIMD_ARCH_FLAG(21, __AVX512VBMI__)             \
     | _GLIBCXX_SIMD_ARCH_FLAG(22, __AVX512VBMI2__)            \
     | _GLIBCXX_SIMD_ARCH_FLAG(23, __AVX512IFMA__)             \
     | _GLIBCXX_SIMD_ARCH_FLAG(24, __AVX512VNNI__)             \
     | _GLIBCXX_SIMD_ARCH_FLAG(25, __AVX512VPOPCNTDQ__)        \
     | _GLIBCXX_SIMD_ARCH_FLAG(26, __AVX512FP16__)             \
     | _GLIBCXX_SIMD_ARCH_FLAG(27, __AVX512BF16__)             \
     | _GLIBCXX_SIMD_ARCH_FLAG(28, __AVXIFMA__)                \
     | _GLIBCXX_SIMD_ARCH_FLAG(29, __AVXNECONVERT__)           \
     | _GLIBCXX_SIMD_ARCH_FLAG(30, __AVXVNNI__)                \
     | _GLIBCXX_SIMD_ARCH_FLAG(31, __AVXVNNIINT8__)            \
     | _GLIBCXX_SIMD_ARCH_FLAG(32, __AVXVNNIINT16__)           \
     | _GLIBCXX_SIMD_ARCH_FLAG(33, __AVX10_1__)                \
     | _GLIBCXX_SIMD_ARCH_FLAG(34, __AVX10_2__)                \
     | _GLIBCXX_SIMD_ARCH_FLAG(35, __AVX512VP2INTERSECT__)     \
     | _GLIBCXX_SIMD_ARCH_FLAG(36, __SSE4A__)                  \
     | _GLIBCXX_SIMD_ARCH_FLAG(37, __FMA4__)                   \
     | _GLIBCXX_SIMD_ARCH_FLAG(38, __XOP__)                    \
   }
   // Should this include __APX_F__? I don't think it's relevant for use in constexpr-if branches =>
   // no ODR issue? The same could be said about several other flags above that are not checked
   // anywhere.

   struct _ArchTraits
   {
     __UINT64_TYPE__ _M_flags = _GLIBCXX_SIMD_ARCH_TRAITS_INIT;

     consteval bool
     _M_test(int __bit) const
     { return ((_M_flags >> __bit) & 1) == 1; }

     consteval bool
     _M_have_mmx() const
     { return _M_test(0); }

     consteval bool
     _M_have_sse() const
     { return _M_test(1); }

     consteval bool
     _M_have_sse2() const
     { return _M_test(2); }

     consteval bool
     _M_have_sse3() const
     { return _M_test(3); }

     consteval bool
     _M_have_ssse3() const
     { return _M_test(4); }

     consteval bool
     _M_have_sse4_1() const
     { return _M_test(5); }

     consteval bool
     _M_have_sse4_2() const
     { return _M_test(6); }

     consteval bool
     _M_have_popcnt() const
     { return _M_test(7); }

     consteval bool
     _M_have_avx() const
     { return _M_test(8); }

     consteval bool
     _M_have_f16c() const
     { return _M_test(9); }

     consteval bool
     _M_have_bmi() const
     { return _M_test(10); }

     consteval bool
     _M_have_bmi2() const
     { return _M_test(11); }

     consteval bool
     _M_have_lzcnt() const
     { return _M_test(12); }

     consteval bool
     _M_have_avx2() const
     { return _M_test(13); }

     consteval bool
     _M_have_fma() const
     { return _M_test(14); }

     consteval bool
     _M_have_avx512f() const
     { return _M_test(15); }

     consteval bool
     _M_have_avx512cd() const
     { return _M_test(16); }

     consteval bool
     _M_have_avx512dq() const
     { return _M_test(17); }

     consteval bool
     _M_have_avx512bw() const
     { return _M_test(18); }

     consteval bool
     _M_have_avx512vl() const
     { return _M_test(19); }

     consteval bool
     _M_have_avx512bitalg() const
     { return _M_test(20); }

     consteval bool
     _M_have_avx512vbmi() const
     { return _M_test(21); }

     consteval bool
     _M_have_avx512vbmi2() const
     { return _M_test(22); }

     consteval bool
     _M_have_avx512ifma() const
     { return _M_test(23); }

     consteval bool
     _M_have_avx512vnni() const
     { return _M_test(24); }

     consteval bool
     _M_have_avx512vpopcntdq() const
     { return _M_test(25); }

     consteval bool
     _M_have_avx512fp16() const
     { return _M_test(26); }

     consteval bool
     _M_have_avx512bf16() const
     { return _M_test(27); }

     consteval bool
     _M_have_avxifma() const
     { return _M_test(28); }

     consteval bool
     _M_have_avxneconvert() const
     { return _M_test(29); }

     consteval bool
     _M_have_avxvnni() const
     { return _M_test(30); }

     consteval bool
     _M_have_avxvnniint8() const
     { return _M_test(31); }

     consteval bool
     _M_have_avxvnniint16() const
     { return _M_test(32); }

     consteval bool
     _M_have_avx10_1() const
     { return _M_test(33); }

     consteval bool
     _M_have_avx10_2() const
     { return _M_test(34); }

     consteval bool
     _M_have_avx512vp2intersect() const
     { return _M_test(35); }

     consteval bool
     _M_have_sse4a() const
     { return _M_test(36); }

     consteval bool
     _M_have_fma4() const
     { return _M_test(37); }

     consteval bool
     _M_have_xop() const
     { return _M_test(38); }

     template <typename _Tp>
       consteval bool
       _M_eval_as_f32() const
       { return is_same_v<_Tp, _Float16> && !_M_have_avx512fp16(); }
   };

   template <typename _Tp, _ArchTraits _Traits = {}>
     consteval auto
     __native_abi()
     {
       constexpr int __adj_sizeof = sizeof(_Tp) * (1 + is_same_v<_Tp, _Float16>);
       if constexpr (!__vectorizable<_Tp>)
 	return _InvalidAbi();
       else if constexpr (_Traits._M_have_avx512fp16())
 	return _Abi_t<64 / sizeof(_Tp), 1, _AbiVariant::_BitMask>();
       else if constexpr (_Traits._M_have_avx512f())
 	return _Abi_t<64 / __adj_sizeof, 1, _AbiVariant::_BitMask>();
       else if constexpr (is_same_v<_Tp, _Float16> && !_Traits._M_have_f16c())
 	return _ScalarAbi<1>();
       else if constexpr (_Traits._M_have_avx2())
 	return _Abi_t<32 / __adj_sizeof, 1>();
       else if constexpr (_Traits._M_have_avx() && is_floating_point_v<_Tp>)
 	return _Abi_t<32 / __adj_sizeof, 1>();
       else if constexpr (_Traits._M_have_sse2())
 	return _Abi_t<16 / __adj_sizeof, 1>();
       else if constexpr (_Traits._M_have_sse() && is_floating_point_v<_Tp>
 			   && sizeof(_Tp) == sizeof(float))
 	return _Abi_t<16 / __adj_sizeof, 1>();
       // no MMX: we can't emit EMMS where it would be necessary
       else
 	return _ScalarAbi<1>();
     }

 #else

   // scalar fallback
   struct _ArchTraits
   {
     __UINT64_TYPE__ _M_flags = 0;

     constexpr bool
     _M_test(int __bit) const
     { return ((_M_flags >> __bit) & 1) == 1; }
   };

   template <typename _Tp>
     consteval auto
     __native_abi()
     {
       if constexpr (!__vectorizable<_Tp>)
 	return _InvalidAbi();
       else
 	return _ScalarAbi<1>();
     }

 #endif

   /** @internal
    * You must use this type as template argument to function templates that are not declared
    * always_inline (to avoid issues when linking code compiled with different compiler flags).
    */
   struct _TargetTraits
   : _ArchTraits, _OptTraits
   {};

   /** @internal
    * Alias for an ABI tag such that basic_vec<_Tp, __native_abi_t_<_Tp>> stores one SIMD register of
    * optimal width.
    *
    * @tparam _Tp  A vectorizable type.
    *
    * C++26 [simd.expos.abi]
    */
   template <typename _Tp>
     using __native_abi_t = decltype(std::simd::__native_abi<_Tp>());

   template <typename _Tp, int _Np, _TargetTraits _Target = {}>
     consteval auto
     __deduce_abi()
     {
       constexpr auto __native = std::simd::__native_abi<_Tp>();
       if constexpr (0 == __native._S_size || _Np <= 0)
 	return _InvalidAbi();
       else if constexpr (_Np == __native._S_size)
 	return __native;
       else
 	return __native.template _S_resize<_Np>();
     }

   /** @internal
    * Alias for an ABI tag @c A such that `basic_vec<_Tp, A>` stores @p _Np elements.
    *
    * C++26 [simd.expos.abi]
    */
   template <typename _Tp, int _Np>
     using __deduce_abi_t = decltype(std::simd::__deduce_abi<_Tp, _Np>());

   /** @internal
    * \c rebind implementation detail for basic_vec, and basic_mask where we know the destination
    * value-type
    */
   template <typename _Tp, int _Np, __abi_tag _A0, _ArchTraits = {}>
     consteval auto
     __abi_rebind()
     {
       if constexpr (_Np <= 0 || !__vectorizable<_Tp>)
 	return _InvalidAbi();

       else if constexpr (__scalar_abi_tag<_A0>)
 	return _A0::template _S_resize<_Np>();

       else
 	{
 	  using _Native = remove_const_t<decltype(std::simd::__native_abi<_Tp>())>;
 	  static_assert(0 != _Native::_S_size);
 	  constexpr int __nreg = __div_ceil(_Np, _Native::_S_size);

 	  if constexpr (__scalar_abi_tag<_Native>)
 	    return _Native::template _S_resize<_Np>();
 	  else
 	    return _Abi_t<_Native::_S_size, 1, __filter_abi_variant(_A0::_S_variant,
 								    _AbiVariant::_MaskVariants)
 			 >::template _S_resize<_Np, __nreg>();
 	}
     }

   /** @internal
    * @c rebind implementation detail for basic_mask.
    *
    * The important difference here is that we have no information about the actual value-type other
    * than its @c sizeof. So `_Bytes == 8` could mean `complex<float>`, @c double, or @c int64_t.
    * E.g. `_Np == 4` with AVX w/o AVX2 that's `vector(4) int`, `vector(4) long long`, or `2x
    * vector(2) long long`.
    * That's why this overload has the additional @p _IsOnlyResize parameter, which tells us that the
    * value-type doesn't change.
    */
   template <size_t _Bytes, int _Np, __abi_tag _A0, bool _IsOnlyResize, _ArchTraits _Traits = {}>
     consteval auto
     __abi_rebind()
     {
       if constexpr (_Bytes == 0 || _Np <= 0)
 	return _InvalidAbi();

       else if constexpr (__scalar_abi_tag<_A0>)
 	return _A0::template _S_resize<_Np>();

 #if _GLIBCXX_X86
       // AVX w/o AVX2:
       // e.g. resize_t<8, mask<float, Whatever>> needs to be _Abi<8, 1> not _Abi<8, 2>
       // We determine whether _A0 identifies an AVX vector by looking at the size of a native
       // register. If it's 32, it's a YMM register, otherwise it's 16 or less.
       else if constexpr (_IsOnlyResize
 			   && _Traits._M_have_avx() && !_Traits._M_have_avx2()
 			   && __bit_ceil(__div_ceil<unsigned>(
 					    _A0::_S_size, _A0::_S_nreg)) * _Bytes == 32)
 	{
 	  if constexpr (_Bytes == sizeof(double))
 	    return __abi_rebind<double, _Np, _A0>();
 	  else if constexpr (_Bytes == sizeof(float))
 	    return __abi_rebind<float, _Np, _A0>();
 	  else if constexpr (_Traits._M_have_f16c() && _Bytes == sizeof(_Float16))
 	    return __abi_rebind<_Float16, _Np, _A0>();
 	  else // impossible
 	    static_assert(false);
 	}
 #endif

       else
 	return __abi_rebind<__integer_from<_Bytes>, _Np, _A0>();
     }

   /** @internal
    * Returns true unless _GLIBCXX_SIMD_COND_EXPLICIT_MASK_CONVERSION is defined.
    *
    * On IvyBridge, (vec<float> == 0.f) == (rebind_t<int, vec<float>> == 0) does not compile. It does
    * compile on basically every other target, though. This is due to the difference in ABI tag:
    * _Abi<8, 1, [...]> vs. _Abi<8, 2, [...]> (8 elements, 1 vs. 2 registers).
    * I know how to define this funtion for libstdc++ to avoid interconvertible masks. The question
    * is whether we can specify this in general for C++29.
    *
    * Idea: Is rebind_t<integer-from<...>, mask>::abi_type the same type as
    *   deduce-t<integer-from<...>, mask::size()>? If yes, it's the "better" ABI tag. However, this
    *   makes the conversion behavior dependent on compiler flags. Probably not what we want.
    */
   template <typename _To, typename _From>
   consteval bool
     __is_mask_conversion_explicit([[maybe_unused]] size_t __b0, [[maybe_unused]] size_t __b1)
     {
       constexpr int __n = _To::_S_size;
       static_assert(__n == _From::_S_size);
 #ifndef _GLIBCXX_SIMD_COND_EXPLICIT_MASK_CONVERSION
       /// C++26 [simd.mask.ctor] uses unconditional explicit
       return true;
 #else
       if (__b0 != __b1)
 	return true;

       // everything is better than _ScalarAbi, except when converting to a single bool
       if constexpr (__scalar_abi_tag<_To>)
 	return __n > 1;
       else if constexpr (__scalar_abi_tag<_From>)
 	return true;

       // converting to a bit-mask is better
       else if constexpr (_To::_S_is_vecmask != _From::_S_is_vecmask)
 	return _To::_S_is_vecmask; // to vector-mask is explicit

       // with vec-masks, fewer registers is better
       else if constexpr (_From::_S_nreg != _To::_S_nreg)
 	return _From::_S_nreg < _To::_S_nreg;

       else
 	__builtin_unreachable();
 #endif
     }

   /** @internal
    * An alias for a signed integer type.
    *
    * libstdc++ unconditionally uses @c int here, since it matches the return type of
    * 'Bit Operation Builtins' in GCC.
    *
    * C++26 [simd.expos.defn]
    */
   using __simd_size_type = int;

   // integral_constant shortcut
   template <__simd_size_type _Xp>
     inline constexpr integral_constant<__simd_size_type, _Xp> __simd_size_c = {};

   // [simd.syn]
   template <typename _Tp, typename _Ap = __native_abi_t<_Tp>>
     class basic_vec;

   template <typename _Tp, __simd_size_type _Np = __native_abi_t<_Tp>::_S_size>
     using vec = basic_vec<_Tp, __deduce_abi_t<_Tp, _Np>>;

   template <size_t _Bytes, typename _Ap = __native_abi_t<__integer_from<_Bytes>>>
     class basic_mask;

   template <typename _Tp, __simd_size_type _Np = __native_abi_t<_Tp>::_S_size>
     using mask = basic_mask<sizeof(_Tp), __deduce_abi_t<_Tp, _Np>>;

   // [simd.ctor] load constructor constraints
   template <typename _Tp, size_t _Np = -1uz>
     concept __static_sized_range
       = ranges::sized_range<_Tp> && requires(_Tp&& __r) {
 	typename integral_constant<size_t, ranges::size(__r)>;
 	requires (_Np == -1uz || ranges::size(__r) == _Np);
       };

   template <typename _Rg>
     consteval size_t
     __static_range_size(_Rg& __r)
     {
       if constexpr (requires { typename integral_constant<size_t, ranges::size(__r)>; })
 	return ranges::size(__r);
       else
 	return dynamic_extent;
     }

   // [simd.general] value-preserving
   template <typename _From, typename _To>
     concept __arithmetic_only_value_preserving_convertible_to
       = convertible_to<_From, _To> && is_arithmetic_v<_From> && is_arithmetic_v<_To>
 	  && !(is_signed_v<_From> && is_unsigned_v<_To>)
 	  && numeric_limits<_From>::digits <= numeric_limits<_To>::digits
 	  && numeric_limits<_From>::max() <= numeric_limits<_To>::max()
 	  && numeric_limits<_From>::lowest() >= numeric_limits<_To>::lowest();

   /** @internal
    * Satisfied if the conversion from @p _From to @p _To is a value-preserving conversion.
    *
    * C++26 [simd.general]
    */
   template <typename _From, typename _To>
     concept __value_preserving_convertible_to
       = __arithmetic_only_value_preserving_convertible_to<_From, _To>;

   // LWG4420
   template <typename _From, typename _To>
     concept __explicitly_convertible_to = requires {
       static_cast<_To>(declval<_From>());
     };

   /** @internal
    * C++26 [simd.expos]
    */
   template<typename _Tp>
     concept __constexpr_wrapper_like
       = convertible_to<_Tp, decltype(_Tp::value)>
 	  && equality_comparable_with<_Tp, decltype(_Tp::value)>
 	  && bool_constant<_Tp() == _Tp::value>::value
 	  && bool_constant<static_cast<decltype(_Tp::value)>(_Tp()) == _Tp::value>::value;

   // [simd.ctor] explicit(...) of broadcast ctor
   template <auto _From, typename _To>
     concept __non_narrowing_constexpr_conversion
       = is_arithmetic_v<decltype(_From)>
 	  && static_cast<decltype(_From)>(static_cast<_To>(_From)) == _From
 	  && !(unsigned_integral<_To> && _From < decltype(_From)())
 	  && _From <= std::numeric_limits<_To>::max()
 	  && _From >= std::numeric_limits<_To>::lowest();

   // [simd.ctor] p4
   // This implements LWG4436 (submitted on 2025-10-28)
   template <typename _From, typename _To>
     concept __broadcast_constructible
       = ((convertible_to<_From, _To> && !is_arithmetic_v<remove_cvref_t<_From>>
 	    && !__constexpr_wrapper_like<remove_cvref_t<_From>>) // 4.1
 	   || __value_preserving_convertible_to<remove_cvref_t<_From>, _To> // 4.2
 	   || (__constexpr_wrapper_like<remove_cvref_t<_From>> // 4.3
 		 && __non_narrowing_constexpr_conversion<auto(remove_cvref_t<_From>::value),
 							 _To>));

   // __higher_floating_point_rank_than<_Tp, U> (_Tp has higher or equal floating point rank than U)
   template <typename _From, typename _To>
     consteval bool
     __higher_floating_point_rank_than()
     {
       return floating_point<_From> && floating_point<_To>
 	       && is_same_v<common_type_t<_From, _To>, _From> && !is_same_v<_From, _To>;
     }

   // __higher_integer_rank_than<_Tp, U> (_Tp has higher or equal integer rank than U)
   template <typename _From, typename _To>
     consteval bool
     __higher_integer_rank_than()
     {
       return integral<_From> && integral<_To>
 	       && (sizeof(_From) > sizeof(_To) || is_same_v<common_type_t<_From, _To>, _From>)
 	       && !is_same_v<_From, _To>;
     }

   template <typename _From, typename _To>
     concept __higher_rank_than
       = __higher_floating_point_rank_than<_From, _To>() || __higher_integer_rank_than<_From, _To>();

   struct __convert_flag;

   template <typename _From, typename _To, typename... _Flags>
     concept __loadstore_convertible_to
       = same_as<_From, _To>
 	  || (__vectorizable<_From> && __vectorizable<_To>
 		&& (__value_preserving_convertible_to<_From, _To>
 		       || (__explicitly_convertible_to<_From, _To>
 			     && (std::is_same_v<_Flags, __convert_flag> || ...))));

   template <typename _From, typename _To>
     concept __simd_generator_convertible_to
       = std::convertible_to<_From, _To>
 	  && (!is_arithmetic_v<_From> || __value_preserving_convertible_to<_From, _To>);

   template <typename _Fp, typename _Tp, __simd_size_type... _Is>
     requires (__simd_generator_convertible_to<
 		decltype(declval<_Fp>()(__simd_size_c<_Is>)), _Tp> && ...)
     constexpr void
     __simd_generator_invokable_impl(integer_sequence<__simd_size_type, _Is...>);

   template <typename _Fp, typename _Tp, __simd_size_type _Np>
     concept __simd_generator_invokable = requires {
       __simd_generator_invokable_impl<_Fp, _Tp>(make_integer_sequence<__simd_size_type, _Np>());
     };

   template <typename _Fp>
     concept __index_permutation_function_sized = requires(_Fp const& __f)
       {
 	{ __f(0, 0) } -> std::integral;
       };

   template <typename _Fp, typename _Simd>
     concept __index_permutation_function
       = __index_permutation_function_sized<_Fp> || requires(_Fp const& __f) {
 	{ __f(0) } -> std::integral;
       };

   /** @internal
    * The value of the @c _Bytes template argument to a @c basic_mask specialization.
    *
    * C++26 [simd.expos.defn]
    */
   template <typename _Tp>
     constexpr size_t __mask_element_size = 0;

   template <size_t _Bytes, __abi_tag _Ap>
     constexpr size_t __mask_element_size<basic_mask<_Bytes, _Ap>> = _Bytes;

   // [simd.expos]
   template <typename _Vp>
     concept __simd_vec_type
       = same_as<_Vp, basic_vec<typename _Vp::value_type, typename _Vp::abi_type>>
 	  && is_default_constructible_v<_Vp>;

   template <typename _Vp>
     concept __simd_mask_type
       = same_as<_Vp, basic_mask<__mask_element_size<_Vp>, typename _Vp::abi_type>>
 	&& is_default_constructible_v<_Vp>;

   /** @internal
    * Satisfied if @p _Tp is a data-parallel type.
    */
   template <typename _Vp>
     concept __simd_vec_or_mask_type = __simd_vec_type<_Vp> || __simd_mask_type<_Vp>;

   template <typename _Vp>
     concept __simd_floating_point
       = __simd_vec_type<_Vp> && floating_point<typename _Vp::value_type>;

   template <typename _Vp>
     concept __simd_integral
       = __simd_vec_type<_Vp> && integral<typename _Vp::value_type>;

   template <typename _Tp>
     concept __converts_to_vec
       = __simd_vec_type<decltype(declval<const _Tp&>() + declval<const _Tp&>())>;

   template <__converts_to_vec _Tp>
     using __deduced_vec_t = decltype(declval<const _Tp&>() + declval<const _Tp&>());

   template <typename _Vp, typename _Tp>
     using __make_compatible_simd_t
       = decltype([] {
 	  using _Up = decltype(declval<const _Tp&>() + declval<const _Tp&>());
 	  if constexpr (__simd_vec_type<_Up>)
 	    return _Up();
 	  else
 	    return vec<_Up, _Vp::size()>();
       }());

   template <typename _Tp>
     concept __math_floating_point = __simd_floating_point<__deduced_vec_t<_Tp>>;

   template <typename _BinaryOperation, typename _Tp>
     concept __reduction_binary_operation
       = requires (const _BinaryOperation __binary_op, const vec<_Tp, 1> __v) {
 	{ __binary_op(__v, __v) } -> same_as<vec<_Tp, 1>>;
       };

   /** @internal
    * Returns the highest index @c i where `(__bits >> i) & 1` equals @c 1.
    */
   [[__gnu__::__always_inline__]]
   constexpr __simd_size_type
   __highest_bit(std::unsigned_integral auto __bits)
   {
     using __gnu_cxx::__int_traits;
     constexpr auto _Nd = __int_traits<decltype(__bits)>::__digits;
     return _Nd - 1 - __countl_zero(__bits);
   }

   template <__vectorizable _Tp, __simd_size_type _Np, __abi_tag _Ap>
     using __similar_mask = basic_mask<sizeof(_Tp), decltype(__abi_rebind<_Tp, _Np, _Ap>())>;

   // Allow _Tp to be _InvalidInteger for __integer_from<16>
   template <typename _Tp, __simd_size_type _Np, __abi_tag _Ap>
     using __similar_vec = basic_vec<_Tp, decltype(__abi_rebind<_Tp, _Np, _Ap>())>;

   // LWG4470 [simd.expos]
   template <size_t _Bytes, typename _Ap>
     using __simd_vec_from_mask_t = __similar_vec<__integer_from<_Bytes>, _Ap::_S_size, _Ap>;

 #if _GLIBCXX_SIMD_THROW_ON_BAD_VALUE // used for unit tests (also see P3844)
   class __bad_value_preserving_cast
   {};

 #define __glibcxx_on_bad_value_preserving_cast throw __bad_value_preserving_cast
 #else
   void __bad_value_preserving_cast(); // not defined

 #define __glibcxx_on_bad_value_preserving_cast __bad_value_preserving_cast
 #endif

   template <typename _To, typename _From>
 #if _GLIBCXX_SIMD_THROW_ON_BAD_VALUE // see P3844
     [[__gnu__::__optimize__("exceptions")]] // work around potential -fno-exceptions
 #endif
     consteval _To
     __value_preserving_cast(const _From& __x)
     {
       static_assert(is_arithmetic_v<_From>);
       if constexpr (!__value_preserving_convertible_to<_From, _To>)
 	{
 	  using _Up = typename __make_unsigned<_From>::__type;
 	  if (static_cast<_Up>(static_cast<_To>(__x)) != static_cast<_Up>(__x))
 	    __glibcxx_on_bad_value_preserving_cast();
 	  else if constexpr (is_signed_v<_From> && is_unsigned_v<_To>)
 	    {
 	      if (__x < _From())
 		__glibcxx_on_bad_value_preserving_cast();
 	    }
 	  else if constexpr (unsigned_integral<_From> && signed_integral<_To>)
 	    {
 	      if (__x > numeric_limits<_To>::max())
 		__glibcxx_on_bad_value_preserving_cast();
 	    }
 	}
       return static_cast<_To>(__x);
     }

   template <typename _From, typename _To>
     concept __simd_vec_bcast_consteval
       = __explicitly_convertible_to<_From, _To>
 	  && is_arithmetic_v<remove_cvref_t<_From>> && convertible_to<_From, _To>
 	  && !__value_preserving_convertible_to<remove_cvref_t<_From>, _To>
 	  && (is_same_v<common_type_t<_From, _To>, _To>
 		|| (is_same_v<remove_cvref_t<_From>, int> && is_integral_v<_To>)
 		|| (is_same_v<remove_cvref_t<_From>, unsigned> && unsigned_integral<_To>));

   /** @internal
    * std::pair is not trivially copyable, this one is
    */
   template <typename _T0, typename _T1>
     struct __trivial_pair
     {
       _T0 _M_first;
       _T1 _M_second;
     };

   template <typename _From, typename _To>
     concept __converts_trivially = convertible_to<_From, _To>
 				     && sizeof(_From) == sizeof(_To)
 				     && is_integral_v<_From> == is_integral_v<_To>
 				     && is_floating_point_v<_From> == is_floating_point_v<_To>;

   [[__gnu__::__always_inline__]]
   constexpr void
   __bit_foreach(unsigned_integral auto __bits, auto&& __fun)
   {
     static_assert(sizeof(__bits) >= sizeof(int)); // avoid promotion to int
     while (__bits)
       {
 	__fun(__countr_zero(__bits));
 	__bits &= (__bits - 1);
       }
   }

   /** @internal
    * Optimized @c memcpy for use in partial loads and stores.
    *
    * The implementation uses at most two fixed-size power-of-2 @c memcpy calls and reduces the
    * number of branches to a minimum. The variable size is achieved by overlapping two @c memcpy
    * calls.
    *
    * @tparam _Chunk   Copies @p __n times @p _Chunk bytes.
    * @tparam _Max     Copy no more than @p _Max bytes.
    *
    * @param  __dst    The destination pointer.
    * @param  __src    The source pointer.
    * @param  __n      Thu number of chunks that need to be copied.
    */
   template <size_t _Chunk, size_t _Max>
     inline void
     __memcpy_chunks(byte* __restrict__ __dst, const byte* __restrict__ __src,
 		    size_t __n)
     {
       static_assert(_Max <= 64);
       static_assert(__has_single_bit(_Chunk) && _Chunk <= 8);
       size_t __bytes = _Chunk * __n;
       if (__builtin_constant_p(__bytes))
 	{ // If __n is known via constant propagation use a single memcpy call. Since this is still
 	  // a fixed-size memcpy to the compiler, this leaves more room for optimization.
 	  __builtin_memcpy(__dst, __src, __bytes);
 	}
       else if (__bytes > 32 && _Max > 32)
 	{
 	  __builtin_memcpy(__dst, __src, 32);
 	  __bytes -= 32;
 	  __builtin_memcpy(__dst + __bytes, __src + __bytes, 32);
 	}
       else if (__bytes > 16 && _Max > 16)
 	{
 	  __builtin_memcpy(__dst, __src, 16);
 	  if constexpr (_Chunk == 8)
 	    {
 	      __bytes -= 8;
 	      __builtin_memcpy(__dst + __bytes, __src + __bytes, 8);
 	    }
 	  else
 	    {
 	      __bytes -= 16;
 	      __builtin_memcpy(__dst + __bytes, __src + __bytes, 16);
 	    }
 	}
       else if (__bytes > 8 && _Max > 8)
 	{
 	  __builtin_memcpy(__dst, __src, 8);
 	  if constexpr (_Chunk == 4)
 	    {
 	      __bytes -= 4;
 	      __builtin_memcpy(__dst + __bytes, __src + __bytes, 4);
 	    }
 	  else if constexpr (_Chunk < 4)
 	    {
 	      __bytes -= 8;
 	      __builtin_memcpy(__dst + __bytes, __src + __bytes, 8);
 	    }
 	}
       else if (__bytes > 4 && _Max > 4)
 	{
 	  __builtin_memcpy(__dst, __src, 4);
 	  if constexpr (_Chunk == 2)
 	    {
 	      __bytes -= 2;
 	      __builtin_memcpy(__dst + __bytes, __src + __bytes, 2);
 	    }
 	  else if constexpr (_Chunk == 1)
 	    {
 	      __bytes -= 4;
 	      __builtin_memcpy(__dst + __bytes, __src + __bytes, 4);
 	    }
 	}
       else if (__bytes >= 2)
 	{
 	  __builtin_memcpy(__dst, __src, 2);
 	  if constexpr (_Chunk == 2)
 	    {
 	      __bytes -= 2;
 	      __builtin_memcpy(__dst + __bytes, __src + __bytes, 2);
 	    }
 	  else if constexpr (_Chunk == 1)
 	    {
 	      __bytes -= 1;
 	      __builtin_memcpy(__dst + __bytes, __src + __bytes, 1);
 	    }
 	}
       else if (__bytes == 1)
 	__builtin_memcpy(__dst, __src, 1);
     }

   // [simd.reductions] identity_element = *see below*
   template <typename _Tp, typename _BinaryOperation>
     requires __is_one_of<_BinaryOperation,
 			 plus<>, multiplies<>, bit_and<>, bit_or<>, bit_xor<>>::value
     consteval _Tp
     __default_identity_element()
     {
       if constexpr (same_as<_BinaryOperation, multiplies<>>)
 	return _Tp(1);
       else if constexpr (same_as<_BinaryOperation, bit_and<>>)
 	return _Tp(~_Tp());
       else
 	return _Tp(0);
     }
 } // namespace simd
 _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace std

 #pragma GCC diagnostic pop
 #endif // C++26
 #endif // _GLIBCXX_SIMD_DETAILS_H