blob: 31bd6ac45ab2d6aa8d08ed06992465f37ae93b71 [file] [log] [blame]
// Implementation of <simd> -*- C++ -*-
// Copyright The GNU Toolchain Authors.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.
// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
// <http://www.gnu.org/licenses/>.
#ifndef _GLIBCXX_SIMD_DETAILS_H
#define _GLIBCXX_SIMD_DETAILS_H 1
#ifdef _GLIBCXX_SYSHDR
#pragma GCC system_header
#endif
#if __cplusplus >= 202400L
#include <bit>
#include <bits/c++config.h> // _GLIBCXX_FLOAT_IS_IEEE_BINARY32
#include <bits/stl_function.h> // plus, minus, multiplies, ...
#include <bits/utility.h> // integer_sequence, etc.
#include <cmath> // for math_errhandling :(
#include <concepts>
#include <cstdint>
#include <limits>
#include <span> // for dynamic_extent
#if __CHAR_BIT__ != 8
// There are simply too many constants and bit operators that currently depend on CHAR_BIT == 8.
// Generalization to CHAR_BIT != 8 does not make sense without testability (i.e. a test target).
#error "<simd> is not supported for CHAR_BIT != 8"
#endif
// psabi warnings are bogus because the ABI of the internal types never leaks into user code
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpsabi"
#if defined __x86_64__ || defined __i386__
#define _GLIBCXX_X86 1
#else
#define _GLIBCXX_X86 0
#endif
#ifndef _GLIBCXX_SIMD_NOEXCEPT
/** @internal
* For unit-testing preconditions, use this macro to remove noexcept.
*/
#define _GLIBCXX_SIMD_NOEXCEPT noexcept
#endif
#define _GLIBCXX_SIMD_TOSTRING_IMPL(x) #x
#define _GLIBCXX_SIMD_TOSTRING(x) _GLIBCXX_SIMD_TOSTRING_IMPL(x)
// This is used for unit-testing precondition checking
#define __glibcxx_simd_precondition(expr, msg, ...) \
__glibcxx_assert(expr)
namespace std _GLIBCXX_VISIBILITY(default)
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
namespace simd
{
template <typename _Tp>
inline constexpr _Tp
__iota = [] { static_assert(false, "invalid __iota specialization"); }();
// [simd.general] vectorizable types
template <typename _Tp>
concept __vectorizable_scalar
= same_as<remove_cv_t<_Tp>, _Tp>
#ifdef __STDCPP_BFLOAT16_T__
&& !same_as<_Tp, __gnu_cxx::__bfloat16_t>
#endif
&& ((integral<_Tp> && sizeof(_Tp) <= sizeof(0ULL) && !same_as<_Tp, bool>)
|| (floating_point<_Tp> && sizeof(_Tp) <= sizeof(double)));
// [simd.general] p2
template <typename _Tp>
concept __vectorizable = __vectorizable_scalar<_Tp>;
/** @internal
* Describes variants of _Abi.
*/
enum class _AbiVariant : unsigned long long
{
_BitMask = 0x01, // AVX512 bit-masks
_MaskVariants = 0x0f, // vector masks if bits [0:3] are 0
};
/** @internal
* Return @p __in with only bits set that are set in any of @p __to_keep.
*/
consteval _AbiVariant
__filter_abi_variant(_AbiVariant __in, same_as<_AbiVariant> auto... __to_keep)
{
using _Up = underlying_type_t<_AbiVariant>;
return static_cast<_AbiVariant>(static_cast<_Up>(__in) & (static_cast<_Up>(__to_keep) | ...));
}
/** @internal
* Type used whenever no valid integer/value type exists.
*/
struct _InvalidInteger
{};
/** @internal
* Alias for a signed integer type T such that sizeof(T) equals _Bytes.
*
* C++26 [simd.expos.defn]
*/
template <size_t _Bytes>
using __integer_from
= decltype([] consteval {
if constexpr (sizeof(signed char) == _Bytes)
return static_cast<signed char>(0);
else if constexpr (sizeof(signed short) == _Bytes)
return static_cast<signed short>(0);
else if constexpr (sizeof(signed int) == _Bytes)
return static_cast<signed int>(0);
else if constexpr (sizeof(signed long long) == _Bytes)
return static_cast<signed long long>(0);
else
return _InvalidInteger();
}());
/** @internal
* Alias for an unsigned integer type T such that sizeof(T) equals _Bytes.
*/
template <size_t _Bytes>
using _UInt = make_unsigned_t<__integer_from<_Bytes>>;
/** @internal
* Divide @p __x by @p __y while rounding up instead of down.
*
* Preconditions: __x >= 0 && __y > 0.
*/
template <typename _Tp>
consteval _Tp
__div_ceil(_Tp __x, _Tp __y)
{ return (__x + __y - 1) / __y; }
/** @internal
* Alias for an unsigned integer type that can store at least @p _NBits bits.
*/
template <int _NBits>
requires (_NBits > 0 && _NBits <= numeric_limits<unsigned long long>::digits)
using _Bitmask = _UInt<__div_ceil(__bit_ceil(unsigned(_NBits)), unsigned(__CHAR_BIT__))>;
/** @internal
* Map a given type @p _Tp to an equivalent type.
*
* This helps with reducing the necessary branches && casts in the implementation as well as
* reducing the number of template instantiations.
*/
template <typename _Tp>
struct __canonical_vec_type
{ using type = _Tp; };
template <typename _Tp>
using __canonical_vec_type_t = typename __canonical_vec_type<_Tp>::type;
#if __SIZEOF_INT__ == __SIZEOF_LONG__
template <>
struct __canonical_vec_type<long>
{ using type = int; };
template <>
struct __canonical_vec_type<unsigned long>
{ using type = unsigned int; };
#elif __SIZEOF_LONG_LONG__ == __SIZEOF_LONG__
template <>
struct __canonical_vec_type<long>
{ using type = long long; };
template <>
struct __canonical_vec_type<unsigned long>
{ using type = unsigned long long; };
#endif
template <typename _Tp>
requires std::is_enum_v<_Tp>
struct __canonical_vec_type<_Tp>
{ using type = __canonical_vec_type<std::underlying_type_t<_Tp>>::type; };
template <>
struct __canonical_vec_type<char>
#if __CHAR_UNSIGNED__
{ using type = unsigned char; };
#else
{ using type = signed char; };
#endif
template <>
struct __canonical_vec_type<char8_t>
{ using type = unsigned char; };
template <>
struct __canonical_vec_type<char16_t>
{ using type = uint_least16_t; };
template <>
struct __canonical_vec_type<char32_t>
{ using type = uint_least32_t; };
template <>
struct __canonical_vec_type<wchar_t>
{
using type = std::__conditional_t<std::is_signed_v<wchar_t>,
simd::__integer_from<sizeof(wchar_t)>,
simd::_UInt<sizeof(wchar_t)>>;
};
#if defined(__FLT64_DIG__) && defined(_GLIBCXX_DOUBLE_IS_IEEE_BINARY64)
template <>
struct __canonical_vec_type<_Float64>
{ using type = double; };
#endif
#if defined(__FLT32_DIG__) && defined(_GLIBCXX_FLOAT_IS_IEEE_BINARY32)
template <>
struct __canonical_vec_type<_Float32>
{ using type = float; };
#endif
/** @internal
* This ABI tag describes basic_vec objects that store one element per data member and basic_mask
* objects that store one bool data members.
*
* @tparam _Np The number of elements, which also matches the number of data members in
* basic_vec and basic_mask.
*/
template <int _Np = 1>
struct _ScalarAbi
{
static constexpr int _S_size = _Np;
static constexpr int _S_nreg = _Np;
static constexpr _AbiVariant _S_variant = {};
template <typename _Tp>
using _DataType = __canonical_vec_type_t<_Tp>;
static constexpr bool _S_is_vecmask = false;
// in principle a bool is a 1-bit bitmask, but this is asking for an AVX512 bitmask
static constexpr bool _S_is_bitmask = false;
template <size_t>
using _MaskDataType = bool;
template <int _N2, int _Nreg2 = _N2>
static consteval _ScalarAbi<_N2>
_S_resize()
{
static_assert(_N2 == _Nreg2);
return {};
}
};
/** @internal
* This ABI tag describes basic_vec objects that store one or more objects declared with the
* [[gnu::vector_size(N)]] attribute.
* Applied to basic_mask objects, this ABI tag either describes corresponding vector-mask objects
* or bit-mask objects. Which one is used is determined via @p _Var.
*
* @tparam _Np The number of elements.
* @tparam _Nreg The number of registers needed to store @p _Np elements.
* @tparam _Var Determines how complex value-types are layed out and whether mask types use
* bit-masks or vector-masks.
*/
template <int _Np, int _Nreg, underlying_type_t<_AbiVariant> _Var>
struct _Abi
{
static constexpr int _S_size = _Np;
/** @internal
* The number of registers needed to represent one basic_vec for the element type that was
* used on ABI deduction.
*
* Examples:
* - '_Abi< 8, 2>' for 'int' is 2x 128-bit
* - '_Abi< 9, 3>' for 'int' is 2x 128-bit and 1x 32-bit
* - '_Abi<10, 3>' for 'int' is 2x 128-bit and 1x 64-bit
* - '_Abi<10, 1>' for 'int' is 1x 512-bit
* - '_Abi<10, 2>' for 'int' is 1x 256-bit and 1x 64-bit
*/
static constexpr int _S_nreg = _Nreg;
static_assert(_S_size > 0);
static_assert(_S_nreg > 0);
static constexpr _AbiVariant _S_variant = static_cast<_AbiVariant>(_Var);
static constexpr bool _S_is_bitmask
= __filter_abi_variant(_S_variant, _AbiVariant::_BitMask) == _AbiVariant::_BitMask;
static constexpr bool _S_is_vecmask = !_S_is_bitmask;
template <typename _Tp>
using _DataType = decltype([] {
static_assert(_S_nreg == 1);
if constexpr (_S_size == 1)
return __canonical_vec_type_t<_Tp>();
else
{
constexpr int __n = __bit_ceil(unsigned(_S_size));
using _Vp [[__gnu__::__vector_size__(sizeof(_Tp) * __n)]]
= __canonical_vec_type_t<_Tp>;
return _Vp();
}
}());
template <size_t _Bytes>
using _MaskDataType
= decltype([] {
static_assert(_S_nreg == 1);
if constexpr (_S_size == 1)
return bool();
else if constexpr (_S_is_vecmask)
{
constexpr unsigned __vbytes = _Bytes * __bit_ceil(unsigned(_S_size));
using _Vp [[__gnu__::__vector_size__(__vbytes)]] = __integer_from<_Bytes>;
return _Vp();
}
else if constexpr (_Nreg > 1)
return _InvalidInteger();
else
return _Bitmask<_S_size>();
}());
template <int _N2, int _Nreg2 = __div_ceil(_N2, _S_size)>
static consteval auto
_S_resize()
{
if constexpr (_N2 == 1)
return _Abi<1, 1, _Var>();
else
return _Abi<_N2, _Nreg2, _Var>();
}
};
/** @internal
* Alias for an _Abi specialization where the _AbiVariant bits are combined into a single integer
* value.
*
* Rationale: Consider diagnostic output and mangling of e.g. vec<int, 4> with AVX512. That's an
* alias for std::simd::basic_vec<int, std::simd::_Abi<4, 1, 1ull>>. If _AbiVariant were the
* template argument type of _Abi, the diagnostic output would be 'std::simd::basic_vec<int,
* std::simd::_Abi<4, 1, (std::simd::_AbiVariant)std::simd::_AbiVariant::_BitMask>>'. That's a lot
* longer, requires longer mangled names, and bakes the names of the enumerators into the ABI. As
* soon as bits of multiple _AbiVariants are combined, this becomes hard to parse for humans
* anyway.
*/
template <int _Np, int _Nreg, _AbiVariant... _Vs>
using _Abi_t = _Abi<_Np, _Nreg, (static_cast<underlying_type_t<_AbiVariant>>(_Vs) | ... | 0)>;
/** @internal
* This type is used whenever ABI tag deduction can't give a useful answer.
*/
struct _InvalidAbi
{ static constexpr int _S_size = 0; };
/** @internal
* Satisfied if @p _Tp is a valid simd ABI tag. This is a necessary but not sufficient condition
* for an enabled basic_vec/basic_mask specialization.
*/
template <typename _Tp>
concept __abi_tag
= same_as<decltype(_Tp::_S_variant), const _AbiVariant>
&& (_Tp::_S_size >= _Tp::_S_nreg) && (_Tp::_S_nreg >= 1)
&& requires(_Tp __x) {
{ __x.template _S_resize<_Tp::_S_size, _Tp::_S_nreg>() } -> same_as<_Tp>;
};
template <typename _Tp>
concept __scalar_abi_tag
= same_as<_Tp, _ScalarAbi<_Tp::_S_size>> && __abi_tag<_Tp>;
// Determine if math functions must *raise* floating-point exceptions.
// math_errhandling may expand to an extern symbol, in which case we must assume fp exceptions
// need to be considered. A conforming C library must define math_errhandling, but in case it
// isn't defined we simply use the fallback.
#ifdef math_errhandling
template <int = 0>
requires requires { typename bool_constant<0 != (math_errhandling & MATH_ERREXCEPT)>; }
consteval bool
__handle_fpexcept_impl(int)
{ return 0 != (math_errhandling & MATH_ERREXCEPT); }
#endif
// Fallback if math_errhandling doesn't work: implement correct exception behavior.
consteval bool
__handle_fpexcept_impl(float)
{ return true; }
/** @internal
* This type can be used as a template parameter for avoiding ODR violations, where code needs to
* differ depending on optimization flags (mostly fp-math related).
*/
struct _OptTraits
{
consteval bool
_M_test(int __bit) const
{ return ((_M_build_flags >> __bit) & 1) == 1; }
// true iff floating-point operations can signal an exception (allow non-default handler)
consteval bool
_M_fp_may_signal() const
{ return _M_test(0); }
// true iff floating-point operations can raise an exception flag
consteval bool
_M_fp_may_raise() const
{ return _M_test(12); }
consteval bool
_M_fast_math() const
{ return _M_test(1); }
consteval bool
_M_finite_math_only() const
{ return _M_test(2); }
consteval bool
_M_no_signed_zeros() const
{ return _M_test(3); }
consteval bool
_M_signed_zeros() const
{ return !_M_test(3); }
consteval bool
_M_reciprocal_math() const
{ return _M_test(4); }
consteval bool
_M_no_math_errno() const
{ return _M_test(5); }
consteval bool
_M_math_errno() const
{ return !_M_test(5); }
consteval bool
_M_associative_math() const
{ return _M_test(6); }
consteval bool
_M_conforming_to_STDC_annex_G() const
{ return _M_test(10) && !_M_finite_math_only(); }
consteval bool
_M_support_snan() const
{ return _M_test(11); }
__UINT64_TYPE__ _M_build_flags
= 0
#if !__NO_TRAPPING_MATH__
+ (1 << 0)
#endif
+ (__handle_fpexcept_impl(0) << 12)
#if __FAST_MATH__
+ (1 << 1)
#endif
#if __FINITE_MATH_ONLY__
+ (1 << 2)
#endif
#if __NO_SIGNED_ZEROS__
+ (1 << 3)
#endif
#if __RECIPROCAL_MATH__
+ (1 << 4)
#endif
#if __NO_MATH_ERRNO__
+ (1 << 5)
#endif
#if __ASSOCIATIVE_MATH__
+ (1 << 6)
#endif
// bits 7, 8, and 9 reserved for __FLT_EVAL_METHOD__
#if __FLT_EVAL_METHOD__ == 1
+ (1 << 7)
#elif __FLT_EVAL_METHOD__ == 2
+ (2 << 7)
#elif __FLT_EVAL_METHOD__ != 0
+ (3 << 7)
#endif
// C Annex G defines the behavior of complex<T> where T is IEC60559 floating-point. If
// __STDC_IEC_60559_COMPLEX__ is defined then Annex G is implemented - and simd<complex>
// will do so as well. However, Clang never defines the macro.
#if defined __STDC_IEC_60559_COMPLEX__ || defined __STDC_IEC_559_COMPLEX__ || defined _GLIBCXX_CLANG
+ (1 << 10)
#endif
#if __SUPPORT_SNAN__
+ (1 << 11)
#endif
;
};
/** @internal
* Return true iff @p __s equals "1".
*/
consteval bool
__streq_to_1(const char* __s)
{ return __s != nullptr && __s[0] == '1' && __s[1] == '\0'; }
/** @internal
* If the macro given as @p feat is defined to 1, expands to a bit set at position @p off.
* Otherwise, expand to zero.
*/
#define _GLIBCXX_SIMD_ARCH_FLAG(off, feat) \
(static_cast<__UINT64_TYPE__>(std::simd::__streq_to_1(_GLIBCXX_SIMD_TOSTRING_IMPL(feat))) << off)
#if _GLIBCXX_X86
#define _GLIBCXX_SIMD_ARCH_TRAITS_INIT { \
_GLIBCXX_SIMD_ARCH_FLAG(0, __MMX__) \
| _GLIBCXX_SIMD_ARCH_FLAG( 1, __SSE__) \
| _GLIBCXX_SIMD_ARCH_FLAG( 2, __SSE2__) \
| _GLIBCXX_SIMD_ARCH_FLAG( 3, __SSE3__) \
| _GLIBCXX_SIMD_ARCH_FLAG( 4, __SSSE3__) \
| _GLIBCXX_SIMD_ARCH_FLAG( 5, __SSE4_1__) \
| _GLIBCXX_SIMD_ARCH_FLAG( 6, __SSE4_2__) \
| _GLIBCXX_SIMD_ARCH_FLAG( 7, __POPCNT__) \
| _GLIBCXX_SIMD_ARCH_FLAG( 8, __AVX__) \
| _GLIBCXX_SIMD_ARCH_FLAG( 9, __F16C__) \
| _GLIBCXX_SIMD_ARCH_FLAG(10, __BMI__) \
| _GLIBCXX_SIMD_ARCH_FLAG(11, __BMI2__) \
| _GLIBCXX_SIMD_ARCH_FLAG(12, __LZCNT__) \
| _GLIBCXX_SIMD_ARCH_FLAG(13, __AVX2__) \
| _GLIBCXX_SIMD_ARCH_FLAG(14, __FMA__) \
| _GLIBCXX_SIMD_ARCH_FLAG(15, __AVX512F__) \
| _GLIBCXX_SIMD_ARCH_FLAG(16, __AVX512CD__) \
| _GLIBCXX_SIMD_ARCH_FLAG(17, __AVX512DQ__) \
| _GLIBCXX_SIMD_ARCH_FLAG(18, __AVX512BW__) \
| _GLIBCXX_SIMD_ARCH_FLAG(19, __AVX512VL__) \
| _GLIBCXX_SIMD_ARCH_FLAG(20, __AVX512BITALG__) \
| _GLIBCXX_SIMD_ARCH_FLAG(21, __AVX512VBMI__) \
| _GLIBCXX_SIMD_ARCH_FLAG(22, __AVX512VBMI2__) \
| _GLIBCXX_SIMD_ARCH_FLAG(23, __AVX512IFMA__) \
| _GLIBCXX_SIMD_ARCH_FLAG(24, __AVX512VNNI__) \
| _GLIBCXX_SIMD_ARCH_FLAG(25, __AVX512VPOPCNTDQ__) \
| _GLIBCXX_SIMD_ARCH_FLAG(26, __AVX512FP16__) \
| _GLIBCXX_SIMD_ARCH_FLAG(27, __AVX512BF16__) \
| _GLIBCXX_SIMD_ARCH_FLAG(28, __AVXIFMA__) \
| _GLIBCXX_SIMD_ARCH_FLAG(29, __AVXNECONVERT__) \
| _GLIBCXX_SIMD_ARCH_FLAG(30, __AVXVNNI__) \
| _GLIBCXX_SIMD_ARCH_FLAG(31, __AVXVNNIINT8__) \
| _GLIBCXX_SIMD_ARCH_FLAG(32, __AVXVNNIINT16__) \
| _GLIBCXX_SIMD_ARCH_FLAG(33, __AVX10_1__) \
| _GLIBCXX_SIMD_ARCH_FLAG(34, __AVX10_2__) \
| _GLIBCXX_SIMD_ARCH_FLAG(35, __AVX512VP2INTERSECT__) \
| _GLIBCXX_SIMD_ARCH_FLAG(36, __SSE4A__) \
| _GLIBCXX_SIMD_ARCH_FLAG(37, __FMA4__) \
| _GLIBCXX_SIMD_ARCH_FLAG(38, __XOP__) \
}
// Should this include __APX_F__? I don't think it's relevant for use in constexpr-if branches =>
// no ODR issue? The same could be said about several other flags above that are not checked
// anywhere.
struct _ArchTraits
{
__UINT64_TYPE__ _M_flags = _GLIBCXX_SIMD_ARCH_TRAITS_INIT;
consteval bool
_M_test(int __bit) const
{ return ((_M_flags >> __bit) & 1) == 1; }
consteval bool
_M_have_mmx() const
{ return _M_test(0); }
consteval bool
_M_have_sse() const
{ return _M_test(1); }
consteval bool
_M_have_sse2() const
{ return _M_test(2); }
consteval bool
_M_have_sse3() const
{ return _M_test(3); }
consteval bool
_M_have_ssse3() const
{ return _M_test(4); }
consteval bool
_M_have_sse4_1() const
{ return _M_test(5); }
consteval bool
_M_have_sse4_2() const
{ return _M_test(6); }
consteval bool
_M_have_popcnt() const
{ return _M_test(7); }
consteval bool
_M_have_avx() const
{ return _M_test(8); }
consteval bool
_M_have_f16c() const
{ return _M_test(9); }
consteval bool
_M_have_bmi() const
{ return _M_test(10); }
consteval bool
_M_have_bmi2() const
{ return _M_test(11); }
consteval bool
_M_have_lzcnt() const
{ return _M_test(12); }
consteval bool
_M_have_avx2() const
{ return _M_test(13); }
consteval bool
_M_have_fma() const
{ return _M_test(14); }
consteval bool
_M_have_avx512f() const
{ return _M_test(15); }
consteval bool
_M_have_avx512cd() const
{ return _M_test(16); }
consteval bool
_M_have_avx512dq() const
{ return _M_test(17); }
consteval bool
_M_have_avx512bw() const
{ return _M_test(18); }
consteval bool
_M_have_avx512vl() const
{ return _M_test(19); }
consteval bool
_M_have_avx512bitalg() const
{ return _M_test(20); }
consteval bool
_M_have_avx512vbmi() const
{ return _M_test(21); }
consteval bool
_M_have_avx512vbmi2() const
{ return _M_test(22); }
consteval bool
_M_have_avx512ifma() const
{ return _M_test(23); }
consteval bool
_M_have_avx512vnni() const
{ return _M_test(24); }
consteval bool
_M_have_avx512vpopcntdq() const
{ return _M_test(25); }
consteval bool
_M_have_avx512fp16() const
{ return _M_test(26); }
consteval bool
_M_have_avx512bf16() const
{ return _M_test(27); }
consteval bool
_M_have_avxifma() const
{ return _M_test(28); }
consteval bool
_M_have_avxneconvert() const
{ return _M_test(29); }
consteval bool
_M_have_avxvnni() const
{ return _M_test(30); }
consteval bool
_M_have_avxvnniint8() const
{ return _M_test(31); }
consteval bool
_M_have_avxvnniint16() const
{ return _M_test(32); }
consteval bool
_M_have_avx10_1() const
{ return _M_test(33); }
consteval bool
_M_have_avx10_2() const
{ return _M_test(34); }
consteval bool
_M_have_avx512vp2intersect() const
{ return _M_test(35); }
consteval bool
_M_have_sse4a() const
{ return _M_test(36); }
consteval bool
_M_have_fma4() const
{ return _M_test(37); }
consteval bool
_M_have_xop() const
{ return _M_test(38); }
template <typename _Tp>
consteval bool
_M_eval_as_f32() const
{ return is_same_v<_Tp, _Float16> && !_M_have_avx512fp16(); }
};
template <typename _Tp, _ArchTraits _Traits = {}>
consteval auto
__native_abi()
{
constexpr int __adj_sizeof = sizeof(_Tp) * (1 + is_same_v<_Tp, _Float16>);
if constexpr (!__vectorizable<_Tp>)
return _InvalidAbi();
else if constexpr (_Traits._M_have_avx512fp16())
return _Abi_t<64 / sizeof(_Tp), 1, _AbiVariant::_BitMask>();
else if constexpr (_Traits._M_have_avx512f())
return _Abi_t<64 / __adj_sizeof, 1, _AbiVariant::_BitMask>();
else if constexpr (is_same_v<_Tp, _Float16> && !_Traits._M_have_f16c())
return _ScalarAbi<1>();
else if constexpr (_Traits._M_have_avx2())
return _Abi_t<32 / __adj_sizeof, 1>();
else if constexpr (_Traits._M_have_avx() && is_floating_point_v<_Tp>)
return _Abi_t<32 / __adj_sizeof, 1>();
else if constexpr (_Traits._M_have_sse2())
return _Abi_t<16 / __adj_sizeof, 1>();
else if constexpr (_Traits._M_have_sse() && is_floating_point_v<_Tp>
&& sizeof(_Tp) == sizeof(float))
return _Abi_t<16 / __adj_sizeof, 1>();
// no MMX: we can't emit EMMS where it would be necessary
else
return _ScalarAbi<1>();
}
#else
// scalar fallback
struct _ArchTraits
{
__UINT64_TYPE__ _M_flags = 0;
constexpr bool
_M_test(int __bit) const
{ return ((_M_flags >> __bit) & 1) == 1; }
};
template <typename _Tp>
consteval auto
__native_abi()
{
if constexpr (!__vectorizable<_Tp>)
return _InvalidAbi();
else
return _ScalarAbi<1>();
}
#endif
/** @internal
* You must use this type as template argument to function templates that are not declared
* always_inline (to avoid issues when linking code compiled with different compiler flags).
*/
struct _TargetTraits
: _ArchTraits, _OptTraits
{};
/** @internal
* Alias for an ABI tag such that basic_vec<_Tp, __native_abi_t_<_Tp>> stores one SIMD register of
* optimal width.
*
* @tparam _Tp A vectorizable type.
*
* C++26 [simd.expos.abi]
*/
template <typename _Tp>
using __native_abi_t = decltype(std::simd::__native_abi<_Tp>());
template <typename _Tp, int _Np, _TargetTraits _Target = {}>
consteval auto
__deduce_abi()
{
constexpr auto __native = std::simd::__native_abi<_Tp>();
if constexpr (0 == __native._S_size || _Np <= 0)
return _InvalidAbi();
else if constexpr (_Np == __native._S_size)
return __native;
else
return __native.template _S_resize<_Np>();
}
/** @internal
* Alias for an ABI tag @c A such that `basic_vec<_Tp, A>` stores @p _Np elements.
*
* C++26 [simd.expos.abi]
*/
template <typename _Tp, int _Np>
using __deduce_abi_t = decltype(std::simd::__deduce_abi<_Tp, _Np>());
/** @internal
* \c rebind implementation detail for basic_vec, and basic_mask where we know the destination
* value-type
*/
template <typename _Tp, int _Np, __abi_tag _A0, _ArchTraits = {}>
consteval auto
__abi_rebind()
{
if constexpr (_Np <= 0 || !__vectorizable<_Tp>)
return _InvalidAbi();
else if constexpr (__scalar_abi_tag<_A0>)
return _A0::template _S_resize<_Np>();
else
{
using _Native = remove_const_t<decltype(std::simd::__native_abi<_Tp>())>;
static_assert(0 != _Native::_S_size);
constexpr int __nreg = __div_ceil(_Np, _Native::_S_size);
if constexpr (__scalar_abi_tag<_Native>)
return _Native::template _S_resize<_Np>();
else
return _Abi_t<_Native::_S_size, 1, __filter_abi_variant(_A0::_S_variant,
_AbiVariant::_MaskVariants)
>::template _S_resize<_Np, __nreg>();
}
}
/** @internal
* @c rebind implementation detail for basic_mask.
*
* The important difference here is that we have no information about the actual value-type other
* than its @c sizeof. So `_Bytes == 8` could mean `complex<float>`, @c double, or @c int64_t.
* E.g. `_Np == 4` with AVX w/o AVX2 that's `vector(4) int`, `vector(4) long long`, or `2x
* vector(2) long long`.
* That's why this overload has the additional @p _IsOnlyResize parameter, which tells us that the
* value-type doesn't change.
*/
template <size_t _Bytes, int _Np, __abi_tag _A0, bool _IsOnlyResize, _ArchTraits _Traits = {}>
consteval auto
__abi_rebind()
{
if constexpr (_Bytes == 0 || _Np <= 0)
return _InvalidAbi();
else if constexpr (__scalar_abi_tag<_A0>)
return _A0::template _S_resize<_Np>();
#if _GLIBCXX_X86
// AVX w/o AVX2:
// e.g. resize_t<8, mask<float, Whatever>> needs to be _Abi<8, 1> not _Abi<8, 2>
// We determine whether _A0 identifies an AVX vector by looking at the size of a native
// register. If it's 32, it's a YMM register, otherwise it's 16 or less.
else if constexpr (_IsOnlyResize
&& _Traits._M_have_avx() && !_Traits._M_have_avx2()
&& __bit_ceil(__div_ceil<unsigned>(
_A0::_S_size, _A0::_S_nreg)) * _Bytes == 32)
{
if constexpr (_Bytes == sizeof(double))
return __abi_rebind<double, _Np, _A0>();
else if constexpr (_Bytes == sizeof(float))
return __abi_rebind<float, _Np, _A0>();
else if constexpr (_Traits._M_have_f16c() && _Bytes == sizeof(_Float16))
return __abi_rebind<_Float16, _Np, _A0>();
else // impossible
static_assert(false);
}
#endif
else
return __abi_rebind<__integer_from<_Bytes>, _Np, _A0>();
}
/** @internal
* Returns true unless _GLIBCXX_SIMD_COND_EXPLICIT_MASK_CONVERSION is defined.
*
* On IvyBridge, (vec<float> == 0.f) == (rebind_t<int, vec<float>> == 0) does not compile. It does
* compile on basically every other target, though. This is due to the difference in ABI tag:
* _Abi<8, 1, [...]> vs. _Abi<8, 2, [...]> (8 elements, 1 vs. 2 registers).
* I know how to define this funtion for libstdc++ to avoid interconvertible masks. The question
* is whether we can specify this in general for C++29.
*
* Idea: Is rebind_t<integer-from<...>, mask>::abi_type the same type as
* deduce-t<integer-from<...>, mask::size()>? If yes, it's the "better" ABI tag. However, this
* makes the conversion behavior dependent on compiler flags. Probably not what we want.
*/
template <typename _To, typename _From>
consteval bool
__is_mask_conversion_explicit([[maybe_unused]] size_t __b0, [[maybe_unused]] size_t __b1)
{
constexpr int __n = _To::_S_size;
static_assert(__n == _From::_S_size);
#ifndef _GLIBCXX_SIMD_COND_EXPLICIT_MASK_CONVERSION
/// C++26 [simd.mask.ctor] uses unconditional explicit
return true;
#else
if (__b0 != __b1)
return true;
// everything is better than _ScalarAbi, except when converting to a single bool
if constexpr (__scalar_abi_tag<_To>)
return __n > 1;
else if constexpr (__scalar_abi_tag<_From>)
return true;
// converting to a bit-mask is better
else if constexpr (_To::_S_is_vecmask != _From::_S_is_vecmask)
return _To::_S_is_vecmask; // to vector-mask is explicit
// with vec-masks, fewer registers is better
else if constexpr (_From::_S_nreg != _To::_S_nreg)
return _From::_S_nreg < _To::_S_nreg;
else
__builtin_unreachable();
#endif
}
/** @internal
* An alias for a signed integer type.
*
* libstdc++ unconditionally uses @c int here, since it matches the return type of
* 'Bit Operation Builtins' in GCC.
*
* C++26 [simd.expos.defn]
*/
using __simd_size_type = int;
// integral_constant shortcut
template <__simd_size_type _Xp>
inline constexpr integral_constant<__simd_size_type, _Xp> __simd_size_c = {};
// [simd.syn]
template <typename _Tp, typename _Ap = __native_abi_t<_Tp>>
class basic_vec;
template <typename _Tp, __simd_size_type _Np = __native_abi_t<_Tp>::_S_size>
using vec = basic_vec<_Tp, __deduce_abi_t<_Tp, _Np>>;
template <size_t _Bytes, typename _Ap = __native_abi_t<__integer_from<_Bytes>>>
class basic_mask;
template <typename _Tp, __simd_size_type _Np = __native_abi_t<_Tp>::_S_size>
using mask = basic_mask<sizeof(_Tp), __deduce_abi_t<_Tp, _Np>>;
// [simd.ctor] load constructor constraints
template <typename _Tp, size_t _Np = -1uz>
concept __static_sized_range
= ranges::sized_range<_Tp> && requires(_Tp&& __r) {
typename integral_constant<size_t, ranges::size(__r)>;
requires (_Np == -1uz || ranges::size(__r) == _Np);
};
template <typename _Rg>
consteval size_t
__static_range_size(_Rg& __r)
{
if constexpr (requires { typename integral_constant<size_t, ranges::size(__r)>; })
return ranges::size(__r);
else
return dynamic_extent;
}
// [simd.general] value-preserving
template <typename _From, typename _To>
concept __arithmetic_only_value_preserving_convertible_to
= convertible_to<_From, _To> && is_arithmetic_v<_From> && is_arithmetic_v<_To>
&& !(is_signed_v<_From> && is_unsigned_v<_To>)
&& numeric_limits<_From>::digits <= numeric_limits<_To>::digits
&& numeric_limits<_From>::max() <= numeric_limits<_To>::max()
&& numeric_limits<_From>::lowest() >= numeric_limits<_To>::lowest();
/** @internal
* Satisfied if the conversion from @p _From to @p _To is a value-preserving conversion.
*
* C++26 [simd.general]
*/
template <typename _From, typename _To>
concept __value_preserving_convertible_to
= __arithmetic_only_value_preserving_convertible_to<_From, _To>;
// LWG4420
template <typename _From, typename _To>
concept __explicitly_convertible_to = requires {
static_cast<_To>(declval<_From>());
};
/** @internal
* C++26 [simd.expos]
*/
template<typename _Tp>
concept __constexpr_wrapper_like
= convertible_to<_Tp, decltype(_Tp::value)>
&& equality_comparable_with<_Tp, decltype(_Tp::value)>
&& bool_constant<_Tp() == _Tp::value>::value
&& bool_constant<static_cast<decltype(_Tp::value)>(_Tp()) == _Tp::value>::value;
// [simd.ctor] explicit(...) of broadcast ctor
template <auto _From, typename _To>
concept __non_narrowing_constexpr_conversion
= is_arithmetic_v<decltype(_From)>
&& static_cast<decltype(_From)>(static_cast<_To>(_From)) == _From
&& !(unsigned_integral<_To> && _From < decltype(_From)())
&& _From <= std::numeric_limits<_To>::max()
&& _From >= std::numeric_limits<_To>::lowest();
// [simd.ctor] p4
// This implements LWG4436 (submitted on 2025-10-28)
template <typename _From, typename _To>
concept __broadcast_constructible
= ((convertible_to<_From, _To> && !is_arithmetic_v<remove_cvref_t<_From>>
&& !__constexpr_wrapper_like<remove_cvref_t<_From>>) // 4.1
|| __value_preserving_convertible_to<remove_cvref_t<_From>, _To> // 4.2
|| (__constexpr_wrapper_like<remove_cvref_t<_From>> // 4.3
&& __non_narrowing_constexpr_conversion<auto(remove_cvref_t<_From>::value),
_To>));
// __higher_floating_point_rank_than<_Tp, U> (_Tp has higher or equal floating point rank than U)
template <typename _From, typename _To>
consteval bool
__higher_floating_point_rank_than()
{
return floating_point<_From> && floating_point<_To>
&& is_same_v<common_type_t<_From, _To>, _From> && !is_same_v<_From, _To>;
}
// __higher_integer_rank_than<_Tp, U> (_Tp has higher or equal integer rank than U)
template <typename _From, typename _To>
consteval bool
__higher_integer_rank_than()
{
return integral<_From> && integral<_To>
&& (sizeof(_From) > sizeof(_To) || is_same_v<common_type_t<_From, _To>, _From>)
&& !is_same_v<_From, _To>;
}
template <typename _From, typename _To>
concept __higher_rank_than
= __higher_floating_point_rank_than<_From, _To>() || __higher_integer_rank_than<_From, _To>();
struct __convert_flag;
template <typename _From, typename _To, typename... _Flags>
concept __loadstore_convertible_to
= same_as<_From, _To>
|| (__vectorizable<_From> && __vectorizable<_To>
&& (__value_preserving_convertible_to<_From, _To>
|| (__explicitly_convertible_to<_From, _To>
&& (std::is_same_v<_Flags, __convert_flag> || ...))));
template <typename _From, typename _To>
concept __simd_generator_convertible_to
= std::convertible_to<_From, _To>
&& (!is_arithmetic_v<_From> || __value_preserving_convertible_to<_From, _To>);
template <typename _Fp, typename _Tp, __simd_size_type... _Is>
requires (__simd_generator_convertible_to<
decltype(declval<_Fp>()(__simd_size_c<_Is>)), _Tp> && ...)
constexpr void
__simd_generator_invokable_impl(integer_sequence<__simd_size_type, _Is...>);
template <typename _Fp, typename _Tp, __simd_size_type _Np>
concept __simd_generator_invokable = requires {
__simd_generator_invokable_impl<_Fp, _Tp>(make_integer_sequence<__simd_size_type, _Np>());
};
template <typename _Fp>
concept __index_permutation_function_sized = requires(_Fp const& __f)
{
{ __f(0, 0) } -> std::integral;
};
template <typename _Fp, typename _Simd>
concept __index_permutation_function
= __index_permutation_function_sized<_Fp> || requires(_Fp const& __f) {
{ __f(0) } -> std::integral;
};
/** @internal
* The value of the @c _Bytes template argument to a @c basic_mask specialization.
*
* C++26 [simd.expos.defn]
*/
template <typename _Tp>
constexpr size_t __mask_element_size = 0;
template <size_t _Bytes, __abi_tag _Ap>
constexpr size_t __mask_element_size<basic_mask<_Bytes, _Ap>> = _Bytes;
// [simd.expos]
template <typename _Vp>
concept __simd_vec_type
= same_as<_Vp, basic_vec<typename _Vp::value_type, typename _Vp::abi_type>>
&& is_default_constructible_v<_Vp>;
template <typename _Vp>
concept __simd_mask_type
= same_as<_Vp, basic_mask<__mask_element_size<_Vp>, typename _Vp::abi_type>>
&& is_default_constructible_v<_Vp>;
/** @internal
* Satisfied if @p _Tp is a data-parallel type.
*/
template <typename _Vp>
concept __simd_vec_or_mask_type = __simd_vec_type<_Vp> || __simd_mask_type<_Vp>;
template <typename _Vp>
concept __simd_floating_point
= __simd_vec_type<_Vp> && floating_point<typename _Vp::value_type>;
template <typename _Vp>
concept __simd_integral
= __simd_vec_type<_Vp> && integral<typename _Vp::value_type>;
template <typename _Tp>
concept __converts_to_vec
= __simd_vec_type<decltype(declval<const _Tp&>() + declval<const _Tp&>())>;
template <__converts_to_vec _Tp>
using __deduced_vec_t = decltype(declval<const _Tp&>() + declval<const _Tp&>());
template <typename _Vp, typename _Tp>
using __make_compatible_simd_t
= decltype([] {
using _Up = decltype(declval<const _Tp&>() + declval<const _Tp&>());
if constexpr (__simd_vec_type<_Up>)
return _Up();
else
return vec<_Up, _Vp::size()>();
}());
template <typename _Tp>
concept __math_floating_point = __simd_floating_point<__deduced_vec_t<_Tp>>;
template <typename _BinaryOperation, typename _Tp>
concept __reduction_binary_operation
= requires (const _BinaryOperation __binary_op, const vec<_Tp, 1> __v) {
{ __binary_op(__v, __v) } -> same_as<vec<_Tp, 1>>;
};
/** @internal
* Returns the highest index @c i where `(__bits >> i) & 1` equals @c 1.
*/
[[__gnu__::__always_inline__]]
constexpr __simd_size_type
__highest_bit(std::unsigned_integral auto __bits)
{
using __gnu_cxx::__int_traits;
constexpr auto _Nd = __int_traits<decltype(__bits)>::__digits;
return _Nd - 1 - __countl_zero(__bits);
}
template <__vectorizable _Tp, __simd_size_type _Np, __abi_tag _Ap>
using __similar_mask = basic_mask<sizeof(_Tp), decltype(__abi_rebind<_Tp, _Np, _Ap>())>;
// Allow _Tp to be _InvalidInteger for __integer_from<16>
template <typename _Tp, __simd_size_type _Np, __abi_tag _Ap>
using __similar_vec = basic_vec<_Tp, decltype(__abi_rebind<_Tp, _Np, _Ap>())>;
// LWG4470 [simd.expos]
template <size_t _Bytes, typename _Ap>
using __simd_vec_from_mask_t = __similar_vec<__integer_from<_Bytes>, _Ap::_S_size, _Ap>;
#if _GLIBCXX_SIMD_THROW_ON_BAD_VALUE // used for unit tests (also see P3844)
class __bad_value_preserving_cast
{};
#define __glibcxx_on_bad_value_preserving_cast throw __bad_value_preserving_cast
#else
void __bad_value_preserving_cast(); // not defined
#define __glibcxx_on_bad_value_preserving_cast __bad_value_preserving_cast
#endif
template <typename _To, typename _From>
#if _GLIBCXX_SIMD_THROW_ON_BAD_VALUE // see P3844
[[__gnu__::__optimize__("exceptions")]] // work around potential -fno-exceptions
#endif
consteval _To
__value_preserving_cast(const _From& __x)
{
static_assert(is_arithmetic_v<_From>);
if constexpr (!__value_preserving_convertible_to<_From, _To>)
{
using _Up = typename __make_unsigned<_From>::__type;
if (static_cast<_Up>(static_cast<_To>(__x)) != static_cast<_Up>(__x))
__glibcxx_on_bad_value_preserving_cast();
else if constexpr (is_signed_v<_From> && is_unsigned_v<_To>)
{
if (__x < _From())
__glibcxx_on_bad_value_preserving_cast();
}
else if constexpr (unsigned_integral<_From> && signed_integral<_To>)
{
if (__x > numeric_limits<_To>::max())
__glibcxx_on_bad_value_preserving_cast();
}
}
return static_cast<_To>(__x);
}
template <typename _From, typename _To>
concept __simd_vec_bcast_consteval
= __explicitly_convertible_to<_From, _To>
&& is_arithmetic_v<remove_cvref_t<_From>> && convertible_to<_From, _To>
&& !__value_preserving_convertible_to<remove_cvref_t<_From>, _To>
&& (is_same_v<common_type_t<_From, _To>, _To>
|| (is_same_v<remove_cvref_t<_From>, int> && is_integral_v<_To>)
|| (is_same_v<remove_cvref_t<_From>, unsigned> && unsigned_integral<_To>));
/** @internal
* std::pair is not trivially copyable, this one is
*/
template <typename _T0, typename _T1>
struct __trivial_pair
{
_T0 _M_first;
_T1 _M_second;
};
template <typename _From, typename _To>
concept __converts_trivially = convertible_to<_From, _To>
&& sizeof(_From) == sizeof(_To)
&& is_integral_v<_From> == is_integral_v<_To>
&& is_floating_point_v<_From> == is_floating_point_v<_To>;
[[__gnu__::__always_inline__]]
constexpr void
__bit_foreach(unsigned_integral auto __bits, auto&& __fun)
{
static_assert(sizeof(__bits) >= sizeof(int)); // avoid promotion to int
while (__bits)
{
__fun(__countr_zero(__bits));
__bits &= (__bits - 1);
}
}
/** @internal
* Optimized @c memcpy for use in partial loads and stores.
*
* The implementation uses at most two fixed-size power-of-2 @c memcpy calls and reduces the
* number of branches to a minimum. The variable size is achieved by overlapping two @c memcpy
* calls.
*
* @tparam _Chunk Copies @p __n times @p _Chunk bytes.
* @tparam _Max Copy no more than @p _Max bytes.
*
* @param __dst The destination pointer.
* @param __src The source pointer.
* @param __n Thu number of chunks that need to be copied.
*/
template <size_t _Chunk, size_t _Max>
inline void
__memcpy_chunks(byte* __restrict__ __dst, const byte* __restrict__ __src,
size_t __n)
{
static_assert(_Max <= 64);
static_assert(__has_single_bit(_Chunk) && _Chunk <= 8);
size_t __bytes = _Chunk * __n;
if (__builtin_constant_p(__bytes))
{ // If __n is known via constant propagation use a single memcpy call. Since this is still
// a fixed-size memcpy to the compiler, this leaves more room for optimization.
__builtin_memcpy(__dst, __src, __bytes);
}
else if (__bytes > 32 && _Max > 32)
{
__builtin_memcpy(__dst, __src, 32);
__bytes -= 32;
__builtin_memcpy(__dst + __bytes, __src + __bytes, 32);
}
else if (__bytes > 16 && _Max > 16)
{
__builtin_memcpy(__dst, __src, 16);
if constexpr (_Chunk == 8)
{
__bytes -= 8;
__builtin_memcpy(__dst + __bytes, __src + __bytes, 8);
}
else
{
__bytes -= 16;
__builtin_memcpy(__dst + __bytes, __src + __bytes, 16);
}
}
else if (__bytes > 8 && _Max > 8)
{
__builtin_memcpy(__dst, __src, 8);
if constexpr (_Chunk == 4)
{
__bytes -= 4;
__builtin_memcpy(__dst + __bytes, __src + __bytes, 4);
}
else if constexpr (_Chunk < 4)
{
__bytes -= 8;
__builtin_memcpy(__dst + __bytes, __src + __bytes, 8);
}
}
else if (__bytes > 4 && _Max > 4)
{
__builtin_memcpy(__dst, __src, 4);
if constexpr (_Chunk == 2)
{
__bytes -= 2;
__builtin_memcpy(__dst + __bytes, __src + __bytes, 2);
}
else if constexpr (_Chunk == 1)
{
__bytes -= 4;
__builtin_memcpy(__dst + __bytes, __src + __bytes, 4);
}
}
else if (__bytes >= 2)
{
__builtin_memcpy(__dst, __src, 2);
if constexpr (_Chunk == 2)
{
__bytes -= 2;
__builtin_memcpy(__dst + __bytes, __src + __bytes, 2);
}
else if constexpr (_Chunk == 1)
{
__bytes -= 1;
__builtin_memcpy(__dst + __bytes, __src + __bytes, 1);
}
}
else if (__bytes == 1)
__builtin_memcpy(__dst, __src, 1);
}
// [simd.reductions] identity_element = *see below*
template <typename _Tp, typename _BinaryOperation>
requires __is_one_of<_BinaryOperation,
plus<>, multiplies<>, bit_and<>, bit_or<>, bit_xor<>>::value
consteval _Tp
__default_identity_element()
{
if constexpr (same_as<_BinaryOperation, multiplies<>>)
return _Tp(1);
else if constexpr (same_as<_BinaryOperation, bit_and<>>)
return _Tp(~_Tp());
else
return _Tp(0);
}
} // namespace simd
_GLIBCXX_END_NAMESPACE_VERSION
} // namespace std
#pragma GCC diagnostic pop
#endif // C++26
#endif // _GLIBCXX_SIMD_DETAILS_H