blob: 9ca0db02c0cdc25565a186ebe175c4313a65326c [file] [log] [blame]
// Implementation of <simd> -*- C++ -*-
// Copyright The GNU Toolchain Authors.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.
// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
// <http://www.gnu.org/licenses/>.
#ifndef _GLIBCXX_VEC_OPS_H
#define _GLIBCXX_VEC_OPS_H 1
#ifdef _GLIBCXX_SYSHDR
#pragma GCC system_header
#endif
#if __cplusplus >= 202400L
#include "simd_details.h"
#include <bit>
#include <bits/utility.h>
// psabi warnings are bogus because the ABI of the internal types never leaks into user code
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpsabi"
namespace std _GLIBCXX_VISIBILITY(default)
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
namespace simd
{
template <std::signed_integral _Tp>
constexpr bool
__signed_has_single_bit(_Tp __x)
{ return __has_single_bit(make_unsigned_t<_Tp>(__x)); }
/**
* Alias for a vector builtin with given value type and total sizeof.
*/
template <__vectorizable _Tp, size_t _Bytes>
requires (__has_single_bit(_Bytes))
using __vec_builtin_type_bytes [[__gnu__::__vector_size__(_Bytes)]] = _Tp;
/**
* Alias for a vector builtin with given value type @p _Tp and @p _Width.
*/
template <__vectorizable _Tp, __simd_size_type _Width>
requires (__signed_has_single_bit(_Width))
using __vec_builtin_type = __vec_builtin_type_bytes<_Tp, sizeof(_Tp) * _Width>;
/**
* Constrain to any vector builtin with given value type and optional width.
*/
template <typename _Tp, typename _ValueType,
__simd_size_type _Width = sizeof(_Tp) / sizeof(_ValueType)>
concept __vec_builtin_of
= !is_class_v<_Tp> && !is_pointer_v<_Tp> && !is_arithmetic_v<_Tp>
&& __vectorizable<_ValueType>
&& _Width >= 1 && sizeof(_Tp) / sizeof(_ValueType) == _Width
&& same_as<__vec_builtin_type_bytes<_ValueType, sizeof(_Tp)>, _Tp>
&& requires(_Tp& __v, _ValueType __x) { __v[0] = __x; };
/**
* Constrain to any vector builtin.
*/
template <typename _Tp>
concept __vec_builtin
= __vec_builtin_of<_Tp, remove_cvref_t<decltype(declval<const _Tp>()[0])>>;
/**
* Alias for the value type of the given __vec_builtin type @p _Tp.
*/
template <__vec_builtin _Tp>
using __vec_value_type = remove_cvref_t<decltype(declval<const _Tp>()[0])>;
/**
* The width (number of value_type elements) of the given vector builtin or arithmetic type.
*/
template <typename _Tp>
inline constexpr __simd_size_type __width_of = 1;
template <typename _Tp>
requires __vec_builtin<_Tp>
inline constexpr __simd_size_type __width_of<_Tp> = sizeof(_Tp) / sizeof(__vec_value_type<_Tp>);
/**
* Alias for a vector builtin with equal value type and new width @p _Np.
*/
template <__simd_size_type _Np, __vec_builtin _TV>
using __resize_vec_builtin_t = __vec_builtin_type<__vec_value_type<_TV>, _Np>;
template <__vec_builtin _TV>
requires (__width_of<_TV> > 1)
using __half_vec_builtin_t = __resize_vec_builtin_t<__width_of<_TV> / 2, _TV>;
template <__vec_builtin _TV>
using __double_vec_builtin_t = __resize_vec_builtin_t<__width_of<_TV> * 2, _TV>;
template <typename _Up, __vec_builtin _TV>
[[__gnu__::__always_inline__]]
constexpr __vec_builtin_type_bytes<_Up, sizeof(_TV)>
__vec_bit_cast(_TV __v)
{ return reinterpret_cast<__vec_builtin_type_bytes<_Up, sizeof(_TV)>>(__v); }
template <int _Np, __vec_builtin _TV>
requires signed_integral<__vec_value_type<_TV>>
static constexpr _TV _S_vec_implicit_mask = []<int... _Is> (integer_sequence<int, _Is...>) {
return _TV{ (_Is < _Np ? -1 : 0)... };
} (make_integer_sequence<int, __width_of<_TV>>());
/**
* Helper function to work around Clang not allowing v[i] in constant expressions.
*/
template <__vec_builtin _TV>
[[__gnu__::__always_inline__]]
constexpr __vec_value_type<_TV>
__vec_get(_TV __v, int __i)
{
#ifdef _GLIBCXX_CLANG
if consteval
{
return __builtin_bit_cast(array<__vec_value_type<_TV>, __width_of<_TV>>, __v)[__i];
}
else
#endif
{
return __v[__i];
}
}
/**
* Helper function to work around Clang and GCC not allowing assignment to v[i] in constant
* expressions.
*/
template <__vec_builtin _TV>
[[__gnu__::__always_inline__]]
constexpr void
__vec_set(_TV& __v, int __i, __vec_value_type<_TV> __x)
{
if consteval
{
#ifdef _GLIBCXX_CLANG
auto __arr = __builtin_bit_cast(array<__vec_value_type<_TV>, __width_of<_TV>>, __v);
__arr[__i] = __x;
__v = __builtin_bit_cast(_TV, __arr);
#else
constexpr auto [...__j] = _IotaArray<__width_of<_TV>>;
__v = _TV{(__i == __j ? __x : __v[__j])...};
#endif
}
else
{
__v[__i] = __x;
}
}
/** @internal
* Return vector builtin with all values from @p __a and @p __b.
*/
template <__vec_builtin _TV>
[[__gnu__::__always_inline__]]
constexpr __vec_builtin_type<__vec_value_type<_TV>, __width_of<_TV> * 2>
__vec_concat(_TV __a, _TV __b)
{
constexpr auto [...__is] = _IotaArray<__width_of<_TV> * 2>;
return __builtin_shufflevector(__a, __b, __is...);
}
/** @internal
* Concatenate the first @p _N0 elements from @p __a with the first @p _N1 elements from @p __b
* with the elements from applying this function recursively to @p __rest.
*
* @pre _N0 <= __width_of<_TV0> && _N1 <= __width_of<_TV1> && _Ns <= __width_of<_TVs> && ...
*
* Strategy: Aim for a power-of-2 tree concat. E.g.
* - cat(2, 2, 2, 2) -> cat(4, 2, 2) -> cat(4, 4)
* - cat(2, 2, 2, 2, 8) -> cat(4, 2, 2, 8) -> cat(4, 4, 8) -> cat(8, 8)
*/
template <int _N0, int _N1, int... _Ns, __vec_builtin _TV0, __vec_builtin _TV1,
__vec_builtin... _TVs>
[[__gnu__::__always_inline__]]
constexpr __vec_builtin_type<__vec_value_type<_TV0>,
__bit_ceil(unsigned(_N0 + (_N1 + ... + _Ns)))>
__vec_concat_sized(const _TV0& __a, const _TV1& __b, const _TVs&... __rest);
template <int _N0, int _N1, int _N2, int... _Ns, __vec_builtin _TV0, __vec_builtin _TV1,
__vec_builtin _TV2, __vec_builtin... _TVs>
requires (__has_single_bit(unsigned(_N0))) && (_N0 >= (_N1 + _N2))
[[__gnu__::__always_inline__]]
constexpr __vec_builtin_type<__vec_value_type<_TV0>,
__bit_ceil(unsigned(_N0 + _N1 + (_N2 + ... + _Ns)))>
__vec_concat_sized(const _TV0& __a, const _TV1& __b, const _TV2& __c, const _TVs&... __rest)
{
return __vec_concat_sized<_N0, _N1 + _N2, _Ns...>(
__a, __vec_concat_sized<_N1, _N2>(__b, __c), __rest...);
}
template <int _N0, int _N1, int... _Ns, __vec_builtin _TV0, __vec_builtin _TV1,
__vec_builtin... _TVs>
[[__gnu__::__always_inline__]]
constexpr __vec_builtin_type<__vec_value_type<_TV0>,
__bit_ceil(unsigned(_N0 + (_N1 + ... + _Ns)))>
__vec_concat_sized(const _TV0& __a, const _TV1& __b, const _TVs&... __rest)
{
// __is is rounded up because we need to generate a power-of-2 vector:
constexpr auto [...__is] = _IotaArray<__bit_ceil(unsigned(_N0 + _N1)), int>;
const auto __ab = __builtin_shufflevector(__a, __b, [](int __i) consteval {
if (__i < _N0) // copy from __a
return __i;
else if (__i < _N0 + _N1) // copy from __b
return __i - _N0 + __width_of<_TV0>; // _N0 <= __width_of<_TV0>
else // can't index into __rest
return -1; // don't care
}(__is)...);
if constexpr (sizeof...(__rest) == 0)
return __ab;
else
return __vec_concat_sized<_N0 + _N1, _Ns...>(__ab, __rest...);
}
template <__vec_builtin _TV>
[[__gnu__::__always_inline__]]
constexpr __half_vec_builtin_t<_TV>
__vec_split_lo(_TV __v)
{
constexpr int __n = __width_of<_TV> / 2;
constexpr auto [...__is] = _IotaArray<__n>;
return __builtin_shufflevector(__v, __v, __is...);
}
template <__vec_builtin _TV>
[[__gnu__::__always_inline__]]
constexpr __half_vec_builtin_t<_TV>
__vec_split_hi(_TV __v)
{
constexpr int __n = __width_of<_TV> / 2;
constexpr auto [...__is] = _IotaArray<__n>;
return __builtin_shufflevector(__v, __v, (__n + __is)...);
}
/** @internal
* Return @p __x zero-padded to @p _Bytes bytes.
*
* Use this function when you need two objects of the same size (e.g. for __vec_concat).
*/
template <size_t _Bytes, __vec_builtin _TV>
[[__gnu__::__always_inline__]]
constexpr auto
__vec_zero_pad_to(_TV __x)
{
if constexpr (sizeof(_TV) == _Bytes)
return __x;
else if constexpr (sizeof(_TV) <= sizeof(0ull))
{
using _Up = _UInt<sizeof(_TV)>;
__vec_builtin_type_bytes<_Up, _Bytes> __tmp = {__builtin_bit_cast(_Up, __x)};
return __builtin_bit_cast(__vec_builtin_type_bytes<__vec_value_type<_TV>, _Bytes>, __tmp);
}
else if constexpr (sizeof(_TV) < _Bytes)
return __vec_zero_pad_to<_Bytes>(__vec_concat(__x, _TV()));
else
static_assert(false);
}
/** @internal
* Return a type with sizeof 16, add zero-padding to @p __x. The input must be smaller.
*
* Use this function instead of the above when you need to pad an argument for a SIMD builtin.
*/
template <__vec_builtin _TV>
[[__gnu__::__always_inline__]]
constexpr auto
__vec_zero_pad_to_16(_TV __x)
{
static_assert(sizeof(_TV) < 16);
return __vec_zero_pad_to<16>(__x);
}
// work around __builtin_constant_p returning false unless passed a variable
// (__builtin_constant_p(x[0]) is false while __is_const_known(x[0]) is true)
template <typename _Tp>
[[__gnu__::__always_inline__]]
constexpr bool
__is_const_known(const _Tp& __x)
{
return __builtin_constant_p(__x);
}
[[__gnu__::__always_inline__]]
constexpr bool
__is_const_known(const auto&... __xs) requires(sizeof...(__xs) >= 2)
{
if consteval
{
return true;
}
else
{
return (__is_const_known(__xs) && ...);
}
}
[[__gnu__::__always_inline__]]
constexpr bool
__is_const_known_equal_to(const auto& __x, const auto& __expect)
{ return __is_const_known(__x == __expect) && __x == __expect; }
#if _GLIBCXX_X86
template <__vec_builtin _UV, __vec_builtin _TV>
inline _UV
__x86_cvt_f16c(_TV __v);
#endif
/** @internal
* Simple wrapper around __builtin_convertvector to provide static_cast-like syntax.
*
* Works around GCC failing to use the F16C/AVX512F cvtps2ph/cvtph2ps instructions.
*/
template <__vec_builtin _UV, __vec_builtin _TV, _ArchTraits _Traits = {}>
[[__gnu__::__always_inline__]]
constexpr _UV
__vec_cast(_TV __v)
{
static_assert(__width_of<_UV> == __width_of<_TV>);
#if _GLIBCXX_X86
using _Up = __vec_value_type<_UV>;
using _Tp = __vec_value_type<_TV>;
constexpr bool __to_f16 = is_same_v<_Up, _Float16>;
constexpr bool __from_f16 = is_same_v<_Tp, _Float16>;
constexpr bool __needs_f16c = _Traits._M_have_f16c() && !_Traits._M_have_avx512fp16()
&& (__to_f16 || __from_f16);
if (__needs_f16c && !__is_const_known(__v))
{ // Work around PR121688
if constexpr (__needs_f16c)
return __x86_cvt_f16c<_UV>(__v);
}
if constexpr (is_floating_point_v<_Tp> && is_integral_v<_Up>
&& sizeof(_UV) < sizeof(_TV) && sizeof(_Up) < sizeof(int))
{
using _Ip = __integer_from<std::min(sizeof(int), sizeof(_Tp))>;
using _IV = __vec_builtin_type<_Ip, __width_of<_TV>>;
return __vec_cast<_UV>(__vec_cast<_IV>(__v));
}
#endif
return __builtin_convertvector(__v, _UV);
}
/** @internal
* Overload of the above cast function that determines the destination vector type from a given
* element type @p _Up and the `__width_of` the argument type.
*
* Calls the above overload.
*/
template <__vectorizable _Up, __vec_builtin _TV>
[[__gnu__::__always_inline__]]
constexpr __vec_builtin_type<_Up, __width_of<_TV>>
__vec_cast(_TV __v)
{ return __vec_cast<__vec_builtin_type<_Up, __width_of<_TV>>>(__v); }
/** @internal
* As above, but with additional precondition on possible values of the argument.
*
* Precondition: __k[i] is either 0 or -1 for all i.
*/
template <__vec_builtin _UV, __vec_builtin _TV>
[[__gnu__::__always_inline__]]
constexpr _UV
__vec_mask_cast(_TV __k)
{
static_assert(signed_integral<__vec_value_type<_UV>>);
static_assert(signed_integral<__vec_value_type<_TV>>);
// TODO: __builtin_convertvector cannot be optimal because it doesn't consider input and
// output can only be 0 or -1.
return __builtin_convertvector(__k, _UV);
}
template <__vec_builtin _TV>
[[__gnu__::__always_inline__]]
constexpr _TV
__vec_xor(_TV __a, _TV __b)
{
using _Tp = __vec_value_type<_TV>;
if constexpr (is_floating_point_v<_Tp>)
{
using _UV = __vec_builtin_type<__integer_from<sizeof(_Tp)>, __width_of<_TV>>;
return __builtin_bit_cast(
_TV, __builtin_bit_cast(_UV, __a) ^ __builtin_bit_cast(_UV, __b));
}
else
return __a ^ __b;
}
template <__vec_builtin _TV>
[[__gnu__::__always_inline__]]
constexpr _TV
__vec_or(_TV __a, _TV __b)
{
using _Tp = __vec_value_type<_TV>;
if constexpr (is_floating_point_v<_Tp>)
{
using _UV = __vec_builtin_type<__integer_from<sizeof(_Tp)>, __width_of<_TV>>;
return __builtin_bit_cast(
_TV, __builtin_bit_cast(_UV, __a) | __builtin_bit_cast(_UV, __b));
}
else
return __a | __b;
}
template <__vec_builtin _TV>
[[__gnu__::__always_inline__]]
constexpr _TV
__vec_and(_TV __a, _TV __b)
{
using _Tp = __vec_value_type<_TV>;
if constexpr (is_floating_point_v<_Tp>)
{
using _UV = __vec_builtin_type<__integer_from<sizeof(_Tp)>, __width_of<_TV>>;
return __builtin_bit_cast(
_TV, __builtin_bit_cast(_UV, __a) & __builtin_bit_cast(_UV, __b));
}
else
return __a & __b;
}
/** @internal
* Returns the bit-wise and of not @p __a and @p __b.
*
* Use __vec_and(__vec_not(__a), __b) unless an andnot instruction is necessary for optimization.
*
* @see __vec_andnot in simd_x86.h
*/
template <__vec_builtin _TV>
[[__gnu__::__always_inline__]]
constexpr _TV
__vec_andnot(_TV __a, _TV __b)
{
using _Tp = __vec_value_type<_TV>;
using _UV = __vec_builtin_type<__integer_from<sizeof(_Tp)>, __width_of<_TV>>;
return __builtin_bit_cast(
_TV, ~__builtin_bit_cast(_UV, __a) & __builtin_bit_cast(_UV, __b));
}
template <__vec_builtin _TV>
[[__gnu__::__always_inline__]]
constexpr _TV
__vec_not(_TV __a)
{
using _Tp = __vec_value_type<_TV>;
using _UV = __vec_builtin_type_bytes<__integer_from<sizeof(_Tp)>, sizeof(_TV)>;
if constexpr (is_floating_point_v<__vec_value_type<_TV>>)
return __builtin_bit_cast(_TV, ~__builtin_bit_cast(_UV, __a));
else
return ~__a;
}
/**
* An object of given type where only the sign bits are 1.
*/
template <__vec_builtin _V>
requires std::floating_point<__vec_value_type<_V>>
constexpr _V _S_signmask = __vec_xor(_V() + 1, _V() - 1);
template <__vec_builtin _TV, int _Np = __width_of<_TV>,
typename = make_integer_sequence<int, _Np>>
struct _VecOps;
template <__vec_builtin _TV, int _Np, int... _Is>
struct _VecOps<_TV, _Np, integer_sequence<int, _Is...>>
{
static_assert(_Np <= __width_of<_TV>);
using _Tp = __vec_value_type<_TV>;
using _HV = __half_vec_builtin_t<__conditional_t<_Np >= 2, _TV, __double_vec_builtin_t<_TV>>>;
[[__gnu__::__always_inline__]]
static constexpr _TV
_S_broadcast_to_even(_Tp __init)
{ return _TV {((_Is & 1) == 0 ? __init : _Tp())...}; }
[[__gnu__::__always_inline__]]
static constexpr _TV
_S_broadcast_to_odd(_Tp __init)
{ return _TV {((_Is & 1) == 1 ? __init : _Tp())...}; }
[[__gnu__::__always_inline__]]
static constexpr bool
_S_all_of(_TV __k) noexcept
{ return (... && (__k[_Is] != 0)); }
[[__gnu__::__always_inline__]]
static constexpr bool
_S_any_of(_TV __k) noexcept
{ return (... || (__k[_Is] != 0)); }
[[__gnu__::__always_inline__]]
static constexpr bool
_S_none_of(_TV __k) noexcept
{ return (... && (__k[_Is] == 0)); }
template <typename _Offset = integral_constant<int, 0>>
[[__gnu__::__always_inline__]]
static constexpr _TV
_S_extract(__vec_builtin auto __x, _Offset = {})
{
static_assert(is_same_v<__vec_value_type<_TV>, __vec_value_type<decltype(__x)>>);
return __builtin_shufflevector(__x, decltype(__x)(), (_Is + _Offset::value)...);
}
// swap neighboring elements
[[__gnu__::__always_inline__]]
static constexpr _TV
_S_swap_neighbors(_TV __x)
{ return __builtin_shufflevector(__x, __x, (_Is ^ 1)...); }
// duplicate even indexed elements, dropping the odd ones
[[__gnu__::__always_inline__]]
static constexpr _TV
_S_dup_even(_TV __x)
{ return __builtin_shufflevector(__x, __x, (_Is & ~1)...); }
// duplicate odd indexed elements, dropping the even ones
[[__gnu__::__always_inline__]]
static constexpr _TV
_S_dup_odd(_TV __x)
{ return __builtin_shufflevector(__x, __x, (_Is | 1)...); }
[[__gnu__::__always_inline__]]
static constexpr void
_S_overwrite_even_elements(_TV& __x, _HV __y) requires (_Np > 1)
{
constexpr __simd_size_type __n = __width_of<_TV>;
__x = __builtin_shufflevector(__x,
#ifdef _GLIBCXX_CLANG
__vec_concat(__y, __y),
#else
__y,
#endif
((_Is & 1) == 0 ? __n + _Is / 2 : _Is)...);
}
[[__gnu__::__always_inline__]]
static constexpr void
_S_overwrite_even_elements(_TV& __xl, _TV& __xh, _TV __y)
{
constexpr __simd_size_type __nl = __width_of<_TV>;
constexpr __simd_size_type __nh = __nl * 3 / 2;
__xl = __builtin_shufflevector(__xl, __y, ((_Is & 1) == 0 ? __nl + _Is / 2 : _Is)...);
__xh = __builtin_shufflevector(__xh, __y, ((_Is & 1) == 0 ? __nh + _Is / 2 : _Is)...);
}
[[__gnu__::__always_inline__]]
static constexpr void
_S_overwrite_odd_elements(_TV& __x, _HV __y) requires (_Np > 1)
{
constexpr __simd_size_type __n = __width_of<_TV>;
__x = __builtin_shufflevector(__x,
#ifdef _GLIBCXX_CLANG
__vec_concat(__y, __y),
#else
__y,
#endif
((_Is & 1) == 1 ? __n + _Is / 2 : _Is)...);
}
[[__gnu__::__always_inline__]]
static constexpr void
_S_overwrite_odd_elements(_TV& __xl, _TV& __xh, _TV __y)
{
constexpr __simd_size_type __nl = __width_of<_TV>;
constexpr __simd_size_type __nh = __nl * 3 / 2;
__xl = __builtin_shufflevector(__xl, __y, ((_Is & 1) == 1 ? __nl + _Is / 2 : _Is)...);
__xh = __builtin_shufflevector(__xh, __y, ((_Is & 1) == 1 ? __nh + _Is / 2 : _Is)...);
}
// true if all elements are know to be equal to __ref at compile time
[[__gnu__::__always_inline__]]
static constexpr bool
_S_is_const_known_equal_to(_TV __x, _Tp __ref)
{ return (__is_const_known_equal_to(__x[_Is], __ref) && ...); }
};
} // namespace simd
_GLIBCXX_END_NAMESPACE_VERSION
} // namespace std
#pragma GCC diagnostic pop
#endif // C++26
#endif // _GLIBCXX_VEC_OPS_H