blob: 27eff4ca01a7f6a1c25a793483522226768707fa [file] [log] [blame]
// Implementation of <simd> -*- C++ -*-
// Copyright The GNU Toolchain Authors.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.
// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
// <http://www.gnu.org/licenses/>.
#ifndef _GLIBCXX_SIMD_MASK_H
#define _GLIBCXX_SIMD_MASK_H 1
#ifdef _GLIBCXX_SYSHDR
#pragma GCC system_header
#endif
#if __cplusplus >= 202400L
#include "simd_iterator.h"
#include "vec_ops.h"
#if _GLIBCXX_X86
#include "simd_x86.h"
#endif
#include <bit>
#include <bitset>
// psabi warnings are bogus because the ABI of the internal types never leaks into user code
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpsabi"
namespace std _GLIBCXX_VISIBILITY(default)
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
namespace simd
{
template <unsigned _Np>
struct _SwapNeighbors
{
consteval unsigned
operator()(unsigned __i, unsigned __size) const
{
if (__size % (2 * _Np) != 0)
__builtin_abort(); // swap_neighbors<N> permutation requires a multiple of 2N elements
else if (std::has_single_bit(_Np))
return __i ^ _Np;
else if (__i % (2 * _Np) >= _Np)
return __i - _Np;
else
return __i + _Np;
}
};
template <size_t _Np, size_t _Mp>
constexpr auto
__bitset_split(const bitset<_Mp>& __b)
{
constexpr auto __bits_per_word = __CHAR_BIT__ * __SIZEOF_LONG__;
if constexpr (_Np % __bits_per_word == 0)
{
struct _Tmp
{
bitset<_Np> _M_lo;
bitset<_Mp - _Np> _M_hi;
};
return __builtin_bit_cast(_Tmp, __b);
}
else
{
constexpr auto __bits_per_ullong = __CHAR_BIT__ * __SIZEOF_LONG_LONG__;
static_assert(_Mp <= __bits_per_ullong);
using _Lo = _Bitmask<_Np>;
using _Hi = _Bitmask<_Mp - _Np>;
struct _Tmp
{
_Lo _M_lo;
_Hi _M_hi;
};
return _Tmp {static_cast<_Lo>(__b.to_ullong()), static_cast<_Hi>(__b.to_ullong() >> _Np)};
}
}
static_assert(__bitset_split<64>(bitset<128>(1))._M_lo == bitset<64>(1));
static_assert(__bitset_split<64>(bitset<128>(1))._M_hi == bitset<64>(0));
// [simd.traits]
// --- rebind ---
template <typename _Tp, typename _Vp, _ArchTraits _Traits = {}>
struct rebind
{};
/**
* Computes a member @c type `basic_vec<_Tp, Abi>`, where @c Abi is chosen such that the
* number of elements is equal to `_Vp::size()` and features of the ABI tag (such as the
* internal representation of masks, or storage order of complex components) are preserved.
*/
template <__vectorizable _Tp, __simd_vec_type _Vp, _ArchTraits _Traits>
//requires requires { typename __deduce_abi_t<_Tp, _Vp::size()>; }
struct rebind<_Tp, _Vp, _Traits>
{ using type = __similar_vec<_Tp, _Vp::size(), typename _Vp::abi_type>; };
/**
* As above, except for @c basic_mask.
*/
template <__vectorizable _Tp, __simd_mask_type _Mp, _ArchTraits _Traits>
//requires requires { typename __deduce_abi_t<_Tp, _Mp::size()>; }
struct rebind<_Tp, _Mp, _Traits>
{ using type = __similar_mask<_Tp, _Mp::size(), typename _Mp::abi_type>; };
template <typename _Tp, typename _Vp>
using rebind_t = typename rebind<_Tp, _Vp>::type;
// --- resize ---
template <__simd_size_type _Np, typename _Vp, _ArchTraits _Traits = {}>
struct resize
{};
template <__simd_size_type _Np, __simd_vec_type _Vp, _ArchTraits _Traits>
requires (_Np >= 1)
//requires requires { typename __deduce_abi_t<typename _Vp::value_type, _Np>; }
struct resize<_Np, _Vp, _Traits>
{ using type = __similar_vec<typename _Vp::value_type, _Np, typename _Vp::abi_type>; };
template <__simd_size_type _Np, __simd_mask_type _Mp, _ArchTraits _Traits>
requires (_Np >= 1)
//requires requires { typename __deduce_abi_t<typename _Mp::value_type, _Np>; }
struct resize<_Np, _Mp, _Traits>
{
using _A1 = decltype(__abi_rebind<__mask_element_size<_Mp>, _Np, typename _Mp::abi_type,
true>());
static_assert(__abi_tag<_A1>);
static_assert(_Mp::abi_type::_S_variant == _A1::_S_variant || __scalar_abi_tag<_A1>
|| __scalar_abi_tag<typename _Mp::abi_type>);
using type = basic_mask<__mask_element_size<_Mp>, _A1>;
};
template <__simd_size_type _Np, typename _Vp>
using resize_t = typename resize<_Np, _Vp>::type;
// [simd.syn]
inline constexpr __simd_size_type zero_element = numeric_limits<int>::min();
inline constexpr __simd_size_type uninit_element = zero_element + 1;
// [simd.permute.static]
template<__simd_size_type _Np = 0, __simd_vec_or_mask_type _Vp,
__index_permutation_function<_Vp> _IdxMap>
[[__gnu__::__always_inline__]]
constexpr resize_t<_Np == 0 ? _Vp::size() : _Np, _Vp>
permute(const _Vp& __v, _IdxMap&& __idxmap)
{ return resize_t<_Np == 0 ? _Vp::size() : _Np, _Vp>::_S_static_permute(__v, __idxmap); }
// [simd.permute.dynamic]
template<__simd_vec_or_mask_type _Vp, __simd_integral _Ip>
[[__gnu__::__always_inline__]]
constexpr resize_t<_Ip::size(), _Vp>
permute(const _Vp& __v, const _Ip& __indices)
{ return __v[__indices]; }
// [simd.creation] ----------------------------------------------------------
template<__simd_vec_type _Vp, typename _Ap>
[[__gnu__::__always_inline__]]
constexpr auto
chunk(const basic_vec<typename _Vp::value_type, _Ap>& __x) noexcept
{ return __x.template _M_chunk<_Vp>(); }
template<__simd_mask_type _Mp, typename _Ap>
[[__gnu__::__always_inline__]]
constexpr auto
chunk(const basic_mask<__mask_element_size<_Mp>, _Ap>& __x) noexcept
{ return __x.template _M_chunk<_Mp>(); }
template<__simd_size_type _Np, typename _Tp, typename _Ap>
[[__gnu__::__always_inline__]]
constexpr auto
chunk(const basic_vec<_Tp, _Ap>& __x) noexcept
-> decltype(chunk<resize_t<_Np, basic_vec<_Tp, _Ap>>>(__x))
{ return chunk<resize_t<_Np, basic_vec<_Tp, _Ap>>>(__x); }
template<__simd_size_type _Np, size_t _Bytes, typename _Ap>
[[__gnu__::__always_inline__]]
constexpr auto
chunk(const basic_mask<_Bytes, _Ap>& __x) noexcept
-> decltype(chunk<resize_t<_Np, basic_mask<_Bytes, _Ap>>>(__x))
{ return chunk<resize_t<_Np, basic_mask<_Bytes, _Ap>>>(__x); }
// LWG???? (reported 2025-11-25)
template<typename _Tp, typename _A0, typename... _Abis>
constexpr resize_t<(_A0::_S_size + ... + _Abis::_S_size), basic_vec<_Tp, _A0>>
cat(const basic_vec<_Tp, _A0>& __x0, const basic_vec<_Tp, _Abis>&... __xs) noexcept
{
return resize_t<(_A0::_S_size + ... + _Abis::_S_size), basic_vec<_Tp, _A0>>
::_S_concat(__x0, __xs...);
}
// LWG???? (reported 2025-11-25)
template<size_t _Bytes, typename _A0, typename... _Abis>
constexpr resize_t<(_A0::_S_size + ... + _Abis::_S_size), basic_mask<_Bytes, _A0>>
cat(const basic_mask<_Bytes, _A0>& __x0, const basic_mask<_Bytes, _Abis>&... __xs) noexcept
{
return resize_t<(_A0::_S_size + ... + _Abis::_S_size), basic_mask<_Bytes, _A0>>
::_S_concat(__x0, __xs...);
}
// implementation helper for chunk and cat
consteval int
__packs_to_skip_at_front(int __offset, initializer_list<int> __sizes)
{
int __i = 0;
int __n = 0;
for (int __s : __sizes)
{
__n += __s;
if (__n > __offset)
return __i;
++__i;
}
__builtin_trap(); // called out of contract
}
consteval int
__packs_to_skip_at_back(int __offset, int __max, initializer_list<int> __sizes)
{
int __i = 0;
int __n = -__offset;
for (int __s : __sizes)
{
++__i;
__n += __s;
if (__n >= __max)
return int(__sizes.size()) - __i;
}
return 0;
}
// in principle, this overload allows conversions to _Dst - and it wouldn't be wrong - but the
// general overload below is still a better candidate in overload resolution
template <typename _Dst>
[[__gnu__::__always_inline__]]
constexpr _Dst
__extract_simd_at(auto _Offset, const _Dst& __r, const auto&...)
requires(_Offset.value == 0)
{ return __r; }
template <typename _Dst, typename _V0>
[[__gnu__::__always_inline__]]
constexpr _Dst
__extract_simd_at(auto _Offset, const _V0&, const _Dst& __r, const auto&...)
requires(_Offset.value == _V0::size.value)
{ return __r; }
template <typename _Dst, typename... _Vs>
[[__gnu__::__always_inline__]]
constexpr _Dst
__extract_simd_at(auto _Offset, const _Vs&... __xs)
{
using _Adst = typename _Dst::abi_type;
if constexpr (_Adst::_S_nreg >= 2)
{
using _Dst0 = remove_cvref_t<decltype(declval<_Dst>()._M_get_low())>;
using _Dst1 = remove_cvref_t<decltype(declval<_Dst>()._M_get_high())>;
return _Dst::_S_init(__extract_simd_at<_Dst0>(_Offset, __xs...),
__extract_simd_at<_Dst1>(_Offset + _Dst0::size, __xs...));
}
else
{
using _Ret = remove_cvref_t<decltype(declval<_Dst>()._M_get())>;
constexpr bool __use_bitmask = __simd_mask_type<_Dst> && _Adst::_S_is_bitmask;
constexpr int __dst_full_size = __bit_ceil(unsigned(_Adst::_S_size));
constexpr int __nargs = sizeof...(__xs);
using _Afirst = typename _Vs...[0]::abi_type;
using _Alast = typename _Vs...[__nargs - 1]::abi_type;
const auto& __x0 = __xs...[0];
const auto& __xlast = __xs...[__nargs - 1];
constexpr int __ninputs = (_Vs::size.value + ...);
if constexpr (_Offset.value >= _Afirst::_S_size
|| __ninputs - _Offset.value - _Alast::_S_size >= _Adst::_S_size)
{ // can drop inputs at the front and/or back of the pack
constexpr int __skip_front = __packs_to_skip_at_front(_Offset.value,
{_Vs::size.value...});
constexpr int __skip_back = __packs_to_skip_at_back(_Offset.value, _Adst::_S_size,
{_Vs::size.value...});
static_assert(__skip_front > 0 || __skip_back > 0);
constexpr auto [...__skip] = _IotaArray<__skip_front>;
constexpr auto [...__is] = _IotaArray<__nargs - __skip_front - __skip_back>;
constexpr int __new_offset = _Offset.value - (0 + ... + _Vs...[__skip]::size.value);
return __extract_simd_at<_Dst>(cw<__new_offset>, __xs...[__is + __skip_front]...);
}
else if constexpr (_Adst::_S_size == 1)
{ // trivial conversion to one value_type
return _Dst(__x0[_Offset.value]);
}
else if constexpr (_Afirst::_S_nreg >= 2 || _Alast::_S_nreg >= 2)
{ // flatten first and/or last multi-register argument
constexpr bool __flatten_first = _Afirst::_S_nreg >= 2;
constexpr bool __flatten_last = __nargs > 1 && _Alast::_S_nreg >= 2;
constexpr auto [...__is] = _IotaArray<__nargs - __flatten_first - __flatten_last>;
if constexpr (__flatten_first && __flatten_last)
return __extract_simd_at<_Dst>(
_Offset, __x0._M_get_low(), __x0._M_get_high(), __xs...[__is + 1]...,
__xlast._M_get_low(), __xlast._M_get_high());
else if constexpr (__flatten_first)
return __extract_simd_at<_Dst>(
_Offset, __x0._M_get_low(), __x0._M_get_high(), __xs...[__is + 1]...);
else
return __extract_simd_at<_Dst>(
_Offset, __xs...[__is]..., __xlast._M_get_low(), __xlast._M_get_high());
}
else if constexpr (__simd_mask_type<_Dst>
&& ((_Adst::_S_variant != _Vs::abi_type::_S_variant
&& !__scalar_abi_tag<typename _Vs::abi_type>) || ...))
{ // convert ABI tag if incompatible
return __extract_simd_at<_Dst>(
_Offset, static_cast<const resize_t<_Vs::size.value, _Dst>&>(__xs)...);
}
// at this point __xs should be as small as possible; there may be some corner cases left
else if constexpr (__nargs == 1)
{ // simple and optimal
if constexpr (__use_bitmask)
return _Dst(_Ret(__x0._M_to_uint() >> _Offset.value));
else
return _VecOps<_Ret>::_S_extract(__x0._M_concat_data(false), _Offset);
}
else if constexpr (__use_bitmask)
{ // fairly simple and optimal bit shifting solution
static_assert(_Afirst::_S_nreg == 1);
static_assert(_Offset.value < _Afirst::_S_size);
int __offset = -_Offset.value;
_Ret __r;
template for (const auto& __x : {__xs...})
{
if (__offset <= 0)
__r = _Ret(__x._M_to_uint() >> -__offset);
else if (__offset < _Adst::_S_size)
__r |= _Ret(_Ret(__x._M_to_uint()) << __offset);
__offset += __x.size.value;
}
return _Dst(__r);
}
else if constexpr (__nargs == 2 && _Offset == 0 && _Adst::_S_nreg == 1
&& _Afirst::_S_size >= _Alast::_S_size
&& __has_single_bit(unsigned(_Afirst::_S_size)))
{ // simple __vec_concat
if constexpr (_Afirst::_S_size == 1)
// even simpler init from two values
return _Ret{__x0._M_concat_data()[0], __xlast._M_concat_data()[0]};
else
{
const auto __v0 = __x0._M_concat_data();
const auto __v1 = __vec_zero_pad_to<sizeof(__v0)>(__xlast._M_concat_data());
return __vec_concat(__v0, __v1);
}
}
else if constexpr (__nargs == 2 && _Adst::_S_nreg == 1 && _Offset == 0
&& _Afirst::_S_nreg == 1 && _Alast::_S_size == 1)
{ // optimize insertion of one element at the end
_Ret __r = __vec_zero_pad_to<sizeof(_Ret)>(__x0._M_get());
__vec_set(__r, _Afirst::_S_size, __xlast._M_concat_data()[0]);
return __r;
}
else if constexpr (__nargs == 2 && _Adst::_S_nreg == 1 && _Offset == 0
&& _Afirst::_S_nreg == 1 && _Alast::_S_size == 2)
{ // optimize insertion of two elements at the end
_Ret __r = __vec_zero_pad_to<sizeof(_Ret)>(__x0._M_concat_data());
const auto __x1 = __xlast._M_concat_data();
if constexpr (sizeof(__x1) <= sizeof(double) && (_Afirst::_S_size & 1) == 0)
{ // can use a single insert instruction
using _Up = __conditional_t<
is_floating_point_v<__vec_value_type<_Ret>>,
__conditional_t<sizeof(__x1) == sizeof(double), double, float>,
__integer_from<sizeof(__x1)>>;
auto __r2 = __vec_bit_cast<_Up>(__r);
__vec_set(__r2, _Afirst::_S_size / 2, __vec_bit_cast<_Up>(__x1)[0]);
__r = reinterpret_cast<_Ret>(__r2);
}
else
{
__vec_set(__r, _Afirst::_S_size, __x1[0]);
__vec_set(__r, _Afirst::_S_size + 1, __x1[1]);
}
return __r;
}
else if constexpr (__nargs == 2 && _Afirst::_S_nreg == 1 && _Alast::_S_nreg == 1)
{ // optimize concat of two input vectors (e.g. using palignr)
constexpr auto [...__is] = _IotaArray<__dst_full_size>;
constexpr int __v2_offset = __width_of<decltype(__x0._M_concat_data())>;
return __builtin_shufflevector(
__x0._M_concat_data(), __xlast._M_concat_data(), [](int __i) consteval {
if (__i < _Afirst::_S_size)
return __i;
__i -= _Afirst::_S_size;
if (__i < _Alast::_S_size)
return __i + __v2_offset;
else
return -1;
}(__is + _Offset.value)...);
}
else if (__is_const_known(__xs...) || __ninputs == _Adst::_S_size)
{ // hard to optimize for the compiler, but necessary in constant expressions
return _VecOps<_Ret>::_S_extract(
__vec_concat_sized<__xs.size.value...>(__xs._M_concat_data(false)...),
_Offset);
}
else
{ // fallback to concatenation in memory => load the result
alignas(_Ret) __vec_value_type<_Ret>
__tmp[std::max(__ninputs, _Offset.value + __dst_full_size)] = {};
int __offset = 0;
template for (const auto& __x : {__xs...})
{
if constexpr (__simd_mask_type<_Dst>)
(-__x)._M_store(__tmp + __offset);
else
__x._M_store(__tmp + __offset);
__offset += __x.size.value;
}
_Ret __r;
__builtin_memcpy(&__r, __tmp + _Offset.value, sizeof(_Ret));
return __r;
}
}
}
// [simd.mask] --------------------------------------------------------------
template <size_t _Bytes, typename _Ap>
class basic_mask
{
public:
using value_type = bool;
using abi_type = _Ap;
#define _GLIBCXX_DELETE_SIMD "This specialization is disabled because of an invalid combination " \
"of template arguments to basic_mask."
basic_mask() = delete(_GLIBCXX_DELETE_SIMD);
~basic_mask() = delete(_GLIBCXX_DELETE_SIMD);
basic_mask(const basic_mask&) = delete(_GLIBCXX_DELETE_SIMD);
basic_mask& operator=(const basic_mask&) = delete(_GLIBCXX_DELETE_SIMD);
#undef _GLIBCXX_DELETE_SIMD
};
template <size_t _Bytes, typename _Ap>
class _MaskBase
{
using _Mp = basic_mask<_Bytes, _Ap>;
protected:
using _VecType = __simd_vec_from_mask_t<_Bytes, _Ap>;
static_assert(destructible<_VecType> || _Bytes > sizeof(0ull));
public:
using iterator = __iterator<_Mp>;
using const_iterator = __iterator<const _Mp>;
constexpr iterator
begin() noexcept
{ return {static_cast<_Mp&>(*this), 0}; }
constexpr const_iterator
begin() const noexcept
{ return cbegin(); }
constexpr const_iterator
cbegin() const noexcept
{ return {static_cast<const _Mp&>(*this), 0}; }
constexpr default_sentinel_t
end() const noexcept
{ return {}; }
constexpr default_sentinel_t
cend() const noexcept
{ return {}; }
static constexpr auto size = __simd_size_c<_Ap::_S_size>;
_MaskBase() = default;
// LWG issue from 2026-03-04 / P4042R0
template <size_t _UBytes, typename _UAbi>
requires (_Ap::_S_size != _UAbi::_S_size)
explicit
_MaskBase(const basic_mask<_UBytes, _UAbi>&) = delete("size mismatch");
template <typename _Up, typename _UAbi>
explicit
_MaskBase(const basic_vec<_Up, _UAbi>&)
= delete("use operator! or a comparison to convert a vec into a mask");
template <typename _Up, typename _UAbi>
requires (_Ap::_S_size != _UAbi::_S_size)
operator basic_vec<_Up, _UAbi>() const
= delete("size mismatch");
};
template <size_t _Bytes, __abi_tag _Ap>
requires (_Ap::_S_nreg == 1)
class basic_mask<_Bytes, _Ap>
: public _MaskBase<_Bytes, _Ap>
{
using _Base = _MaskBase<_Bytes, _Ap>;
using _VecType = _Base::_VecType;
template <size_t, typename>
friend class basic_mask;
template <typename, typename>
friend class basic_vec;
static constexpr int _S_size = _Ap::_S_size;
using _DataType = typename _Ap::template _MaskDataType<_Bytes>;
static constexpr bool _S_has_bool_member = is_same_v<_DataType, bool>;
static constexpr bool _S_is_scalar = _S_has_bool_member;
static constexpr bool _S_use_bitmask = _Ap::_S_is_bitmask;
static constexpr int _S_full_size = [] {
if constexpr (_S_is_scalar)
return _S_size;
else if constexpr (_S_use_bitmask && _S_size < __CHAR_BIT__)
return __CHAR_BIT__;
else
return __bit_ceil(unsigned(_S_size));
}();
static constexpr bool _S_is_partial = _S_size != _S_full_size;
static constexpr _DataType _S_implicit_mask = [] {
if constexpr (_S_is_scalar)
return true;
else if (!_S_is_partial)
return _DataType(~_DataType());
else if constexpr (_S_use_bitmask)
return _DataType((_DataType(1) << _S_size) - 1);
else
{
constexpr auto [...__is] = _IotaArray<_S_full_size>;
return _DataType{ (__is < _S_size ? -1 : 0)... };
}
}();
// Actual padding bytes, not padding elements.
// => _S_padding_bytes is 0 even if _S_is_partial is true.
static constexpr size_t _S_padding_bytes = 0;
_DataType _M_data;
public:
using value_type = bool;
using abi_type = _Ap;
using iterator = _Base::iterator;
using const_iterator = _Base::const_iterator;
// internal but public API ----------------------------------------------
[[__gnu__::__always_inline__]]
static constexpr basic_mask
_S_init(_DataType __x)
{
basic_mask __r;
__r._M_data = __x;
return __r;
}
[[__gnu__::__always_inline__]]
static constexpr basic_mask
_S_init(unsigned_integral auto __bits)
{ return basic_mask(__bits); }
[[__gnu__::__always_inline__]]
constexpr const _DataType&
_M_get() const
{ return _M_data; }
/** @internal
* Bit-cast the given object @p __x to basic_mask.
*
* This is necessary for _S_nreg > 1 where the last element can be bool or when the sizeof
* doesn't match because of different alignment requirements of the sub-masks.
*/
template <size_t _UBytes, typename _UAbi>
[[__gnu__::__always_inline__]]
static constexpr basic_mask
_S_recursive_bit_cast(const basic_mask<_UBytes, _UAbi>& __x)
{ return __builtin_bit_cast(basic_mask, __x._M_concat_data()); }
[[__gnu__::__always_inline__]]
constexpr auto
_M_concat_data(bool __do_sanitize = _S_is_partial) const
{
if constexpr (_S_is_scalar)
return __vec_builtin_type<__integer_from<_Bytes>, 1>{__integer_from<_Bytes>(-_M_data)};
else
{
if constexpr (_S_is_partial)
if (__do_sanitize)
return _DataType(_M_data & _S_implicit_mask);
return _M_data;
}
}
/** @internal
* Returns a mask where the first @p __n elements are true. All remaining elements are false.
*
* @pre @p __n > 0 && @p __n < _S_size
*/
template <_ArchTraits _Traits = {}>
[[__gnu__::__always_inline__]]
static constexpr basic_mask
_S_partial_mask_of_n(int __n)
{
static_assert(!_S_is_scalar);
if constexpr (!_S_use_bitmask)
{
using _Ip = __integer_from<_Bytes>;
__glibcxx_simd_precondition(__n >= 0 && __n <= numeric_limits<_Ip>::max(),
"_S_partial_mask_of_n without _S_use_bitmask requires "
"positive __n that does not overflow.");
constexpr _DataType __0123
= __builtin_bit_cast(_DataType, _IotaArray<_Ip(_S_full_size)>);
return basic_mask(__0123 < _Ip(__n));
}
else
{
__glibcxx_simd_precondition(__n >= 0 && __n <= 255,
"The x86 BZHI instruction requires __n to "
"only use bits 0:7");
#if __has_builtin(__builtin_ia32_bzhi_si)
if constexpr (_S_size <= 32 && _Traits._M_have_bmi2())
return _S_init(_Bitmask<_S_size>(
__builtin_ia32_bzhi_si(~0u >> (32 - _S_size), unsigned(__n))));
#endif
#if __has_builtin(__builtin_ia32_bzhi_di)
else if constexpr (_S_size <= 64 && _Traits._M_have_bmi2())
return _S_init(__builtin_ia32_bzhi_di(~0ull >> (64 - _S_size), unsigned(__n)));
#endif
if constexpr (_S_size <= 32)
{
__glibcxx_simd_precondition(__n < 32, "invalid shift");
return _S_init(_Bitmask<_S_size>((1u << unsigned(__n)) - 1));
}
else if constexpr (_S_size <= 64)
{
__glibcxx_simd_precondition(__n < 64, "invalid shift");
return _S_init((1ull << unsigned(__n)) - 1);
}
else
static_assert(false);
}
}
[[__gnu__::__always_inline__]]
constexpr basic_mask&
_M_and_neighbors()
{
if constexpr (_S_use_bitmask)
_M_data &= ((_M_data >> 1) & 0x5555'5555'5555'5555ull)
| ((_M_data << 1) & ~0x5555'5555'5555'5555ull);
else
_M_data &= _VecOps<_DataType>::_S_swap_neighbors(_M_data);
return *this;
}
[[__gnu__::__always_inline__]]
constexpr basic_mask&
_M_or_neighbors()
{
if constexpr (_S_use_bitmask)
_M_data |= ((_M_data >> 1) & 0x5555'5555'5555'5555ull)
| ((_M_data << 1) & ~0x5555'5555'5555'5555ull);
else
_M_data |= _VecOps<_DataType>::_S_swap_neighbors(_M_data);
return *this;
}
template <typename _Mp>
[[__gnu__::__always_inline__]]
constexpr auto _M_chunk() const noexcept
{
constexpr int __n = _S_size / _Mp::_S_size;
constexpr int __rem = _S_size % _Mp::_S_size;
constexpr auto [...__is] = _IotaArray<__n>;
if constexpr (__rem == 0)
return array<_Mp, __n>{__extract_simd_at<_Mp>(cw<_Mp::_S_size * __is>, *this)...};
else
{
using _Rest = resize_t<__rem, _Mp>;
return tuple(__extract_simd_at<_Mp>(cw<_Mp::_S_size * __is>, *this)...,
__extract_simd_at<_Rest>(cw<_Mp::_S_size * __n>, *this));
}
}
[[__gnu__::__always_inline__]]
static constexpr const basic_mask&
_S_concat(const basic_mask& __x0) noexcept
{ return __x0; }
template <typename... _As>
requires (sizeof...(_As) > 1)
[[__gnu__::__always_inline__]]
static constexpr basic_mask
_S_concat(const basic_mask<_Bytes, _As>&... __xs) noexcept
{
static_assert(_S_size == (_As::_S_size + ...));
return __extract_simd_at<basic_mask>(cw<0>, __xs...);
}
// [simd.mask.overview] default constructor -----------------------------
basic_mask() = default;
// [simd.mask.overview] conversion extensions ---------------------------
[[__gnu__::__always_inline__]]
constexpr
basic_mask(_DataType __x) requires(!_S_is_scalar && !_S_use_bitmask)
: _M_data(__x)
{}
[[__gnu__::__always_inline__]]
constexpr
operator _DataType() requires(!_S_is_scalar && !_S_use_bitmask)
{ return _M_data; }
// [simd.mask.ctor] broadcast constructor -------------------------------
[[__gnu__::__always_inline__]]
constexpr explicit
basic_mask(same_as<bool> auto __x) noexcept // LWG 4382.
: _M_data(__x ? _S_implicit_mask : _DataType())
{}
// [simd.mask.ctor] conversion constructor ------------------------------
template <size_t _UBytes, typename _UAbi>
requires (_S_size == _UAbi::_S_size)
[[__gnu__::__always_inline__]]
constexpr explicit(__is_mask_conversion_explicit<_Ap, _UAbi>(_Bytes, _UBytes))
basic_mask(const basic_mask<_UBytes, _UAbi>& __x) noexcept
: _M_data([&] [[__gnu__::__always_inline__]] {
using _UV = basic_mask<_UBytes, _UAbi>;
// bool to bool
if constexpr (_S_is_scalar)
return __x[0];
// converting from an "array of bool"
else if constexpr (_UV::_S_is_scalar)
{
constexpr auto [...__is] = _IotaArray<_S_size>;
if constexpr (_S_use_bitmask)
return ((_DataType(__x[__is]) << __is) | ...);
else
return _DataType{__vec_value_type<_DataType>(-__x[__is])...};
}
// vec-/bit-mask to bit-mask | bit-mask to vec-mask
else if constexpr (_S_use_bitmask || _UV::_S_use_bitmask)
return basic_mask(__x.to_bitset())._M_data;
// vec-mask to vec-mask
else if constexpr (_Bytes == _UBytes)
return _S_recursive_bit_cast(__x)._M_data;
else
{
#if _GLIBCXX_X86
// TODO: turn this into a __vec_mask_cast overload in simd_x86.h
if constexpr (_Bytes == 1 && _UBytes == 2)
if (!__is_const_known(__x))
{
if constexpr (_UAbi::_S_nreg == 1)
return __x86_cvt_vecmask<_DataType>(__x._M_data);
else if constexpr (_UAbi::_S_nreg == 2)
{
auto __lo = __x._M_data0._M_data;
auto __hi = __vec_zero_pad_to<sizeof(__lo)>(
__x._M_data1._M_concat_data());
return __x86_cvt_vecmask<_DataType>(__lo, __hi);
}
}
#endif
return __vec_mask_cast<_DataType>(__x._M_concat_data());
}
}())
{}
using _Base::_MaskBase;
// [simd.mask.ctor] generator constructor -------------------------------
template <__simd_generator_invokable<bool, _S_size> _Fp>
[[__gnu__::__always_inline__]]
constexpr explicit
basic_mask(_Fp&& __gen)
: _M_data([&] [[__gnu__::__always_inline__]] {
constexpr auto [...__is] = _IotaArray<_S_size>;
if constexpr (_S_is_scalar)
return __gen(__simd_size_c<0>);
else if constexpr (_S_use_bitmask)
return _DataType(((_DataType(__gen(__simd_size_c<__is>)) << __is)
| ...));
else
return _DataType{__vec_value_type<_DataType>(
__gen(__simd_size_c<__is>) ? -1 : 0)...};
}())
{}
// [simd.mask.ctor] bitset constructor ----------------------------------
[[__gnu__::__always_inline__]]
constexpr
basic_mask(const same_as<bitset<_S_size>> auto& __b) noexcept // LWG 4382.
: basic_mask(static_cast<_Bitmask<_S_size>>(__b.to_ullong()))
{
// more than 64 elements in one register? not yet.
static_assert(_S_size <= numeric_limits<unsigned long long>::digits);
}
// [simd.mask.ctor] uint constructor ------------------------------------
template <unsigned_integral _Tp>
requires (!same_as<_Tp, bool>) // LWG 4382.
[[__gnu__::__always_inline__]]
constexpr explicit
basic_mask(_Tp __val) noexcept
: _M_data([&] [[__gnu__::__always_inline__]] () {
if constexpr (_S_use_bitmask)
return __val;
else if constexpr (_S_is_scalar)
return bool(__val & 1);
else if (__is_const_known(__val))
{
constexpr auto [...__is] = _IotaArray<_S_size>;
return _DataType {__vec_value_type<_DataType>((__val & (1ull << __is)) == 0
? 0 : -1)...};
}
else
{
using _Ip = typename _VecType::value_type;
_VecType __v0 = _Ip(__val);
constexpr int __bits_per_element = sizeof(_Ip) * __CHAR_BIT__;
constexpr _VecType __pow2 = _VecType(1) << (__iota<_VecType> % __bits_per_element);
if constexpr (_S_size < __bits_per_element)
return ((__v0 & __pow2) > 0)._M_concat_data();
else if constexpr (_S_size == __bits_per_element)
return ((__v0 & __pow2) != 0)._M_concat_data();
else
{
static_assert(_Bytes == 1);
static_assert(sizeof(_Ip) == 1);
_Bitmask<_S_size> __bits = __val;
static_assert(sizeof(_VecType) % sizeof(__bits) == 0);
if constexpr (sizeof(_DataType) == 32)
{
__vec_builtin_type<_UInt<8>, 4> __v1 = {
0xffu & (__bits >> (0 * __CHAR_BIT__)),
0xffu & (__bits >> (1 * __CHAR_BIT__)),
0xffu & (__bits >> (2 * __CHAR_BIT__)),
0xffu & (__bits >> (3 * __CHAR_BIT__)),
};
__v1 *= 0x0101'0101'0101'0101ull;
__v0 = __builtin_bit_cast(_VecType, __v1);
return ((__v0 & __pow2) != 0)._M_data;
}
else
{
using _V1 = vec<_Ip, sizeof(__bits)>;
_V1 __v1 = __builtin_bit_cast(_V1, __bits);
__v0 = _VecType::_S_static_permute(__v1, [](int __i) {
return __i / __CHAR_BIT__;
});
return ((__v0 & __pow2) != 0)._M_data;
}
}
}
}())
{}
//Effects: Initializes the first M elements to the corresponding bit values in val, where M is
//the smaller of size() and the number of bits in the value representation
//([basic.types.general]) of the type of val. If M is less than size(), the remaining elements
//are initialized to zero.
// [simd.mask.subscr] ---------------------------------------------------
[[__gnu__::__always_inline__]]
constexpr value_type
operator[](__simd_size_type __i) const
{
__glibcxx_simd_precondition(__i >= 0 && __i < _S_size, "subscript is out of bounds");
if constexpr (_S_is_scalar)
return _M_data;
else if constexpr (_S_use_bitmask)
return bool((_M_data >> __i) & 1);
else
return _M_data[__i] & 1;
}
// [simd.mask.unary] ----------------------------------------------------
[[__gnu__::__always_inline__]]
constexpr basic_mask
operator!() const noexcept
{
if constexpr (_S_is_scalar)
return _S_init(!_M_data);
else
return _S_init(~_M_data);
}
[[__gnu__::__always_inline__]]
constexpr _VecType
operator+() const noexcept requires destructible<_VecType>
{ return operator _VecType(); }
constexpr _VecType
operator+() const noexcept = delete;
[[__gnu__::__always_inline__]]
constexpr _VecType
operator-() const noexcept requires destructible<_VecType>
{
using _Ip = typename _VecType::value_type;
if constexpr (_S_is_scalar)
return _Ip(-int(_M_data));
else if constexpr (_S_use_bitmask)
return __select_impl(*this, _Ip(-1), _Ip());
else
{
static_assert(sizeof(_VecType) == sizeof(_M_data));
return __builtin_bit_cast(_VecType, _M_data);
}
}
constexpr _VecType
operator-() const noexcept = delete;
[[__gnu__::__always_inline__]]
constexpr _VecType
operator~() const noexcept requires destructible<_VecType>
{
using _Ip = typename _VecType::value_type;
if constexpr (_S_is_scalar)
return _Ip(~int(_M_data));
else if constexpr (_S_use_bitmask)
return __select_impl(*this, _Ip(-2), _Ip(-1));
else
{
static_assert(sizeof(_VecType) == sizeof(_M_data));
return __builtin_bit_cast(_VecType, _M_data) - _Ip(1);
}
}
constexpr _VecType
operator~() const noexcept = delete;
// [simd.mask.conv] -----------------------------------------------------
template <typename _Up, typename _UAbi>
requires (_UAbi::_S_size == _S_size)
[[__gnu__::__always_inline__]]
constexpr explicit(sizeof(_Up) != _Bytes)
operator basic_vec<_Up, _UAbi>() const noexcept
{
if constexpr (_S_is_scalar)
return _Up(_M_data);
else
{
using _UV = basic_vec<_Up, _UAbi>;
return __select_impl(static_cast<_UV::mask_type>(*this), _UV(1), _UV(0));
}
}
using _Base::operator basic_vec;
// [simd.mask.namedconv] ------------------------------------------------
[[__gnu__::__always_inline__]]
constexpr bitset<_S_size>
to_bitset() const noexcept
{
// more than 64 elements in one register? not yet.
static_assert(_S_size <= numeric_limits<unsigned long long>::digits);
return to_ullong();
}
/** @internal
* Return the mask as the smallest possible unsigned integer (up to 64 bits).
*
* @tparam _Offset Adjust the return type & value to start at bit @p _Offset.
* @tparam _Use_2_for_1 Store the value of every second element into one bit of the result.
* (precondition: each even/odd pair stores the same value)
*/
template <int _Offset = 0, _ArchTraits _Traits = {}>
[[__gnu__::__always_inline__]]
constexpr _Bitmask<_S_size + _Offset>
_M_to_uint() const
{
constexpr int __nbits = _S_size;
static_assert(__nbits + _Offset <= numeric_limits<unsigned long long>::digits);
// before shifting
using _U0 = _Bitmask<__nbits>;
// potentially wider type needed for shift by _Offset
using _Ur = _Bitmask<__nbits + _Offset>;
if constexpr (_S_is_scalar || _S_use_bitmask)
{
auto __bits = _M_data;
if constexpr (_S_is_partial)
__bits &= _S_implicit_mask;
return _Ur(__bits) << _Offset;
}
else
{
#if _GLIBCXX_X86
if (!__is_const_known(*this))
{
_U0 __uint;
if constexpr (_Bytes != 2) // movmskb would duplicate each bit
__uint = _U0(__x86_movmsk(_M_data));
else if constexpr (_Bytes == 2 && _Traits._M_have_bmi2())
__uint = __bit_extract_even<__nbits>(__x86_movmsk(_M_data));
else if constexpr (_Bytes == 2)
return __similar_mask<char, __nbits, _Ap>(*this).template _M_to_uint<_Offset>();
else
static_assert(false);
// TODO: with AVX512 use __builtin_ia32_cvt[bwdq]2mask(128|256|512)
// TODO: Ask for compiler builtin to do the best of the above. This should also
// combine with a preceding vector-mask compare to produce a bit-mask compare (on
// AVX512)
if constexpr (_S_is_partial)
__uint &= (_U0(1) << _S_size) - 1;
return _Ur(__uint) << _Offset;
}
#endif
using _IV = _VecType;
static_assert(destructible<_IV>);
const typename _IV::mask_type& __k = [&] [[__gnu__::__always_inline__]] () {
if constexpr (is_same_v<typename _IV::mask_type, basic_mask>)
return *this;
else
return typename _IV::mask_type(*this);
}();
constexpr int __n = _IV::size();
if constexpr (_Bytes * __CHAR_BIT__ >= __n) // '1 << __iota' cannot overflow
{ // reduce(select(k, powers_of_2, 0))
constexpr _IV __pow2 = _IV(1) << __iota<_IV>;
return _Ur(_U0(__select_impl(__k, __pow2, _IV())
._M_reduce(bit_or<>()))) << _Offset;
}
else if constexpr (__n % __CHAR_BIT__ != 0)
{ // recurse after splitting in two
constexpr int __n_lo = __n - __n % __CHAR_BIT__;
const auto [__lo, __hi] = chunk<__n_lo>(__k);
_Ur __bits = __hi.template _M_to_uint<_Offset + __n_lo>();
return __bits | __lo.template _M_to_uint<_Offset>();
}
else
{ // limit powers_of_2 to 1, 2, 4, ..., 128
constexpr _IV __pow2 = _IV(1) << (__iota<_IV> % _IV(__CHAR_BIT__));
_IV __x = __select_impl(__k, __pow2, _IV());
// partial reductions of 8 neighboring elements
__x |= _IV::_S_static_permute(__x, _SwapNeighbors<4>());
__x |= _IV::_S_static_permute(__x, _SwapNeighbors<2>());
__x |= _IV::_S_static_permute(__x, _SwapNeighbors<1>());
// permute partial reduction results to the front
__x = _IV::_S_static_permute(__x, [](int __i) {
return __i * 8 < __n ? __i * 8 : uninit_element;
});
// extract front as scalar unsigned
_U0 __bits = __builtin_bit_cast(
__similar_vec<_U0, __n * _Bytes / sizeof(_U0), _Ap>, __x)[0];
// mask off unused bits
if constexpr (!__has_single_bit(unsigned(__nbits)))
__bits &= (_U0(1) << __nbits) - 1;
return _Ur(__bits) << _Offset;
}
}
}
[[__gnu__::__always_inline__]]
constexpr unsigned long long
to_ullong() const
{ return _M_to_uint(); }
// [simd.mask.binary] ---------------------------------------------------
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator&&(const basic_mask& __x, const basic_mask& __y) noexcept
{ return _S_init(__x._M_data & __y._M_data); }
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator||(const basic_mask& __x, const basic_mask& __y) noexcept
{ return _S_init(__x._M_data | __y._M_data); }
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator&(const basic_mask& __x, const basic_mask& __y) noexcept
{ return _S_init(__x._M_data & __y._M_data); }
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator|(const basic_mask& __x, const basic_mask& __y) noexcept
{ return _S_init(__x._M_data | __y._M_data); }
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator^(const basic_mask& __x, const basic_mask& __y) noexcept
{ return _S_init(__x._M_data ^ __y._M_data); }
// [simd.mask.cassign] --------------------------------------------------
[[__gnu__::__always_inline__]]
friend constexpr basic_mask&
operator&=(basic_mask& __x, const basic_mask& __y) noexcept
{
__x._M_data &= __y._M_data;
return __x;
}
[[__gnu__::__always_inline__]]
friend constexpr basic_mask&
operator|=(basic_mask& __x, const basic_mask& __y) noexcept
{
__x._M_data |= __y._M_data;
return __x;
}
[[__gnu__::__always_inline__]]
friend constexpr basic_mask&
operator^=(basic_mask& __x, const basic_mask& __y) noexcept
{
__x._M_data ^= __y._M_data;
return __x;
}
// [simd.mask.comparison] -----------------------------------------------
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator==(const basic_mask& __x, const basic_mask& __y) noexcept
{ return !(__x ^ __y); }
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator!=(const basic_mask& __x, const basic_mask& __y) noexcept
{ return __x ^ __y; }
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator>=(const basic_mask& __x, const basic_mask& __y) noexcept
{ return __x || !__y; }
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator<=(const basic_mask& __x, const basic_mask& __y) noexcept
{ return !__x || __y; }
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator>(const basic_mask& __x, const basic_mask& __y) noexcept
{ return __x && !__y; }
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator<(const basic_mask& __x, const basic_mask& __y) noexcept
{ return !__x && __y; }
// [simd.mask.cond] -----------------------------------------------------
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
__select_impl(const basic_mask& __k, const basic_mask& __t, const basic_mask& __f) noexcept
{
if constexpr (!_S_use_bitmask)
{
#if _GLIBCXX_X86
// this works around bad code-gen when the compiler can't see that __k is a vector-mask.
// This pattern, is recognized to match the x86 blend instructions, which only consider
// the sign bit of the mask register. Also, without SSE4, if the compiler knows that __k
// is a vector-mask, then the '< 0' is elided.
return __k._M_data < 0 ? __t._M_data : __f._M_data;
#endif
return __k._M_data ? __t._M_data : __f._M_data;
}
else
return (__k._M_data & __t._M_data) | (~__k._M_data & __f._M_data);
}
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
__select_impl(const basic_mask& __k, same_as<bool> auto __t, same_as<bool> auto __f) noexcept
{
if (__t == __f)
return basic_mask(__t);
else
return __t ? __k : !__k;
}
template <__vectorizable _T0, same_as<_T0> _T1>
requires (sizeof(_T0) == _Bytes)
[[__gnu__::__always_inline__]]
friend constexpr vec<_T0, _S_size>
__select_impl(const basic_mask& __k, const _T0& __t, const _T1& __f) noexcept
{
if constexpr (_S_is_scalar)
return __k._M_data ? __t : __f;
else
{
using _Vp = vec<_T0, _S_size>;
using _Mp = typename _Vp::mask_type;
return __select_impl(_Mp(__k), _Vp(__t), _Vp(__f));
}
}
// [simd.mask.reductions] implementation --------------------------------
[[__gnu__::__always_inline__]]
constexpr bool
_M_all_of() const noexcept
{
if constexpr (_S_is_scalar)
return _M_data;
else if constexpr (_S_use_bitmask)
{
if constexpr (_S_is_partial)
// PR120925 (partial kortest pattern not recognized)
return (_M_data & _S_implicit_mask) == _S_implicit_mask;
else
return _M_data == _S_implicit_mask;
}
#if _GLIBCXX_X86
else if (!__is_const_known(_M_data))
return __x86_vecmask_all<_S_size>(_M_data);
#endif
else
return _VecOps<_DataType, _S_size>::_S_all_of(_M_data);
}
[[__gnu__::__always_inline__]]
constexpr bool
_M_any_of() const noexcept
{
if constexpr (_S_is_scalar)
return _M_data;
else if constexpr (_S_use_bitmask)
{
if constexpr (_S_is_partial)
// PR120925 (partial kortest pattern not recognized)
return (_M_data & _S_implicit_mask) != 0;
else
return _M_data != 0;
}
#if _GLIBCXX_X86
else if (!__is_const_known(_M_data))
return __x86_vecmask_any<_S_size>(_M_data);
#endif
else
return _VecOps<_DataType, _S_size>::_S_any_of(_M_data);
}
[[__gnu__::__always_inline__]]
constexpr bool
_M_none_of() const noexcept
{
if constexpr (_S_is_scalar)
return !_M_data;
else if constexpr (_S_use_bitmask)
{
if constexpr (_S_is_partial)
// PR120925 (partial kortest pattern not recognized)
return (_M_data & _S_implicit_mask) == 0;
else
return _M_data == 0;
}
#if _GLIBCXX_X86
else if (!__is_const_known(_M_data))
return __x86_vecmask_none<_S_size>(_M_data);
#endif
else
return _VecOps<_DataType, _S_size>::_S_none_of(_M_data);
}
[[__gnu__::__always_inline__]]
constexpr __simd_size_type
_M_reduce_count() const noexcept
{
if constexpr (_S_is_scalar)
return int(_M_data);
else if constexpr (_S_size <= numeric_limits<unsigned>::digits)
return __builtin_popcount(_M_to_uint());
else
return __builtin_popcountll(to_ullong());
}
[[__gnu__::__always_inline__]]
constexpr __simd_size_type
_M_reduce_min_index() const
{
const auto __bits = _M_to_uint();
__glibcxx_simd_precondition(__bits, "An empty mask does not have a min_index.");
if constexpr (_S_size == 1)
return 0;
else
return __countr_zero(__bits);
}
[[__gnu__::__always_inline__]]
constexpr __simd_size_type
_M_reduce_max_index() const
{
const auto __bits = _M_to_uint();
__glibcxx_simd_precondition(__bits, "An empty mask does not have a max_index.");
if constexpr (_S_size == 1)
return 0;
else
return __highest_bit(__bits);
}
[[__gnu__::__always_inline__]]
friend constexpr bool
__is_const_known(const basic_mask& __x)
{ return __builtin_constant_p(__x._M_data); }
};
template <size_t _Bytes, __abi_tag _Ap>
requires (_Ap::_S_nreg > 1)
class basic_mask<_Bytes, _Ap>
: public _MaskBase<_Bytes, _Ap>
{
using _Base = _MaskBase<_Bytes, _Ap>;
using _VecType = _Base::_VecType;
template <size_t, typename>
friend class basic_mask;
template <typename, typename>
friend class basic_vec;
static constexpr int _S_size = _Ap::_S_size;
static constexpr int _N0 = __bit_ceil(unsigned(_S_size)) / 2;
static constexpr int _N1 = _S_size - _N0;
static constexpr int _Nreg0 = __bit_ceil(unsigned(_Ap::_S_nreg)) / 2;
static constexpr int _Nreg1 = _Ap::_S_nreg - _Nreg0;
// explicitly request _Nreg0 rather than use __abi_rebind. This way _Float16 can use half
// of native registers (since they convert to full float32 registers).
using _Abi0 = decltype(_Ap::template _S_resize<_N0, _Nreg0>());
using _Abi1 = decltype(_Ap::template _S_resize<_N1, _Nreg1>());
using _Mask0 = basic_mask<_Bytes, _Abi0>;
// the implementation (and users) depend on elements being contiguous in memory
static_assert(_Mask0::_S_padding_bytes == 0 && !_Mask0::_S_is_partial);
using _Mask1 = basic_mask<_Bytes, _Abi1>;
static constexpr bool _S_is_partial = _Mask1::_S_is_partial;
// _Ap::_S_nreg determines how deep the recursion goes. E.g. basic_mask<4, _Abi<8, 4>> cannot
// use basic_mask<4, _Abi<4, 1>> as _Mask0/1 types.
static_assert(_Mask0::abi_type::_S_nreg + _Mask1::abi_type::_S_nreg == _Ap::_S_nreg);
static constexpr bool _S_use_bitmask = _Mask0::_S_use_bitmask;
static constexpr bool _S_is_scalar = _Mask0::_S_is_scalar;
_Mask0 _M_data0;
_Mask1 _M_data1;
static constexpr bool _S_has_bool_member = _Mask1::_S_has_bool_member;
// by construction _N0 >= _N1
// => sizeof(_Mask0) >= sizeof(_Mask1)
// and __alignof__(_Mask0) >= __alignof__(_Mask1)
static constexpr size_t _S_padding_bytes
= (__alignof__(_Mask0) == __alignof__(_Mask1)
? 0 : __alignof__(_Mask0) - (sizeof(_Mask1) % __alignof__(_Mask0)))
+ _Mask1::_S_padding_bytes;
public:
using value_type = bool;
using abi_type = _Ap;
using iterator = _Base::iterator;
using const_iterator = _Base::const_iterator;
[[__gnu__::__always_inline__]]
static constexpr basic_mask
_S_init(const _Mask0& __x, const _Mask1& __y)
{
basic_mask __r;
__r._M_data0 = __x;
__r._M_data1 = __y;
return __r;
}
[[__gnu__::__always_inline__]]
static constexpr basic_mask
_S_init(unsigned_integral auto __bits)
{ return basic_mask(__bits); }
template <typename _U0, typename _U1>
[[__gnu__::__always_inline__]]
static constexpr basic_mask
_S_init(const __trivial_pair<_U0, _U1>& __bits)
{
if constexpr (is_unsigned_v<_U0>)
{
static_assert(is_unsigned_v<_U1>);
return _S_init(_Mask0(__bits._M_first), _Mask1(__bits._M_second));
}
else if constexpr (is_unsigned_v<_U1>)
return _S_init(_Mask0::_S_init(__bits._M_first), _Mask1(__bits._M_second));
else
return _S_init(_Mask0::_S_init(__bits._M_first), _Mask1::_S_init(__bits._M_second));
}
[[__gnu__::__always_inline__]]
constexpr const _Mask0&
_M_get_low() const
{ return _M_data0; }
[[__gnu__::__always_inline__]]
constexpr const _Mask1&
_M_get_high() const
{ return _M_data1; }
template <size_t _UBytes, typename _UAbi>
[[__gnu__::__always_inline__]]
static constexpr basic_mask
_S_recursive_bit_cast(const basic_mask<_UBytes, _UAbi>& __x)
{
using _Mp = basic_mask<_UBytes, _UAbi>;
if constexpr (_Mp::_S_has_bool_member || sizeof(basic_mask) > sizeof(__x)
|| _Mp::_S_padding_bytes != 0)
return _S_init(__builtin_bit_cast(_Mask0, __x._M_data0),
_Mask1::_S_recursive_bit_cast(__x._M_data1));
else if constexpr (sizeof(basic_mask) == sizeof(__x))
return __builtin_bit_cast(basic_mask, __x);
else
{ // e.g. on IvyBridge (different alignment => different sizeof)
struct _Tmp { alignas(_Mp) basic_mask _M_data; };
return __builtin_bit_cast(_Tmp, __x)._M_data;
}
}
[[__gnu__::__always_inline__]]
constexpr auto
_M_concat_data(bool __do_sanitize = _S_is_partial) const
{
if constexpr (_S_use_bitmask)
{
static_assert(_S_size <= numeric_limits<unsigned long long>::digits,
"cannot concat more than 64 bits");
using _Up = _Bitmask<_S_size>;
return _Up(_M_data0._M_concat_data() | (_Up(_M_data1._M_concat_data(__do_sanitize)) << _N0));
}
else
{
auto __lo = _M_data0._M_concat_data();
auto __hi = __vec_zero_pad_to<sizeof(__lo)>(_M_data1._M_concat_data(__do_sanitize));
return __vec_concat(__lo, __hi);
}
}
template <_ArchTraits _Traits = {}>
[[__gnu__::__always_inline__]]
static constexpr basic_mask
_S_partial_mask_of_n(int __n)
{
#if __has_builtin(__builtin_ia32_bzhi_di)
if constexpr (_S_use_bitmask && _S_size <= 64 && _Traits._M_have_bmi2())
return basic_mask(__builtin_ia32_bzhi_di(~0ull >> (64 - _S_size), unsigned(__n)));
#endif
if constexpr (_N0 == 1)
{
static_assert(_S_size == 2); // => __n == 1
return _S_init(_Mask0(true), _Mask1(false));
}
else if (__n < _N0)
return _S_init(_Mask0::_S_partial_mask_of_n(__n), _Mask1(false));
else if (__n == _N0 || _N1 == 1)
return _S_init(_Mask0(true), _Mask1(false));
else if constexpr (_N1 != 1)
return _S_init(_Mask0(true), _Mask1::_S_partial_mask_of_n(__n - _N0));
}
[[__gnu__::__always_inline__]]
constexpr basic_mask&
_M_and_neighbors()
{
_M_data0._M_and_neighbors();
_M_data1._M_and_neighbors();
return *this;
}
[[__gnu__::__always_inline__]]
constexpr basic_mask&
_M_or_neighbors()
{
_M_data0._M_or_neighbors();
_M_data1._M_or_neighbors();
return *this;
}
template <typename _Mp>
[[__gnu__::__always_inline__]]
constexpr auto
_M_chunk() const noexcept
{
constexpr int __n = _S_size / _Mp::_S_size;
constexpr int __rem = _S_size % _Mp::_S_size;
constexpr auto [...__is] = _IotaArray<__n>;
if constexpr (__rem == 0)
return array<_Mp, __n>{__extract_simd_at<_Mp>(cw<_Mp::_S_size * __is>,
_M_data0, _M_data1)...};
else
{
using _Rest = resize_t<__rem, _Mp>;
return tuple(__extract_simd_at<_Mp>(cw<_Mp::_S_size * __is>, _M_data0, _M_data1)...,
__extract_simd_at<_Rest>(cw<_Mp::_S_size * __n>, _M_data0, _M_data1));
}
}
[[__gnu__::__always_inline__]]
static constexpr basic_mask
_S_concat(const basic_mask& __x0) noexcept
{ return __x0; }
template <typename... _As>
requires (sizeof...(_As) >= 2)
[[__gnu__::__always_inline__]]
static constexpr basic_mask
_S_concat(const basic_mask<_Bytes, _As>&... __xs) noexcept
{
static_assert(_S_size == (_As::_S_size + ...));
return _S_init(__extract_simd_at<_Mask0>(cw<0>, __xs...),
__extract_simd_at<_Mask1>(cw<_N0>, __xs...));
}
// [simd.mask.overview] default constructor -----------------------------
basic_mask() = default;
// [simd.mask.overview] conversion extensions ---------------------------
// TODO: any?
// [simd.mask.ctor] broadcast constructor -------------------------------
[[__gnu__::__always_inline__]]
constexpr explicit
basic_mask(same_as<bool> auto __x) noexcept // LWG 4382.
: _M_data0(__x), _M_data1(__x)
{}
// [simd.mask.ctor] conversion constructor ------------------------------
template <size_t _UBytes, typename _UAbi>
requires (_S_size == _UAbi::_S_size)
[[__gnu__::__always_inline__]]
constexpr explicit(__is_mask_conversion_explicit<_Ap, _UAbi>(_Bytes, _UBytes))
basic_mask(const basic_mask<_UBytes, _UAbi>& __x) noexcept
: _M_data0([&] {
if constexpr (_UAbi::_S_nreg > 1)
{
return __x._M_data0;
}
else if constexpr (_N0 == 1)
return _Mask0(__x[0]);
else
return get<0>(chunk<_N0>(__x));
}()),
_M_data1([&] {
if constexpr (_UAbi::_S_nreg > 1)
{
return __x._M_data1;
}
else if constexpr (_N1 == 1)
return _Mask1(__x[_N0]);
else
return get<1>(chunk<_N0>(__x));
}())
{}
using _Base::_MaskBase;
// [simd.mask.ctor] generator constructor -------------------------------
template <__simd_generator_invokable<bool, _S_size> _Fp>
[[__gnu__::__always_inline__]]
constexpr explicit
basic_mask(_Fp&& __gen)
: _M_data0(__gen), _M_data1([&] [[__gnu__::__always_inline__]] (auto __i) {
return __gen(__simd_size_c<__i + _N0>);
})
{}
// [simd.mask.ctor] bitset constructor ----------------------------------
[[__gnu__::__always_inline__]]
constexpr
basic_mask(const same_as<bitset<_S_size>> auto& __b) noexcept // LWG 4382.
: _M_data0(__bitset_split<_N0>(__b)._M_lo), _M_data1(__bitset_split<_N0>(__b)._M_hi)
{}
// [simd.mask.ctor] uint constructor ------------------------------------------
template <unsigned_integral _Tp>
requires (!same_as<_Tp, bool>) // LWG 4382.
[[__gnu__::__always_inline__]]
constexpr explicit
basic_mask(_Tp __val) noexcept
: _M_data0(static_cast<_Bitmask<_N0>>(__val)),
_M_data1(sizeof(_Tp) * __CHAR_BIT__ > _N0
? static_cast<_Bitmask<_N1>>(__val >> _N0) : _Bitmask<_N1>())
{}
// [simd.mask.subscr] ---------------------------------------------------
[[__gnu__::__always_inline__]]
constexpr value_type
operator[](__simd_size_type __i) const
{
__glibcxx_simd_precondition(__i >= 0 && __i < _S_size, "subscript is out of bounds");
if (__is_const_known(__i))
return __i < _N0 ? _M_data0[__i] : _M_data1[__i - _N0];
else if constexpr (_M_data1._S_has_bool_member)
// in some cases the last element can be 'bool' instead of bit-/vector-mask;
// e.g. mask<short, 17> is {mask<short, 16>, mask<short, 1>}, where the latter uses
// _ScalarAbi<1>, which is stored as 'bool'
return __i < _N0 ? _M_data0[__i] : _M_data1[__i - _N0];
else if constexpr (abi_type::_S_is_bitmask)
{
using _AliasingByte [[__gnu__::__may_alias__]] = unsigned char;
return bool((reinterpret_cast<const _AliasingByte*>(this)
[__i / __CHAR_BIT__] >> (__i % __CHAR_BIT__)) & 1);
}
else
{
using _AliasingInt [[__gnu__::__may_alias__]] = __integer_from<_Bytes>;
return reinterpret_cast<const _AliasingInt*>(this)[__i] != 0;
}
}
// [simd.mask.unary] ----------------------------------------------------
[[__gnu__::__always_inline__]]
constexpr basic_mask
operator!() const noexcept
{ return _S_init(!_M_data0, !_M_data1); }
[[__gnu__::__always_inline__]]
constexpr _VecType
operator+() const noexcept requires destructible<_VecType>
{ return _VecType::_S_concat(+_M_data0, +_M_data1); }
constexpr _VecType
operator+() const noexcept = delete;
[[__gnu__::__always_inline__]]
constexpr _VecType
operator-() const noexcept requires destructible<_VecType>
{ return _VecType::_S_concat(-_M_data0, -_M_data1); }
constexpr _VecType
operator-() const noexcept = delete;
[[__gnu__::__always_inline__]]
constexpr _VecType
operator~() const noexcept requires destructible<_VecType>
{ return _VecType::_S_concat(~_M_data0, ~_M_data1); }
constexpr _VecType
operator~() const noexcept = delete;
// [simd.mask.conv] -----------------------------------------------------
template <typename _Up, typename _UAbi>
requires (_UAbi::_S_size == _S_size)
[[__gnu__::__always_inline__]]
constexpr explicit(sizeof(_Up) != _Bytes)
operator basic_vec<_Up, _UAbi>() const noexcept
{
using _Rp = basic_vec<_Up, _UAbi>;
return _Rp::_S_init(static_cast<_Rp::_DataType0>(_M_data0),
static_cast<_Rp::_DataType1>(_M_data1));
}
using _Base::operator basic_vec;
// [simd.mask.namedconv] ------------------------------------------------
[[__gnu__::__always_inline__]]
constexpr bitset<_S_size>
to_bitset() const noexcept
{
if constexpr (_S_size <= numeric_limits<unsigned long long>::digits)
return to_ullong();
else
{
static_assert(_N0 % numeric_limits<unsigned long long>::digits == 0);
struct _Tmp
{
bitset<_N0> _M_lo;
bitset<_N1> _M_hi;
} __tmp = {_M_data0.to_bitset(), _M_data1.to_bitset()};
return __builtin_bit_cast(bitset<_S_size>, __tmp);
}
}
template <int _Offset = 0, _ArchTraits _Traits = {}>
[[__gnu__::__always_inline__]]
constexpr auto
_M_to_uint() const
{
constexpr int _N0x = _N0;
if constexpr (_N0x >= numeric_limits<unsigned long long>::digits)
{
static_assert(_Offset == 0);
return __trivial_pair {
_M_data0.template _M_to_uint<0>(),
_M_data1.template _M_to_uint<0>()
};
}
else
{
#if _GLIBCXX_X86
if constexpr (_Bytes == 2 && !_Traits._M_have_bmi2() && _Ap::_S_nreg == 2
&& !_S_use_bitmask)
return __similar_mask<char, _S_size, _Ap>(*this).template _M_to_uint<_Offset>();
#endif
auto __uint = _M_data1.template _M_to_uint<_N0x + _Offset>();
__uint |= _M_data0.template _M_to_uint<_Offset>();
return __uint;
}
}
[[__gnu__::__always_inline__]]
constexpr unsigned long long
to_ullong() const
{
if constexpr (_S_size <= numeric_limits<unsigned long long>::digits)
return _M_to_uint();
else
{
__glibcxx_simd_precondition(_M_data1.to_ullong() == 0,
"to_ullong called on mask with 'true' elements at indices"
"higher than representable in a ullong");
return _M_data0.to_ullong();
}
}
// [simd.mask.binary]
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator&&(const basic_mask& __x, const basic_mask& __y) noexcept
{ return _S_init(__x._M_data0 && __y._M_data0, __x._M_data1 && __y._M_data1); }
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator||(const basic_mask& __x, const basic_mask& __y) noexcept
{ return _S_init(__x._M_data0 || __y._M_data0, __x._M_data1 || __y._M_data1); }
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator&(const basic_mask& __x, const basic_mask& __y) noexcept
{ return _S_init(__x._M_data0 & __y._M_data0, __x._M_data1 & __y._M_data1); }
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator|(const basic_mask& __x, const basic_mask& __y) noexcept
{ return _S_init(__x._M_data0 | __y._M_data0, __x._M_data1 | __y._M_data1); }
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator^(const basic_mask& __x, const basic_mask& __y) noexcept
{ return _S_init(__x._M_data0 ^ __y._M_data0, __x._M_data1 ^ __y._M_data1); }
// [simd.mask.cassign]
[[__gnu__::__always_inline__]]
friend constexpr basic_mask&
operator&=(basic_mask& __x, const basic_mask& __y) noexcept
{
__x._M_data0 &= __y._M_data0;
__x._M_data1 &= __y._M_data1;
return __x;
}
[[__gnu__::__always_inline__]]
friend constexpr basic_mask&
operator|=(basic_mask& __x, const basic_mask& __y) noexcept
{
__x._M_data0 |= __y._M_data0;
__x._M_data1 |= __y._M_data1;
return __x;
}
[[__gnu__::__always_inline__]]
friend constexpr basic_mask&
operator^=(basic_mask& __x, const basic_mask& __y) noexcept
{
__x._M_data0 ^= __y._M_data0;
__x._M_data1 ^= __y._M_data1;
return __x;
}
// [simd.mask.comparison] -----------------------------------------------
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator==(const basic_mask& __x, const basic_mask& __y) noexcept
{ return !(__x ^ __y); }
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator!=(const basic_mask& __x, const basic_mask& __y) noexcept
{ return __x ^ __y; }
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator>=(const basic_mask& __x, const basic_mask& __y) noexcept
{ return __x || !__y; }
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator<=(const basic_mask& __x, const basic_mask& __y) noexcept
{ return !__x || __y; }
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator>(const basic_mask& __x, const basic_mask& __y) noexcept
{ return __x && !__y; }
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
operator<(const basic_mask& __x, const basic_mask& __y) noexcept
{ return !__x && __y; }
// [simd.mask.cond] -----------------------------------------------------
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
__select_impl(const basic_mask& __k, const basic_mask& __t, const basic_mask& __f) noexcept
{
return _S_init(__select_impl(__k._M_data0, __t._M_data0, __f._M_data0),
__select_impl(__k._M_data1, __t._M_data1, __f._M_data1));
}
[[__gnu__::__always_inline__]]
friend constexpr basic_mask
__select_impl(const basic_mask& __k, same_as<bool> auto __t, same_as<bool> auto __f) noexcept
{
if (__t == __f)
return basic_mask(__t);
else
return __t ? __k : !__k;
}
template <__vectorizable _T0, same_as<_T0> _T1>
requires (sizeof(_T0) == _Bytes)
[[__gnu__::__always_inline__]]
friend constexpr vec<_T0, _S_size>
__select_impl(const basic_mask& __k, const _T0& __t, const _T1& __f) noexcept
{
using _Vp = vec<_T0, _S_size>;
if constexpr (!is_same_v<basic_mask, typename _Vp::mask_type>)
return __select_impl(static_cast<_Vp::mask_type>(__k), __t, __f);
else
return _Vp::_S_init(__select_impl(__k._M_data0, __t, __f),
__select_impl(__k._M_data1, __t, __f));
}
template <_ArchTraits _Traits = {}>
[[__gnu__::__always_inline__]]
constexpr bool
_M_all_of() const
{
if constexpr (_N0 == _N1)
return (_M_data0 && _M_data1)._M_all_of();
else
return _M_data0._M_all_of() && _M_data1._M_all_of();
}
template <_ArchTraits _Traits = {}>
[[__gnu__::__always_inline__]]
constexpr bool
_M_any_of() const
{
if constexpr (_N0 == _N1)
return (_M_data0 || _M_data1)._M_any_of();
else
return _M_data0._M_any_of() || _M_data1._M_any_of();
}
template <_ArchTraits _Traits = {}>
[[__gnu__::__always_inline__]]
constexpr bool
_M_none_of() const
{
if constexpr (_N0 == _N1)
return (_M_data0 || _M_data1)._M_none_of();
else
return _M_data0._M_none_of() && _M_data1._M_none_of();
}
[[__gnu__::__always_inline__]]
constexpr __simd_size_type
_M_reduce_min_index() const
{
if constexpr (_S_size <= numeric_limits<unsigned long long>::digits)
{
const auto __bits = _M_to_uint();
__glibcxx_simd_precondition(__bits, "An empty mask does not have a min_index.");
if constexpr (_S_size == 1)
return 0;
else
return __countr_zero(_M_to_uint());
}
else if (_M_data0._M_none_of())
return _M_data1._M_reduce_min_index() + _N0;
else
return _M_data0._M_reduce_min_index();
}
[[__gnu__::__always_inline__]]
constexpr __simd_size_type
_M_reduce_max_index() const
{
if constexpr (_S_size <= numeric_limits<unsigned long long>::digits)
{
const auto __bits = _M_to_uint();
__glibcxx_simd_precondition(__bits, "An empty mask does not have a max_index.");
if constexpr (_S_size == 1)
return 0;
else
return __highest_bit(_M_to_uint());
}
else if (_M_data1._M_none_of())
return _M_data0._M_reduce_max_index();
else
return _M_data1._M_reduce_max_index() + _N0;
}
[[__gnu__::__always_inline__]]
friend constexpr bool
__is_const_known(const basic_mask& __x)
{ return __is_const_known(__x._M_data0) && __is_const_known(__x._M_data1); }
};
} // namespace simd
_GLIBCXX_END_NAMESPACE_VERSION
} // namespace std
#pragma GCC diagnostic pop
#endif // C++26
#endif // _GLIBCXX_SIMD_MASK_H