libstdc++-v3/include/experimental/bits/simd_neon.h - gcc - Git at Google

 // Simd NEON specific implementations -*- C++ -*-

 // Copyright (C) 2020-2021 Free Software Foundation, Inc.
 //
 // This file is part of the GNU ISO C++ Library.  This library is free
 // software; you can redistribute it and/or modify it under the
 // terms of the GNU General Public License as published by the
 // Free Software Foundation; either version 3, or (at your option)
 // any later version.

 // This library is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.

 // Under Section 7 of GPL version 3, you are granted additional
 // permissions described in the GCC Runtime Library Exception, version
 // 3.1, as published by the Free Software Foundation.

 // You should have received a copy of the GNU General Public License and
 // a copy of the GCC Runtime Library Exception along with this program;
 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 // <http://www.gnu.org/licenses/>.

 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
 #define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_

 #if __cplusplus >= 201703L

 #if !_GLIBCXX_SIMD_HAVE_NEON
 #error "simd_neon.h may only be included when NEON on ARM is available"
 #endif

 _GLIBCXX_SIMD_BEGIN_NAMESPACE

 // _CommonImplNeon {{{
 struct _CommonImplNeon : _CommonImplBuiltin
 {
   // _S_store {{{
   using _CommonImplBuiltin::_S_store;

   // }}}
 };

 // }}}
 // _SimdImplNeon {{{
 template <typename _Abi>
   struct _SimdImplNeon : _SimdImplBuiltin<_Abi>
   {
     using _Base = _SimdImplBuiltin<_Abi>;

     template <typename _Tp>
       using _MaskMember = typename _Base::template _MaskMember<_Tp>;

     template <typename _Tp>
       static constexpr size_t _S_max_store_size = 16;

     // _S_masked_load {{{
     template <typename _Tp, size_t _Np, typename _Up>
       static inline _SimdWrapper<_Tp, _Np>
       _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
 		     const _Up* __mem) noexcept
       {
 	__execute_n_times<_Np>([&](auto __i) {
 	  if (__k[__i] != 0)
 	    __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
 	});
 	return __merge;
       }

     // }}}
     // _S_masked_store_nocvt {{{
     template <typename _Tp, size_t _Np>
       _GLIBCXX_SIMD_INTRINSIC static void
       _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
 			    _MaskMember<_Tp> __k)
       {
 	__execute_n_times<_Np>([&](auto __i) {
 	  if (__k[__i] != 0)
 	    __mem[__i] = __v[__i];
 	});
       }

     // }}}
     // _S_reduce {{{
     template <typename _Tp, typename _BinaryOperation>
       _GLIBCXX_SIMD_INTRINSIC static _Tp
       _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
       {
 	constexpr size_t _Np = __x.size();
 	if constexpr (sizeof(__x) == 16 && _Np >= 4
 		      && !_Abi::template _S_is_partial<_Tp>)
 	  {
 	    const auto __halves = split<simd<_Tp, simd_abi::_Neon<8>>>(__x);
 	    const auto __y = __binary_op(__halves[0], __halves[1]);
 	    return _SimdImplNeon<simd_abi::_Neon<8>>::_S_reduce(
 	      __y, static_cast<_BinaryOperation&&>(__binary_op));
 	  }
 	else if constexpr (_Np == 8)
 	  {
 	    __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
 				     __vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(
 				       __x._M_data)));
 	    __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
 				     __vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(
 				       __x._M_data)));
 	    __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
 				     __vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(
 				       __x._M_data)));
 	    return __x[0];
 	  }
 	else if constexpr (_Np == 4)
 	  {
 	    __x
 	      = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
 				   __vector_permute<1, 0, 3, 2>(__x._M_data)));
 	    __x
 	      = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
 				   __vector_permute<3, 2, 1, 0>(__x._M_data)));
 	    return __x[0];
 	  }
 	else if constexpr (_Np == 2)
 	  {
 	    __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
 				     __vector_permute<1, 0>(__x._M_data)));
 	    return __x[0];
 	  }
 	else
 	  return _Base::_S_reduce(__x,
 				  static_cast<_BinaryOperation&&>(__binary_op));
       }

     // }}}
     // math {{{
     // _S_sqrt {{{
     template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
       _GLIBCXX_SIMD_INTRINSIC static _Tp _S_sqrt(_Tp __x)
       {
 	if constexpr (__have_neon_a64)
 	  {
 	    const auto __intrin = __to_intrin(__x);
 	    if constexpr (_TVT::template _S_is<float, 2>)
 	      return vsqrt_f32(__intrin);
 	    else if constexpr (_TVT::template _S_is<float, 4>)
 	      return vsqrtq_f32(__intrin);
 	    else if constexpr (_TVT::template _S_is<double, 1>)
 	      return vsqrt_f64(__intrin);
 	    else if constexpr (_TVT::template _S_is<double, 2>)
 	      return vsqrtq_f64(__intrin);
 	    else
 	      __assert_unreachable<_Tp>();
 	  }
 	else
 	  return _Base::_S_sqrt(__x);
       }

     // }}}
     // _S_trunc {{{
     template <typename _TW, typename _TVT = _VectorTraits<_TW>>
       _GLIBCXX_SIMD_INTRINSIC static _TW _S_trunc(_TW __x)
       {
 	using _Tp = typename _TVT::value_type;
 	if constexpr (__have_neon_a32)
 	  {
 	    const auto __intrin = __to_intrin(__x);
 	    if constexpr (_TVT::template _S_is<float, 2>)
 	      return vrnd_f32(__intrin);
 	    else if constexpr (_TVT::template _S_is<float, 4>)
 	      return vrndq_f32(__intrin);
 	    else if constexpr (_TVT::template _S_is<double, 1>)
 	      return vrnd_f64(__intrin);
 	    else if constexpr (_TVT::template _S_is<double, 2>)
 	      return vrndq_f64(__intrin);
 	    else
 	      __assert_unreachable<_Tp>();
 	  }
 	else if constexpr (is_same_v<_Tp, float>)
 	  {
 	    auto __intrin = __to_intrin(__x);
 	    if constexpr (sizeof(__x) == 16)
 	      __intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin));
 	    else
 	      __intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin));
 	    return _Base::_S_abs(__x)._M_data < 0x1p23f
 		     ? __vector_bitcast<float>(__intrin)
 		     : __x._M_data;
 	  }
 	else
 	  return _Base::_S_trunc(__x);
       }

     // }}}
     // _S_round {{{
     template <typename _Tp, size_t _Np>
       _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
       _S_round(_SimdWrapper<_Tp, _Np> __x)
       {
 	if constexpr (__have_neon_a32)
 	  {
 	    const auto __intrin = __to_intrin(__x);
 	    if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8)
 	      return vrnda_f32(__intrin);
 	    else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16)
 	      return vrndaq_f32(__intrin);
 	    else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8)
 	      return vrnda_f64(__intrin);
 	    else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16)
 	      return vrndaq_f64(__intrin);
 	    else
 	      __assert_unreachable<_Tp>();
 	  }
 	else
 	  return _Base::_S_round(__x);
       }

     // }}}
     // _S_floor {{{
     template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
       _GLIBCXX_SIMD_INTRINSIC static _Tp _S_floor(_Tp __x)
       {
 	if constexpr (__have_neon_a32)
 	  {
 	    const auto __intrin = __to_intrin(__x);
 	    if constexpr (_TVT::template _S_is<float, 2>)
 	      return vrndm_f32(__intrin);
 	    else if constexpr (_TVT::template _S_is<float, 4>)
 	      return vrndmq_f32(__intrin);
 	    else if constexpr (_TVT::template _S_is<double, 1>)
 	      return vrndm_f64(__intrin);
 	    else if constexpr (_TVT::template _S_is<double, 2>)
 	      return vrndmq_f64(__intrin);
 	    else
 	      __assert_unreachable<_Tp>();
 	  }
 	else
 	  return _Base::_S_floor(__x);
       }

     // }}}
     // _S_ceil {{{
     template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
       _GLIBCXX_SIMD_INTRINSIC static _Tp _S_ceil(_Tp __x)
       {
 	if constexpr (__have_neon_a32)
 	  {
 	    const auto __intrin = __to_intrin(__x);
 	    if constexpr (_TVT::template _S_is<float, 2>)
 	      return vrndp_f32(__intrin);
 	    else if constexpr (_TVT::template _S_is<float, 4>)
 	      return vrndpq_f32(__intrin);
 	    else if constexpr (_TVT::template _S_is<double, 1>)
 	      return vrndp_f64(__intrin);
 	    else if constexpr (_TVT::template _S_is<double, 2>)
 	      return vrndpq_f64(__intrin);
 	    else
 	      __assert_unreachable<_Tp>();
 	  }
 	else
 	  return _Base::_S_ceil(__x);
       }

     //}}} }}}
   }; // }}}
 // _MaskImplNeonMixin {{{
 struct _MaskImplNeonMixin
 {
   using _Base = _MaskImplBuiltinMixin;

   template <typename _Tp, size_t _Np>
     _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
     _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
     {
       if (__builtin_is_constant_evaluated())
 	return _Base::_S_to_bits(__x);

       using _I = __int_for_sizeof_t<_Tp>;
       if constexpr (sizeof(__x) == 16)
 	{
 	  auto __asint = __vector_bitcast<_I>(__x);
 #ifdef __aarch64__
 	  [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
 #else
 	  [[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))();
 #endif
 	  if constexpr (sizeof(_Tp) == 1)
 	    {
 	      constexpr auto __bitsel
 		= __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
 		  [&](auto __i) {
 		    return static_cast<_I>(
 		      __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
 		  });
 	      __asint &= __bitsel;
 #ifdef __aarch64__
 	      return __vector_bitcast<_UShort>(
 		vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero),
 			  __zero))[0];
 #else
 	      return __vector_bitcast<_UShort>(
 		vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)),
 				  __zero),
 			 __zero))[0];
 #endif
 	    }
 	  else if constexpr (sizeof(_Tp) == 2)
 	    {
 	      constexpr auto __bitsel
 		= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
 		  [&](auto __i) {
 		    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
 		  });
 	      __asint &= __bitsel;
 #ifdef __aarch64__
 	      return vaddvq_s16(__asint);
 #else
 	      return vpadd_s16(
 		vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
 		__zero)[0];
 #endif
 	    }
 	  else if constexpr (sizeof(_Tp) == 4)
 	    {
 	      constexpr auto __bitsel
 		= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
 		  [&](auto __i) {
 		    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
 		  });
 	      __asint &= __bitsel;
 #ifdef __aarch64__
 	      return vaddvq_s32(__asint);
 #else
 	      return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
 			       __zero)[0];
 #endif
 	    }
 	  else if constexpr (sizeof(_Tp) == 8)
 	    return (__asint[0] & 1) | (__asint[1] & 2);
 	  else
 	    __assert_unreachable<_Tp>();
 	}
       else if constexpr (sizeof(__x) == 8)
 	{
 	  auto __asint = __vector_bitcast<_I>(__x);
 	  [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
 	  if constexpr (sizeof(_Tp) == 1)
 	    {
 	      constexpr auto __bitsel
 		= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
 		  [&](auto __i) {
 		    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
 		  });
 	      __asint &= __bitsel;
 #ifdef __aarch64__
 	      return vaddv_s8(__asint);
 #else
 	      return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
 			      __zero)[0];
 #endif
 	    }
 	  else if constexpr (sizeof(_Tp) == 2)
 	    {
 	      constexpr auto __bitsel
 		= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
 		  [&](auto __i) {
 		    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
 		  });
 	      __asint &= __bitsel;
 #ifdef __aarch64__
 	      return vaddv_s16(__asint);
 #else
 	      return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
 #endif
 	    }
 	  else if constexpr (sizeof(_Tp) == 4)
 	    {
 	      __asint &= __make_vector<_I>(0x1, 0x2);
 #ifdef __aarch64__
 	      return vaddv_s32(__asint);
 #else
 	      return vpadd_s32(__asint, __zero)[0];
 #endif
 	    }
 	  else
 	    __assert_unreachable<_Tp>();
 	}
       else
 	return _Base::_S_to_bits(__x);
     }
 };

 // }}}
 // _MaskImplNeon {{{
 template <typename _Abi>
   struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi>
   {
     using _MaskImplBuiltinMixin::_S_to_maskvector;
     using _MaskImplNeonMixin::_S_to_bits;
     using _Base = _MaskImplBuiltin<_Abi>;
     using _Base::_S_convert;

     // _S_all_of {{{
     template <typename _Tp>
       _GLIBCXX_SIMD_INTRINSIC static bool _S_all_of(simd_mask<_Tp, _Abi> __k)
       {
 	const auto __kk
 	  = __vector_bitcast<char>(__k._M_data)
 	    | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
 	if constexpr (sizeof(__k) == 16)
 	  {
 	    const auto __x = __vector_bitcast<long long>(__kk);
 	    return __x[0] + __x[1] == -2;
 	  }
 	else if constexpr (sizeof(__k) <= 8)
 	  return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == -1;
 	else
 	  __assert_unreachable<_Tp>();
       }

     // }}}
     // _S_any_of {{{
     template <typename _Tp>
       _GLIBCXX_SIMD_INTRINSIC static bool _S_any_of(simd_mask<_Tp, _Abi> __k)
       {
 	const auto __kk
 	  = __vector_bitcast<char>(__k._M_data)
 	    | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
 	if constexpr (sizeof(__k) == 16)
 	  {
 	    const auto __x = __vector_bitcast<long long>(__kk);
 	    return (__x[0] | __x[1]) != 0;
 	  }
 	else if constexpr (sizeof(__k) <= 8)
 	  return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) != 0;
 	else
 	  __assert_unreachable<_Tp>();
       }

     // }}}
     // _S_none_of {{{
     template <typename _Tp>
       _GLIBCXX_SIMD_INTRINSIC static bool _S_none_of(simd_mask<_Tp, _Abi> __k)
       {
 	const auto __kk = _Abi::_S_masked(__k._M_data);
 	if constexpr (sizeof(__k) == 16)
 	  {
 	    const auto __x = __vector_bitcast<long long>(__kk);
 	    return (__x[0] | __x[1]) == 0;
 	  }
 	else if constexpr (sizeof(__k) <= 8)
 	  return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == 0;
 	else
 	  __assert_unreachable<_Tp>();
       }

     // }}}
     // _S_some_of {{{
     template <typename _Tp>
       _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
       {
 	if constexpr (sizeof(__k) <= 8)
 	  {
 	    const auto __kk = __vector_bitcast<char>(__k._M_data)
 			      | ~__vector_bitcast<char>(
 				_Abi::template _S_implicit_mask<_Tp>());
 	    using _Up = make_unsigned_t<__int_for_sizeof_t<decltype(__kk)>>;
 	    return __bit_cast<_Up>(__kk) + 1 > 1;
 	  }
 	else
 	  return _Base::_S_some_of(__k);
       }

     // }}}
     // _S_popcount {{{
     template <typename _Tp>
       _GLIBCXX_SIMD_INTRINSIC static int _S_popcount(simd_mask<_Tp, _Abi> __k)
       {
 	if constexpr (sizeof(_Tp) == 1)
 	  {
 	    const auto __s8 = __vector_bitcast<_SChar>(__k._M_data);
 	    int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8);
 	    return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()),
 			     int8x8_t())[0];
 	  }
 	else if constexpr (sizeof(_Tp) == 2)
 	  {
 	    const auto __s16 = __vector_bitcast<short>(__k._M_data);
 	    int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16);
 	    return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0];
 	  }
 	else if constexpr (sizeof(_Tp) == 4)
 	  {
 	    const auto __s32 = __vector_bitcast<int>(__k._M_data);
 	    int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32);
 	    return -vpadd_s32(__tmp, int32x2_t())[0];
 	  }
 	else if constexpr (sizeof(_Tp) == 8)
 	  {
 	    static_assert(sizeof(__k) == 16);
 	    const auto __s64 = __vector_bitcast<long>(__k._M_data);
 	    return -(__s64[0] + __s64[1]);
 	  }
       }

     // }}}
     // _S_find_first_set {{{
     template <typename _Tp>
       _GLIBCXX_SIMD_INTRINSIC static int
       _S_find_first_set(simd_mask<_Tp, _Abi> __k)
       {
 	// TODO: the _Base implementation is not optimal for NEON
 	return _Base::_S_find_first_set(__k);
       }

     // }}}
     // _S_find_last_set {{{
     template <typename _Tp>
       _GLIBCXX_SIMD_INTRINSIC static int
       _S_find_last_set(simd_mask<_Tp, _Abi> __k)
       {
 	// TODO: the _Base implementation is not optimal for NEON
 	return _Base::_S_find_last_set(__k);
       }

     // }}}
   }; // }}}

 _GLIBCXX_SIMD_END_NAMESPACE
 #endif // __cplusplus >= 201703L
 #endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
 // vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80
	// Simd NEON specific implementations -- C++ --

	// Copyright (C) 2020-2021 Free Software Foundation, Inc.
	//
	// This file is part of the GNU ISO C++ Library. This library is free
	// software; you can redistribute it and/or modify it under the
	// terms of the GNU General Public License as published by the
	// Free Software Foundation; either version 3, or (at your option)
	// any later version.

	// This library is distributed in the hope that it will be useful,
	// but WITHOUT ANY WARRANTY; without even the implied warranty of
	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	// GNU General Public License for more details.

	// Under Section 7 of GPL version 3, you are granted additional
	// permissions described in the GCC Runtime Library Exception, version
	// 3.1, as published by the Free Software Foundation.

	// You should have received a copy of the GNU General Public License and
	// a copy of the GCC Runtime Library Exception along with this program;
	// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	// <http://www.gnu.org/licenses/>.

	#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
	#define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_

	#if __cplusplus >= 201703L

	#if !_GLIBCXX_SIMD_HAVE_NEON
	#error "simd_neon.h may only be included when NEON on ARM is available"
	#endif

	_GLIBCXX_SIMD_BEGIN_NAMESPACE

	// _CommonImplNeon {{{
	struct _CommonImplNeon : _CommonImplBuiltin
	{
	// _S_store {{{
	using _CommonImplBuiltin::_S_store;

	// }}}
	};

	// }}}
	// _SimdImplNeon {{{
	template <typename _Abi>
	struct _SimdImplNeon : _SimdImplBuiltin<_Abi>
	{
	using _Base = _SimdImplBuiltin<_Abi>;

	template <typename _Tp>
	using _MaskMember = typename _Base::template _MaskMember<_Tp>;

	template <typename _Tp>
	static constexpr size_t _S_max_store_size = 16;

	// _S_masked_load {{{
	template <typename _Tp, size_t _Np, typename _Up>
	static inline _SimdWrapper<_Tp, _Np>
	_S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
	const _Up* __mem) noexcept
	{
	__execute_n_times<_Np>([&](auto __i) {
	if (__k[__i] != 0)
	__merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
	});
	return __merge;
	}

	// }}}
	// _S_masked_store_nocvt {{{
	template <typename _Tp, size_t _Np>
	_GLIBCXX_SIMD_INTRINSIC static void
	_S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
	_MaskMember<_Tp> __k)
	{
	__execute_n_times<_Np>([&](auto __i) {
	if (__k[__i] != 0)
	__mem[__i] = __v[__i];
	});
	}

	// }}}
	// _S_reduce {{{
	template <typename _Tp, typename _BinaryOperation>
	_GLIBCXX_SIMD_INTRINSIC static _Tp
	_S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
	{
	constexpr size_t _Np = __x.size();
	if constexpr (sizeof(__x) == 16 && _Np >= 4
	&& !_Abi::template _S_is_partial<_Tp>)
	{
	const auto __halves = split<simd<_Tp, simd_abi::_Neon<8>>>(__x);
	const auto __y = __binary_op(__halves[0], __halves[1]);
	return _SimdImplNeon<simd_abi::_Neon<8>>::_S_reduce(
	__y, static_cast<_BinaryOperation&&>(__binary_op));
	}
	else if constexpr (_Np == 8)
	{
	__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
	__vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(
	__x._M_data)));
	__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
	__vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(
	__x._M_data)));
	__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
	__vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(
	__x._M_data)));
	return __x[0];
	}
	else if constexpr (_Np == 4)
	{
	__x
	= __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
	__vector_permute<1, 0, 3, 2>(__x._M_data)));
	__x
	= __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
	__vector_permute<3, 2, 1, 0>(__x._M_data)));
	return __x[0];
	}
	else if constexpr (_Np == 2)
	{
	__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
	__vector_permute<1, 0>(__x._M_data)));
	return __x[0];
	}
	else
	return _Base::_S_reduce(__x,
	static_cast<_BinaryOperation&&>(__binary_op));
	}

	// }}}
	// math {{{
	// _S_sqrt {{{
	template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
	_GLIBCXX_SIMD_INTRINSIC static _Tp _S_sqrt(_Tp __x)
	{
	if constexpr (__have_neon_a64)
	{
	const auto __intrin = __to_intrin(__x);
	if constexpr (_TVT::template _S_is<float, 2>)
	return vsqrt_f32(__intrin);
	else if constexpr (_TVT::template _S_is<float, 4>)
	return vsqrtq_f32(__intrin);
	else if constexpr (_TVT::template _S_is<double, 1>)
	return vsqrt_f64(__intrin);
	else if constexpr (_TVT::template _S_is<double, 2>)
	return vsqrtq_f64(__intrin);
	else
	__assert_unreachable<_Tp>();
	}
	else
	return _Base::_S_sqrt(__x);
	}

	// }}}
	// _S_trunc {{{
	template <typename _TW, typename _TVT = _VectorTraits<_TW>>
	_GLIBCXX_SIMD_INTRINSIC static _TW _S_trunc(_TW __x)
	{
	using _Tp = typename _TVT::value_type;
	if constexpr (__have_neon_a32)
	{
	const auto __intrin = __to_intrin(__x);
	if constexpr (_TVT::template _S_is<float, 2>)
	return vrnd_f32(__intrin);
	else if constexpr (_TVT::template _S_is<float, 4>)
	return vrndq_f32(__intrin);
	else if constexpr (_TVT::template _S_is<double, 1>)
	return vrnd_f64(__intrin);
	else if constexpr (_TVT::template _S_is<double, 2>)
	return vrndq_f64(__intrin);
	else
	__assert_unreachable<_Tp>();
	}
	else if constexpr (is_same_v<_Tp, float>)
	{
	auto __intrin = __to_intrin(__x);
	if constexpr (sizeof(__x) == 16)
	__intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin));
	else
	__intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin));
	return _Base::_S_abs(__x)._M_data < 0x1p23f
	? __vector_bitcast<float>(__intrin)
	: __x._M_data;
	}
	else
	return _Base::_S_trunc(__x);
	}

	// }}}
	// _S_round {{{
	template <typename _Tp, size_t _Np>
	_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
	_S_round(_SimdWrapper<_Tp, _Np> __x)
	{
	if constexpr (__have_neon_a32)
	{
	const auto __intrin = __to_intrin(__x);
	if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8)
	return vrnda_f32(__intrin);
	else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16)
	return vrndaq_f32(__intrin);
	else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8)
	return vrnda_f64(__intrin);
	else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16)
	return vrndaq_f64(__intrin);
	else
	__assert_unreachable<_Tp>();
	}
	else
	return _Base::_S_round(__x);
	}

	// }}}
	// _S_floor {{{
	template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
	_GLIBCXX_SIMD_INTRINSIC static _Tp _S_floor(_Tp __x)
	{
	if constexpr (__have_neon_a32)
	{
	const auto __intrin = __to_intrin(__x);
	if constexpr (_TVT::template _S_is<float, 2>)
	return vrndm_f32(__intrin);
	else if constexpr (_TVT::template _S_is<float, 4>)
	return vrndmq_f32(__intrin);
	else if constexpr (_TVT::template _S_is<double, 1>)
	return vrndm_f64(__intrin);
	else if constexpr (_TVT::template _S_is<double, 2>)
	return vrndmq_f64(__intrin);
	else
	__assert_unreachable<_Tp>();
	}
	else
	return _Base::_S_floor(__x);
	}

	// }}}
	// _S_ceil {{{
	template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
	_GLIBCXX_SIMD_INTRINSIC static _Tp _S_ceil(_Tp __x)
	{
	if constexpr (__have_neon_a32)
	{
	const auto __intrin = __to_intrin(__x);
	if constexpr (_TVT::template _S_is<float, 2>)
	return vrndp_f32(__intrin);
	else if constexpr (_TVT::template _S_is<float, 4>)
	return vrndpq_f32(__intrin);
	else if constexpr (_TVT::template _S_is<double, 1>)
	return vrndp_f64(__intrin);
	else if constexpr (_TVT::template _S_is<double, 2>)
	return vrndpq_f64(__intrin);
	else
	__assert_unreachable<_Tp>();
	}
	else
	return _Base::_S_ceil(__x);
	}

	//}}} }}}
	}; // }}}
	// _MaskImplNeonMixin {{{
	struct _MaskImplNeonMixin
	{
	using _Base = _MaskImplBuiltinMixin;

	template <typename _Tp, size_t _Np>
	_GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
	_S_to_bits(_SimdWrapper<_Tp, _Np> __x)
	{
	if (__builtin_is_constant_evaluated())
	return _Base::_S_to_bits(__x);

	using _I = __int_for_sizeof_t<_Tp>;
	if constexpr (sizeof(__x) == 16)
	{
	auto __asint = __vector_bitcast<_I>(__x);
	#ifdef __aarch64__
	[[maybe_unused]] constexpr auto __zero = decltype(__asint)();
	#else
	[[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))();
	#endif
	if constexpr (sizeof(_Tp) == 1)
	{
	constexpr auto __bitsel
	= __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
	[&](auto __i) {
	return static_cast<_I>(
	__i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
	});
	__asint &= __bitsel;
	#ifdef __aarch64__
	return __vector_bitcast<_UShort>(
	vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero),
	__zero))[0];
	#else
	return __vector_bitcast<_UShort>(
	vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)),
	__zero),
	__zero))[0];
	#endif
	}
	else if constexpr (sizeof(_Tp) == 2)
	{
	constexpr auto __bitsel
	= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
	[&](auto __i) {
	return static_cast<_I>(__i < _Np ? 1 << __i : 0);
	});
	__asint &= __bitsel;
	#ifdef __aarch64__
	return vaddvq_s16(__asint);
	#else
	return vpadd_s16(
	vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
	__zero)[0];
	#endif
	}
	else if constexpr (sizeof(_Tp) == 4)
	{
	constexpr auto __bitsel
	= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
	[&](auto __i) {
	return static_cast<_I>(__i < _Np ? 1 << __i : 0);
	});
	__asint &= __bitsel;
	#ifdef __aarch64__
	return vaddvq_s32(__asint);
	#else
	return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
	__zero)[0];
	#endif
	}
	else if constexpr (sizeof(_Tp) == 8)
	return (__asint[0] & 1) \| (__asint[1] & 2);
	else
	__assert_unreachable<_Tp>();
	}
	else if constexpr (sizeof(__x) == 8)
	{
	auto __asint = __vector_bitcast<_I>(__x);
	[[maybe_unused]] constexpr auto __zero = decltype(__asint)();
	if constexpr (sizeof(_Tp) == 1)
	{
	constexpr auto __bitsel
	= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
	[&](auto __i) {
	return static_cast<_I>(__i < _Np ? 1 << __i : 0);
	});
	__asint &= __bitsel;
	#ifdef __aarch64__
	return vaddv_s8(__asint);
	#else
	return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
	__zero)[0];
	#endif
	}
	else if constexpr (sizeof(_Tp) == 2)
	{
	constexpr auto __bitsel
	= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
	[&](auto __i) {
	return static_cast<_I>(__i < _Np ? 1 << __i : 0);
	});
	__asint &= __bitsel;
	#ifdef __aarch64__
	return vaddv_s16(__asint);
	#else
	return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
	#endif
	}
	else if constexpr (sizeof(_Tp) == 4)
	{
	__asint &= __make_vector<_I>(0x1, 0x2);
	#ifdef __aarch64__
	return vaddv_s32(__asint);
	#else
	return vpadd_s32(__asint, __zero)[0];
	#endif
	}
	else
	__assert_unreachable<_Tp>();
	}
	else
	return _Base::_S_to_bits(__x);
	}
	};

	// }}}
	// _MaskImplNeon {{{
	template <typename _Abi>
	struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi>
	{
	using _MaskImplBuiltinMixin::_S_to_maskvector;
	using _MaskImplNeonMixin::_S_to_bits;
	using _Base = _MaskImplBuiltin<_Abi>;
	using _Base::_S_convert;

	// _S_all_of {{{
	template <typename _Tp>
	_GLIBCXX_SIMD_INTRINSIC static bool _S_all_of(simd_mask<_Tp, _Abi> __k)
	{
	const auto __kk
	= __vector_bitcast<char>(__k._M_data)
	\| ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
	if constexpr (sizeof(__k) == 16)
	{
	const auto __x = __vector_bitcast<long long>(__kk);
	return __x[0] + __x[1] == -2;
	}
	else if constexpr (sizeof(__k) <= 8)
	return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == -1;
	else
	__assert_unreachable<_Tp>();
	}

	// }}}
	// _S_any_of {{{
	template <typename _Tp>
	_GLIBCXX_SIMD_INTRINSIC static bool _S_any_of(simd_mask<_Tp, _Abi> __k)
	{
	const auto __kk
	= __vector_bitcast<char>(__k._M_data)
	\| ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
	if constexpr (sizeof(__k) == 16)
	{
	const auto __x = __vector_bitcast<long long>(__kk);
	return (__x[0] \| __x[1]) != 0;
	}
	else if constexpr (sizeof(__k) <= 8)
	return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) != 0;
	else
	__assert_unreachable<_Tp>();
	}

	// }}}
	// _S_none_of {{{
	template <typename _Tp>
	_GLIBCXX_SIMD_INTRINSIC static bool _S_none_of(simd_mask<_Tp, _Abi> __k)
	{
	const auto __kk = _Abi::_S_masked(__k._M_data);
	if constexpr (sizeof(__k) == 16)
	{
	const auto __x = __vector_bitcast<long long>(__kk);
	return (__x[0] \| __x[1]) == 0;
	}
	else if constexpr (sizeof(__k) <= 8)
	return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == 0;
	else
	__assert_unreachable<_Tp>();
	}

	// }}}
	// _S_some_of {{{
	template <typename _Tp>
	_GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
	{
	if constexpr (sizeof(__k) <= 8)
	{
	const auto __kk = __vector_bitcast<char>(__k._M_data)
	\| ~__vector_bitcast<char>(
	_Abi::template _S_implicit_mask<_Tp>());
	using _Up = make_unsigned_t<__int_for_sizeof_t<decltype(__kk)>>;
	return __bit_cast<_Up>(__kk) + 1 > 1;
	}
	else
	return _Base::_S_some_of(__k);
	}

	// }}}
	// _S_popcount {{{
	template <typename _Tp>
	_GLIBCXX_SIMD_INTRINSIC static int _S_popcount(simd_mask<_Tp, _Abi> __k)
	{
	if constexpr (sizeof(_Tp) == 1)
	{
	const auto __s8 = __vector_bitcast<_SChar>(__k._M_data);
	int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8);
	return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()),
	int8x8_t())[0];
	}
	else if constexpr (sizeof(_Tp) == 2)
	{
	const auto __s16 = __vector_bitcast<short>(__k._M_data);
	int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16);
	return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0];
	}
	else if constexpr (sizeof(_Tp) == 4)
	{
	const auto __s32 = __vector_bitcast<int>(__k._M_data);
	int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32);
	return -vpadd_s32(__tmp, int32x2_t())[0];
	}
	else if constexpr (sizeof(_Tp) == 8)
	{
	static_assert(sizeof(__k) == 16);
	const auto __s64 = __vector_bitcast<long>(__k._M_data);
	return -(__s64[0] + __s64[1]);
	}
	}

	// }}}
	// _S_find_first_set {{{
	template <typename _Tp>
	_GLIBCXX_SIMD_INTRINSIC static int
	_S_find_first_set(simd_mask<_Tp, _Abi> __k)
	{
	// TODO: the _Base implementation is not optimal for NEON
	return _Base::_S_find_first_set(__k);
	}

	// }}}
	// _S_find_last_set {{{
	template <typename _Tp>
	_GLIBCXX_SIMD_INTRINSIC static int
	_S_find_last_set(simd_mask<_Tp, _Abi> __k)
	{
	// TODO: the _Base implementation is not optimal for NEON
	return _Base::_S_find_last_set(__k);
	}

	// }}}
	}; // }}}

	_GLIBCXX_SIMD_END_NAMESPACE
	#endif // __cplusplus >= 201703L
	#endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
	// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80