blob: f6dd0d8e2a45493808b17f5bf4aae19395ed13c9 [file] [log] [blame]
/* Subroutines used for code generation for RISC-V 'V' Extension for
GNU compiler.
Copyright (C) 2022-2023 Free Software Foundation, Inc.
Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
GCC is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
#define IN_TARGET_CODE 1
/* We have a maximum of 11 operands for RVV instruction patterns according to
the vector.md. */
#define RVV_INSN_OPERANDS_MAX 11
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "tm.h"
#include "backend.h"
#include "rtl.h"
#include "insn-config.h"
#include "insn-attr.h"
#include "recog.h"
#include "alias.h"
#include "tree.h"
#include "stringpool.h"
#include "attribs.h"
#include "explow.h"
#include "memmodel.h"
#include "emit-rtl.h"
#include "tm_p.h"
#include "target.h"
#include "targhooks.h"
#include "expr.h"
#include "optabs.h"
#include "tm-constrs.h"
#include "rtx-vector-builder.h"
#include "targhooks.h"
using namespace riscv_vector;
namespace riscv_vector {
/* Return true if vlmax is constant value and can be used in vsetivl. */
static bool
const_vlmax_p (machine_mode mode)
{
poly_uint64 nuints = GET_MODE_NUNITS (mode);
return nuints.is_constant ()
/* The vsetivli can only hold register 0~31. */
? (IN_RANGE (nuints.to_constant (), 0, 31))
/* Only allowed in VLS-VLMAX mode. */
: false;
}
template <int MAX_OPERANDS> class insn_expander
{
public:
insn_expander ()
: m_opno (0), m_op_num (0), m_has_dest_p (false),
m_fully_unmasked_p (false), m_use_real_merge_p (false),
m_needs_avl_p (false), m_vlmax_p (false), m_has_tail_policy_p (false),
m_has_mask_policy_p (false), m_has_fp_rounding_mode_p (false),
m_tail_policy (TAIL_ANY), m_mask_policy (MASK_ANY),
m_fp_rounding_mode (FRM_DYN),
m_dest_mode (VOIDmode), m_mask_mode (VOIDmode),
m_vl_op (NULL_RTX)
{}
/* Initializer for various configurations. */
insn_expander (int op_num, bool has_dest_p, bool use_all_trues_mask_p,
bool use_real_merge_p, bool needs_avl_p, bool vlmax_p,
machine_mode dest_mode, machine_mode mask_mode)
: m_opno (0), m_op_num (op_num), m_has_dest_p (has_dest_p),
m_fully_unmasked_p (use_all_trues_mask_p),
m_use_real_merge_p (use_real_merge_p), m_needs_avl_p (needs_avl_p),
m_vlmax_p (vlmax_p), m_has_tail_policy_p (false),
m_has_mask_policy_p (false), m_has_fp_rounding_mode_p (false),
m_tail_policy (TAIL_ANY), m_mask_policy (MASK_ANY),
m_fp_rounding_mode (FRM_DYN),
m_dest_mode (dest_mode),
m_mask_mode (mask_mode), m_vl_op (NULL_RTX)
{}
void set_policy (enum tail_policy ta)
{
m_has_tail_policy_p = true;
m_tail_policy = ta;
}
void set_policy (enum mask_policy ma)
{
m_has_mask_policy_p = true;
m_mask_policy = ma;
}
void set_vl (rtx vl) { m_vl_op = vl; }
void set_rounding_mode (enum floating_point_rounding_mode mode)
{
m_has_fp_rounding_mode_p = true;
m_fp_rounding_mode = mode;
}
void add_output_operand (rtx x, machine_mode mode)
{
create_output_operand (&m_ops[m_opno++], x, mode);
gcc_assert (m_opno <= MAX_OPERANDS);
}
void add_input_operand (rtx x, machine_mode mode)
{
create_input_operand (&m_ops[m_opno++], x, mode);
gcc_assert (m_opno <= MAX_OPERANDS);
}
void add_all_one_mask_operand ()
{
add_input_operand (CONSTM1_RTX (m_mask_mode), m_mask_mode);
}
void add_vundef_operand ()
{
add_input_operand (RVV_VUNDEF (m_dest_mode), m_dest_mode);
}
void add_policy_operand ()
{
if (m_has_tail_policy_p)
{
rtx tail_policy_rtx = gen_int_mode (m_tail_policy, Pmode);
add_input_operand (tail_policy_rtx, Pmode);
}
if (m_has_mask_policy_p)
{
rtx mask_policy_rtx = gen_int_mode (m_mask_policy, Pmode);
add_input_operand (mask_policy_rtx, Pmode);
}
}
void add_avl_type_operand (avl_type type)
{
add_input_operand (gen_int_mode (type, Pmode), Pmode);
}
void add_rounding_mode_operand ()
{
if (m_has_fp_rounding_mode_p)
{
rtx frm_rtx = gen_int_mode (m_fp_rounding_mode, Pmode);
add_input_operand (frm_rtx, Pmode);
}
}
void emit_insn (enum insn_code icode, rtx *ops)
{
int opno = 0;
/* It's true if any operand is memory operand. */
bool any_mem_p = false;
/* It's true if all operands are mask operand. */
bool all_mask_p = true;
if (m_has_dest_p)
{
any_mem_p |= MEM_P (ops[opno]);
all_mask_p &= GET_MODE_CLASS (GET_MODE (ops[opno])) == MODE_VECTOR_BOOL;
add_output_operand (ops[opno++], m_dest_mode);
}
if (m_fully_unmasked_p)
add_all_one_mask_operand ();
if (!m_use_real_merge_p)
add_vundef_operand ();
for (; opno < m_op_num; opno++)
{
any_mem_p |= MEM_P (ops[opno]);
all_mask_p &= GET_MODE_CLASS (GET_MODE (ops[opno])) == MODE_VECTOR_BOOL;
machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
/* 'create_input_operand doesn't allow VOIDmode.
According to vector.md, we may have some patterns that do not have
explicit machine mode specifying the operand. Such operands are
always Pmode. */
if (mode == VOIDmode)
mode = Pmode;
add_input_operand (ops[opno], mode);
}
if (m_needs_avl_p)
{
rtx len = m_vl_op;
if (m_vlmax_p)
{
if (const_vlmax_p (m_dest_mode))
{
/* Optimize VLS-VLMAX code gen, we can use vsetivli instead of
the vsetvli to obtain the value of vlmax. */
poly_uint64 nunits = GET_MODE_NUNITS (m_dest_mode);
len = gen_int_mode (nunits, Pmode);
m_vlmax_p = false; /* It has became NONVLMAX now. */
}
else if (can_create_pseudo_p ())
{
len = gen_reg_rtx (Pmode);
emit_vlmax_vsetvl (m_dest_mode, len);
}
}
add_input_operand (len, Pmode);
}
if (!all_mask_p)
add_policy_operand ();
if (m_needs_avl_p)
add_avl_type_operand (m_vlmax_p ? avl_type::VLMAX : avl_type::NONVLMAX);
add_rounding_mode_operand ();
expand (icode, any_mem_p);
}
void expand (enum insn_code icode, bool temporary_volatile_p = false)
{
if (temporary_volatile_p)
{
temporary_volatile_ok v (true);
expand_insn (icode, m_opno, m_ops);
}
else
expand_insn (icode, m_opno, m_ops);
}
private:
int m_opno;
int m_op_num;
/* It't true when the pattern has a dest operand. Most of the patterns have
dest operand wheras some patterns like STOREs does not have dest operand.
For example, according to vector.md. We can see indexed loads/stores do
not have dest operand.
*/
bool m_has_dest_p;
/* It't true if the pattern uses all trues mask operand. */
bool m_fully_unmasked_p;
/* It's true if the pattern uses real merge operand. */
bool m_use_real_merge_p;
bool m_needs_avl_p;
bool m_vlmax_p;
bool m_has_tail_policy_p;
bool m_has_mask_policy_p;
bool m_has_fp_rounding_mode_p;
enum tail_policy m_tail_policy;
enum mask_policy m_mask_policy;
enum floating_point_rounding_mode m_fp_rounding_mode;
machine_mode m_dest_mode;
machine_mode m_mask_mode;
rtx m_vl_op;
expand_operand m_ops[MAX_OPERANDS];
};
class rvv_builder : public rtx_vector_builder
{
public:
rvv_builder () : rtx_vector_builder () {}
rvv_builder (machine_mode mode, unsigned int npatterns,
unsigned int nelts_per_pattern)
: rtx_vector_builder (mode, npatterns, nelts_per_pattern)
{
m_inner_mode = GET_MODE_INNER (mode);
m_inner_bits_size = GET_MODE_BITSIZE (m_inner_mode);
m_inner_bytes_size = GET_MODE_SIZE (m_inner_mode);
m_mask_mode = get_mask_mode (mode).require ();
gcc_assert (
int_mode_for_size (inner_bits_size (), 0).exists (&m_inner_int_mode));
m_int_mode
= get_vector_mode (m_inner_int_mode, GET_MODE_NUNITS (mode)).require ();
}
bool can_duplicate_repeating_sequence_p ();
rtx get_merged_repeating_sequence ();
bool repeating_sequence_use_merge_profitable_p ();
rtx get_merge_scalar_mask (unsigned int) const;
bool single_step_npatterns_p () const;
bool npatterns_all_equal_p () const;
machine_mode new_mode () const { return m_new_mode; }
scalar_mode inner_mode () const { return m_inner_mode; }
scalar_int_mode inner_int_mode () const { return m_inner_int_mode; }
machine_mode mask_mode () const { return m_mask_mode; }
machine_mode int_mode () const { return m_int_mode; }
unsigned int inner_bits_size () const { return m_inner_bits_size; }
unsigned int inner_bytes_size () const { return m_inner_bytes_size; }
private:
scalar_mode m_inner_mode;
scalar_int_mode m_inner_int_mode;
machine_mode m_new_mode;
scalar_int_mode m_new_inner_mode;
machine_mode m_mask_mode;
machine_mode m_int_mode;
unsigned int m_inner_bits_size;
unsigned int m_inner_bytes_size;
};
/* Return true if the vector duplicated by a super element which is the fusion
of consecutive elements.
v = { a, b, a, b } super element = ab, v = { ab, ab } */
bool
rvv_builder::can_duplicate_repeating_sequence_p ()
{
poly_uint64 new_size = exact_div (full_nelts (), npatterns ());
unsigned int new_inner_size = m_inner_bits_size * npatterns ();
if (!int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode)
|| GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD
|| !get_vector_mode (m_new_inner_mode, new_size).exists (&m_new_mode))
return false;
if (full_nelts ().is_constant ())
return repeating_sequence_p (0, full_nelts ().to_constant (), npatterns ());
return nelts_per_pattern () == 1;
}
/* Return true if it is a repeating sequence that using
merge approach has better codegen than using default
approach (slide1down).
Sequence A:
{a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
nelts = 16
npatterns = 2
for merging a we need mask 101010....
for merging b we need mask 010101....
Foreach element in the npattern, we need to build a mask in scalar register.
Mostely we need 3 instructions (aka COST = 3), which is consist of 2 scalar
instruction and 1 scalar move to v0 register. Finally we need vector merge
to merge them.
lui a5, #imm
add a5, #imm
vmov.s.x v0, a5
vmerge.vxm v9, v9, a1, v0
So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8.
If we use slide1down, the COST = nelts = 16 > 8 (COST of merge).
So return true in this case as it is profitable.
Sequence B:
{a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h}
nelts = 16
npatterns = 8
COST of merge approach = (3 + 1) * npatterns = 24
COST of slide1down approach = nelts = 16
Return false in this case as it is NOT profitable in merge approach.
*/
bool
rvv_builder::repeating_sequence_use_merge_profitable_p ()
{
if (inner_bytes_size () > UNITS_PER_WORD)
return false;
unsigned int nelts = full_nelts ().to_constant ();
if (!repeating_sequence_p (0, nelts, npatterns ()))
return false;
unsigned int merge_cost = 1;
unsigned int build_merge_mask_cost = 3;
unsigned int slide1down_cost = nelts;
return (build_merge_mask_cost + merge_cost) * npatterns () < slide1down_cost;
}
/* Merge the repeating sequence into a single element and return the RTX. */
rtx
rvv_builder::get_merged_repeating_sequence ()
{
scalar_int_mode mode = Pmode;
rtx target = gen_reg_rtx (mode);
emit_move_insn (target, const0_rtx);
rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode);
/* { a, b, a, b }: Generate duplicate element = b << bits | a. */
for (unsigned int i = 0; i < npatterns (); i++)
{
unsigned int loc = m_inner_bits_size * i;
rtx shift = gen_int_mode (loc, mode);
rtx ele = gen_lowpart (mode, elt (i));
rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX, false,
OPTAB_DIRECT);
rtx tmp2 = expand_simple_binop (mode, ASHIFT, tmp, shift, NULL_RTX, false,
OPTAB_DIRECT);
rtx tmp3 = expand_simple_binop (mode, IOR, tmp2, target, NULL_RTX, false,
OPTAB_DIRECT);
emit_move_insn (target, tmp3);
}
if (GET_MODE_SIZE (m_new_inner_mode) < UNITS_PER_WORD)
return gen_lowpart (m_new_inner_mode, target);
return target;
}
/* Get the mask for merge approach.
Consider such following case:
{a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
To merge "a", the mask should be 1010....
To merge "b", the mask should be 0101....
*/
rtx
rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern) const
{
unsigned HOST_WIDE_INT mask = 0;
unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern);
/* Here we construct a mask pattern that will later be broadcast
to a vector register. The maximum broadcast size for vmv.v.x/vmv.s.x
is determined by the length of a vector element (ELEN) and not by
XLEN so make sure we do not exceed it. One example is -march=zve32*
which mandates ELEN == 32 but can be combined with -march=rv64
with XLEN == 64. */
unsigned int elen = TARGET_VECTOR_ELEN_64 ? 64 : 32;
gcc_assert (elen % npatterns () == 0);
int limit = elen / npatterns ();
for (int i = 0; i < limit; i++)
mask |= base_mask << (i * npatterns ());
return gen_int_mode (mask, inner_int_mode ());
}
/* Return true if the variable-length vector is single step.
Single step means step all patterns in NPATTERNS are equal.
Consider this following case:
CASE 1: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
{ 0, 2, 2, 4, 4, 6, ... }
First pattern: step1 = 2 - 0 = 2
step2 = 4 - 2 = 2
Second pattern: step1 = 4 - 2 = 2
step2 = 6 - 4 = 2
Since all steps of NPATTERNS are equal step = 2.
Return true in this case.
CASE 2: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
{ 0, 1, 2, 4, 4, 7, ... }
First pattern: step1 = 2 - 0 = 2
step2 = 4 - 2 = 2
Second pattern: step1 = 4 - 1 = 3
step2 = 7 - 4 = 3
Since not all steps are equal, return false. */
bool
rvv_builder::single_step_npatterns_p () const
{
if (nelts_per_pattern () != 3)
return false;
poly_int64 step
= rtx_to_poly_int64 (elt (npatterns ())) - rtx_to_poly_int64 (elt (0));
for (unsigned int i = 0; i < npatterns (); i++)
{
poly_int64 ele0 = rtx_to_poly_int64 (elt (i));
poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i));
poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i));
poly_int64 diff1 = ele1 - ele0;
poly_int64 diff2 = ele2 - ele1;
if (maybe_ne (step, diff1) || maybe_ne (step, diff2))
return false;
}
return true;
}
/* Return true if all elements of NPATTERNS are equal.
E.g. NPATTERNS = 4:
{ 2, 2, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, ... }
E.g. NPATTERNS = 8:
{ 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, ... }
We only check ele[0] ~ ele[NPATTERNS - 1] whether they are the same.
We don't need to check the elements[n] with n >= NPATTERNS since
they don't belong to the same pattern.
*/
bool
rvv_builder::npatterns_all_equal_p () const
{
poly_int64 ele0 = rtx_to_poly_int64 (elt (0));
for (unsigned int i = 1; i < npatterns (); i++)
{
poly_int64 ele = rtx_to_poly_int64 (elt (i));
if (!known_eq (ele, ele0))
return false;
}
return true;
}
static unsigned
get_sew (machine_mode mode)
{
unsigned int sew = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
? 8
: GET_MODE_BITSIZE (GET_MODE_INNER (mode));
return sew;
}
/* Return true if X is a const_vector with all duplicate elements, which is in
the range between MINVAL and MAXVAL. */
bool
const_vec_all_same_in_range_p (rtx x, HOST_WIDE_INT minval,
HOST_WIDE_INT maxval)
{
rtx elt;
return (const_vec_duplicate_p (x, &elt) && CONST_INT_P (elt)
&& IN_RANGE (INTVAL (elt), minval, maxval));
}
/* Return true if VEC is a constant in which every element is in the range
[MINVAL, MAXVAL]. The elements do not need to have the same value.
This function also exists in aarch64, we may unify it in middle-end in the
future. */
static bool
const_vec_all_in_range_p (rtx vec, poly_int64 minval, poly_int64 maxval)
{
if (!CONST_VECTOR_P (vec)
|| GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
return false;
int nunits;
if (!CONST_VECTOR_STEPPED_P (vec))
nunits = const_vector_encoded_nelts (vec);
else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
return false;
for (int i = 0; i < nunits; i++)
{
rtx vec_elem = CONST_VECTOR_ELT (vec, i);
poly_int64 value;
if (!poly_int_rtx_p (vec_elem, &value)
|| maybe_lt (value, minval)
|| maybe_gt (value, maxval))
return false;
}
return true;
}
/* Return a const_int vector of VAL.
This function also exists in aarch64, we may unify it in middle-end in the
future. */
static rtx
gen_const_vector_dup (machine_mode mode, poly_int64 val)
{
rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
return gen_const_vec_duplicate (mode, c);
}
/* Emit a vlmax vsetvl instruction. This should only be used when
optimization is disabled or after vsetvl insertion pass. */
void
emit_hard_vlmax_vsetvl (machine_mode vmode, rtx vl)
{
unsigned int sew = get_sew (vmode);
emit_insn (gen_vsetvl (Pmode, vl, RVV_VLMAX, gen_int_mode (sew, Pmode),
gen_int_mode (get_vlmul (vmode), Pmode), const0_rtx,
const0_rtx));
}
void
emit_vlmax_vsetvl (machine_mode vmode, rtx vl)
{
unsigned int sew = get_sew (vmode);
enum vlmul_type vlmul = get_vlmul (vmode);
unsigned int ratio = calculate_ratio (sew, vlmul);
if (!optimize)
emit_hard_vlmax_vsetvl (vmode, vl);
else
emit_insn (gen_vlmax_avl (Pmode, vl, gen_int_mode (ratio, Pmode)));
}
/* Calculate SEW/LMUL ratio. */
unsigned int
calculate_ratio (unsigned int sew, enum vlmul_type vlmul)
{
unsigned int ratio;
switch (vlmul)
{
case LMUL_1:
ratio = sew;
break;
case LMUL_2:
ratio = sew / 2;
break;
case LMUL_4:
ratio = sew / 4;
break;
case LMUL_8:
ratio = sew / 8;
break;
case LMUL_F8:
ratio = sew * 8;
break;
case LMUL_F4:
ratio = sew * 4;
break;
case LMUL_F2:
ratio = sew * 2;
break;
default:
gcc_unreachable ();
}
return ratio;
}
/* SCALABLE means that the vector-length is agnostic (run-time invariant and
compile-time unknown). FIXED meands that the vector-length is specific
(compile-time known). Both RVV_SCALABLE and RVV_FIXED_VLMAX are doing
auto-vectorization using VLMAX vsetvl configuration. */
static bool
autovec_use_vlmax_p (void)
{
return (riscv_autovec_preference == RVV_SCALABLE
|| riscv_autovec_preference == RVV_FIXED_VLMAX);
}
/* The RISC-V vsetvli pass uses "known vlmax" operations for optimization.
Whether or not an instruction actually is a vlmax operation is not
recognizable from the length operand alone but the avl_type operand
is used instead. In general, there are two cases:
- Emit a vlmax operation by passing a NULL length. Here we emit
a vsetvli with vlmax configuration and set the avl_type to VLMAX.
- Emit an operation that uses the existing (last-set) length and
set the avl_type to NONVLMAX.
Sometimes we also need to set the VLMAX avl_type to an operation that
already uses a given length register. This can happen during or after
register allocation when we are not allowed to create a new register.
For that case we also allow to set the avl_type to VLMAX.
*/
/* This function emits a {VLMAX, TAIL_ANY, MASK_ANY} vsetvli followed by the
* actual operation. */
void
emit_vlmax_insn (unsigned icode, int op_num, rtx *ops, rtx vl)
{
machine_mode dest_mode = GET_MODE (ops[0]);
machine_mode mask_mode = get_mask_mode (dest_mode).require ();
insn_expander<RVV_INSN_OPERANDS_MAX> e (op_num,
/* HAS_DEST_P */ true,
/* FULLY_UNMASKED_P */ true,
/* USE_REAL_MERGE_P */ false,
/* HAS_AVL_P */ true,
/* VLMAX_P */ true,
dest_mode,
mask_mode);
e.set_policy (TAIL_ANY);
e.set_policy (MASK_ANY);
/* According to LRA mov pattern in vector.md, we have a clobber operand
to be used ad VL operand. */
e.set_vl (vl);
e.emit_insn ((enum insn_code) icode, ops);
}
void
emit_vlmax_fp_insn (unsigned icode, int op_num, rtx *ops, rtx vl)
{
machine_mode dest_mode = GET_MODE (ops[0]);
machine_mode mask_mode = get_mask_mode (dest_mode).require ();
insn_expander<RVV_INSN_OPERANDS_MAX> e (op_num,
/* HAS_DEST_P */ true,
/* FULLY_UNMASKED_P */ true,
/* USE_REAL_MERGE_P */ false,
/* HAS_AVL_P */ true,
/* VLMAX_P */ true,
dest_mode,
mask_mode);
e.set_policy (TAIL_ANY);
e.set_policy (MASK_ANY);
e.set_rounding_mode (FRM_DYN);
e.set_vl (vl);
e.emit_insn ((enum insn_code) icode, ops);
}
/* This function emits a {VLMAX, TAIL_ANY, MASK_ANY} vsetvli followed by the
* ternary operation which always has a real merge operand. */
void
emit_vlmax_ternary_insn (unsigned icode, int op_num, rtx *ops, rtx vl)
{
machine_mode dest_mode = GET_MODE (ops[0]);
machine_mode mask_mode = get_mask_mode (dest_mode).require ();
insn_expander<RVV_INSN_OPERANDS_MAX> e (/*OP_NUM*/ op_num,
/*HAS_DEST_P*/ true,
/*FULLY_UNMASKED_P*/ true,
/*USE_REAL_MERGE_P*/ true,
/*HAS_AVL_P*/ true,
/*VLMAX_P*/ true,
/*DEST_MODE*/ dest_mode,
/*MASK_MODE*/ mask_mode);
e.set_policy (TAIL_ANY);
e.set_policy (MASK_ANY);
e.set_vl (vl);
e.emit_insn ((enum insn_code) icode, ops);
}
/* This function emits a {VLMAX, TAIL_ANY, MASK_ANY} vsetvli followed by the
* ternary operation which always has a real merge operand. */
void
emit_vlmax_fp_ternary_insn (unsigned icode, int op_num, rtx *ops, rtx vl)
{
machine_mode dest_mode = GET_MODE (ops[0]);
machine_mode mask_mode = get_mask_mode (dest_mode).require ();
insn_expander<RVV_INSN_OPERANDS_MAX> e (/*OP_NUM*/ op_num,
/*HAS_DEST_P*/ true,
/*FULLY_UNMASKED_P*/ true,
/*USE_REAL_MERGE_P*/ true,
/*HAS_AVL_P*/ true,
/*VLMAX_P*/ true,
/*DEST_MODE*/ dest_mode,
/*MASK_MODE*/ mask_mode);
e.set_policy (TAIL_ANY);
e.set_policy (MASK_ANY);
e.set_rounding_mode (FRM_DYN);
e.set_vl (vl);
e.emit_insn ((enum insn_code) icode, ops);
}
/* This function emits a {NONVLMAX, TAIL_ANY, MASK_ANY} vsetvli followed by the
* actual operation. */
void
emit_nonvlmax_insn (unsigned icode, int op_num, rtx *ops, rtx avl)
{
machine_mode dest_mode = GET_MODE (ops[0]);
machine_mode mask_mode = get_mask_mode (dest_mode).require ();
insn_expander<RVV_INSN_OPERANDS_MAX> e (op_num,
/* HAS_DEST_P */ true,
/* FULLY_UNMASKED_P */ true,
/* USE_REAL_MERGE_P */ false,
/* HAS_AVL_P */ true,
/* VLMAX_P */ false,
dest_mode,
mask_mode);
e.set_policy (TAIL_ANY);
e.set_policy (MASK_ANY);
e.set_vl (avl);
e.emit_insn ((enum insn_code) icode, ops);
}
/* This function emits a {NONVLMAX, TAIL_UNDISTURBED, MASK_ANY} vsetvli
followed by a vslide insn (with real merge operand). */
void
emit_vlmax_slide_insn (unsigned icode, rtx *ops)
{
machine_mode dest_mode = GET_MODE (ops[0]);
machine_mode mask_mode = get_mask_mode (dest_mode).require ();
insn_expander<RVV_INSN_OPERANDS_MAX> e (RVV_SLIDE_OP,
/* HAS_DEST_P */ true,
/* FULLY_UNMASKED_P */ true,
/* USE_REAL_MERGE_P */ true,
/* HAS_AVL_P */ true,
/* VLMAX_P */ true,
dest_mode,
mask_mode);
e.set_policy (TAIL_ANY);
e.set_policy (MASK_ANY);
e.emit_insn ((enum insn_code) icode, ops);
}
/* This function emits a {NONVLMAX, TAIL_UNDISTURBED, MASK_ANY} vsetvli
followed by a vslide insn (with real merge operand). */
void
emit_nonvlmax_slide_tu_insn (unsigned icode, rtx *ops, rtx avl)
{
machine_mode dest_mode = GET_MODE (ops[0]);
machine_mode mask_mode = get_mask_mode (dest_mode).require ();
insn_expander<RVV_INSN_OPERANDS_MAX> e (RVV_SLIDE_OP,
/* HAS_DEST_P */ true,
/* FULLY_UNMASKED_P */ true,
/* USE_REAL_MERGE_P */ true,
/* HAS_AVL_P */ true,
/* VLMAX_P */ false,
dest_mode,
mask_mode);
e.set_policy (TAIL_UNDISTURBED);
e.set_policy (MASK_ANY);
e.set_vl (avl);
e.emit_insn ((enum insn_code) icode, ops);
}
/* This function emits merge instruction. */
void
emit_vlmax_merge_insn (unsigned icode, int op_num, rtx *ops)
{
machine_mode dest_mode = GET_MODE (ops[0]);
machine_mode mask_mode = get_mask_mode (dest_mode).require ();
insn_expander<RVV_INSN_OPERANDS_MAX> e (op_num,
/* HAS_DEST_P */ true,
/* FULLY_UNMASKED_P */ false,
/* USE_REAL_MERGE_P */ false,
/* HAS_AVL_P */ true,
/* VLMAX_P */ true,
dest_mode,
mask_mode);
e.set_policy (TAIL_ANY);
e.emit_insn ((enum insn_code) icode, ops);
}
/* This function emits cmp instruction. */
void
emit_vlmax_cmp_insn (unsigned icode, rtx *ops)
{
machine_mode mode = GET_MODE (ops[0]);
insn_expander<RVV_INSN_OPERANDS_MAX> e (RVV_CMP_OP,
/* HAS_DEST_P */ true,
/* FULLY_UNMASKED_P */ true,
/* USE_REAL_MERGE_P */ false,
/* HAS_AVL_P */ true,
/* VLMAX_P */ true,
mode,
mode);
e.set_policy (MASK_ANY);
e.emit_insn ((enum insn_code) icode, ops);
}
/* This function emits cmp with MU instruction. */
void
emit_vlmax_cmp_mu_insn (unsigned icode, rtx *ops)
{
machine_mode mode = GET_MODE (ops[0]);
insn_expander<RVV_INSN_OPERANDS_MAX> e (RVV_CMP_MU_OP,
/* HAS_DEST_P */ true,
/* FULLY_UNMASKED_P */ false,
/* USE_REAL_MERGE_P */ true,
/* HAS_AVL_P */ true,
/* VLMAX_P */ true,
mode,
mode);
e.set_policy (MASK_UNDISTURBED);
e.emit_insn ((enum insn_code) icode, ops);
}
/* This function emits a masked instruction. */
static void
emit_vlmax_masked_insn (unsigned icode, int op_num, rtx *ops)
{
machine_mode dest_mode = GET_MODE (ops[0]);
machine_mode mask_mode = get_mask_mode (dest_mode).require ();
insn_expander<RVV_INSN_OPERANDS_MAX> e (/*OP_NUM*/ op_num,
/*HAS_DEST_P*/ true,
/*FULLY_UNMASKED_P*/ false,
/*USE_REAL_MERGE_P*/ true,
/*HAS_AVL_P*/ true,
/*VLMAX_P*/ true, dest_mode,
mask_mode);
e.set_policy (TAIL_ANY);
e.set_policy (MASK_ANY);
e.emit_insn ((enum insn_code) icode, ops);
}
/* This function emits a masked instruction. */
static void
emit_nonvlmax_masked_insn (unsigned icode, int op_num, rtx *ops, rtx avl)
{
machine_mode dest_mode = GET_MODE (ops[0]);
machine_mode mask_mode = get_mask_mode (dest_mode).require ();
insn_expander<RVV_INSN_OPERANDS_MAX> e (/*OP_NUM*/ op_num,
/*HAS_DEST_P*/ true,
/*FULLY_UNMASKED_P*/ false,
/*USE_REAL_MERGE_P*/ true,
/*HAS_AVL_P*/ true,
/*VLMAX_P*/ false, dest_mode,
mask_mode);
e.set_policy (TAIL_ANY);
e.set_policy (MASK_ANY);
e.set_vl (avl);
e.emit_insn ((enum insn_code) icode, ops);
}
/* This function emits a masked instruction. */
void
emit_vlmax_masked_mu_insn (unsigned icode, int op_num, rtx *ops)
{
machine_mode dest_mode = GET_MODE (ops[0]);
machine_mode mask_mode = get_mask_mode (dest_mode).require ();
insn_expander<RVV_INSN_OPERANDS_MAX> e (/*OP_NUM*/ op_num,
/*HAS_DEST_P*/ true,
/*FULLY_UNMASKED_P*/ false,
/*USE_REAL_MERGE_P*/ true,
/*HAS_AVL_P*/ true,
/*VLMAX_P*/ true, dest_mode,
mask_mode);
e.set_policy (TAIL_ANY);
e.set_policy (MASK_UNDISTURBED);
e.emit_insn ((enum insn_code) icode, ops);
}
/* Emit vmv.s.x instruction. */
void
emit_scalar_move_insn (unsigned icode, rtx *ops)
{
machine_mode dest_mode = GET_MODE (ops[0]);
machine_mode mask_mode = get_mask_mode (dest_mode).require ();
insn_expander<RVV_INSN_OPERANDS_MAX> e (riscv_vector::RVV_SCALAR_MOV_OP,
/* HAS_DEST_P */ true,
/* FULLY_UNMASKED_P */ false,
/* USE_REAL_MERGE_P */ true,
/* HAS_AVL_P */ true,
/* VLMAX_P */ false,
dest_mode,
mask_mode);
e.set_policy (TAIL_ANY);
e.set_policy (MASK_ANY);
e.set_vl (CONST1_RTX (Pmode));
e.emit_insn ((enum insn_code) icode, ops);
}
/* Emit vmv.v.x instruction with vlmax. */
static void
emit_vlmax_integer_move_insn (unsigned icode, rtx *ops, rtx vl)
{
emit_vlmax_insn (icode, riscv_vector::RVV_UNOP, ops, vl);
}
/* Emit vmv.v.x instruction with nonvlmax. */
void
emit_nonvlmax_integer_move_insn (unsigned icode, rtx *ops, rtx avl)
{
emit_nonvlmax_insn (icode, riscv_vector::RVV_UNOP, ops, avl);
}
/* This function emits VLMAX vrgather instruction. Emit vrgather.vx/vi when sel
is a const duplicate vector. Otherwise, emit vrgather.vv. */
static void
emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
{
rtx elt;
insn_code icode;
machine_mode data_mode = GET_MODE (target);
machine_mode sel_mode = GET_MODE (sel);
if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
icode = code_for_pred_gatherei16 (data_mode);
else if (const_vec_duplicate_p (sel, &elt))
{
icode = code_for_pred_gather_scalar (data_mode);
sel = elt;
}
else
icode = code_for_pred_gather (data_mode);
rtx ops[] = {target, op, sel};
emit_vlmax_insn (icode, RVV_BINOP, ops);
}
static void
emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask)
{
rtx elt;
insn_code icode;
machine_mode data_mode = GET_MODE (target);
machine_mode sel_mode = GET_MODE (sel);
if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
icode = code_for_pred_gatherei16 (data_mode);
else if (const_vec_duplicate_p (sel, &elt))
{
icode = code_for_pred_gather_scalar (data_mode);
sel = elt;
}
else
icode = code_for_pred_gather (data_mode);
rtx ops[] = {target, mask, target, op, sel};
emit_vlmax_masked_mu_insn (icode, RVV_BINOP_MU, ops);
}
/* According to RVV ISA spec (16.5.1. Synthesizing vdecompress):
https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
There is no inverse vdecompress provided, as this operation can be readily
synthesized using iota and a masked vrgather:
Desired functionality of 'vdecompress'
7 6 5 4 3 2 1 0 # vid
e d c b a # packed vector of 5 elements
1 0 0 1 1 1 0 1 # mask vector of 8 elements
p q r s t u v w # destination register before vdecompress
e q r d c b v a # result of vdecompress
# v0 holds mask
# v1 holds packed data
# v11 holds input expanded vector and result
viota.m v10, v0 # Calc iota from mask in v0
vrgather.vv v11, v1, v10, v0.t # Expand into destination
p q r s t u v w # v11 destination register
e d c b a # v1 source vector
1 0 0 1 1 1 0 1 # v0 mask vector
4 4 4 3 2 1 1 0 # v10 result of viota.m
e q r d c b v a # v11 destination after vrgather using viota.m under mask
*/
static void
emit_vlmax_decompress_insn (rtx target, rtx op0, rtx op1, rtx mask)
{
machine_mode data_mode = GET_MODE (target);
machine_mode sel_mode = related_int_vector_mode (data_mode).require ();
if (GET_MODE_INNER (data_mode) == QImode)
sel_mode = get_vector_mode (HImode, GET_MODE_NUNITS (data_mode)).require ();
rtx sel = gen_reg_rtx (sel_mode);
rtx iota_ops[] = {sel, mask};
emit_vlmax_insn (code_for_pred_iota (sel_mode), RVV_UNOP, iota_ops);
emit_vlmax_gather_insn (target, op0, sel);
emit_vlmax_masked_gather_mu_insn (target, op1, sel, mask);
}
/* Emit merge instruction. */
static machine_mode
get_repeating_sequence_dup_machine_mode (const rvv_builder &builder)
{
poly_uint64 dup_nunits = GET_MODE_NUNITS (builder.mode ());
if (known_ge (GET_MODE_SIZE (builder.mode ()), BYTES_PER_RISCV_VECTOR))
{
dup_nunits = exact_div (BYTES_PER_RISCV_VECTOR,
builder.inner_bytes_size ());
}
return get_vector_mode (builder.inner_int_mode (), dup_nunits).require ();
}
/* Expand series const vector. */
void
expand_vec_series (rtx dest, rtx base, rtx step)
{
machine_mode mode = GET_MODE (dest);
machine_mode mask_mode;
gcc_assert (get_mask_mode (mode).exists (&mask_mode));
poly_int64 nunits_m1 = GET_MODE_NUNITS (mode) - 1;
poly_int64 value;
/* VECT_IV = BASE + I * STEP. */
/* Step 1: Generate I = { 0, 1, 2, ... } by vid.v. */
rtx vid = gen_reg_rtx (mode);
rtx op[] = {vid};
emit_vlmax_insn (code_for_pred_series (mode), RVV_MISC_OP, op);
/* Step 2: Generate I * STEP.
- STEP is 1, we don't emit any instructions.
- STEP is power of 2, we use vsll.vi/vsll.vx.
- STEP is non-power of 2, we use vmul.vx. */
rtx step_adj;
if (rtx_equal_p (step, const1_rtx))
step_adj = vid;
else if (rtx_equal_p (step, constm1_rtx) && poly_int_rtx_p (base, &value)
&& known_eq (nunits_m1, value))
{
/* Special case:
{nunits - 1, nunits - 2, ... , 0}.
nunits can be either const_int or const_poly_int.
Code sequence:
vid.v v
vrsub nunits - 1, v. */
rtx ops[] = {dest, vid, gen_int_mode (nunits_m1, GET_MODE_INNER (mode))};
insn_code icode = code_for_pred_sub_reverse_scalar (mode);
emit_vlmax_insn (icode, RVV_BINOP, ops);
return;
}
else
{
step_adj = gen_reg_rtx (mode);
if (CONST_INT_P (step) && pow2p_hwi (INTVAL (step)))
{
/* Emit logical left shift operation. */
int shift = exact_log2 (INTVAL (step));
rtx shift_amount = gen_int_mode (shift, Pmode);
insn_code icode = code_for_pred_scalar (ASHIFT, mode);
rtx ops[] = {step_adj, vid, shift_amount};
emit_vlmax_insn (icode, RVV_BINOP, ops);
}
else
{
insn_code icode = code_for_pred_scalar (MULT, mode);
rtx ops[] = {step_adj, vid, step};
emit_vlmax_insn (icode, RVV_BINOP, ops);
}
}
/* Step 3: Generate BASE + I * STEP.
- BASE is 0, use result of vid.
- BASE is not 0, we use vadd.vx/vadd.vi. */
if (rtx_equal_p (base, const0_rtx))
{
emit_move_insn (dest, step_adj);
}
else
{
rtx result = gen_reg_rtx (mode);
insn_code icode = code_for_pred_scalar (PLUS, mode);
rtx ops[] = {result, step_adj, base};
emit_vlmax_insn (icode, RVV_BINOP, ops);
emit_move_insn (dest, result);
}
}
static void
expand_const_vector (rtx target, rtx src)
{
machine_mode mode = GET_MODE (target);
scalar_mode elt_mode = GET_MODE_INNER (mode);
if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
{
rtx elt;
gcc_assert (
const_vec_duplicate_p (src, &elt)
&& (rtx_equal_p (elt, const0_rtx) || rtx_equal_p (elt, const1_rtx)));
rtx ops[] = {target, src};
emit_vlmax_insn (code_for_pred_mov (mode), RVV_UNOP, ops);
return;
}
rtx elt;
if (const_vec_duplicate_p (src, &elt))
{
rtx tmp = register_operand (target, mode) ? target : gen_reg_rtx (mode);
/* Element in range -16 ~ 15 integer or 0.0 floating-point,
we use vmv.v.i instruction. */
if (satisfies_constraint_vi (src) || satisfies_constraint_Wc0 (src))
{
rtx ops[] = {tmp, src};
emit_vlmax_insn (code_for_pred_mov (mode), RVV_UNOP, ops);
}
else
{
elt = force_reg (elt_mode, elt);
rtx ops[] = {tmp, elt};
emit_vlmax_insn (code_for_pred_broadcast (mode), RVV_UNOP, ops);
}
if (tmp != target)
emit_move_insn (target, tmp);
return;
}
/* Support scalable const series vector. */
rtx base, step;
if (const_vec_series_p (src, &base, &step))
{
emit_insn (gen_vec_series (mode, target, base, step));
return;
}
/* Handle variable-length vector. */
unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
rvv_builder builder (mode, npatterns, nelts_per_pattern);
for (unsigned int i = 0; i < nelts_per_pattern; i++)
{
for (unsigned int j = 0; j < npatterns; j++)
builder.quick_push (CONST_VECTOR_ELT (src, i * npatterns + j));
}
builder.finalize ();
if (CONST_VECTOR_DUPLICATE_P (src))
{
/* Handle the case with repeating sequence that NELTS_PER_PATTERN = 1
E.g. NPATTERNS = 4, v = { 0, 2, 6, 7, ... }
NPATTERNS = 8, v = { 0, 2, 6, 7, 19, 20, 8, 7 ... }
The elements within NPATTERNS are not necessary regular. */
if (builder.can_duplicate_repeating_sequence_p ())
{
/* We handle the case that we can find a vector containter to hold
element bitsize = NPATTERNS * ele_bitsize.
NPATTERNS = 8, element width = 8
v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
In this case, we can combine NPATTERNS element into a larger
element. Use element width = 64 and broadcast a vector with
all element equal to 0x0706050403020100. */
rtx ele = builder.get_merged_repeating_sequence ();
rtx dup = expand_vector_broadcast (builder.new_mode (), ele);
emit_move_insn (target, gen_lowpart (mode, dup));
}
else
{
/* We handle the case that we can't find a vector containter to hold
element bitsize = NPATTERNS * ele_bitsize.
NPATTERNS = 8, element width = 16
v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
Since NPATTERNS * element width = 128, we can't find a container
to hold it.
In this case, we use NPATTERNS merge operations to generate such
vector. */
unsigned int nbits = npatterns - 1;
/* Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
rtx vid = gen_reg_rtx (builder.int_mode ());
rtx op[] = {vid};
emit_vlmax_insn (code_for_pred_series (builder.int_mode ()),
RVV_MISC_OP, op);
/* Generate vid_repeat = { 0, 1, ... nbits, ... } */
rtx vid_repeat = gen_reg_rtx (builder.int_mode ());
rtx and_ops[] = {vid_repeat, vid,
gen_int_mode (nbits, builder.inner_int_mode ())};
emit_vlmax_insn (code_for_pred_scalar (AND, builder.int_mode ()),
RVV_BINOP, and_ops);
rtx tmp = gen_reg_rtx (builder.mode ());
rtx dup_ops[] = {tmp, builder.elt (0)};
emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()), RVV_UNOP,
dup_ops);
for (unsigned int i = 1; i < builder.npatterns (); i++)
{
/* Generate mask according to i. */
rtx mask = gen_reg_rtx (builder.mask_mode ());
rtx const_vec = gen_const_vector_dup (builder.int_mode (), i);
expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
/* Merge scalar to each i. */
rtx tmp2 = gen_reg_rtx (builder.mode ());
rtx merge_ops[] = {tmp2, tmp, builder.elt (i), mask};
insn_code icode = code_for_pred_merge_scalar (builder.mode ());
emit_vlmax_merge_insn (icode, RVV_MERGE_OP, merge_ops);
tmp = tmp2;
}
emit_move_insn (target, tmp);
}
return;
}
else if (CONST_VECTOR_STEPPED_P (src))
{
gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
if (builder.single_step_npatterns_p ())
{
/* Describe the case by choosing NPATTERNS = 4 as an example. */
insn_code icode;
/* Step 1: Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
rtx vid = gen_reg_rtx (builder.mode ());
rtx vid_ops[] = {vid};
icode = code_for_pred_series (builder.mode ());
emit_vlmax_insn (icode, RVV_MISC_OP, vid_ops);
if (builder.npatterns_all_equal_p ())
{
/* Generate the variable-length vector following this rule:
{ a, a, a + step, a + step, a + step * 2, a + step * 2, ...}
E.g. { 0, 0, 8, 8, 16, 16, ... } */
/* We want to create a pattern where value[ix] = floor (ix /
NPATTERNS). As NPATTERNS is always a power of two we can
rewrite this as = ix & -NPATTERNS. */
/* Step 2: VID AND -NPATTERNS:
{ 0&-4, 1&-4, 2&-4, 3 &-4, 4 &-4, 5 &-4, 6 &-4, 7 &-4, ... }
*/
rtx imm
= gen_int_mode (-builder.npatterns (), builder.inner_mode ());
rtx and_ops[] = {target, vid, imm};
icode = code_for_pred_scalar (AND, builder.mode ());
emit_vlmax_insn (icode, RVV_BINOP, and_ops);
}
else
{
/* Generate the variable-length vector following this rule:
{ a, b, a, b, a + step, b + step, a + step*2, b + step*2, ...}
E.g. { 3, 2, 1, 0, 7, 6, 5, 4, ... } */
/* Step 2: Generate diff = TARGET - VID:
{ 3-0, 2-1, 1-2, 0-3, 7-4, 6-5, 5-6, 4-7, ... }*/
rvv_builder v (builder.mode (), builder.npatterns (), 1);
for (unsigned int i = 0; i < v.npatterns (); ++i)
{
/* Calculate the diff between the target sequence and
vid sequence. */
HOST_WIDE_INT diff = INTVAL (builder.elt (i)) - i;
v.quick_push (gen_int_mode (diff, v.inner_mode ()));
}
/* Step 2: Generate result = VID + diff. */
rtx vec = v.build ();
rtx add_ops[] = {target, vid, vec};
emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()), RVV_BINOP,
add_ops);
}
}
else
/* TODO: We will enable more variable-length vector in the future. */
gcc_unreachable ();
}
else
gcc_unreachable ();
}
/* Expand a pre-RA RVV data move from SRC to DEST.
It expands move for RVV fractional vector modes. */
bool
legitimize_move (rtx dest, rtx src)
{
machine_mode mode = GET_MODE (dest);
if (CONST_VECTOR_P (src))
{
expand_const_vector (dest, src);
return true;
}
/* In order to decrease the memory traffic, we don't use whole register
* load/store for the LMUL less than 1 and mask mode, so those case will
* require one extra general purpose register, but it's not allowed during LRA
* process, so we have a special move pattern used for LRA, which will defer
* the expansion after LRA. */
if ((known_lt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
|| GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
&& lra_in_progress)
{
emit_insn (gen_mov_lra (mode, Pmode, dest, src));
return true;
}
if (known_ge (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
&& GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL)
{
/* Need to force register if mem <- !reg. */
if (MEM_P (dest) && !REG_P (src))
src = force_reg (mode, src);
return false;
}
if (register_operand (src, mode) && register_operand (dest, mode))
{
emit_insn (gen_rtx_SET (dest, src));
return true;
}
if (!register_operand (src, mode) && !register_operand (dest, mode))
{
rtx tmp = gen_reg_rtx (mode);
if (MEM_P (src))
{
rtx ops[] = {tmp, src};
emit_vlmax_insn (code_for_pred_mov (mode), RVV_UNOP, ops);
}
else
emit_move_insn (tmp, src);
src = tmp;
}
if (satisfies_constraint_vu (src))
return false;
rtx ops[] = {dest, src};
emit_vlmax_insn (code_for_pred_mov (mode), RVV_UNOP, ops);
return true;
}
/* VTYPE information for machine_mode. */
struct mode_vtype_group
{
enum vlmul_type vlmul_for_min_vlen32[NUM_MACHINE_MODES];
uint8_t ratio_for_min_vlen32[NUM_MACHINE_MODES];
enum vlmul_type vlmul_for_min_vlen64[NUM_MACHINE_MODES];
uint8_t ratio_for_min_vlen64[NUM_MACHINE_MODES];
enum vlmul_type vlmul_for_for_vlen128[NUM_MACHINE_MODES];
uint8_t ratio_for_for_vlen128[NUM_MACHINE_MODES];
machine_mode subpart_mode[NUM_MACHINE_MODES];
uint8_t nf[NUM_MACHINE_MODES];
mode_vtype_group ()
{
#define ENTRY(MODE, REQUIREMENT, VLMUL_FOR_MIN_VLEN32, RATIO_FOR_MIN_VLEN32, \
VLMUL_FOR_MIN_VLEN64, RATIO_FOR_MIN_VLEN64, \
VLMUL_FOR_MIN_VLEN128, RATIO_FOR_MIN_VLEN128) \
vlmul_for_min_vlen32[MODE##mode] = VLMUL_FOR_MIN_VLEN32; \
ratio_for_min_vlen32[MODE##mode] = RATIO_FOR_MIN_VLEN32; \
vlmul_for_min_vlen64[MODE##mode] = VLMUL_FOR_MIN_VLEN64; \
ratio_for_min_vlen64[MODE##mode] = RATIO_FOR_MIN_VLEN64; \
vlmul_for_for_vlen128[MODE##mode] = VLMUL_FOR_MIN_VLEN128; \
ratio_for_for_vlen128[MODE##mode] = RATIO_FOR_MIN_VLEN128;
#define TUPLE_ENTRY(MODE, REQUIREMENT, SUBPART_MODE, NF, VLMUL_FOR_MIN_VLEN32, \
RATIO_FOR_MIN_VLEN32, VLMUL_FOR_MIN_VLEN64, \
RATIO_FOR_MIN_VLEN64, VLMUL_FOR_MIN_VLEN128, \
RATIO_FOR_MIN_VLEN128) \
subpart_mode[MODE##mode] = SUBPART_MODE##mode; \
nf[MODE##mode] = NF; \
vlmul_for_min_vlen32[MODE##mode] = VLMUL_FOR_MIN_VLEN32; \
ratio_for_min_vlen32[MODE##mode] = RATIO_FOR_MIN_VLEN32; \
vlmul_for_min_vlen64[MODE##mode] = VLMUL_FOR_MIN_VLEN64; \
ratio_for_min_vlen64[MODE##mode] = RATIO_FOR_MIN_VLEN64; \
vlmul_for_for_vlen128[MODE##mode] = VLMUL_FOR_MIN_VLEN128; \
ratio_for_for_vlen128[MODE##mode] = RATIO_FOR_MIN_VLEN128;
#include "riscv-vector-switch.def"
#undef ENTRY
#undef TUPLE_ENTRY
}
};
static mode_vtype_group mode_vtype_infos;
/* Get vlmul field value by comparing LMUL with BYTES_PER_RISCV_VECTOR. */
enum vlmul_type
get_vlmul (machine_mode mode)
{
if (TARGET_MIN_VLEN >= 128)
return mode_vtype_infos.vlmul_for_for_vlen128[mode];
else if (TARGET_MIN_VLEN == 32)
return mode_vtype_infos.vlmul_for_min_vlen32[mode];
else
return mode_vtype_infos.vlmul_for_min_vlen64[mode];
}
/* Return the NF value of the corresponding mode. */
unsigned int
get_nf (machine_mode mode)
{
/* We don't allow non-tuple modes go through this function. */
gcc_assert (riscv_v_ext_tuple_mode_p (mode));
return mode_vtype_infos.nf[mode];
}
/* Return the subpart mode of the tuple mode. For VNx2x1SImode,
the subpart mode is VNx1SImode. This will help to build
array/struct type in builtins. */
machine_mode
get_subpart_mode (machine_mode mode)
{
/* We don't allow non-tuple modes go through this function. */
gcc_assert (riscv_v_ext_tuple_mode_p (mode));
return mode_vtype_infos.subpart_mode[mode];
}
/* Get ratio according to machine mode. */
unsigned int
get_ratio (machine_mode mode)
{
if (TARGET_MIN_VLEN >= 128)
return mode_vtype_infos.ratio_for_for_vlen128[mode];
else if (TARGET_MIN_VLEN == 32)
return mode_vtype_infos.ratio_for_min_vlen32[mode];
else
return mode_vtype_infos.ratio_for_min_vlen64[mode];
}
/* Get ta according to operand[tail_op_idx]. */
int
get_ta (rtx ta)
{
if (INTVAL (ta) == TAIL_ANY)
return INVALID_ATTRIBUTE;
return INTVAL (ta);
}
/* Get ma according to operand[mask_op_idx]. */
int
get_ma (rtx ma)
{
if (INTVAL (ma) == MASK_ANY)
return INVALID_ATTRIBUTE;
return INTVAL (ma);
}
/* Get prefer tail policy. */
enum tail_policy
get_prefer_tail_policy ()
{
/* TODO: By default, we choose to use TAIL_ANY which allows
compiler pick up either agnostic or undisturbed. Maybe we
will have a compile option like -mprefer=agnostic to set
this value???. */
return TAIL_ANY;
}
/* Get prefer mask policy. */
enum mask_policy
get_prefer_mask_policy ()
{
/* TODO: By default, we choose to use MASK_ANY which allows
compiler pick up either agnostic or undisturbed. Maybe we
will have a compile option like -mprefer=agnostic to set
this value???. */
return MASK_ANY;
}
/* Get avl_type rtx. */
rtx
get_avl_type_rtx (enum avl_type type)
{
return gen_int_mode (type, Pmode);
}
/* Return the appropriate mask mode for MODE. */
opt_machine_mode
get_mask_mode (machine_mode mode)
{
return get_vector_mode (BImode, GET_MODE_NUNITS (mode));
}
/* Return the RVV vector mode that has NUNITS elements of mode INNER_MODE.
This function is not only used by builtins, but also will be used by
auto-vectorization in the future. */
opt_machine_mode
get_vector_mode (scalar_mode inner_mode, poly_uint64 nunits)
{
enum mode_class mclass;
if (inner_mode == E_BImode)
mclass = MODE_VECTOR_BOOL;
else if (FLOAT_MODE_P (inner_mode))
mclass = MODE_VECTOR_FLOAT;
else
mclass = MODE_VECTOR_INT;
machine_mode mode;
FOR_EACH_MODE_IN_CLASS (mode, mclass)
if (inner_mode == GET_MODE_INNER (mode)
&& known_eq (nunits, GET_MODE_NUNITS (mode))
&& riscv_v_ext_vector_mode_p (mode))
return mode;
return opt_machine_mode ();
}
/* Return the RVV tuple mode if we can find the legal tuple mode for the
corresponding subpart mode and NF. */
opt_machine_mode
get_tuple_mode (machine_mode subpart_mode, unsigned int nf)
{
poly_uint64 nunits = GET_MODE_NUNITS (subpart_mode) * nf;
scalar_mode inner_mode = GET_MODE_INNER (subpart_mode);
enum mode_class mclass = GET_MODE_CLASS (subpart_mode);
machine_mode mode;
FOR_EACH_MODE_IN_CLASS (mode, mclass)
if (inner_mode == GET_MODE_INNER (mode)
&& known_eq (nunits, GET_MODE_NUNITS (mode))
&& riscv_v_ext_tuple_mode_p (mode)
&& get_subpart_mode (mode) == subpart_mode)
return mode;
return opt_machine_mode ();
}
bool
simm5_p (rtx x)
{
if (!CONST_INT_P (x))
return false;
return IN_RANGE (INTVAL (x), -16, 15);
}
bool
neg_simm5_p (rtx x)
{
if (!CONST_INT_P (x))
return false;
return IN_RANGE (INTVAL (x), -15, 16);
}
bool
has_vi_variant_p (rtx_code code, rtx x)
{
switch (code)
{
case PLUS:
case AND:
case IOR:
case XOR:
case SS_PLUS:
case US_PLUS:
case EQ:
case NE:
case LE:
case LEU:
case GT:
case GTU:
return simm5_p (x);
case LT:
case LTU:
case GE:
case GEU:
case MINUS:
case SS_MINUS:
return neg_simm5_p (x);
default:
return false;
}
}
bool
sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
machine_mode vector_mode, bool has_vi_variant_p,
void (*emit_vector_func) (rtx *, rtx))
{
machine_mode scalar_mode = GET_MODE_INNER (vector_mode);
if (has_vi_variant_p)
{
*scalar_op = force_reg (scalar_mode, *scalar_op);
return false;
}
if (TARGET_64BIT)
{
if (!rtx_equal_p (*scalar_op, const0_rtx))
*scalar_op = force_reg (scalar_mode, *scalar_op);
return false;
}
if (immediate_operand (*scalar_op, Pmode))
{
if (!rtx_equal_p (*scalar_op, const0_rtx))
*scalar_op = force_reg (Pmode, *scalar_op);
*scalar_op = gen_rtx_SIGN_EXTEND (scalar_mode, *scalar_op);
return false;
}
if (CONST_INT_P (*scalar_op))
*scalar_op = force_reg (scalar_mode, *scalar_op);
rtx tmp = gen_reg_rtx (vector_mode);
rtx ops[] = {tmp, *scalar_op};
riscv_vector::emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode),
riscv_vector::RVV_UNOP, ops, vl);
emit_vector_func (operands, tmp);
return true;
}
/* Get { ... ,0, 0, 0, ..., 0, 0, 0, 1 } mask. */
rtx
gen_scalar_move_mask (machine_mode mode)
{
rtx_vector_builder builder (mode, 1, 2);
builder.quick_push (const1_rtx);
builder.quick_push (const0_rtx);
return builder.build ();
}
static unsigned
compute_vlmax (unsigned vector_bits, unsigned elt_size, unsigned min_size)
{
// Original equation:
// VLMAX = (VectorBits / EltSize) * LMUL
// where LMUL = MinSize / TARGET_MIN_VLEN
// The following equations have been reordered to prevent loss of precision
// when calculating fractional LMUL.
return ((vector_bits / elt_size) * min_size) / TARGET_MIN_VLEN;
}
static unsigned
get_unknown_min_value (machine_mode mode)
{
enum vlmul_type vlmul = get_vlmul (mode);
switch (vlmul)
{
case LMUL_1:
return TARGET_MIN_VLEN;
case LMUL_2:
return TARGET_MIN_VLEN * 2;
case LMUL_4:
return TARGET_MIN_VLEN * 4;
case LMUL_8:
return TARGET_MIN_VLEN * 8;
default:
gcc_unreachable ();
}
}
static rtx
force_vector_length_operand (rtx vl)
{
if (CONST_INT_P (vl) && !satisfies_constraint_K (vl))
return force_reg (Pmode, vl);
return vl;
}
static rtx
gen_no_side_effects_vsetvl_rtx (machine_mode vmode, rtx vl, rtx avl)
{
unsigned int sew = get_sew (vmode);
return gen_vsetvl_no_side_effects (Pmode, vl, avl, gen_int_mode (sew, Pmode),
gen_int_mode (get_vlmul (vmode), Pmode),
const0_rtx, const0_rtx);
}
/* GET VL * 2 rtx. */
static rtx
get_vl_x2_rtx (rtx avl, machine_mode mode, machine_mode demote_mode)
{
rtx i32vl = NULL_RTX;
if (CONST_INT_P (avl))
{
unsigned elt_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
unsigned min_size = get_unknown_min_value (mode);
unsigned vlen_max = RVV_65536;
unsigned vlmax_max = compute_vlmax (vlen_max, elt_size, min_size);
unsigned vlen_min = TARGET_MIN_VLEN;
unsigned vlmax_min = compute_vlmax (vlen_min, elt_size, min_size);
unsigned HOST_WIDE_INT avl_int = INTVAL (avl);
if (avl_int <= vlmax_min)
i32vl = gen_int_mode (2 * avl_int, Pmode);
else if (avl_int >= 2 * vlmax_max)
{
// Just set i32vl to VLMAX in this situation
i32vl = gen_reg_rtx (Pmode);
emit_insn (
gen_no_side_effects_vsetvl_rtx (demote_mode, i32vl, RVV_VLMAX));
}
else
{
// For AVL between (MinVLMAX, 2 * MaxVLMAX), the actual working vl
// is related to the hardware implementation.
// So let the following code handle
}
}
if (!i32vl)
{
// Using vsetvli instruction to get actually used length which related to
// the hardware implementation
rtx i64vl = gen_reg_rtx (Pmode);
emit_insn (
gen_no_side_effects_vsetvl_rtx (mode, i64vl, force_reg (Pmode, avl)));
// scale 2 for 32-bit length
i32vl = gen_reg_rtx (Pmode);
emit_insn (
gen_rtx_SET (i32vl, gen_rtx_ASHIFT (Pmode, i64vl, const1_rtx)));
}
return force_vector_length_operand (i32vl);
}
bool
slide1_sew64_helper (int unspec, machine_mode mode, machine_mode demote_mode,
machine_mode demote_mask_mode, rtx *ops)
{
rtx scalar_op = ops[4];
rtx avl = ops[5];
machine_mode scalar_mode = GET_MODE_INNER (mode);
if (rtx_equal_p (scalar_op, const0_rtx))
{
ops[5] = force_vector_length_operand (ops[5]);
return false;
}
if (TARGET_64BIT)
{
ops[4] = force_reg (scalar_mode, scalar_op);
ops[5] = force_vector_length_operand (ops[5]);
return false;
}
if (immediate_operand (scalar_op, Pmode))
{
ops[4] = gen_rtx_SIGN_EXTEND (scalar_mode, force_reg (Pmode, scalar_op));
ops[5] = force_vector_length_operand (ops[5]);
return false;
}
if (CONST_INT_P (scalar_op))
scalar_op = force_reg (scalar_mode, scalar_op);
rtx vl_x2 = get_vl_x2_rtx (avl, mode, demote_mode);
rtx demote_scalar_op1, demote_scalar_op2;
if (unspec == UNSPEC_VSLIDE1UP)
{
demote_scalar_op1 = gen_highpart (Pmode, scalar_op);
demote_scalar_op2 = gen_lowpart (Pmode, scalar_op);
}
else
{
demote_scalar_op1 = gen_lowpart (Pmode, scalar_op);
demote_scalar_op2 = gen_highpart (Pmode, scalar_op);
}
rtx temp = gen_reg_rtx (demote_mode);
rtx ta = gen_int_mode (get_prefer_tail_policy (), Pmode);
rtx ma = gen_int_mode (get_prefer_mask_policy (), Pmode);
rtx merge = RVV_VUNDEF (demote_mode);
/* Handle vslide1<ud>_tu. */
if (register_operand (ops[2], mode)
&& rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1]))))
{
merge = gen_lowpart (demote_mode, ops[2]);
ta = ops[6];
ma = ops[7];
}
emit_insn (gen_pred_slide (unspec, demote_mode, temp,
CONSTM1_RTX (demote_mask_mode), merge,
gen_lowpart (demote_mode, ops[3]),
demote_scalar_op1, vl_x2, ta, ma, ops[8]));
emit_insn (gen_pred_slide (unspec, demote_mode,
gen_lowpart (demote_mode, ops[0]),
CONSTM1_RTX (demote_mask_mode), merge, temp,
demote_scalar_op2, vl_x2, ta, ma, ops[8]));
if (rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1]))))
return true;
else
emit_insn (gen_pred_merge (mode, ops[0], ops[2], ops[2], ops[0], ops[1],
force_vector_length_operand (ops[5]), ops[6],
ops[8]));
return true;
}
rtx
gen_avl_for_scalar_move (rtx avl)
{
/* AVL for scalar move has different behavior between 0 and large than 0. */
if (CONST_INT_P (avl))
{
/* So we could just set AVL to 1 for any constant other than 0. */
if (rtx_equal_p (avl, const0_rtx))
return const0_rtx;
else
return const1_rtx;
}
else
{
/* For non-constant value, we set any non zero value to 1 by
`sgtu new_avl,input_avl,zero` + `vsetvli`. */
rtx tmp = gen_reg_rtx (Pmode);
emit_insn (
gen_rtx_SET (tmp, gen_rtx_fmt_ee (GTU, Pmode, avl, const0_rtx)));
return tmp;
}
}
/* Expand tuple modes data movement for. */
void
expand_tuple_move (rtx *ops)
{
unsigned int i;
machine_mode tuple_mode = GET_MODE (ops[0]);
machine_mode subpart_mode = get_subpart_mode (tuple_mode);
poly_int64 subpart_size = GET_MODE_SIZE (subpart_mode);
unsigned int nf = get_nf (tuple_mode);
bool fractional_p = known_lt (subpart_size, BYTES_PER_RISCV_VECTOR);
if (REG_P (ops[0]) && CONST_VECTOR_P (ops[1]))
{
rtx val;
gcc_assert (can_create_pseudo_p ()
&& const_vec_duplicate_p (ops[1], &val));
for (i = 0; i < nf; ++i)
{
poly_int64 offset = i * subpart_size;
rtx subreg
= simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
rtx dup = gen_const_vec_duplicate (subpart_mode, val);
emit_move_insn (subreg, dup);
}
}
else if (REG_P (ops[0]) && REG_P (ops[1]))
{
for (i = 0; i < nf; ++i)
{
int index = i;
/* Take NF = 2 and LMUL = 1 for example:
- move v8 to v9:
vmv1r v10,v9
vmv1r v9,v8
- move v8 to v7:
vmv1r v7,v8
vmv1r v8,v9 */
if (REGNO (ops[0]) > REGNO (ops[1]))
index = nf - 1 - i;
poly_int64 offset = index * subpart_size;
rtx dst_subreg
= simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
rtx src_subreg
= simplify_gen_subreg (subpart_mode, ops[1], tuple_mode, offset);
emit_insn (gen_rtx_SET (dst_subreg, src_subreg));
}
}
else
{
/* Expand tuple memory data movement. */
gcc_assert (MEM_P (ops[0]) || MEM_P (ops[1]));
rtx offset = gen_int_mode (subpart_size, Pmode);
if (!subpart_size.is_constant ())
{
emit_move_insn (ops[2], gen_int_mode (BYTES_PER_RISCV_VECTOR, Pmode));
if (fractional_p)
{
unsigned int factor
= exact_div (BYTES_PER_RISCV_VECTOR, subpart_size)
.to_constant ();
rtx pat
= gen_rtx_ASHIFTRT (Pmode, ops[2],
gen_int_mode (exact_log2 (factor), Pmode));
emit_insn (gen_rtx_SET (ops[2], pat));
}
if (known_gt (subpart_size, BYTES_PER_RISCV_VECTOR))
{
unsigned int factor
= exact_div (subpart_size, BYTES_PER_RISCV_VECTOR)
.to_constant ();
rtx pat
= gen_rtx_ASHIFT (Pmode, ops[2],
gen_int_mode (exact_log2 (factor), Pmode));
emit_insn (gen_rtx_SET (ops[2], pat));
}
offset = ops[2];
}
if (MEM_P (ops[1]))
{
/* Load operations. */
emit_move_insn (ops[3], XEXP (ops[1], 0));
for (i = 0; i < nf; i++)
{
rtx subreg = simplify_gen_subreg (subpart_mode, ops[0],
tuple_mode, i * subpart_size);
if (i != 0)
{
rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
emit_insn (gen_rtx_SET (ops[3], new_addr));
}
rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
if (fractional_p)
{
rtx operands[] = {subreg, mem};
emit_vlmax_insn (code_for_pred_mov (subpart_mode), RVV_UNOP,
operands, ops[4]);
}
else
emit_move_insn (subreg, mem);
}
}
else
{
/* Store operations. */
emit_move_insn (ops[3], XEXP (ops[0], 0));
for (i = 0; i < nf; i++)
{
rtx subreg = simplify_gen_subreg (subpart_mode, ops[1],
tuple_mode, i * subpart_size);
if (i != 0)
{
rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
emit_insn (gen_rtx_SET (ops[3], new_addr));
}
rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
if (fractional_p)
{
rtx operands[] = {mem, subreg};
emit_vlmax_insn (code_for_pred_mov (subpart_mode), RVV_UNOP,
operands, ops[4]);
}
else
emit_move_insn (mem, subreg);
}
}
}
}
/* Return the vectorization machine mode for RVV according to LMUL. */
machine_mode
preferred_simd_mode (scalar_mode mode)
{
/* We will disable auto-vectorization when TARGET_MIN_VLEN < 128 &&
riscv_autovec_lmul < RVV_M2. Since GCC loop vectorizer report ICE when we
enable -march=rv64gc_zve32* and -march=rv32gc_zve64*. in the
'can_duplicate_and_interleave_p' of tree-vect-slp.cc. Since we have
VNx1SImode in -march=*zve32* and VNx1DImode in -march=*zve64*, they are
enabled in targetm. vector_mode_supported_p and SLP vectorizer will try to
use them. Currently, we can support auto-vectorization in
-march=rv32_zve32x_zvl128b. Wheras, -march=rv32_zve32x_zvl32b or
-march=rv32_zve32x_zvl64b are disabled. */
if (autovec_use_vlmax_p ())
{
if (TARGET_MIN_VLEN < 128 && riscv_autovec_lmul < RVV_M2)
return word_mode;
/* We use LMUL = 1 as base bytesize which is BYTES_PER_RISCV_VECTOR and
riscv_autovec_lmul as multiply factor to calculate the the NUNITS to
get the auto-vectorization mode. */
poly_uint64 nunits;
poly_uint64 vector_size
= BYTES_PER_RISCV_VECTOR * ((int) riscv_autovec_lmul);
poly_uint64 scalar_size = GET_MODE_SIZE (mode);
gcc_assert (multiple_p (vector_size, scalar_size, &nunits));
machine_mode rvv_mode;
if (get_vector_mode (mode, nunits).exists (&rvv_mode))
return rvv_mode;
}
/* TODO: We will support minimum length VLS auto-vectorization in
the future. */
return word_mode;
}
/* Subroutine of riscv_vector_expand_vector_init.
Works as follows:
(a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
(b) Skip leading elements from BUILDER, which are the same as
element NELTS_REQD - 1.
(c) Insert earlier elements in reverse order in TARGET using vslide1down. */
static void
expand_vector_init_insert_elems (rtx target, const rvv_builder &builder,
int nelts_reqd)
{
machine_mode mode = GET_MODE (target);
machine_mode mask_mode;
gcc_assert (get_mask_mode (mode).exists (&mask_mode));
rtx dup = expand_vector_broadcast (mode, builder.elt (0));
emit_move_insn (target, dup);
int ndups = builder.count_dups (0, nelts_reqd - 1, 1);
for (int i = ndups; i < nelts_reqd; i++)
{
unsigned int unspec
= FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1DOWN : UNSPEC_VSLIDE1DOWN;
insn_code icode = code_for_pred_slide (unspec, mode);
rtx ops[] = {target, target, builder.elt (i)};
emit_vlmax_insn (icode, RVV_BINOP, ops);
}
}
/* Use merge approach to initialize the vector with repeating sequence.
v = {a, b, a, b, a, b, a, b}.
v = broadcast (a).
mask = 0b01010101....
v = merge (v, b, mask)
*/
static void
expand_vector_init_merge_repeating_sequence (rtx target,
const rvv_builder &builder)
{
machine_mode dup_mode = get_repeating_sequence_dup_machine_mode (builder);
machine_mode dup_mask_mode = get_mask_mode (dup_mode).require ();
machine_mode mask_mode = get_mask_mode (builder.mode ()).require ();
uint64_t full_nelts = builder.full_nelts ().to_constant ();
/* Step 1: Broadcast the first pattern. */
rtx ops[] = {target, force_reg (GET_MODE_INNER (dup_mode), builder.elt (0))};
emit_vlmax_integer_move_insn (code_for_pred_broadcast (builder.mode ()),
ops, NULL_RTX);
/* Step 2: Merge the rest iteration of pattern. */
for (unsigned int i = 1; i < builder.npatterns (); i++)
{
/* Step 2-1: Generate mask register v0 for each merge. */
rtx merge_mask = builder.get_merge_scalar_mask (i);
rtx mask = gen_reg_rtx (mask_mode);
rtx dup = gen_reg_rtx (dup_mode);
if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x. */
{
rtx ops[] = {dup, gen_scalar_move_mask (dup_mask_mode),
RVV_VUNDEF (dup_mode), merge_mask};
emit_scalar_move_insn (code_for_pred_broadcast (GET_MODE (dup)),
ops);
}
else /* vmv.v.x. */
{
rtx ops[] = {dup, force_reg (GET_MODE_INNER (dup_mode), merge_mask)};
rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()),
Pmode);
emit_nonvlmax_integer_move_insn (code_for_pred_broadcast (dup_mode),
ops, vl);
}
emit_move_insn (mask, gen_lowpart (mask_mode, dup));
/* Step 2-2: Merge pattern according to the mask. */
rtx ops[] = {target, target, builder.elt (i), mask};
emit_vlmax_merge_insn (code_for_pred_merge_scalar (GET_MODE (target)),
riscv_vector::RVV_MERGE_OP, ops);
}
}
/* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
void
expand_vec_init (rtx target, rtx vals)
{
machine_mode mode = GET_MODE (target);
int nelts = XVECLEN (vals, 0);
rvv_builder v (mode, nelts, 1);
for (int i = 0; i < nelts; i++)
v.quick_push (XVECEXP (vals, 0, i));
v.finalize ();
if (nelts > 3)
{
/* Case 1: Convert v = { a, b, a, b } into v = { ab, ab }. */
if (v.can_duplicate_repeating_sequence_p ())
{
rtx ele = v.get_merged_repeating_sequence ();
rtx dup = expand_vector_broadcast (v.new_mode (), ele);
emit_move_insn (target, gen_lowpart (mode, dup));
return;
}
/* Case 2: Optimize repeating sequence cases that Case 1 can
not handle and it is profitable. For example:
ELEMENT BITSIZE = 64.
v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}.
We can't find a vector mode for "ab" which will be combined into
128-bit element to duplicate. */
if (v.repeating_sequence_use_merge_profitable_p ())
{
expand_vector_init_merge_repeating_sequence (target, v);
return;
}
/* TODO: We will support more Initialization of vector in the future. */
}
/* Handle common situation by vslide1down. This function can handle any
situation of vec_init<mode>. Only the cases that are not optimized above
will fall through here. */
expand_vector_init_insert_elems (target, v, nelts);
}
/* Get insn code for corresponding comparison. */
static insn_code
get_cmp_insn_code (rtx_code code, machine_mode mode)
{
insn_code icode;
switch (code)
{
case EQ:
case NE:
case LE:
case LEU:
case GT:
case GTU:
case LTGT:
icode = code_for_pred_cmp (mode);
break;
case LT:
case LTU:
case GE:
case GEU:
if (FLOAT_MODE_P (mode))
icode = code_for_pred_cmp (mode);
else
icode = code_for_pred_ltge (mode);
break;
default:
gcc_unreachable ();
}
return icode;
}
/* This hook gives the vectorizer more vector mode options. We want it to not
only try modes with the maximum number of units a full vector can hold but
for example also half the number of units for a smaller elements size.
Such vectors can be promoted to a full vector of widened elements
(still with the same number of elements, essentially vectorizing at a
fixed number of units rather than a fixed number of bytes). */
unsigned int
autovectorize_vector_modes (vector_modes *modes, bool)
{
if (autovec_use_vlmax_p ())
{
/* TODO: We will support RVV VLS auto-vectorization mode in the future. */
poly_uint64 full_size
= BYTES_PER_RISCV_VECTOR * ((int) riscv_autovec_lmul);
/* Start with a VNxYYQImode where YY is the number of units that
fit a whole vector.
Then try YY = nunits / 2, nunits / 4 and nunits / 8 which
is guided by the extensions we have available (vf2, vf4 and vf8).
- full_size: Try using full vectors for all element types.
- full_size / 2:
Try using 16-bit containers for 8-bit elements and full vectors
for wider elements.
- full_size / 4:
Try using 32-bit containers for 8-bit and 16-bit elements and
full vectors for wider elements.
- full_size / 8:
Try using 64-bit containers for all element types. */
static const int rvv_factors[] = {1, 2, 4, 8};
for (unsigned int i = 0; i < sizeof (rvv_factors) / sizeof (int); i++)
{
poly_uint64 units;
machine_mode mode;
if (can_div_trunc_p (full_size, rvv_factors[i], &units)
&& get_vector_mode (QImode, units).exists (&mode))
modes->safe_push (mode);
}
}
return 0;
}
/* If the given VECTOR_MODE is an RVV mode, first get the largest number
of units that fit into a full vector at the given ELEMENT_MODE.
We will have the vectorizer call us with a successively decreasing
number of units (as specified in autovectorize_vector_modes).
The starting mode is always the one specified by preferred_simd_mode. */
opt_machine_mode
vectorize_related_mode (machine_mode vector_mode, scalar_mode element_mode,
poly_uint64 nunits)
{
/* TODO: We will support RVV VLS auto-vectorization mode in the future. */
poly_uint64 min_units;
if (autovec_use_vlmax_p () && riscv_v_ext_vector_mode_p (vector_mode)
&& multiple_p (BYTES_PER_RISCV_VECTOR * ((int) riscv_autovec_lmul),
GET_MODE_SIZE (element_mode), &min_units))
{
machine_mode rvv_mode;
if (maybe_ne (nunits, 0U))
{
/* If we were given a number of units NUNITS, try to find an
RVV vector mode of inner mode ELEMENT_MODE with the same
number of units. */
if (multiple_p (min_units, nunits)
&& get_vector_mode (element_mode, nunits).exists (&rvv_mode))
return rvv_mode;
}
else
{
/* Look for a vector mode with the same number of units as the
VECTOR_MODE we were given. We keep track of the minimum
number of units so far which determines the smallest necessary
but largest possible, suitable mode for vectorization. */
min_units = ordered_min (min_units, GET_MODE_SIZE (vector_mode));
if (get_vector_mode (element_mode, min_units).exists (&rvv_mode))
return rvv_mode;
}
}
return default_vectorize_related_mode (vector_mode, element_mode, nunits);
}
/* Expand an RVV comparison. */
void
expand_vec_cmp (rtx target, rtx_code code, rtx op0, rtx op1)
{
machine_mode mask_mode = GET_MODE (target);
machine_mode data_mode = GET_MODE (op0);
insn_code icode = get_cmp_insn_code (code, data_mode);
if (code == LTGT)
{
rtx lt = gen_reg_rtx (mask_mode);
rtx gt = gen_reg_rtx (mask_mode);
expand_vec_cmp (lt, LT, op0, op1);
expand_vec_cmp (gt, GT, op0, op1);
icode = code_for_pred (IOR, mask_mode);
rtx ops[] = {target, lt, gt};
emit_vlmax_insn (icode, riscv_vector::RVV_BINOP, ops);
return;
}
rtx cmp = gen_rtx_fmt_ee (code, mask_mode, op0, op1);
rtx ops[] = {target, cmp, op0, op1};
emit_vlmax_cmp_insn (icode, ops);
}
void
expand_vec_cmp (rtx target, rtx_code code, rtx mask, rtx maskoff, rtx op0,
rtx op1)
{
machine_mode mask_mode = GET_MODE (target);
machine_mode data_mode = GET_MODE (op0);
insn_code icode = get_cmp_insn_code (code, data_mode);
if (code == LTGT)
{
rtx lt = gen_reg_rtx (mask_mode);
rtx gt = gen_reg_rtx (mask_mode);
expand_vec_cmp (lt, LT, mask, maskoff, op0, op1);
expand_vec_cmp (gt, GT, mask, maskoff, op0, op1);
icode = code_for_pred (IOR, mask_mode);
rtx ops[] = {target, lt, gt};
emit_vlmax_insn (icode, RVV_BINOP, ops);
return;
}
rtx cmp = gen_rtx_fmt_ee (code, mask_mode, op0, op1);
rtx ops[] = {target, mask, maskoff, cmp, op0, op1};
emit_vlmax_cmp_mu_insn (icode, ops);
}
/* Expand an RVV floating-point comparison:
If CAN_INVERT_P is true, the caller can also handle inverted results;
return true if the result is in fact inverted. */
bool
expand_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1,
bool can_invert_p)
{
machine_mode mask_mode = GET_MODE (target);
machine_mode data_mode = GET_MODE (op0);
/* If can_invert_p = true:
It suffices to implement a u>= b as !(a < b) but with the NaNs masked off:
vmfeq.vv v0, va, va
vmfeq.vv v1, vb, vb
vmand.mm v0, v0, v1
vmflt.vv v0, va, vb, v0.t
vmnot.m v0, v0
And, if !HONOR_SNANS, then you can remove the vmand.mm by masking the
second vmfeq.vv:
vmfeq.vv v0, va, va
vmfeq.vv v0, vb, vb, v0.t
vmflt.vv v0, va, vb, v0.t
vmnot.m v0, v0
If can_invert_p = false:
# Example of implementing isgreater()
vmfeq.vv v0, va, va # Only set where A is not NaN.
vmfeq.vv v1, vb, vb # Only set where B is not NaN.
vmand.mm v0, v0, v1 # Only set where A and B are ordered,
vmfgt.vv v0, va, vb, v0.t # so only set flags on ordered values.
*/
rtx eq0 = gen_reg_rtx (mask_mode);
rtx eq1 = gen_reg_rtx (mask_mode);
switch (code)
{
case EQ:
case NE:
case LT:
case LE:
case GT:
case GE:
case LTGT:
/* There is native support for the comparison. */
expand_vec_cmp (target, code, op0, op1);
return false;
case UNEQ:
case ORDERED:
case UNORDERED:
case UNLT:
case UNLE:
case UNGT:
case UNGE:
/* vmfeq.vv v0, va, va */
expand_vec_cmp (eq0, EQ, op0, op0);
if (HONOR_SNANS (data_mode))
{
/*
vmfeq.vv v1, vb, vb
vmand.mm v0, v0, v1
*/
expand_vec_cmp (eq1, EQ, op1, op1);
insn_code icode = code_for_pred (AND, mask_mode);
rtx ops[] = {eq0, eq0, eq1};
emit_vlmax_insn (icode, riscv_vector::RVV_BINOP, ops);
}
else
{
/* vmfeq.vv v0, vb, vb, v0.t */
expand_vec_cmp (eq0, EQ, eq0, eq0, op1, op1);
}
break;
default:
gcc_unreachable ();
}
if (code == ORDERED)
{
emit_move_insn (target, eq0);
return false;
}
/* There is native support for the inverse comparison. */
code = reverse_condition_maybe_unordered (code);
if (code == ORDERED)
emit_move_insn (target, eq0);
else
expand_vec_cmp (eq0, code, eq0, eq0, op0, op1);
if (can_invert_p)
{
emit_move_insn (target, eq0);
return true;
}
/* We use one_cmpl<mode>2 to make Combine PASS to combine mask instructions
into: vmand.mm/vmnor.mm/vmnand.mm/vmnor.mm/vmxnor.mm. */
emit_insn (gen_rtx_SET (target, gen_rtx_NOT (mask_mode, eq0)));
return false;
}
/* Implement vec_perm<mode>. */
void
expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
{
machine_mode data_mode = GET_MODE (target);
machine_mode sel_mode = GET_MODE (sel);
poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
/* Check if the sel only references the first values vector. If each select
index is in range of [0, nunits - 1]. A single vrgather instructions is
enough. Since we will use vrgatherei16.vv for variable-length vector,
it is never out of range and we don't need to modulo the index. */
if (!nunits.is_constant () || const_vec_all_in_range_p (sel, 0, nunits - 1))
{
emit_vlmax_gather_insn (target, op0, sel);
return;
}
/* Check if the two values vectors are the same. */
if (rtx_equal_p (op0, op1) || const_vec_duplicate_p (sel))
{
/* Note: vec_perm indices are supposed to wrap when they go beyond the
size of the two value vectors, i.e. the upper bits of the indices
are effectively ignored. RVV vrgather instead produces 0 for any
out-of-range indices, so we need to modulo all the vec_perm indices
to ensure they are all in range of [0, nunits - 1]. */
rtx max_sel = gen_const_vector_dup (sel_mode, nunits - 1);
rtx sel_mod = expand_simple_binop (sel_mode, AND, sel, max_sel, NULL, 0,
OPTAB_DIRECT);
emit_vlmax_gather_insn (target, op1, sel_mod);
return;
}
rtx sel_mod = sel;
rtx max_sel = gen_const_vector_dup (sel_mode, 2 * nunits - 1);
/* We don't need to modulo indices for VLA vector.
Since we should gurantee they aren't out of range before. */
if (nunits.is_constant ())
{
/* Note: vec_perm indices are supposed to wrap when they go beyond the
size of the two value vectors, i.e. the upper bits of the indices
are effectively ignored. RVV vrgather instead produces 0 for any
out-of-range indices, so we need to modulo all the vec_perm indices
to ensure they are all in range of [0, 2 * nunits - 1]. */
sel_mod = expand_simple_binop (sel_mode, AND, sel, max_sel, NULL, 0,
OPTAB_DIRECT);
}
/* This following sequence is handling the case that:
__builtin_shufflevector (vec1, vec2, index...), the index can be any
value in range of [0, 2 * nunits - 1]. */
machine_mode mask_mode;
mask_mode = get_mask_mode (data_mode).require ();
rtx mask = gen_reg_rtx (mask_mode);
max_sel = gen_const_vector_dup (sel_mode, nunits);
/* Step 1: generate a mask that should select everything >= nunits into the
* mask. */
expand_vec_cmp (mask, GEU, sel_mod, max_sel);
/* Step2: gather every op0 values indexed by sel into target,
we don't need to care about the result of the element
whose index >= nunits. */
emit_vlmax_gather_insn (target, op0, sel_mod);
/* Step3: shift the range from (nunits, max_of_mode] to
[0, max_of_mode - nunits]. */
rtx tmp = gen_reg_rtx (sel_mode);
rtx ops[] = {tmp, sel_mod, max_sel};
emit_vlmax_insn (code_for_pred (MINUS, sel_mode), RVV_BINOP, ops);
/* Step4: gather those into the previously masked-out elements
of target. */
emit_vlmax_masked_gather_mu_insn (target, op1, tmp, mask);
}
/* Implement TARGET_VECTORIZE_VEC_PERM_CONST for RVV. */
/* vec_perm support. */
struct expand_vec_perm_d
{
rtx target, op0, op1;
vec_perm_indices perm;
machine_mode vmode;
machine_mode op_mode;
bool one_vector_p;
bool testing_p;
};
/* Recognize the patterns that we can use merge operation to shuffle the
vectors. The value of Each element (index i) in selector can only be
either i or nunits + i. We will check the pattern is actually monotonic.
E.g.
v = VEC_PERM_EXPR (v0, v1, selector),
selector = { 0, nunits + 1, 2, nunits + 3, 4, nunits + 5, ... }
We can transform such pattern into:
v = vcond_mask (v0, v1, mask),
mask = { 0, 1, 0, 1, 0, 1, ... }. */
static bool
shuffle_merge_patterns (struct expand_vec_perm_d *d)
{
machine_mode vmode = d->vmode;
machine_mode sel_mode = related_int_vector_mode (vmode).require ();
int n_patterns = d->perm.encoding ().npatterns ();
poly_int64 vec_len = d->perm.length ();
for (int i = 0; i < n_patterns; ++i)
if (!known_eq (d->perm[i], i) && !known_eq (d->perm[i], vec_len + i))
return false;
/* Check the pattern is monotonic here, otherwise, return false. */
for (int i = n_patterns; i < n_patterns * 2; i++)
if (!d->perm.series_p (i, n_patterns, i, n_patterns)
&& !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
return false;
if (d->testing_p)
return true;
machine_mode mask_mode = get_mask_mode (vmode).require ();
rtx mask = gen_reg_rtx (mask_mode);
rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
/* MASK = SELECTOR < NUNTIS ? 1 : 0. */
rtx x = gen_int_mode (vec_len, GET_MODE_INNER (sel_mode));
insn_code icode = code_for_pred_cmp_scalar (sel_mode);
rtx cmp = gen_rtx_fmt_ee (LTU, mask_mode, sel, x);
rtx ops[] = {mask, cmp, sel, x};
emit_vlmax_cmp_insn (icode, ops);
/* TARGET = MASK ? OP0 : OP1. */
emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, mask));
return true;
}
/* Recognize decompress patterns:
1. VEC_PERM_EXPR op0 and op1
with isel = { 0, nunits, 1, nunits + 1, ... }.
Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
2. VEC_PERM_EXPR op0 and op1
with isel = { 1/2 nunits, 3/2 nunits, 1/2 nunits+1, 3/2 nunits+1,... }.
Slide down op0 and op1 with OFFSET = 1/2 nunits.
Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
*/
static bool
shuffle_decompress_patterns (struct expand_vec_perm_d *d)
{
poly_uint64 nelt = d->perm.length ();
machine_mode mask_mode = get_mask_mode (d->vmode).require ();
/* For constant size indices, we dont't need to handle it here.
Just leave it to vec_perm<mode>. */
if (d->perm.length ().is_constant ())
return false;
poly_uint64 first = d->perm[0];
if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
|| !d->perm.series_p (0, 2, first, 1)
|| !d->perm.series_p (1, 2, first + nelt, 1))
return false;
/* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
Otherwise, it could overflow the index range. */
machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
if (GET_MODE_INNER (d->vmode) == QImode
&& !get_vector_mode (HImode, nelt).exists (&sel_mode))
return false;
/* Success! */
if (d->testing_p)
return true;
rtx op0, op1;
if (known_eq (first, 0U))
{
op0 = d->op0;
op1 = d->op1;
}
else
{
op0 = gen_reg_rtx (d->vmode);
op1 = gen_reg_rtx (d->vmode);
insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode);
rtx ops0[] = {op0, d->op0, gen_int_mode (first, Pmode)};
rtx ops1[] = {op1, d->op1, gen_int_mode (first, Pmode)};
emit_vlmax_insn (icode, RVV_BINOP, ops0);
emit_vlmax_insn (icode, RVV_BINOP, ops1);
}
/* Generate { 0, 1, .... } mask. */
rtx vid = gen_reg_rtx (sel_mode);
rtx vid_repeat = gen_reg_rtx (sel_mode);
emit_insn (gen_vec_series (sel_mode, vid, const0_rtx, const1_rtx));
rtx and_ops[] = {vid_repeat, vid, const1_rtx};
emit_vlmax_insn (code_for_pred_scalar (AND, sel_mode), RVV_BINOP, and_ops);
rtx const_vec = gen_const_vector_dup (sel_mode, 1);
rtx mask = gen_reg_rtx (mask_mode);
expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
emit_vlmax_decompress_insn (d->target, op0, op1, mask);
return true;
}
/* Recognize the pattern that can be shuffled by generic approach. */
static bool
shuffle_generic_patterns (struct expand_vec_perm_d *d)
{
machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
poly_uint64 nunits = GET_MODE_NUNITS (d->vmode);
/* We don't enable SLP for non-power of 2 NPATTERNS. */
if (!pow2p_hwi (d->perm.encoding().npatterns ()))
return false;
/* For constant size indices, we dont't need to handle it here.
Just leave it to vec_perm<mode>. */
if (d->perm.length ().is_constant ())
return false;
/* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
Otherwise, it could overflow the index range. */
if (GET_MODE_INNER (d->vmode) == QImode
&& !get_vector_mode (HImode, nunits).exists (&sel_mode))
return false;
/* Success! */
if (d->testing_p)
return true;
rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
expand_vec_perm (d->target, d->op0, d->op1, force_reg (sel_mode, sel));
return true;
}
/* This function recognizes and supports different permutation patterns
and enable VLA SLP auto-vectorization. */
static bool
expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
{
gcc_assert (d->op_mode != E_VOIDmode);
/* The pattern matching functions above are written to look for a small
number to begin the sequence (0, 1, N/2). If we begin with an index
from the second operand, we can swap the operands. */
poly_int64 nelt = d->perm.length ();
if (known_ge (d->perm[0], nelt))
{
d->perm.rotate_inputs (1);
std::swap (d->op0, d->op1);
}
if (known_gt (nelt, 1))
{
if (d->vmode == d->op_mode)
{
if (shuffle_merge_patterns (d))
return true;
if (shuffle_decompress_patterns (d))
return true;
if (shuffle_generic_patterns (d))
return true;
return false;
}
else
return false;
}
return false;
}
/* This function implements TARGET_VECTORIZE_VEC_PERM_CONST by using RVV
* instructions. */
bool
expand_vec_perm_const (machine_mode vmode, machine_mode op_mode, rtx target,
rtx op0, rtx op1, const vec_perm_indices &sel)
{
/* RVV doesn't have Mask type pack/unpack instructions and we don't use
mask to do the iteration loop control. Just disable it directly. */
if (GET_MODE_CLASS (vmode) == MODE_VECTOR_BOOL)
return false;
struct expand_vec_perm_d d;
/* Check whether the mask can be applied to a single vector. */
if (sel.ninputs () == 1 || (op0 && rtx_equal_p (op0, op1)))
d.one_vector_p = true;
else if (sel.all_from_input_p (0))
{
d.one_vector_p = true;
op1 = op0;
}
else if (sel.all_from_input_p (1))
{
d.one_vector_p = true;
op0 = op1;
}
else
d.one_vector_p = false;
d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
sel.nelts_per_input ());
d.vmode = vmode;
d.op_mode = op_mode;
d.target = target;
d.op0 = op0;
if (op0 == op1)
d.op1 = d.op0;
else
d.op1 = op1;
d.testing_p = !target;
if (!d.testing_p)
return expand_vec_perm_const_1 (&d);
rtx_insn *last = get_last_insn ();
bool ret = expand_vec_perm_const_1 (&d);
gcc_assert (last == get_last_insn ());
return ret;
}
/* Generate no side effects vsetvl to get the vector length. */
void
expand_select_vl (rtx *ops)
{
poly_int64 nunits = rtx_to_poly_int64 (ops[2]);
/* We arbitrary picked QImode as inner scalar mode to get vector mode.
since vsetvl only demand ratio. We let VSETVL PASS to optimize it. */
scalar_int_mode mode = QImode;
machine_mode rvv_mode = get_vector_mode (mode, nunits).require ();
emit_insn (gen_no_side_effects_vsetvl_rtx (rvv_mode, ops[0], ops[1]));
}
/* Expand LEN_MASK_{LOAD,STORE}. */
void
expand_load_store (rtx *ops, bool is_load)
{
poly_int64 value;
rtx len = ops[2];
rtx mask = ops[3];
machine_mode mode = GET_MODE (ops[0]);
if (poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode)))
{
/* If the length operand is equal to VF, it is VLMAX load/store. */
if (is_load)
{
rtx m_ops[] = {ops[0], mask, RVV_VUNDEF (mode), ops[1]};
emit_vlmax_masked_insn (code_for_pred_mov (mode), RVV_UNOP_M, m_ops);
}
else
{
len = gen_reg_rtx (Pmode);
emit_vlmax_vsetvl (mode, len);
emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
get_avl_type_rtx (VLMAX)));
}
}
else
{
if (!satisfies_constraint_K (len))
len = force_reg (Pmode, len);
if (is_load)
{
rtx m_ops[] = {ops[0], mask, RVV_VUNDEF (mode), ops[1]};
emit_nonvlmax_masked_insn (code_for_pred_mov (mode), RVV_UNOP_M,
m_ops, len);
}
else
emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
get_avl_type_rtx (NONVLMAX)));
}
}
} // namespace riscv_vector