| /* Subroutines used for code generation for RISC-V 'V' Extension for |
| GNU compiler. |
| Copyright (C) 2022-2023 Free Software Foundation, Inc. |
| Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd. |
| |
| This file is part of GCC. |
| |
| GCC is free software; you can redistribute it and/or modify it |
| under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 3, or (at your option) |
| any later version. |
| |
| GCC is distributed in the hope that it will be useful, but |
| WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with GCC; see the file COPYING3. If not see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #define IN_TARGET_CODE 1 |
| |
| /* We have a maximum of 11 operands for RVV instruction patterns according to |
| the vector.md. */ |
| #define RVV_INSN_OPERANDS_MAX 11 |
| |
| #include "config.h" |
| #include "system.h" |
| #include "coretypes.h" |
| #include "tm.h" |
| #include "backend.h" |
| #include "rtl.h" |
| #include "insn-config.h" |
| #include "insn-attr.h" |
| #include "recog.h" |
| #include "alias.h" |
| #include "tree.h" |
| #include "stringpool.h" |
| #include "attribs.h" |
| #include "explow.h" |
| #include "memmodel.h" |
| #include "emit-rtl.h" |
| #include "tm_p.h" |
| #include "target.h" |
| #include "targhooks.h" |
| #include "expr.h" |
| #include "optabs.h" |
| #include "tm-constrs.h" |
| #include "rtx-vector-builder.h" |
| #include "targhooks.h" |
| |
| using namespace riscv_vector; |
| |
| namespace riscv_vector { |
| |
| /* Return true if vlmax is constant value and can be used in vsetivl. */ |
| static bool |
| const_vlmax_p (machine_mode mode) |
| { |
| poly_uint64 nuints = GET_MODE_NUNITS (mode); |
| |
| return nuints.is_constant () |
| /* The vsetivli can only hold register 0~31. */ |
| ? (IN_RANGE (nuints.to_constant (), 0, 31)) |
| /* Only allowed in VLS-VLMAX mode. */ |
| : false; |
| } |
| |
| template <int MAX_OPERANDS> class insn_expander |
| { |
| public: |
| insn_expander () |
| : m_opno (0), m_op_num (0), m_has_dest_p (false), |
| m_fully_unmasked_p (false), m_use_real_merge_p (false), |
| m_needs_avl_p (false), m_vlmax_p (false), m_has_tail_policy_p (false), |
| m_has_mask_policy_p (false), m_has_fp_rounding_mode_p (false), |
| m_tail_policy (TAIL_ANY), m_mask_policy (MASK_ANY), |
| m_fp_rounding_mode (FRM_DYN), |
| m_dest_mode (VOIDmode), m_mask_mode (VOIDmode), |
| m_vl_op (NULL_RTX) |
| {} |
| |
| /* Initializer for various configurations. */ |
| insn_expander (int op_num, bool has_dest_p, bool use_all_trues_mask_p, |
| bool use_real_merge_p, bool needs_avl_p, bool vlmax_p, |
| machine_mode dest_mode, machine_mode mask_mode) |
| : m_opno (0), m_op_num (op_num), m_has_dest_p (has_dest_p), |
| m_fully_unmasked_p (use_all_trues_mask_p), |
| m_use_real_merge_p (use_real_merge_p), m_needs_avl_p (needs_avl_p), |
| m_vlmax_p (vlmax_p), m_has_tail_policy_p (false), |
| m_has_mask_policy_p (false), m_has_fp_rounding_mode_p (false), |
| m_tail_policy (TAIL_ANY), m_mask_policy (MASK_ANY), |
| m_fp_rounding_mode (FRM_DYN), |
| m_dest_mode (dest_mode), |
| m_mask_mode (mask_mode), m_vl_op (NULL_RTX) |
| {} |
| |
| void set_policy (enum tail_policy ta) |
| { |
| m_has_tail_policy_p = true; |
| m_tail_policy = ta; |
| } |
| void set_policy (enum mask_policy ma) |
| { |
| m_has_mask_policy_p = true; |
| m_mask_policy = ma; |
| } |
| void set_vl (rtx vl) { m_vl_op = vl; } |
| |
| void set_rounding_mode (enum floating_point_rounding_mode mode) |
| { |
| m_has_fp_rounding_mode_p = true; |
| m_fp_rounding_mode = mode; |
| } |
| |
| void add_output_operand (rtx x, machine_mode mode) |
| { |
| create_output_operand (&m_ops[m_opno++], x, mode); |
| gcc_assert (m_opno <= MAX_OPERANDS); |
| } |
| void add_input_operand (rtx x, machine_mode mode) |
| { |
| create_input_operand (&m_ops[m_opno++], x, mode); |
| gcc_assert (m_opno <= MAX_OPERANDS); |
| } |
| void add_all_one_mask_operand () |
| { |
| add_input_operand (CONSTM1_RTX (m_mask_mode), m_mask_mode); |
| } |
| void add_vundef_operand () |
| { |
| add_input_operand (RVV_VUNDEF (m_dest_mode), m_dest_mode); |
| } |
| void add_policy_operand () |
| { |
| if (m_has_tail_policy_p) |
| { |
| rtx tail_policy_rtx = gen_int_mode (m_tail_policy, Pmode); |
| add_input_operand (tail_policy_rtx, Pmode); |
| } |
| if (m_has_mask_policy_p) |
| { |
| rtx mask_policy_rtx = gen_int_mode (m_mask_policy, Pmode); |
| add_input_operand (mask_policy_rtx, Pmode); |
| } |
| } |
| void add_avl_type_operand (avl_type type) |
| { |
| add_input_operand (gen_int_mode (type, Pmode), Pmode); |
| } |
| |
| void add_rounding_mode_operand () |
| { |
| if (m_has_fp_rounding_mode_p) |
| { |
| rtx frm_rtx = gen_int_mode (m_fp_rounding_mode, Pmode); |
| add_input_operand (frm_rtx, Pmode); |
| } |
| } |
| |
| void emit_insn (enum insn_code icode, rtx *ops) |
| { |
| int opno = 0; |
| /* It's true if any operand is memory operand. */ |
| bool any_mem_p = false; |
| /* It's true if all operands are mask operand. */ |
| bool all_mask_p = true; |
| if (m_has_dest_p) |
| { |
| any_mem_p |= MEM_P (ops[opno]); |
| all_mask_p &= GET_MODE_CLASS (GET_MODE (ops[opno])) == MODE_VECTOR_BOOL; |
| add_output_operand (ops[opno++], m_dest_mode); |
| } |
| |
| if (m_fully_unmasked_p) |
| add_all_one_mask_operand (); |
| |
| if (!m_use_real_merge_p) |
| add_vundef_operand (); |
| |
| for (; opno < m_op_num; opno++) |
| { |
| any_mem_p |= MEM_P (ops[opno]); |
| all_mask_p &= GET_MODE_CLASS (GET_MODE (ops[opno])) == MODE_VECTOR_BOOL; |
| machine_mode mode = insn_data[(int) icode].operand[m_opno].mode; |
| /* 'create_input_operand doesn't allow VOIDmode. |
| According to vector.md, we may have some patterns that do not have |
| explicit machine mode specifying the operand. Such operands are |
| always Pmode. */ |
| if (mode == VOIDmode) |
| mode = Pmode; |
| add_input_operand (ops[opno], mode); |
| } |
| |
| if (m_needs_avl_p) |
| { |
| rtx len = m_vl_op; |
| if (m_vlmax_p) |
| { |
| if (const_vlmax_p (m_dest_mode)) |
| { |
| /* Optimize VLS-VLMAX code gen, we can use vsetivli instead of |
| the vsetvli to obtain the value of vlmax. */ |
| poly_uint64 nunits = GET_MODE_NUNITS (m_dest_mode); |
| len = gen_int_mode (nunits, Pmode); |
| m_vlmax_p = false; /* It has became NONVLMAX now. */ |
| } |
| else if (can_create_pseudo_p ()) |
| { |
| len = gen_reg_rtx (Pmode); |
| emit_vlmax_vsetvl (m_dest_mode, len); |
| } |
| } |
| add_input_operand (len, Pmode); |
| } |
| |
| if (!all_mask_p) |
| add_policy_operand (); |
| if (m_needs_avl_p) |
| add_avl_type_operand (m_vlmax_p ? avl_type::VLMAX : avl_type::NONVLMAX); |
| |
| add_rounding_mode_operand (); |
| |
| expand (icode, any_mem_p); |
| } |
| |
| void expand (enum insn_code icode, bool temporary_volatile_p = false) |
| { |
| if (temporary_volatile_p) |
| { |
| temporary_volatile_ok v (true); |
| expand_insn (icode, m_opno, m_ops); |
| } |
| else |
| expand_insn (icode, m_opno, m_ops); |
| } |
| |
| private: |
| int m_opno; |
| int m_op_num; |
| /* It't true when the pattern has a dest operand. Most of the patterns have |
| dest operand wheras some patterns like STOREs does not have dest operand. |
| For example, according to vector.md. We can see indexed loads/stores do |
| not have dest operand. |
| */ |
| bool m_has_dest_p; |
| /* It't true if the pattern uses all trues mask operand. */ |
| bool m_fully_unmasked_p; |
| /* It's true if the pattern uses real merge operand. */ |
| bool m_use_real_merge_p; |
| bool m_needs_avl_p; |
| bool m_vlmax_p; |
| bool m_has_tail_policy_p; |
| bool m_has_mask_policy_p; |
| bool m_has_fp_rounding_mode_p; |
| enum tail_policy m_tail_policy; |
| enum mask_policy m_mask_policy; |
| enum floating_point_rounding_mode m_fp_rounding_mode; |
| machine_mode m_dest_mode; |
| machine_mode m_mask_mode; |
| rtx m_vl_op; |
| expand_operand m_ops[MAX_OPERANDS]; |
| }; |
| |
| |
| class rvv_builder : public rtx_vector_builder |
| { |
| public: |
| rvv_builder () : rtx_vector_builder () {} |
| rvv_builder (machine_mode mode, unsigned int npatterns, |
| unsigned int nelts_per_pattern) |
| : rtx_vector_builder (mode, npatterns, nelts_per_pattern) |
| { |
| m_inner_mode = GET_MODE_INNER (mode); |
| m_inner_bits_size = GET_MODE_BITSIZE (m_inner_mode); |
| m_inner_bytes_size = GET_MODE_SIZE (m_inner_mode); |
| m_mask_mode = get_mask_mode (mode).require (); |
| |
| gcc_assert ( |
| int_mode_for_size (inner_bits_size (), 0).exists (&m_inner_int_mode)); |
| m_int_mode |
| = get_vector_mode (m_inner_int_mode, GET_MODE_NUNITS (mode)).require (); |
| } |
| |
| bool can_duplicate_repeating_sequence_p (); |
| rtx get_merged_repeating_sequence (); |
| |
| bool repeating_sequence_use_merge_profitable_p (); |
| rtx get_merge_scalar_mask (unsigned int) const; |
| |
| bool single_step_npatterns_p () const; |
| bool npatterns_all_equal_p () const; |
| |
| machine_mode new_mode () const { return m_new_mode; } |
| scalar_mode inner_mode () const { return m_inner_mode; } |
| scalar_int_mode inner_int_mode () const { return m_inner_int_mode; } |
| machine_mode mask_mode () const { return m_mask_mode; } |
| machine_mode int_mode () const { return m_int_mode; } |
| unsigned int inner_bits_size () const { return m_inner_bits_size; } |
| unsigned int inner_bytes_size () const { return m_inner_bytes_size; } |
| |
| private: |
| scalar_mode m_inner_mode; |
| scalar_int_mode m_inner_int_mode; |
| machine_mode m_new_mode; |
| scalar_int_mode m_new_inner_mode; |
| machine_mode m_mask_mode; |
| machine_mode m_int_mode; |
| unsigned int m_inner_bits_size; |
| unsigned int m_inner_bytes_size; |
| }; |
| |
| /* Return true if the vector duplicated by a super element which is the fusion |
| of consecutive elements. |
| |
| v = { a, b, a, b } super element = ab, v = { ab, ab } */ |
| bool |
| rvv_builder::can_duplicate_repeating_sequence_p () |
| { |
| poly_uint64 new_size = exact_div (full_nelts (), npatterns ()); |
| unsigned int new_inner_size = m_inner_bits_size * npatterns (); |
| if (!int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode) |
| || GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD |
| || !get_vector_mode (m_new_inner_mode, new_size).exists (&m_new_mode)) |
| return false; |
| if (full_nelts ().is_constant ()) |
| return repeating_sequence_p (0, full_nelts ().to_constant (), npatterns ()); |
| return nelts_per_pattern () == 1; |
| } |
| |
| /* Return true if it is a repeating sequence that using |
| merge approach has better codegen than using default |
| approach (slide1down). |
| |
| Sequence A: |
| {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b} |
| |
| nelts = 16 |
| npatterns = 2 |
| |
| for merging a we need mask 101010.... |
| for merging b we need mask 010101.... |
| |
| Foreach element in the npattern, we need to build a mask in scalar register. |
| Mostely we need 3 instructions (aka COST = 3), which is consist of 2 scalar |
| instruction and 1 scalar move to v0 register. Finally we need vector merge |
| to merge them. |
| |
| lui a5, #imm |
| add a5, #imm |
| vmov.s.x v0, a5 |
| vmerge.vxm v9, v9, a1, v0 |
| |
| So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8. |
| If we use slide1down, the COST = nelts = 16 > 8 (COST of merge). |
| So return true in this case as it is profitable. |
| |
| Sequence B: |
| {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h} |
| |
| nelts = 16 |
| npatterns = 8 |
| |
| COST of merge approach = (3 + 1) * npatterns = 24 |
| COST of slide1down approach = nelts = 16 |
| Return false in this case as it is NOT profitable in merge approach. |
| */ |
| bool |
| rvv_builder::repeating_sequence_use_merge_profitable_p () |
| { |
| if (inner_bytes_size () > UNITS_PER_WORD) |
| return false; |
| |
| unsigned int nelts = full_nelts ().to_constant (); |
| |
| if (!repeating_sequence_p (0, nelts, npatterns ())) |
| return false; |
| |
| unsigned int merge_cost = 1; |
| unsigned int build_merge_mask_cost = 3; |
| unsigned int slide1down_cost = nelts; |
| |
| return (build_merge_mask_cost + merge_cost) * npatterns () < slide1down_cost; |
| } |
| |
| /* Merge the repeating sequence into a single element and return the RTX. */ |
| rtx |
| rvv_builder::get_merged_repeating_sequence () |
| { |
| scalar_int_mode mode = Pmode; |
| rtx target = gen_reg_rtx (mode); |
| emit_move_insn (target, const0_rtx); |
| rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode); |
| /* { a, b, a, b }: Generate duplicate element = b << bits | a. */ |
| for (unsigned int i = 0; i < npatterns (); i++) |
| { |
| unsigned int loc = m_inner_bits_size * i; |
| rtx shift = gen_int_mode (loc, mode); |
| rtx ele = gen_lowpart (mode, elt (i)); |
| rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX, false, |
| OPTAB_DIRECT); |
| rtx tmp2 = expand_simple_binop (mode, ASHIFT, tmp, shift, NULL_RTX, false, |
| OPTAB_DIRECT); |
| rtx tmp3 = expand_simple_binop (mode, IOR, tmp2, target, NULL_RTX, false, |
| OPTAB_DIRECT); |
| emit_move_insn (target, tmp3); |
| } |
| if (GET_MODE_SIZE (m_new_inner_mode) < UNITS_PER_WORD) |
| return gen_lowpart (m_new_inner_mode, target); |
| return target; |
| } |
| |
| /* Get the mask for merge approach. |
| |
| Consider such following case: |
| {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b} |
| To merge "a", the mask should be 1010.... |
| To merge "b", the mask should be 0101.... |
| */ |
| rtx |
| rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern) const |
| { |
| unsigned HOST_WIDE_INT mask = 0; |
| unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern); |
| /* Here we construct a mask pattern that will later be broadcast |
| to a vector register. The maximum broadcast size for vmv.v.x/vmv.s.x |
| is determined by the length of a vector element (ELEN) and not by |
| XLEN so make sure we do not exceed it. One example is -march=zve32* |
| which mandates ELEN == 32 but can be combined with -march=rv64 |
| with XLEN == 64. */ |
| unsigned int elen = TARGET_VECTOR_ELEN_64 ? 64 : 32; |
| |
| gcc_assert (elen % npatterns () == 0); |
| |
| int limit = elen / npatterns (); |
| |
| for (int i = 0; i < limit; i++) |
| mask |= base_mask << (i * npatterns ()); |
| |
| return gen_int_mode (mask, inner_int_mode ()); |
| } |
| |
| /* Return true if the variable-length vector is single step. |
| Single step means step all patterns in NPATTERNS are equal. |
| Consider this following case: |
| |
| CASE 1: NPATTERNS = 2, NELTS_PER_PATTERN = 3. |
| { 0, 2, 2, 4, 4, 6, ... } |
| First pattern: step1 = 2 - 0 = 2 |
| step2 = 4 - 2 = 2 |
| Second pattern: step1 = 4 - 2 = 2 |
| step2 = 6 - 4 = 2 |
| Since all steps of NPATTERNS are equal step = 2. |
| Return true in this case. |
| |
| CASE 2: NPATTERNS = 2, NELTS_PER_PATTERN = 3. |
| { 0, 1, 2, 4, 4, 7, ... } |
| First pattern: step1 = 2 - 0 = 2 |
| step2 = 4 - 2 = 2 |
| Second pattern: step1 = 4 - 1 = 3 |
| step2 = 7 - 4 = 3 |
| Since not all steps are equal, return false. */ |
| bool |
| rvv_builder::single_step_npatterns_p () const |
| { |
| if (nelts_per_pattern () != 3) |
| return false; |
| |
| poly_int64 step |
| = rtx_to_poly_int64 (elt (npatterns ())) - rtx_to_poly_int64 (elt (0)); |
| for (unsigned int i = 0; i < npatterns (); i++) |
| { |
| poly_int64 ele0 = rtx_to_poly_int64 (elt (i)); |
| poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i)); |
| poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i)); |
| poly_int64 diff1 = ele1 - ele0; |
| poly_int64 diff2 = ele2 - ele1; |
| if (maybe_ne (step, diff1) || maybe_ne (step, diff2)) |
| return false; |
| } |
| return true; |
| } |
| |
| /* Return true if all elements of NPATTERNS are equal. |
| |
| E.g. NPATTERNS = 4: |
| { 2, 2, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, ... } |
| E.g. NPATTERNS = 8: |
| { 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, ... } |
| We only check ele[0] ~ ele[NPATTERNS - 1] whether they are the same. |
| We don't need to check the elements[n] with n >= NPATTERNS since |
| they don't belong to the same pattern. |
| */ |
| bool |
| rvv_builder::npatterns_all_equal_p () const |
| { |
| poly_int64 ele0 = rtx_to_poly_int64 (elt (0)); |
| for (unsigned int i = 1; i < npatterns (); i++) |
| { |
| poly_int64 ele = rtx_to_poly_int64 (elt (i)); |
| if (!known_eq (ele, ele0)) |
| return false; |
| } |
| return true; |
| } |
| |
| static unsigned |
| get_sew (machine_mode mode) |
| { |
| unsigned int sew = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL |
| ? 8 |
| : GET_MODE_BITSIZE (GET_MODE_INNER (mode)); |
| return sew; |
| } |
| |
| /* Return true if X is a const_vector with all duplicate elements, which is in |
| the range between MINVAL and MAXVAL. */ |
| bool |
| const_vec_all_same_in_range_p (rtx x, HOST_WIDE_INT minval, |
| HOST_WIDE_INT maxval) |
| { |
| rtx elt; |
| return (const_vec_duplicate_p (x, &elt) && CONST_INT_P (elt) |
| && IN_RANGE (INTVAL (elt), minval, maxval)); |
| } |
| |
| /* Return true if VEC is a constant in which every element is in the range |
| [MINVAL, MAXVAL]. The elements do not need to have the same value. |
| |
| This function also exists in aarch64, we may unify it in middle-end in the |
| future. */ |
| |
| static bool |
| const_vec_all_in_range_p (rtx vec, poly_int64 minval, poly_int64 maxval) |
| { |
| if (!CONST_VECTOR_P (vec) |
| || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT) |
| return false; |
| |
| int nunits; |
| if (!CONST_VECTOR_STEPPED_P (vec)) |
| nunits = const_vector_encoded_nelts (vec); |
| else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits)) |
| return false; |
| |
| for (int i = 0; i < nunits; i++) |
| { |
| rtx vec_elem = CONST_VECTOR_ELT (vec, i); |
| poly_int64 value; |
| if (!poly_int_rtx_p (vec_elem, &value) |
| || maybe_lt (value, minval) |
| || maybe_gt (value, maxval)) |
| return false; |
| } |
| return true; |
| } |
| |
| /* Return a const_int vector of VAL. |
| |
| This function also exists in aarch64, we may unify it in middle-end in the |
| future. */ |
| |
| static rtx |
| gen_const_vector_dup (machine_mode mode, poly_int64 val) |
| { |
| rtx c = gen_int_mode (val, GET_MODE_INNER (mode)); |
| return gen_const_vec_duplicate (mode, c); |
| } |
| |
| /* Emit a vlmax vsetvl instruction. This should only be used when |
| optimization is disabled or after vsetvl insertion pass. */ |
| void |
| emit_hard_vlmax_vsetvl (machine_mode vmode, rtx vl) |
| { |
| unsigned int sew = get_sew (vmode); |
| emit_insn (gen_vsetvl (Pmode, vl, RVV_VLMAX, gen_int_mode (sew, Pmode), |
| gen_int_mode (get_vlmul (vmode), Pmode), const0_rtx, |
| const0_rtx)); |
| } |
| |
| void |
| emit_vlmax_vsetvl (machine_mode vmode, rtx vl) |
| { |
| unsigned int sew = get_sew (vmode); |
| enum vlmul_type vlmul = get_vlmul (vmode); |
| unsigned int ratio = calculate_ratio (sew, vlmul); |
| |
| if (!optimize) |
| emit_hard_vlmax_vsetvl (vmode, vl); |
| else |
| emit_insn (gen_vlmax_avl (Pmode, vl, gen_int_mode (ratio, Pmode))); |
| } |
| |
| /* Calculate SEW/LMUL ratio. */ |
| unsigned int |
| calculate_ratio (unsigned int sew, enum vlmul_type vlmul) |
| { |
| unsigned int ratio; |
| switch (vlmul) |
| { |
| case LMUL_1: |
| ratio = sew; |
| break; |
| case LMUL_2: |
| ratio = sew / 2; |
| break; |
| case LMUL_4: |
| ratio = sew / 4; |
| break; |
| case LMUL_8: |
| ratio = sew / 8; |
| break; |
| case LMUL_F8: |
| ratio = sew * 8; |
| break; |
| case LMUL_F4: |
| ratio = sew * 4; |
| break; |
| case LMUL_F2: |
| ratio = sew * 2; |
| break; |
| default: |
| gcc_unreachable (); |
| } |
| return ratio; |
| } |
| |
| /* SCALABLE means that the vector-length is agnostic (run-time invariant and |
| compile-time unknown). FIXED meands that the vector-length is specific |
| (compile-time known). Both RVV_SCALABLE and RVV_FIXED_VLMAX are doing |
| auto-vectorization using VLMAX vsetvl configuration. */ |
| static bool |
| autovec_use_vlmax_p (void) |
| { |
| return (riscv_autovec_preference == RVV_SCALABLE |
| || riscv_autovec_preference == RVV_FIXED_VLMAX); |
| } |
| |
| /* The RISC-V vsetvli pass uses "known vlmax" operations for optimization. |
| Whether or not an instruction actually is a vlmax operation is not |
| recognizable from the length operand alone but the avl_type operand |
| is used instead. In general, there are two cases: |
| |
| - Emit a vlmax operation by passing a NULL length. Here we emit |
| a vsetvli with vlmax configuration and set the avl_type to VLMAX. |
| - Emit an operation that uses the existing (last-set) length and |
| set the avl_type to NONVLMAX. |
| |
| Sometimes we also need to set the VLMAX avl_type to an operation that |
| already uses a given length register. This can happen during or after |
| register allocation when we are not allowed to create a new register. |
| For that case we also allow to set the avl_type to VLMAX. |
| */ |
| |
| /* This function emits a {VLMAX, TAIL_ANY, MASK_ANY} vsetvli followed by the |
| * actual operation. */ |
| void |
| emit_vlmax_insn (unsigned icode, int op_num, rtx *ops, rtx vl) |
| { |
| machine_mode dest_mode = GET_MODE (ops[0]); |
| machine_mode mask_mode = get_mask_mode (dest_mode).require (); |
| insn_expander<RVV_INSN_OPERANDS_MAX> e (op_num, |
| /* HAS_DEST_P */ true, |
| /* FULLY_UNMASKED_P */ true, |
| /* USE_REAL_MERGE_P */ false, |
| /* HAS_AVL_P */ true, |
| /* VLMAX_P */ true, |
| dest_mode, |
| mask_mode); |
| |
| e.set_policy (TAIL_ANY); |
| e.set_policy (MASK_ANY); |
| /* According to LRA mov pattern in vector.md, we have a clobber operand |
| to be used ad VL operand. */ |
| e.set_vl (vl); |
| e.emit_insn ((enum insn_code) icode, ops); |
| } |
| |
| void |
| emit_vlmax_fp_insn (unsigned icode, int op_num, rtx *ops, rtx vl) |
| { |
| machine_mode dest_mode = GET_MODE (ops[0]); |
| machine_mode mask_mode = get_mask_mode (dest_mode).require (); |
| insn_expander<RVV_INSN_OPERANDS_MAX> e (op_num, |
| /* HAS_DEST_P */ true, |
| /* FULLY_UNMASKED_P */ true, |
| /* USE_REAL_MERGE_P */ false, |
| /* HAS_AVL_P */ true, |
| /* VLMAX_P */ true, |
| dest_mode, |
| mask_mode); |
| |
| e.set_policy (TAIL_ANY); |
| e.set_policy (MASK_ANY); |
| e.set_rounding_mode (FRM_DYN); |
| e.set_vl (vl); |
| e.emit_insn ((enum insn_code) icode, ops); |
| } |
| |
| /* This function emits a {VLMAX, TAIL_ANY, MASK_ANY} vsetvli followed by the |
| * ternary operation which always has a real merge operand. */ |
| void |
| emit_vlmax_ternary_insn (unsigned icode, int op_num, rtx *ops, rtx vl) |
| { |
| machine_mode dest_mode = GET_MODE (ops[0]); |
| machine_mode mask_mode = get_mask_mode (dest_mode).require (); |
| insn_expander<RVV_INSN_OPERANDS_MAX> e (/*OP_NUM*/ op_num, |
| /*HAS_DEST_P*/ true, |
| /*FULLY_UNMASKED_P*/ true, |
| /*USE_REAL_MERGE_P*/ true, |
| /*HAS_AVL_P*/ true, |
| /*VLMAX_P*/ true, |
| /*DEST_MODE*/ dest_mode, |
| /*MASK_MODE*/ mask_mode); |
| e.set_policy (TAIL_ANY); |
| e.set_policy (MASK_ANY); |
| e.set_vl (vl); |
| e.emit_insn ((enum insn_code) icode, ops); |
| } |
| |
| /* This function emits a {VLMAX, TAIL_ANY, MASK_ANY} vsetvli followed by the |
| * ternary operation which always has a real merge operand. */ |
| void |
| emit_vlmax_fp_ternary_insn (unsigned icode, int op_num, rtx *ops, rtx vl) |
| { |
| machine_mode dest_mode = GET_MODE (ops[0]); |
| machine_mode mask_mode = get_mask_mode (dest_mode).require (); |
| insn_expander<RVV_INSN_OPERANDS_MAX> e (/*OP_NUM*/ op_num, |
| /*HAS_DEST_P*/ true, |
| /*FULLY_UNMASKED_P*/ true, |
| /*USE_REAL_MERGE_P*/ true, |
| /*HAS_AVL_P*/ true, |
| /*VLMAX_P*/ true, |
| /*DEST_MODE*/ dest_mode, |
| /*MASK_MODE*/ mask_mode); |
| e.set_policy (TAIL_ANY); |
| e.set_policy (MASK_ANY); |
| e.set_rounding_mode (FRM_DYN); |
| e.set_vl (vl); |
| e.emit_insn ((enum insn_code) icode, ops); |
| } |
| |
| /* This function emits a {NONVLMAX, TAIL_ANY, MASK_ANY} vsetvli followed by the |
| * actual operation. */ |
| void |
| emit_nonvlmax_insn (unsigned icode, int op_num, rtx *ops, rtx avl) |
| { |
| machine_mode dest_mode = GET_MODE (ops[0]); |
| machine_mode mask_mode = get_mask_mode (dest_mode).require (); |
| insn_expander<RVV_INSN_OPERANDS_MAX> e (op_num, |
| /* HAS_DEST_P */ true, |
| /* FULLY_UNMASKED_P */ true, |
| /* USE_REAL_MERGE_P */ false, |
| /* HAS_AVL_P */ true, |
| /* VLMAX_P */ false, |
| dest_mode, |
| mask_mode); |
| |
| e.set_policy (TAIL_ANY); |
| e.set_policy (MASK_ANY); |
| e.set_vl (avl); |
| e.emit_insn ((enum insn_code) icode, ops); |
| } |
| |
| /* This function emits a {NONVLMAX, TAIL_UNDISTURBED, MASK_ANY} vsetvli |
| followed by a vslide insn (with real merge operand). */ |
| void |
| emit_vlmax_slide_insn (unsigned icode, rtx *ops) |
| { |
| machine_mode dest_mode = GET_MODE (ops[0]); |
| machine_mode mask_mode = get_mask_mode (dest_mode).require (); |
| insn_expander<RVV_INSN_OPERANDS_MAX> e (RVV_SLIDE_OP, |
| /* HAS_DEST_P */ true, |
| /* FULLY_UNMASKED_P */ true, |
| /* USE_REAL_MERGE_P */ true, |
| /* HAS_AVL_P */ true, |
| /* VLMAX_P */ true, |
| dest_mode, |
| mask_mode); |
| |
| e.set_policy (TAIL_ANY); |
| e.set_policy (MASK_ANY); |
| |
| e.emit_insn ((enum insn_code) icode, ops); |
| } |
| |
| /* This function emits a {NONVLMAX, TAIL_UNDISTURBED, MASK_ANY} vsetvli |
| followed by a vslide insn (with real merge operand). */ |
| void |
| emit_nonvlmax_slide_tu_insn (unsigned icode, rtx *ops, rtx avl) |
| { |
| machine_mode dest_mode = GET_MODE (ops[0]); |
| machine_mode mask_mode = get_mask_mode (dest_mode).require (); |
| insn_expander<RVV_INSN_OPERANDS_MAX> e (RVV_SLIDE_OP, |
| /* HAS_DEST_P */ true, |
| /* FULLY_UNMASKED_P */ true, |
| /* USE_REAL_MERGE_P */ true, |
| /* HAS_AVL_P */ true, |
| /* VLMAX_P */ false, |
| dest_mode, |
| mask_mode); |
| |
| e.set_policy (TAIL_UNDISTURBED); |
| e.set_policy (MASK_ANY); |
| e.set_vl (avl); |
| |
| e.emit_insn ((enum insn_code) icode, ops); |
| } |
| |
| |
| /* This function emits merge instruction. */ |
| void |
| emit_vlmax_merge_insn (unsigned icode, int op_num, rtx *ops) |
| { |
| machine_mode dest_mode = GET_MODE (ops[0]); |
| machine_mode mask_mode = get_mask_mode (dest_mode).require (); |
| insn_expander<RVV_INSN_OPERANDS_MAX> e (op_num, |
| /* HAS_DEST_P */ true, |
| /* FULLY_UNMASKED_P */ false, |
| /* USE_REAL_MERGE_P */ false, |
| /* HAS_AVL_P */ true, |
| /* VLMAX_P */ true, |
| dest_mode, |
| mask_mode); |
| |
| e.set_policy (TAIL_ANY); |
| e.emit_insn ((enum insn_code) icode, ops); |
| } |
| |
| /* This function emits cmp instruction. */ |
| void |
| emit_vlmax_cmp_insn (unsigned icode, rtx *ops) |
| { |
| machine_mode mode = GET_MODE (ops[0]); |
| insn_expander<RVV_INSN_OPERANDS_MAX> e (RVV_CMP_OP, |
| /* HAS_DEST_P */ true, |
| /* FULLY_UNMASKED_P */ true, |
| /* USE_REAL_MERGE_P */ false, |
| /* HAS_AVL_P */ true, |
| /* VLMAX_P */ true, |
| mode, |
| mode); |
| |
| e.set_policy (MASK_ANY); |
| e.emit_insn ((enum insn_code) icode, ops); |
| } |
| |
| /* This function emits cmp with MU instruction. */ |
| void |
| emit_vlmax_cmp_mu_insn (unsigned icode, rtx *ops) |
| { |
| machine_mode mode = GET_MODE (ops[0]); |
| insn_expander<RVV_INSN_OPERANDS_MAX> e (RVV_CMP_MU_OP, |
| /* HAS_DEST_P */ true, |
| /* FULLY_UNMASKED_P */ false, |
| /* USE_REAL_MERGE_P */ true, |
| /* HAS_AVL_P */ true, |
| /* VLMAX_P */ true, |
| mode, |
| mode); |
| |
| e.set_policy (MASK_UNDISTURBED); |
| e.emit_insn ((enum insn_code) icode, ops); |
| } |
| |
| /* This function emits a masked instruction. */ |
| static void |
| emit_vlmax_masked_insn (unsigned icode, int op_num, rtx *ops) |
| { |
| machine_mode dest_mode = GET_MODE (ops[0]); |
| machine_mode mask_mode = get_mask_mode (dest_mode).require (); |
| insn_expander<RVV_INSN_OPERANDS_MAX> e (/*OP_NUM*/ op_num, |
| /*HAS_DEST_P*/ true, |
| /*FULLY_UNMASKED_P*/ false, |
| /*USE_REAL_MERGE_P*/ true, |
| /*HAS_AVL_P*/ true, |
| /*VLMAX_P*/ true, dest_mode, |
| mask_mode); |
| e.set_policy (TAIL_ANY); |
| e.set_policy (MASK_ANY); |
| e.emit_insn ((enum insn_code) icode, ops); |
| } |
| |
| /* This function emits a masked instruction. */ |
| static void |
| emit_nonvlmax_masked_insn (unsigned icode, int op_num, rtx *ops, rtx avl) |
| { |
| machine_mode dest_mode = GET_MODE (ops[0]); |
| machine_mode mask_mode = get_mask_mode (dest_mode).require (); |
| insn_expander<RVV_INSN_OPERANDS_MAX> e (/*OP_NUM*/ op_num, |
| /*HAS_DEST_P*/ true, |
| /*FULLY_UNMASKED_P*/ false, |
| /*USE_REAL_MERGE_P*/ true, |
| /*HAS_AVL_P*/ true, |
| /*VLMAX_P*/ false, dest_mode, |
| mask_mode); |
| e.set_policy (TAIL_ANY); |
| e.set_policy (MASK_ANY); |
| e.set_vl (avl); |
| e.emit_insn ((enum insn_code) icode, ops); |
| } |
| |
| /* This function emits a masked instruction. */ |
| void |
| emit_vlmax_masked_mu_insn (unsigned icode, int op_num, rtx *ops) |
| { |
| machine_mode dest_mode = GET_MODE (ops[0]); |
| machine_mode mask_mode = get_mask_mode (dest_mode).require (); |
| insn_expander<RVV_INSN_OPERANDS_MAX> e (/*OP_NUM*/ op_num, |
| /*HAS_DEST_P*/ true, |
| /*FULLY_UNMASKED_P*/ false, |
| /*USE_REAL_MERGE_P*/ true, |
| /*HAS_AVL_P*/ true, |
| /*VLMAX_P*/ true, dest_mode, |
| mask_mode); |
| e.set_policy (TAIL_ANY); |
| e.set_policy (MASK_UNDISTURBED); |
| e.emit_insn ((enum insn_code) icode, ops); |
| } |
| |
| /* Emit vmv.s.x instruction. */ |
| |
| void |
| emit_scalar_move_insn (unsigned icode, rtx *ops) |
| { |
| machine_mode dest_mode = GET_MODE (ops[0]); |
| machine_mode mask_mode = get_mask_mode (dest_mode).require (); |
| insn_expander<RVV_INSN_OPERANDS_MAX> e (riscv_vector::RVV_SCALAR_MOV_OP, |
| /* HAS_DEST_P */ true, |
| /* FULLY_UNMASKED_P */ false, |
| /* USE_REAL_MERGE_P */ true, |
| /* HAS_AVL_P */ true, |
| /* VLMAX_P */ false, |
| dest_mode, |
| mask_mode); |
| |
| e.set_policy (TAIL_ANY); |
| e.set_policy (MASK_ANY); |
| e.set_vl (CONST1_RTX (Pmode)); |
| e.emit_insn ((enum insn_code) icode, ops); |
| } |
| |
| /* Emit vmv.v.x instruction with vlmax. */ |
| |
| static void |
| emit_vlmax_integer_move_insn (unsigned icode, rtx *ops, rtx vl) |
| { |
| emit_vlmax_insn (icode, riscv_vector::RVV_UNOP, ops, vl); |
| } |
| |
| /* Emit vmv.v.x instruction with nonvlmax. */ |
| |
| void |
| emit_nonvlmax_integer_move_insn (unsigned icode, rtx *ops, rtx avl) |
| { |
| emit_nonvlmax_insn (icode, riscv_vector::RVV_UNOP, ops, avl); |
| } |
| |
| /* This function emits VLMAX vrgather instruction. Emit vrgather.vx/vi when sel |
| is a const duplicate vector. Otherwise, emit vrgather.vv. */ |
| static void |
| emit_vlmax_gather_insn (rtx target, rtx op, rtx sel) |
| { |
| rtx elt; |
| insn_code icode; |
| machine_mode data_mode = GET_MODE (target); |
| machine_mode sel_mode = GET_MODE (sel); |
| if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode))) |
| icode = code_for_pred_gatherei16 (data_mode); |
| else if (const_vec_duplicate_p (sel, &elt)) |
| { |
| icode = code_for_pred_gather_scalar (data_mode); |
| sel = elt; |
| } |
| else |
| icode = code_for_pred_gather (data_mode); |
| rtx ops[] = {target, op, sel}; |
| emit_vlmax_insn (icode, RVV_BINOP, ops); |
| } |
| |
| static void |
| emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask) |
| { |
| rtx elt; |
| insn_code icode; |
| machine_mode data_mode = GET_MODE (target); |
| machine_mode sel_mode = GET_MODE (sel); |
| if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode))) |
| icode = code_for_pred_gatherei16 (data_mode); |
| else if (const_vec_duplicate_p (sel, &elt)) |
| { |
| icode = code_for_pred_gather_scalar (data_mode); |
| sel = elt; |
| } |
| else |
| icode = code_for_pred_gather (data_mode); |
| rtx ops[] = {target, mask, target, op, sel}; |
| emit_vlmax_masked_mu_insn (icode, RVV_BINOP_MU, ops); |
| } |
| |
| /* According to RVV ISA spec (16.5.1. Synthesizing vdecompress): |
| https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc |
| |
| There is no inverse vdecompress provided, as this operation can be readily |
| synthesized using iota and a masked vrgather: |
| |
| Desired functionality of 'vdecompress' |
| 7 6 5 4 3 2 1 0 # vid |
| |
| e d c b a # packed vector of 5 elements |
| 1 0 0 1 1 1 0 1 # mask vector of 8 elements |
| p q r s t u v w # destination register before vdecompress |
| |
| e q r d c b v a # result of vdecompress |
| # v0 holds mask |
| # v1 holds packed data |
| # v11 holds input expanded vector and result |
| viota.m v10, v0 # Calc iota from mask in v0 |
| vrgather.vv v11, v1, v10, v0.t # Expand into destination |
| p q r s t u v w # v11 destination register |
| e d c b a # v1 source vector |
| 1 0 0 1 1 1 0 1 # v0 mask vector |
| |
| 4 4 4 3 2 1 1 0 # v10 result of viota.m |
| e q r d c b v a # v11 destination after vrgather using viota.m under mask |
| */ |
| static void |
| emit_vlmax_decompress_insn (rtx target, rtx op0, rtx op1, rtx mask) |
| { |
| machine_mode data_mode = GET_MODE (target); |
| machine_mode sel_mode = related_int_vector_mode (data_mode).require (); |
| if (GET_MODE_INNER (data_mode) == QImode) |
| sel_mode = get_vector_mode (HImode, GET_MODE_NUNITS (data_mode)).require (); |
| |
| rtx sel = gen_reg_rtx (sel_mode); |
| rtx iota_ops[] = {sel, mask}; |
| emit_vlmax_insn (code_for_pred_iota (sel_mode), RVV_UNOP, iota_ops); |
| emit_vlmax_gather_insn (target, op0, sel); |
| emit_vlmax_masked_gather_mu_insn (target, op1, sel, mask); |
| } |
| |
| /* Emit merge instruction. */ |
| |
| static machine_mode |
| get_repeating_sequence_dup_machine_mode (const rvv_builder &builder) |
| { |
| poly_uint64 dup_nunits = GET_MODE_NUNITS (builder.mode ()); |
| |
| if (known_ge (GET_MODE_SIZE (builder.mode ()), BYTES_PER_RISCV_VECTOR)) |
| { |
| dup_nunits = exact_div (BYTES_PER_RISCV_VECTOR, |
| builder.inner_bytes_size ()); |
| } |
| |
| return get_vector_mode (builder.inner_int_mode (), dup_nunits).require (); |
| } |
| |
| /* Expand series const vector. */ |
| |
| void |
| expand_vec_series (rtx dest, rtx base, rtx step) |
| { |
| machine_mode mode = GET_MODE (dest); |
| machine_mode mask_mode; |
| gcc_assert (get_mask_mode (mode).exists (&mask_mode)); |
| poly_int64 nunits_m1 = GET_MODE_NUNITS (mode) - 1; |
| poly_int64 value; |
| |
| /* VECT_IV = BASE + I * STEP. */ |
| |
| /* Step 1: Generate I = { 0, 1, 2, ... } by vid.v. */ |
| rtx vid = gen_reg_rtx (mode); |
| rtx op[] = {vid}; |
| emit_vlmax_insn (code_for_pred_series (mode), RVV_MISC_OP, op); |
| |
| /* Step 2: Generate I * STEP. |
| - STEP is 1, we don't emit any instructions. |
| - STEP is power of 2, we use vsll.vi/vsll.vx. |
| - STEP is non-power of 2, we use vmul.vx. */ |
| rtx step_adj; |
| if (rtx_equal_p (step, const1_rtx)) |
| step_adj = vid; |
| else if (rtx_equal_p (step, constm1_rtx) && poly_int_rtx_p (base, &value) |
| && known_eq (nunits_m1, value)) |
| { |
| /* Special case: |
| {nunits - 1, nunits - 2, ... , 0}. |
| nunits can be either const_int or const_poly_int. |
| |
| Code sequence: |
| vid.v v |
| vrsub nunits - 1, v. */ |
| rtx ops[] = {dest, vid, gen_int_mode (nunits_m1, GET_MODE_INNER (mode))}; |
| insn_code icode = code_for_pred_sub_reverse_scalar (mode); |
| emit_vlmax_insn (icode, RVV_BINOP, ops); |
| return; |
| } |
| else |
| { |
| step_adj = gen_reg_rtx (mode); |
| if (CONST_INT_P (step) && pow2p_hwi (INTVAL (step))) |
| { |
| /* Emit logical left shift operation. */ |
| int shift = exact_log2 (INTVAL (step)); |
| rtx shift_amount = gen_int_mode (shift, Pmode); |
| insn_code icode = code_for_pred_scalar (ASHIFT, mode); |
| rtx ops[] = {step_adj, vid, shift_amount}; |
| emit_vlmax_insn (icode, RVV_BINOP, ops); |
| } |
| else |
| { |
| insn_code icode = code_for_pred_scalar (MULT, mode); |
| rtx ops[] = {step_adj, vid, step}; |
| emit_vlmax_insn (icode, RVV_BINOP, ops); |
| } |
| } |
| |
| /* Step 3: Generate BASE + I * STEP. |
| - BASE is 0, use result of vid. |
| - BASE is not 0, we use vadd.vx/vadd.vi. */ |
| if (rtx_equal_p (base, const0_rtx)) |
| { |
| emit_move_insn (dest, step_adj); |
| } |
| else |
| { |
| rtx result = gen_reg_rtx (mode); |
| insn_code icode = code_for_pred_scalar (PLUS, mode); |
| rtx ops[] = {result, step_adj, base}; |
| emit_vlmax_insn (icode, RVV_BINOP, ops); |
| emit_move_insn (dest, result); |
| } |
| } |
| |
| static void |
| expand_const_vector (rtx target, rtx src) |
| { |
| machine_mode mode = GET_MODE (target); |
| scalar_mode elt_mode = GET_MODE_INNER (mode); |
| if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL) |
| { |
| rtx elt; |
| gcc_assert ( |
| const_vec_duplicate_p (src, &elt) |
| && (rtx_equal_p (elt, const0_rtx) || rtx_equal_p (elt, const1_rtx))); |
| rtx ops[] = {target, src}; |
| emit_vlmax_insn (code_for_pred_mov (mode), RVV_UNOP, ops); |
| return; |
| } |
| |
| rtx elt; |
| if (const_vec_duplicate_p (src, &elt)) |
| { |
| rtx tmp = register_operand (target, mode) ? target : gen_reg_rtx (mode); |
| /* Element in range -16 ~ 15 integer or 0.0 floating-point, |
| we use vmv.v.i instruction. */ |
| if (satisfies_constraint_vi (src) || satisfies_constraint_Wc0 (src)) |
| { |
| rtx ops[] = {tmp, src}; |
| emit_vlmax_insn (code_for_pred_mov (mode), RVV_UNOP, ops); |
| } |
| else |
| { |
| elt = force_reg (elt_mode, elt); |
| rtx ops[] = {tmp, elt}; |
| emit_vlmax_insn (code_for_pred_broadcast (mode), RVV_UNOP, ops); |
| } |
| |
| if (tmp != target) |
| emit_move_insn (target, tmp); |
| return; |
| } |
| |
| /* Support scalable const series vector. */ |
| rtx base, step; |
| if (const_vec_series_p (src, &base, &step)) |
| { |
| emit_insn (gen_vec_series (mode, target, base, step)); |
| return; |
| } |
| |
| /* Handle variable-length vector. */ |
| unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src); |
| unsigned int npatterns = CONST_VECTOR_NPATTERNS (src); |
| rvv_builder builder (mode, npatterns, nelts_per_pattern); |
| for (unsigned int i = 0; i < nelts_per_pattern; i++) |
| { |
| for (unsigned int j = 0; j < npatterns; j++) |
| builder.quick_push (CONST_VECTOR_ELT (src, i * npatterns + j)); |
| } |
| builder.finalize (); |
| |
| if (CONST_VECTOR_DUPLICATE_P (src)) |
| { |
| /* Handle the case with repeating sequence that NELTS_PER_PATTERN = 1 |
| E.g. NPATTERNS = 4, v = { 0, 2, 6, 7, ... } |
| NPATTERNS = 8, v = { 0, 2, 6, 7, 19, 20, 8, 7 ... } |
| The elements within NPATTERNS are not necessary regular. */ |
| if (builder.can_duplicate_repeating_sequence_p ()) |
| { |
| /* We handle the case that we can find a vector containter to hold |
| element bitsize = NPATTERNS * ele_bitsize. |
| |
| NPATTERNS = 8, element width = 8 |
| v = { 0, 1, 2, 3, 4, 5, 6, 7, ... } |
| In this case, we can combine NPATTERNS element into a larger |
| element. Use element width = 64 and broadcast a vector with |
| all element equal to 0x0706050403020100. */ |
| rtx ele = builder.get_merged_repeating_sequence (); |
| rtx dup = expand_vector_broadcast (builder.new_mode (), ele); |
| emit_move_insn (target, gen_lowpart (mode, dup)); |
| } |
| else |
| { |
| /* We handle the case that we can't find a vector containter to hold |
| element bitsize = NPATTERNS * ele_bitsize. |
| |
| NPATTERNS = 8, element width = 16 |
| v = { 0, 1, 2, 3, 4, 5, 6, 7, ... } |
| Since NPATTERNS * element width = 128, we can't find a container |
| to hold it. |
| |
| In this case, we use NPATTERNS merge operations to generate such |
| vector. */ |
| unsigned int nbits = npatterns - 1; |
| |
| /* Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */ |
| rtx vid = gen_reg_rtx (builder.int_mode ()); |
| rtx op[] = {vid}; |
| emit_vlmax_insn (code_for_pred_series (builder.int_mode ()), |
| RVV_MISC_OP, op); |
| |
| /* Generate vid_repeat = { 0, 1, ... nbits, ... } */ |
| rtx vid_repeat = gen_reg_rtx (builder.int_mode ()); |
| rtx and_ops[] = {vid_repeat, vid, |
| gen_int_mode (nbits, builder.inner_int_mode ())}; |
| emit_vlmax_insn (code_for_pred_scalar (AND, builder.int_mode ()), |
| RVV_BINOP, and_ops); |
| |
| rtx tmp = gen_reg_rtx (builder.mode ()); |
| rtx dup_ops[] = {tmp, builder.elt (0)}; |
| emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()), RVV_UNOP, |
| dup_ops); |
| for (unsigned int i = 1; i < builder.npatterns (); i++) |
| { |
| /* Generate mask according to i. */ |
| rtx mask = gen_reg_rtx (builder.mask_mode ()); |
| rtx const_vec = gen_const_vector_dup (builder.int_mode (), i); |
| expand_vec_cmp (mask, EQ, vid_repeat, const_vec); |
| |
| /* Merge scalar to each i. */ |
| rtx tmp2 = gen_reg_rtx (builder.mode ()); |
| rtx merge_ops[] = {tmp2, tmp, builder.elt (i), mask}; |
| insn_code icode = code_for_pred_merge_scalar (builder.mode ()); |
| emit_vlmax_merge_insn (icode, RVV_MERGE_OP, merge_ops); |
| tmp = tmp2; |
| } |
| emit_move_insn (target, tmp); |
| } |
| return; |
| } |
| else if (CONST_VECTOR_STEPPED_P (src)) |
| { |
| gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT); |
| if (builder.single_step_npatterns_p ()) |
| { |
| /* Describe the case by choosing NPATTERNS = 4 as an example. */ |
| insn_code icode; |
| |
| /* Step 1: Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */ |
| rtx vid = gen_reg_rtx (builder.mode ()); |
| rtx vid_ops[] = {vid}; |
| icode = code_for_pred_series (builder.mode ()); |
| emit_vlmax_insn (icode, RVV_MISC_OP, vid_ops); |
| |
| if (builder.npatterns_all_equal_p ()) |
| { |
| /* Generate the variable-length vector following this rule: |
| { a, a, a + step, a + step, a + step * 2, a + step * 2, ...} |
| E.g. { 0, 0, 8, 8, 16, 16, ... } */ |
| /* We want to create a pattern where value[ix] = floor (ix / |
| NPATTERNS). As NPATTERNS is always a power of two we can |
| rewrite this as = ix & -NPATTERNS. */ |
| /* Step 2: VID AND -NPATTERNS: |
| { 0&-4, 1&-4, 2&-4, 3 &-4, 4 &-4, 5 &-4, 6 &-4, 7 &-4, ... } |
| */ |
| rtx imm |
| = gen_int_mode (-builder.npatterns (), builder.inner_mode ()); |
| rtx and_ops[] = {target, vid, imm}; |
| icode = code_for_pred_scalar (AND, builder.mode ()); |
| emit_vlmax_insn (icode, RVV_BINOP, and_ops); |
| } |
| else |
| { |
| /* Generate the variable-length vector following this rule: |
| { a, b, a, b, a + step, b + step, a + step*2, b + step*2, ...} |
| E.g. { 3, 2, 1, 0, 7, 6, 5, 4, ... } */ |
| /* Step 2: Generate diff = TARGET - VID: |
| { 3-0, 2-1, 1-2, 0-3, 7-4, 6-5, 5-6, 4-7, ... }*/ |
| rvv_builder v (builder.mode (), builder.npatterns (), 1); |
| for (unsigned int i = 0; i < v.npatterns (); ++i) |
| { |
| /* Calculate the diff between the target sequence and |
| vid sequence. */ |
| HOST_WIDE_INT diff = INTVAL (builder.elt (i)) - i; |
| v.quick_push (gen_int_mode (diff, v.inner_mode ())); |
| } |
| /* Step 2: Generate result = VID + diff. */ |
| rtx vec = v.build (); |
| rtx add_ops[] = {target, vid, vec}; |
| emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()), RVV_BINOP, |
| add_ops); |
| } |
| } |
| else |
| /* TODO: We will enable more variable-length vector in the future. */ |
| gcc_unreachable (); |
| } |
| else |
| gcc_unreachable (); |
| } |
| |
| /* Expand a pre-RA RVV data move from SRC to DEST. |
| It expands move for RVV fractional vector modes. */ |
| bool |
| legitimize_move (rtx dest, rtx src) |
| { |
| machine_mode mode = GET_MODE (dest); |
| if (CONST_VECTOR_P (src)) |
| { |
| expand_const_vector (dest, src); |
| return true; |
| } |
| |
| /* In order to decrease the memory traffic, we don't use whole register |
| * load/store for the LMUL less than 1 and mask mode, so those case will |
| * require one extra general purpose register, but it's not allowed during LRA |
| * process, so we have a special move pattern used for LRA, which will defer |
| * the expansion after LRA. */ |
| if ((known_lt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR) |
| || GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL) |
| && lra_in_progress) |
| { |
| emit_insn (gen_mov_lra (mode, Pmode, dest, src)); |
| return true; |
| } |
| |
| if (known_ge (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR) |
| && GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL) |
| { |
| /* Need to force register if mem <- !reg. */ |
| if (MEM_P (dest) && !REG_P (src)) |
| src = force_reg (mode, src); |
| |
| return false; |
| } |
| |
| if (register_operand (src, mode) && register_operand (dest, mode)) |
| { |
| emit_insn (gen_rtx_SET (dest, src)); |
| return true; |
| } |
| |
| if (!register_operand (src, mode) && !register_operand (dest, mode)) |
| { |
| rtx tmp = gen_reg_rtx (mode); |
| if (MEM_P (src)) |
| { |
| rtx ops[] = {tmp, src}; |
| emit_vlmax_insn (code_for_pred_mov (mode), RVV_UNOP, ops); |
| } |
| else |
| emit_move_insn (tmp, src); |
| src = tmp; |
| } |
| |
| if (satisfies_constraint_vu (src)) |
| return false; |
| |
| rtx ops[] = {dest, src}; |
| emit_vlmax_insn (code_for_pred_mov (mode), RVV_UNOP, ops); |
| return true; |
| } |
| |
| /* VTYPE information for machine_mode. */ |
| struct mode_vtype_group |
| { |
| enum vlmul_type vlmul_for_min_vlen32[NUM_MACHINE_MODES]; |
| uint8_t ratio_for_min_vlen32[NUM_MACHINE_MODES]; |
| enum vlmul_type vlmul_for_min_vlen64[NUM_MACHINE_MODES]; |
| uint8_t ratio_for_min_vlen64[NUM_MACHINE_MODES]; |
| enum vlmul_type vlmul_for_for_vlen128[NUM_MACHINE_MODES]; |
| uint8_t ratio_for_for_vlen128[NUM_MACHINE_MODES]; |
| machine_mode subpart_mode[NUM_MACHINE_MODES]; |
| uint8_t nf[NUM_MACHINE_MODES]; |
| mode_vtype_group () |
| { |
| #define ENTRY(MODE, REQUIREMENT, VLMUL_FOR_MIN_VLEN32, RATIO_FOR_MIN_VLEN32, \ |
| VLMUL_FOR_MIN_VLEN64, RATIO_FOR_MIN_VLEN64, \ |
| VLMUL_FOR_MIN_VLEN128, RATIO_FOR_MIN_VLEN128) \ |
| vlmul_for_min_vlen32[MODE##mode] = VLMUL_FOR_MIN_VLEN32; \ |
| ratio_for_min_vlen32[MODE##mode] = RATIO_FOR_MIN_VLEN32; \ |
| vlmul_for_min_vlen64[MODE##mode] = VLMUL_FOR_MIN_VLEN64; \ |
| ratio_for_min_vlen64[MODE##mode] = RATIO_FOR_MIN_VLEN64; \ |
| vlmul_for_for_vlen128[MODE##mode] = VLMUL_FOR_MIN_VLEN128; \ |
| ratio_for_for_vlen128[MODE##mode] = RATIO_FOR_MIN_VLEN128; |
| #define TUPLE_ENTRY(MODE, REQUIREMENT, SUBPART_MODE, NF, VLMUL_FOR_MIN_VLEN32, \ |
| RATIO_FOR_MIN_VLEN32, VLMUL_FOR_MIN_VLEN64, \ |
| RATIO_FOR_MIN_VLEN64, VLMUL_FOR_MIN_VLEN128, \ |
| RATIO_FOR_MIN_VLEN128) \ |
| subpart_mode[MODE##mode] = SUBPART_MODE##mode; \ |
| nf[MODE##mode] = NF; \ |
| vlmul_for_min_vlen32[MODE##mode] = VLMUL_FOR_MIN_VLEN32; \ |
| ratio_for_min_vlen32[MODE##mode] = RATIO_FOR_MIN_VLEN32; \ |
| vlmul_for_min_vlen64[MODE##mode] = VLMUL_FOR_MIN_VLEN64; \ |
| ratio_for_min_vlen64[MODE##mode] = RATIO_FOR_MIN_VLEN64; \ |
| vlmul_for_for_vlen128[MODE##mode] = VLMUL_FOR_MIN_VLEN128; \ |
| ratio_for_for_vlen128[MODE##mode] = RATIO_FOR_MIN_VLEN128; |
| #include "riscv-vector-switch.def" |
| #undef ENTRY |
| #undef TUPLE_ENTRY |
| } |
| }; |
| |
| static mode_vtype_group mode_vtype_infos; |
| |
| /* Get vlmul field value by comparing LMUL with BYTES_PER_RISCV_VECTOR. */ |
| enum vlmul_type |
| get_vlmul (machine_mode mode) |
| { |
| if (TARGET_MIN_VLEN >= 128) |
| return mode_vtype_infos.vlmul_for_for_vlen128[mode]; |
| else if (TARGET_MIN_VLEN == 32) |
| return mode_vtype_infos.vlmul_for_min_vlen32[mode]; |
| else |
| return mode_vtype_infos.vlmul_for_min_vlen64[mode]; |
| } |
| |
| /* Return the NF value of the corresponding mode. */ |
| unsigned int |
| get_nf (machine_mode mode) |
| { |
| /* We don't allow non-tuple modes go through this function. */ |
| gcc_assert (riscv_v_ext_tuple_mode_p (mode)); |
| return mode_vtype_infos.nf[mode]; |
| } |
| |
| /* Return the subpart mode of the tuple mode. For VNx2x1SImode, |
| the subpart mode is VNx1SImode. This will help to build |
| array/struct type in builtins. */ |
| machine_mode |
| get_subpart_mode (machine_mode mode) |
| { |
| /* We don't allow non-tuple modes go through this function. */ |
| gcc_assert (riscv_v_ext_tuple_mode_p (mode)); |
| return mode_vtype_infos.subpart_mode[mode]; |
| } |
| |
| /* Get ratio according to machine mode. */ |
| unsigned int |
| get_ratio (machine_mode mode) |
| { |
| if (TARGET_MIN_VLEN >= 128) |
| return mode_vtype_infos.ratio_for_for_vlen128[mode]; |
| else if (TARGET_MIN_VLEN == 32) |
| return mode_vtype_infos.ratio_for_min_vlen32[mode]; |
| else |
| return mode_vtype_infos.ratio_for_min_vlen64[mode]; |
| } |
| |
| /* Get ta according to operand[tail_op_idx]. */ |
| int |
| get_ta (rtx ta) |
| { |
| if (INTVAL (ta) == TAIL_ANY) |
| return INVALID_ATTRIBUTE; |
| return INTVAL (ta); |
| } |
| |
| /* Get ma according to operand[mask_op_idx]. */ |
| int |
| get_ma (rtx ma) |
| { |
| if (INTVAL (ma) == MASK_ANY) |
| return INVALID_ATTRIBUTE; |
| return INTVAL (ma); |
| } |
| |
| /* Get prefer tail policy. */ |
| enum tail_policy |
| get_prefer_tail_policy () |
| { |
| /* TODO: By default, we choose to use TAIL_ANY which allows |
| compiler pick up either agnostic or undisturbed. Maybe we |
| will have a compile option like -mprefer=agnostic to set |
| this value???. */ |
| return TAIL_ANY; |
| } |
| |
| /* Get prefer mask policy. */ |
| enum mask_policy |
| get_prefer_mask_policy () |
| { |
| /* TODO: By default, we choose to use MASK_ANY which allows |
| compiler pick up either agnostic or undisturbed. Maybe we |
| will have a compile option like -mprefer=agnostic to set |
| this value???. */ |
| return MASK_ANY; |
| } |
| |
| /* Get avl_type rtx. */ |
| rtx |
| get_avl_type_rtx (enum avl_type type) |
| { |
| return gen_int_mode (type, Pmode); |
| } |
| |
| /* Return the appropriate mask mode for MODE. */ |
| |
| opt_machine_mode |
| get_mask_mode (machine_mode mode) |
| { |
| return get_vector_mode (BImode, GET_MODE_NUNITS (mode)); |
| } |
| |
| /* Return the RVV vector mode that has NUNITS elements of mode INNER_MODE. |
| This function is not only used by builtins, but also will be used by |
| auto-vectorization in the future. */ |
| opt_machine_mode |
| get_vector_mode (scalar_mode inner_mode, poly_uint64 nunits) |
| { |
| enum mode_class mclass; |
| if (inner_mode == E_BImode) |
| mclass = MODE_VECTOR_BOOL; |
| else if (FLOAT_MODE_P (inner_mode)) |
| mclass = MODE_VECTOR_FLOAT; |
| else |
| mclass = MODE_VECTOR_INT; |
| machine_mode mode; |
| FOR_EACH_MODE_IN_CLASS (mode, mclass) |
| if (inner_mode == GET_MODE_INNER (mode) |
| && known_eq (nunits, GET_MODE_NUNITS (mode)) |
| && riscv_v_ext_vector_mode_p (mode)) |
| return mode; |
| return opt_machine_mode (); |
| } |
| |
| /* Return the RVV tuple mode if we can find the legal tuple mode for the |
| corresponding subpart mode and NF. */ |
| opt_machine_mode |
| get_tuple_mode (machine_mode subpart_mode, unsigned int nf) |
| { |
| poly_uint64 nunits = GET_MODE_NUNITS (subpart_mode) * nf; |
| scalar_mode inner_mode = GET_MODE_INNER (subpart_mode); |
| enum mode_class mclass = GET_MODE_CLASS (subpart_mode); |
| machine_mode mode; |
| FOR_EACH_MODE_IN_CLASS (mode, mclass) |
| if (inner_mode == GET_MODE_INNER (mode) |
| && known_eq (nunits, GET_MODE_NUNITS (mode)) |
| && riscv_v_ext_tuple_mode_p (mode) |
| && get_subpart_mode (mode) == subpart_mode) |
| return mode; |
| return opt_machine_mode (); |
| } |
| |
| bool |
| simm5_p (rtx x) |
| { |
| if (!CONST_INT_P (x)) |
| return false; |
| return IN_RANGE (INTVAL (x), -16, 15); |
| } |
| |
| bool |
| neg_simm5_p (rtx x) |
| { |
| if (!CONST_INT_P (x)) |
| return false; |
| return IN_RANGE (INTVAL (x), -15, 16); |
| } |
| |
| bool |
| has_vi_variant_p (rtx_code code, rtx x) |
| { |
| switch (code) |
| { |
| case PLUS: |
| case AND: |
| case IOR: |
| case XOR: |
| case SS_PLUS: |
| case US_PLUS: |
| case EQ: |
| case NE: |
| case LE: |
| case LEU: |
| case GT: |
| case GTU: |
| return simm5_p (x); |
| |
| case LT: |
| case LTU: |
| case GE: |
| case GEU: |
| case MINUS: |
| case SS_MINUS: |
| return neg_simm5_p (x); |
| |
| default: |
| return false; |
| } |
| } |
| |
| bool |
| sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl, |
| machine_mode vector_mode, bool has_vi_variant_p, |
| void (*emit_vector_func) (rtx *, rtx)) |
| { |
| machine_mode scalar_mode = GET_MODE_INNER (vector_mode); |
| if (has_vi_variant_p) |
| { |
| *scalar_op = force_reg (scalar_mode, *scalar_op); |
| return false; |
| } |
| |
| if (TARGET_64BIT) |
| { |
| if (!rtx_equal_p (*scalar_op, const0_rtx)) |
| *scalar_op = force_reg (scalar_mode, *scalar_op); |
| return false; |
| } |
| |
| if (immediate_operand (*scalar_op, Pmode)) |
| { |
| if (!rtx_equal_p (*scalar_op, const0_rtx)) |
| *scalar_op = force_reg (Pmode, *scalar_op); |
| |
| *scalar_op = gen_rtx_SIGN_EXTEND (scalar_mode, *scalar_op); |
| return false; |
| } |
| |
| if (CONST_INT_P (*scalar_op)) |
| *scalar_op = force_reg (scalar_mode, *scalar_op); |
| |
| rtx tmp = gen_reg_rtx (vector_mode); |
| rtx ops[] = {tmp, *scalar_op}; |
| riscv_vector::emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode), |
| riscv_vector::RVV_UNOP, ops, vl); |
| emit_vector_func (operands, tmp); |
| |
| return true; |
| } |
| |
| /* Get { ... ,0, 0, 0, ..., 0, 0, 0, 1 } mask. */ |
| rtx |
| gen_scalar_move_mask (machine_mode mode) |
| { |
| rtx_vector_builder builder (mode, 1, 2); |
| builder.quick_push (const1_rtx); |
| builder.quick_push (const0_rtx); |
| return builder.build (); |
| } |
| |
| static unsigned |
| compute_vlmax (unsigned vector_bits, unsigned elt_size, unsigned min_size) |
| { |
| // Original equation: |
| // VLMAX = (VectorBits / EltSize) * LMUL |
| // where LMUL = MinSize / TARGET_MIN_VLEN |
| // The following equations have been reordered to prevent loss of precision |
| // when calculating fractional LMUL. |
| return ((vector_bits / elt_size) * min_size) / TARGET_MIN_VLEN; |
| } |
| |
| static unsigned |
| get_unknown_min_value (machine_mode mode) |
| { |
| enum vlmul_type vlmul = get_vlmul (mode); |
| switch (vlmul) |
| { |
| case LMUL_1: |
| return TARGET_MIN_VLEN; |
| case LMUL_2: |
| return TARGET_MIN_VLEN * 2; |
| case LMUL_4: |
| return TARGET_MIN_VLEN * 4; |
| case LMUL_8: |
| return TARGET_MIN_VLEN * 8; |
| default: |
| gcc_unreachable (); |
| } |
| } |
| |
| static rtx |
| force_vector_length_operand (rtx vl) |
| { |
| if (CONST_INT_P (vl) && !satisfies_constraint_K (vl)) |
| return force_reg (Pmode, vl); |
| return vl; |
| } |
| |
| static rtx |
| gen_no_side_effects_vsetvl_rtx (machine_mode vmode, rtx vl, rtx avl) |
| { |
| unsigned int sew = get_sew (vmode); |
| return gen_vsetvl_no_side_effects (Pmode, vl, avl, gen_int_mode (sew, Pmode), |
| gen_int_mode (get_vlmul (vmode), Pmode), |
| const0_rtx, const0_rtx); |
| } |
| |
| /* GET VL * 2 rtx. */ |
| static rtx |
| get_vl_x2_rtx (rtx avl, machine_mode mode, machine_mode demote_mode) |
| { |
| rtx i32vl = NULL_RTX; |
| if (CONST_INT_P (avl)) |
| { |
| unsigned elt_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode)); |
| unsigned min_size = get_unknown_min_value (mode); |
| unsigned vlen_max = RVV_65536; |
| unsigned vlmax_max = compute_vlmax (vlen_max, elt_size, min_size); |
| unsigned vlen_min = TARGET_MIN_VLEN; |
| unsigned vlmax_min = compute_vlmax (vlen_min, elt_size, min_size); |
| |
| unsigned HOST_WIDE_INT avl_int = INTVAL (avl); |
| if (avl_int <= vlmax_min) |
| i32vl = gen_int_mode (2 * avl_int, Pmode); |
| else if (avl_int >= 2 * vlmax_max) |
| { |
| // Just set i32vl to VLMAX in this situation |
| i32vl = gen_reg_rtx (Pmode); |
| emit_insn ( |
| gen_no_side_effects_vsetvl_rtx (demote_mode, i32vl, RVV_VLMAX)); |
| } |
| else |
| { |
| // For AVL between (MinVLMAX, 2 * MaxVLMAX), the actual working vl |
| // is related to the hardware implementation. |
| // So let the following code handle |
| } |
| } |
| if (!i32vl) |
| { |
| // Using vsetvli instruction to get actually used length which related to |
| // the hardware implementation |
| rtx i64vl = gen_reg_rtx (Pmode); |
| emit_insn ( |
| gen_no_side_effects_vsetvl_rtx (mode, i64vl, force_reg (Pmode, avl))); |
| // scale 2 for 32-bit length |
| i32vl = gen_reg_rtx (Pmode); |
| emit_insn ( |
| gen_rtx_SET (i32vl, gen_rtx_ASHIFT (Pmode, i64vl, const1_rtx))); |
| } |
| |
| return force_vector_length_operand (i32vl); |
| } |
| |
| bool |
| slide1_sew64_helper (int unspec, machine_mode mode, machine_mode demote_mode, |
| machine_mode demote_mask_mode, rtx *ops) |
| { |
| rtx scalar_op = ops[4]; |
| rtx avl = ops[5]; |
| machine_mode scalar_mode = GET_MODE_INNER (mode); |
| if (rtx_equal_p (scalar_op, const0_rtx)) |
| { |
| ops[5] = force_vector_length_operand (ops[5]); |
| return false; |
| } |
| |
| if (TARGET_64BIT) |
| { |
| ops[4] = force_reg (scalar_mode, scalar_op); |
| ops[5] = force_vector_length_operand (ops[5]); |
| return false; |
| } |
| |
| if (immediate_operand (scalar_op, Pmode)) |
| { |
| ops[4] = gen_rtx_SIGN_EXTEND (scalar_mode, force_reg (Pmode, scalar_op)); |
| ops[5] = force_vector_length_operand (ops[5]); |
| return false; |
| } |
| |
| if (CONST_INT_P (scalar_op)) |
| scalar_op = force_reg (scalar_mode, scalar_op); |
| |
| rtx vl_x2 = get_vl_x2_rtx (avl, mode, demote_mode); |
| |
| rtx demote_scalar_op1, demote_scalar_op2; |
| if (unspec == UNSPEC_VSLIDE1UP) |
| { |
| demote_scalar_op1 = gen_highpart (Pmode, scalar_op); |
| demote_scalar_op2 = gen_lowpart (Pmode, scalar_op); |
| } |
| else |
| { |
| demote_scalar_op1 = gen_lowpart (Pmode, scalar_op); |
| demote_scalar_op2 = gen_highpart (Pmode, scalar_op); |
| } |
| |
| rtx temp = gen_reg_rtx (demote_mode); |
| rtx ta = gen_int_mode (get_prefer_tail_policy (), Pmode); |
| rtx ma = gen_int_mode (get_prefer_mask_policy (), Pmode); |
| rtx merge = RVV_VUNDEF (demote_mode); |
| /* Handle vslide1<ud>_tu. */ |
| if (register_operand (ops[2], mode) |
| && rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1])))) |
| { |
| merge = gen_lowpart (demote_mode, ops[2]); |
| ta = ops[6]; |
| ma = ops[7]; |
| } |
| |
| emit_insn (gen_pred_slide (unspec, demote_mode, temp, |
| CONSTM1_RTX (demote_mask_mode), merge, |
| gen_lowpart (demote_mode, ops[3]), |
| demote_scalar_op1, vl_x2, ta, ma, ops[8])); |
| emit_insn (gen_pred_slide (unspec, demote_mode, |
| gen_lowpart (demote_mode, ops[0]), |
| CONSTM1_RTX (demote_mask_mode), merge, temp, |
| demote_scalar_op2, vl_x2, ta, ma, ops[8])); |
| |
| if (rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1])))) |
| return true; |
| else |
| emit_insn (gen_pred_merge (mode, ops[0], ops[2], ops[2], ops[0], ops[1], |
| force_vector_length_operand (ops[5]), ops[6], |
| ops[8])); |
| return true; |
| } |
| |
| rtx |
| gen_avl_for_scalar_move (rtx avl) |
| { |
| /* AVL for scalar move has different behavior between 0 and large than 0. */ |
| if (CONST_INT_P (avl)) |
| { |
| /* So we could just set AVL to 1 for any constant other than 0. */ |
| if (rtx_equal_p (avl, const0_rtx)) |
| return const0_rtx; |
| else |
| return const1_rtx; |
| } |
| else |
| { |
| /* For non-constant value, we set any non zero value to 1 by |
| `sgtu new_avl,input_avl,zero` + `vsetvli`. */ |
| rtx tmp = gen_reg_rtx (Pmode); |
| emit_insn ( |
| gen_rtx_SET (tmp, gen_rtx_fmt_ee (GTU, Pmode, avl, const0_rtx))); |
| return tmp; |
| } |
| } |
| |
| /* Expand tuple modes data movement for. */ |
| void |
| expand_tuple_move (rtx *ops) |
| { |
| unsigned int i; |
| machine_mode tuple_mode = GET_MODE (ops[0]); |
| machine_mode subpart_mode = get_subpart_mode (tuple_mode); |
| poly_int64 subpart_size = GET_MODE_SIZE (subpart_mode); |
| unsigned int nf = get_nf (tuple_mode); |
| bool fractional_p = known_lt (subpart_size, BYTES_PER_RISCV_VECTOR); |
| |
| if (REG_P (ops[0]) && CONST_VECTOR_P (ops[1])) |
| { |
| rtx val; |
| gcc_assert (can_create_pseudo_p () |
| && const_vec_duplicate_p (ops[1], &val)); |
| for (i = 0; i < nf; ++i) |
| { |
| poly_int64 offset = i * subpart_size; |
| rtx subreg |
| = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset); |
| rtx dup = gen_const_vec_duplicate (subpart_mode, val); |
| emit_move_insn (subreg, dup); |
| } |
| } |
| else if (REG_P (ops[0]) && REG_P (ops[1])) |
| { |
| for (i = 0; i < nf; ++i) |
| { |
| int index = i; |
| |
| /* Take NF = 2 and LMUL = 1 for example: |
| |
| - move v8 to v9: |
| vmv1r v10,v9 |
| vmv1r v9,v8 |
| |
| - move v8 to v7: |
| vmv1r v7,v8 |
| vmv1r v8,v9 */ |
| if (REGNO (ops[0]) > REGNO (ops[1])) |
| index = nf - 1 - i; |
| poly_int64 offset = index * subpart_size; |
| rtx dst_subreg |
| = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset); |
| rtx src_subreg |
| = simplify_gen_subreg (subpart_mode, ops[1], tuple_mode, offset); |
| emit_insn (gen_rtx_SET (dst_subreg, src_subreg)); |
| } |
| } |
| else |
| { |
| /* Expand tuple memory data movement. */ |
| gcc_assert (MEM_P (ops[0]) || MEM_P (ops[1])); |
| rtx offset = gen_int_mode (subpart_size, Pmode); |
| if (!subpart_size.is_constant ()) |
| { |
| emit_move_insn (ops[2], gen_int_mode (BYTES_PER_RISCV_VECTOR, Pmode)); |
| if (fractional_p) |
| { |
| unsigned int factor |
| = exact_div (BYTES_PER_RISCV_VECTOR, subpart_size) |
| .to_constant (); |
| rtx pat |
| = gen_rtx_ASHIFTRT (Pmode, ops[2], |
| gen_int_mode (exact_log2 (factor), Pmode)); |
| emit_insn (gen_rtx_SET (ops[2], pat)); |
| } |
| |
| if (known_gt (subpart_size, BYTES_PER_RISCV_VECTOR)) |
| { |
| unsigned int factor |
| = exact_div (subpart_size, BYTES_PER_RISCV_VECTOR) |
| .to_constant (); |
| rtx pat |
| = gen_rtx_ASHIFT (Pmode, ops[2], |
| gen_int_mode (exact_log2 (factor), Pmode)); |
| emit_insn (gen_rtx_SET (ops[2], pat)); |
| } |
| offset = ops[2]; |
| } |
| |
| if (MEM_P (ops[1])) |
| { |
| /* Load operations. */ |
| emit_move_insn (ops[3], XEXP (ops[1], 0)); |
| for (i = 0; i < nf; i++) |
| { |
| rtx subreg = simplify_gen_subreg (subpart_mode, ops[0], |
| tuple_mode, i * subpart_size); |
| if (i != 0) |
| { |
| rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset); |
| emit_insn (gen_rtx_SET (ops[3], new_addr)); |
| } |
| rtx mem = gen_rtx_MEM (subpart_mode, ops[3]); |
| |
| if (fractional_p) |
| { |
| rtx operands[] = {subreg, mem}; |
| emit_vlmax_insn (code_for_pred_mov (subpart_mode), RVV_UNOP, |
| operands, ops[4]); |
| } |
| else |
| emit_move_insn (subreg, mem); |
| } |
| } |
| else |
| { |
| /* Store operations. */ |
| emit_move_insn (ops[3], XEXP (ops[0], 0)); |
| for (i = 0; i < nf; i++) |
| { |
| rtx subreg = simplify_gen_subreg (subpart_mode, ops[1], |
| tuple_mode, i * subpart_size); |
| if (i != 0) |
| { |
| rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset); |
| emit_insn (gen_rtx_SET (ops[3], new_addr)); |
| } |
| rtx mem = gen_rtx_MEM (subpart_mode, ops[3]); |
| |
| if (fractional_p) |
| { |
| rtx operands[] = {mem, subreg}; |
| emit_vlmax_insn (code_for_pred_mov (subpart_mode), RVV_UNOP, |
| operands, ops[4]); |
| } |
| else |
| emit_move_insn (mem, subreg); |
| } |
| } |
| } |
| } |
| |
| /* Return the vectorization machine mode for RVV according to LMUL. */ |
| machine_mode |
| preferred_simd_mode (scalar_mode mode) |
| { |
| /* We will disable auto-vectorization when TARGET_MIN_VLEN < 128 && |
| riscv_autovec_lmul < RVV_M2. Since GCC loop vectorizer report ICE when we |
| enable -march=rv64gc_zve32* and -march=rv32gc_zve64*. in the |
| 'can_duplicate_and_interleave_p' of tree-vect-slp.cc. Since we have |
| VNx1SImode in -march=*zve32* and VNx1DImode in -march=*zve64*, they are |
| enabled in targetm. vector_mode_supported_p and SLP vectorizer will try to |
| use them. Currently, we can support auto-vectorization in |
| -march=rv32_zve32x_zvl128b. Wheras, -march=rv32_zve32x_zvl32b or |
| -march=rv32_zve32x_zvl64b are disabled. */ |
| if (autovec_use_vlmax_p ()) |
| { |
| if (TARGET_MIN_VLEN < 128 && riscv_autovec_lmul < RVV_M2) |
| return word_mode; |
| /* We use LMUL = 1 as base bytesize which is BYTES_PER_RISCV_VECTOR and |
| riscv_autovec_lmul as multiply factor to calculate the the NUNITS to |
| get the auto-vectorization mode. */ |
| poly_uint64 nunits; |
| poly_uint64 vector_size |
| = BYTES_PER_RISCV_VECTOR * ((int) riscv_autovec_lmul); |
| poly_uint64 scalar_size = GET_MODE_SIZE (mode); |
| gcc_assert (multiple_p (vector_size, scalar_size, &nunits)); |
| machine_mode rvv_mode; |
| if (get_vector_mode (mode, nunits).exists (&rvv_mode)) |
| return rvv_mode; |
| } |
| /* TODO: We will support minimum length VLS auto-vectorization in |
| the future. */ |
| return word_mode; |
| } |
| |
| /* Subroutine of riscv_vector_expand_vector_init. |
| Works as follows: |
| (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER. |
| (b) Skip leading elements from BUILDER, which are the same as |
| element NELTS_REQD - 1. |
| (c) Insert earlier elements in reverse order in TARGET using vslide1down. */ |
| |
| static void |
| expand_vector_init_insert_elems (rtx target, const rvv_builder &builder, |
| int nelts_reqd) |
| { |
| machine_mode mode = GET_MODE (target); |
| machine_mode mask_mode; |
| gcc_assert (get_mask_mode (mode).exists (&mask_mode)); |
| rtx dup = expand_vector_broadcast (mode, builder.elt (0)); |
| emit_move_insn (target, dup); |
| int ndups = builder.count_dups (0, nelts_reqd - 1, 1); |
| for (int i = ndups; i < nelts_reqd; i++) |
| { |
| unsigned int unspec |
| = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1DOWN : UNSPEC_VSLIDE1DOWN; |
| insn_code icode = code_for_pred_slide (unspec, mode); |
| rtx ops[] = {target, target, builder.elt (i)}; |
| emit_vlmax_insn (icode, RVV_BINOP, ops); |
| } |
| } |
| |
| /* Use merge approach to initialize the vector with repeating sequence. |
| v = {a, b, a, b, a, b, a, b}. |
| |
| v = broadcast (a). |
| mask = 0b01010101.... |
| v = merge (v, b, mask) |
| */ |
| static void |
| expand_vector_init_merge_repeating_sequence (rtx target, |
| const rvv_builder &builder) |
| { |
| machine_mode dup_mode = get_repeating_sequence_dup_machine_mode (builder); |
| machine_mode dup_mask_mode = get_mask_mode (dup_mode).require (); |
| machine_mode mask_mode = get_mask_mode (builder.mode ()).require (); |
| uint64_t full_nelts = builder.full_nelts ().to_constant (); |
| |
| /* Step 1: Broadcast the first pattern. */ |
| rtx ops[] = {target, force_reg (GET_MODE_INNER (dup_mode), builder.elt (0))}; |
| emit_vlmax_integer_move_insn (code_for_pred_broadcast (builder.mode ()), |
| ops, NULL_RTX); |
| |
| /* Step 2: Merge the rest iteration of pattern. */ |
| for (unsigned int i = 1; i < builder.npatterns (); i++) |
| { |
| /* Step 2-1: Generate mask register v0 for each merge. */ |
| rtx merge_mask = builder.get_merge_scalar_mask (i); |
| rtx mask = gen_reg_rtx (mask_mode); |
| rtx dup = gen_reg_rtx (dup_mode); |
| |
| if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x. */ |
| { |
| rtx ops[] = {dup, gen_scalar_move_mask (dup_mask_mode), |
| RVV_VUNDEF (dup_mode), merge_mask}; |
| emit_scalar_move_insn (code_for_pred_broadcast (GET_MODE (dup)), |
| ops); |
| } |
| else /* vmv.v.x. */ |
| { |
| rtx ops[] = {dup, force_reg (GET_MODE_INNER (dup_mode), merge_mask)}; |
| rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()), |
| Pmode); |
| emit_nonvlmax_integer_move_insn (code_for_pred_broadcast (dup_mode), |
| ops, vl); |
| } |
| |
| emit_move_insn (mask, gen_lowpart (mask_mode, dup)); |
| |
| /* Step 2-2: Merge pattern according to the mask. */ |
| rtx ops[] = {target, target, builder.elt (i), mask}; |
| emit_vlmax_merge_insn (code_for_pred_merge_scalar (GET_MODE (target)), |
| riscv_vector::RVV_MERGE_OP, ops); |
| } |
| } |
| |
| /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */ |
| |
| void |
| expand_vec_init (rtx target, rtx vals) |
| { |
| machine_mode mode = GET_MODE (target); |
| int nelts = XVECLEN (vals, 0); |
| |
| rvv_builder v (mode, nelts, 1); |
| for (int i = 0; i < nelts; i++) |
| v.quick_push (XVECEXP (vals, 0, i)); |
| v.finalize (); |
| |
| if (nelts > 3) |
| { |
| /* Case 1: Convert v = { a, b, a, b } into v = { ab, ab }. */ |
| if (v.can_duplicate_repeating_sequence_p ()) |
| { |
| rtx ele = v.get_merged_repeating_sequence (); |
| rtx dup = expand_vector_broadcast (v.new_mode (), ele); |
| emit_move_insn (target, gen_lowpart (mode, dup)); |
| return; |
| } |
| |
| /* Case 2: Optimize repeating sequence cases that Case 1 can |
| not handle and it is profitable. For example: |
| ELEMENT BITSIZE = 64. |
| v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}. |
| We can't find a vector mode for "ab" which will be combined into |
| 128-bit element to duplicate. */ |
| if (v.repeating_sequence_use_merge_profitable_p ()) |
| { |
| expand_vector_init_merge_repeating_sequence (target, v); |
| return; |
| } |
| |
| /* TODO: We will support more Initialization of vector in the future. */ |
| } |
| |
| /* Handle common situation by vslide1down. This function can handle any |
| situation of vec_init<mode>. Only the cases that are not optimized above |
| will fall through here. */ |
| expand_vector_init_insert_elems (target, v, nelts); |
| } |
| |
| /* Get insn code for corresponding comparison. */ |
| |
| static insn_code |
| get_cmp_insn_code (rtx_code code, machine_mode mode) |
| { |
| insn_code icode; |
| switch (code) |
| { |
| case EQ: |
| case NE: |
| case LE: |
| case LEU: |
| case GT: |
| case GTU: |
| case LTGT: |
| icode = code_for_pred_cmp (mode); |
| break; |
| case LT: |
| case LTU: |
| case GE: |
| case GEU: |
| if (FLOAT_MODE_P (mode)) |
| icode = code_for_pred_cmp (mode); |
| else |
| icode = code_for_pred_ltge (mode); |
| break; |
| default: |
| gcc_unreachable (); |
| } |
| return icode; |
| } |
| |
| /* This hook gives the vectorizer more vector mode options. We want it to not |
| only try modes with the maximum number of units a full vector can hold but |
| for example also half the number of units for a smaller elements size. |
| Such vectors can be promoted to a full vector of widened elements |
| (still with the same number of elements, essentially vectorizing at a |
| fixed number of units rather than a fixed number of bytes). */ |
| unsigned int |
| autovectorize_vector_modes (vector_modes *modes, bool) |
| { |
| if (autovec_use_vlmax_p ()) |
| { |
| /* TODO: We will support RVV VLS auto-vectorization mode in the future. */ |
| poly_uint64 full_size |
| = BYTES_PER_RISCV_VECTOR * ((int) riscv_autovec_lmul); |
| |
| /* Start with a VNxYYQImode where YY is the number of units that |
| fit a whole vector. |
| Then try YY = nunits / 2, nunits / 4 and nunits / 8 which |
| is guided by the extensions we have available (vf2, vf4 and vf8). |
| |
| - full_size: Try using full vectors for all element types. |
| - full_size / 2: |
| Try using 16-bit containers for 8-bit elements and full vectors |
| for wider elements. |
| - full_size / 4: |
| Try using 32-bit containers for 8-bit and 16-bit elements and |
| full vectors for wider elements. |
| - full_size / 8: |
| Try using 64-bit containers for all element types. */ |
| static const int rvv_factors[] = {1, 2, 4, 8}; |
| for (unsigned int i = 0; i < sizeof (rvv_factors) / sizeof (int); i++) |
| { |
| poly_uint64 units; |
| machine_mode mode; |
| if (can_div_trunc_p (full_size, rvv_factors[i], &units) |
| && get_vector_mode (QImode, units).exists (&mode)) |
| modes->safe_push (mode); |
| } |
| } |
| return 0; |
| } |
| |
| /* If the given VECTOR_MODE is an RVV mode, first get the largest number |
| of units that fit into a full vector at the given ELEMENT_MODE. |
| We will have the vectorizer call us with a successively decreasing |
| number of units (as specified in autovectorize_vector_modes). |
| The starting mode is always the one specified by preferred_simd_mode. */ |
| opt_machine_mode |
| vectorize_related_mode (machine_mode vector_mode, scalar_mode element_mode, |
| poly_uint64 nunits) |
| { |
| /* TODO: We will support RVV VLS auto-vectorization mode in the future. */ |
| poly_uint64 min_units; |
| if (autovec_use_vlmax_p () && riscv_v_ext_vector_mode_p (vector_mode) |
| && multiple_p (BYTES_PER_RISCV_VECTOR * ((int) riscv_autovec_lmul), |
| GET_MODE_SIZE (element_mode), &min_units)) |
| { |
| machine_mode rvv_mode; |
| if (maybe_ne (nunits, 0U)) |
| { |
| /* If we were given a number of units NUNITS, try to find an |
| RVV vector mode of inner mode ELEMENT_MODE with the same |
| number of units. */ |
| if (multiple_p (min_units, nunits) |
| && get_vector_mode (element_mode, nunits).exists (&rvv_mode)) |
| return rvv_mode; |
| } |
| else |
| { |
| /* Look for a vector mode with the same number of units as the |
| VECTOR_MODE we were given. We keep track of the minimum |
| number of units so far which determines the smallest necessary |
| but largest possible, suitable mode for vectorization. */ |
| min_units = ordered_min (min_units, GET_MODE_SIZE (vector_mode)); |
| if (get_vector_mode (element_mode, min_units).exists (&rvv_mode)) |
| return rvv_mode; |
| } |
| } |
| |
| return default_vectorize_related_mode (vector_mode, element_mode, nunits); |
| } |
| |
| /* Expand an RVV comparison. */ |
| |
| void |
| expand_vec_cmp (rtx target, rtx_code code, rtx op0, rtx op1) |
| { |
| machine_mode mask_mode = GET_MODE (target); |
| machine_mode data_mode = GET_MODE (op0); |
| insn_code icode = get_cmp_insn_code (code, data_mode); |
| |
| if (code == LTGT) |
| { |
| rtx lt = gen_reg_rtx (mask_mode); |
| rtx gt = gen_reg_rtx (mask_mode); |
| expand_vec_cmp (lt, LT, op0, op1); |
| expand_vec_cmp (gt, GT, op0, op1); |
| icode = code_for_pred (IOR, mask_mode); |
| rtx ops[] = {target, lt, gt}; |
| emit_vlmax_insn (icode, riscv_vector::RVV_BINOP, ops); |
| return; |
| } |
| |
| rtx cmp = gen_rtx_fmt_ee (code, mask_mode, op0, op1); |
| rtx ops[] = {target, cmp, op0, op1}; |
| emit_vlmax_cmp_insn (icode, ops); |
| } |
| |
| void |
| expand_vec_cmp (rtx target, rtx_code code, rtx mask, rtx maskoff, rtx op0, |
| rtx op1) |
| { |
| machine_mode mask_mode = GET_MODE (target); |
| machine_mode data_mode = GET_MODE (op0); |
| insn_code icode = get_cmp_insn_code (code, data_mode); |
| |
| if (code == LTGT) |
| { |
| rtx lt = gen_reg_rtx (mask_mode); |
| rtx gt = gen_reg_rtx (mask_mode); |
| expand_vec_cmp (lt, LT, mask, maskoff, op0, op1); |
| expand_vec_cmp (gt, GT, mask, maskoff, op0, op1); |
| icode = code_for_pred (IOR, mask_mode); |
| rtx ops[] = {target, lt, gt}; |
| emit_vlmax_insn (icode, RVV_BINOP, ops); |
| return; |
| } |
| |
| rtx cmp = gen_rtx_fmt_ee (code, mask_mode, op0, op1); |
| rtx ops[] = {target, mask, maskoff, cmp, op0, op1}; |
| emit_vlmax_cmp_mu_insn (icode, ops); |
| } |
| |
| /* Expand an RVV floating-point comparison: |
| |
| If CAN_INVERT_P is true, the caller can also handle inverted results; |
| return true if the result is in fact inverted. */ |
| |
| bool |
| expand_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1, |
| bool can_invert_p) |
| { |
| machine_mode mask_mode = GET_MODE (target); |
| machine_mode data_mode = GET_MODE (op0); |
| |
| /* If can_invert_p = true: |
| It suffices to implement a u>= b as !(a < b) but with the NaNs masked off: |
| |
| vmfeq.vv v0, va, va |
| vmfeq.vv v1, vb, vb |
| vmand.mm v0, v0, v1 |
| vmflt.vv v0, va, vb, v0.t |
| vmnot.m v0, v0 |
| |
| And, if !HONOR_SNANS, then you can remove the vmand.mm by masking the |
| second vmfeq.vv: |
| |
| vmfeq.vv v0, va, va |
| vmfeq.vv v0, vb, vb, v0.t |
| vmflt.vv v0, va, vb, v0.t |
| vmnot.m v0, v0 |
| |
| If can_invert_p = false: |
| |
| # Example of implementing isgreater() |
| vmfeq.vv v0, va, va # Only set where A is not NaN. |
| vmfeq.vv v1, vb, vb # Only set where B is not NaN. |
| vmand.mm v0, v0, v1 # Only set where A and B are ordered, |
| vmfgt.vv v0, va, vb, v0.t # so only set flags on ordered values. |
| */ |
| |
| rtx eq0 = gen_reg_rtx (mask_mode); |
| rtx eq1 = gen_reg_rtx (mask_mode); |
| switch (code) |
| { |
| case EQ: |
| case NE: |
| case LT: |
| case LE: |
| case GT: |
| case GE: |
| case LTGT: |
| /* There is native support for the comparison. */ |
| expand_vec_cmp (target, code, op0, op1); |
| return false; |
| case UNEQ: |
| case ORDERED: |
| case UNORDERED: |
| case UNLT: |
| case UNLE: |
| case UNGT: |
| case UNGE: |
| /* vmfeq.vv v0, va, va */ |
| expand_vec_cmp (eq0, EQ, op0, op0); |
| if (HONOR_SNANS (data_mode)) |
| { |
| /* |
| vmfeq.vv v1, vb, vb |
| vmand.mm v0, v0, v1 |
| */ |
| expand_vec_cmp (eq1, EQ, op1, op1); |
| insn_code icode = code_for_pred (AND, mask_mode); |
| rtx ops[] = {eq0, eq0, eq1}; |
| emit_vlmax_insn (icode, riscv_vector::RVV_BINOP, ops); |
| } |
| else |
| { |
| /* vmfeq.vv v0, vb, vb, v0.t */ |
| expand_vec_cmp (eq0, EQ, eq0, eq0, op1, op1); |
| } |
| break; |
| default: |
| gcc_unreachable (); |
| } |
| |
| if (code == ORDERED) |
| { |
| emit_move_insn (target, eq0); |
| return false; |
| } |
| |
| /* There is native support for the inverse comparison. */ |
| code = reverse_condition_maybe_unordered (code); |
| if (code == ORDERED) |
| emit_move_insn (target, eq0); |
| else |
| expand_vec_cmp (eq0, code, eq0, eq0, op0, op1); |
| |
| if (can_invert_p) |
| { |
| emit_move_insn (target, eq0); |
| return true; |
| } |
| |
| /* We use one_cmpl<mode>2 to make Combine PASS to combine mask instructions |
| into: vmand.mm/vmnor.mm/vmnand.mm/vmnor.mm/vmxnor.mm. */ |
| emit_insn (gen_rtx_SET (target, gen_rtx_NOT (mask_mode, eq0))); |
| return false; |
| } |
| |
| /* Implement vec_perm<mode>. */ |
| |
| void |
| expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel) |
| { |
| machine_mode data_mode = GET_MODE (target); |
| machine_mode sel_mode = GET_MODE (sel); |
| poly_uint64 nunits = GET_MODE_NUNITS (sel_mode); |
| |
| /* Check if the sel only references the first values vector. If each select |
| index is in range of [0, nunits - 1]. A single vrgather instructions is |
| enough. Since we will use vrgatherei16.vv for variable-length vector, |
| it is never out of range and we don't need to modulo the index. */ |
| if (!nunits.is_constant () || const_vec_all_in_range_p (sel, 0, nunits - 1)) |
| { |
| emit_vlmax_gather_insn (target, op0, sel); |
| return; |
| } |
| |
| /* Check if the two values vectors are the same. */ |
| if (rtx_equal_p (op0, op1) || const_vec_duplicate_p (sel)) |
| { |
| /* Note: vec_perm indices are supposed to wrap when they go beyond the |
| size of the two value vectors, i.e. the upper bits of the indices |
| are effectively ignored. RVV vrgather instead produces 0 for any |
| out-of-range indices, so we need to modulo all the vec_perm indices |
| to ensure they are all in range of [0, nunits - 1]. */ |
| rtx max_sel = gen_const_vector_dup (sel_mode, nunits - 1); |
| rtx sel_mod = expand_simple_binop (sel_mode, AND, sel, max_sel, NULL, 0, |
| OPTAB_DIRECT); |
| emit_vlmax_gather_insn (target, op1, sel_mod); |
| return; |
| } |
| |
| rtx sel_mod = sel; |
| rtx max_sel = gen_const_vector_dup (sel_mode, 2 * nunits - 1); |
| /* We don't need to modulo indices for VLA vector. |
| Since we should gurantee they aren't out of range before. */ |
| if (nunits.is_constant ()) |
| { |
| /* Note: vec_perm indices are supposed to wrap when they go beyond the |
| size of the two value vectors, i.e. the upper bits of the indices |
| are effectively ignored. RVV vrgather instead produces 0 for any |
| out-of-range indices, so we need to modulo all the vec_perm indices |
| to ensure they are all in range of [0, 2 * nunits - 1]. */ |
| sel_mod = expand_simple_binop (sel_mode, AND, sel, max_sel, NULL, 0, |
| OPTAB_DIRECT); |
| } |
| |
| /* This following sequence is handling the case that: |
| __builtin_shufflevector (vec1, vec2, index...), the index can be any |
| value in range of [0, 2 * nunits - 1]. */ |
| machine_mode mask_mode; |
| mask_mode = get_mask_mode (data_mode).require (); |
| rtx mask = gen_reg_rtx (mask_mode); |
| max_sel = gen_const_vector_dup (sel_mode, nunits); |
| |
| /* Step 1: generate a mask that should select everything >= nunits into the |
| * mask. */ |
| expand_vec_cmp (mask, GEU, sel_mod, max_sel); |
| |
| /* Step2: gather every op0 values indexed by sel into target, |
| we don't need to care about the result of the element |
| whose index >= nunits. */ |
| emit_vlmax_gather_insn (target, op0, sel_mod); |
| |
| /* Step3: shift the range from (nunits, max_of_mode] to |
| [0, max_of_mode - nunits]. */ |
| rtx tmp = gen_reg_rtx (sel_mode); |
| rtx ops[] = {tmp, sel_mod, max_sel}; |
| emit_vlmax_insn (code_for_pred (MINUS, sel_mode), RVV_BINOP, ops); |
| |
| /* Step4: gather those into the previously masked-out elements |
| of target. */ |
| emit_vlmax_masked_gather_mu_insn (target, op1, tmp, mask); |
| } |
| |
| /* Implement TARGET_VECTORIZE_VEC_PERM_CONST for RVV. */ |
| |
| /* vec_perm support. */ |
| |
| struct expand_vec_perm_d |
| { |
| rtx target, op0, op1; |
| vec_perm_indices perm; |
| machine_mode vmode; |
| machine_mode op_mode; |
| bool one_vector_p; |
| bool testing_p; |
| }; |
| |
| /* Recognize the patterns that we can use merge operation to shuffle the |
| vectors. The value of Each element (index i) in selector can only be |
| either i or nunits + i. We will check the pattern is actually monotonic. |
| |
| E.g. |
| v = VEC_PERM_EXPR (v0, v1, selector), |
| selector = { 0, nunits + 1, 2, nunits + 3, 4, nunits + 5, ... } |
| |
| We can transform such pattern into: |
| |
| v = vcond_mask (v0, v1, mask), |
| mask = { 0, 1, 0, 1, 0, 1, ... }. */ |
| |
| static bool |
| shuffle_merge_patterns (struct expand_vec_perm_d *d) |
| { |
| machine_mode vmode = d->vmode; |
| machine_mode sel_mode = related_int_vector_mode (vmode).require (); |
| int n_patterns = d->perm.encoding ().npatterns (); |
| poly_int64 vec_len = d->perm.length (); |
| |
| for (int i = 0; i < n_patterns; ++i) |
| if (!known_eq (d->perm[i], i) && !known_eq (d->perm[i], vec_len + i)) |
| return false; |
| |
| /* Check the pattern is monotonic here, otherwise, return false. */ |
| for (int i = n_patterns; i < n_patterns * 2; i++) |
| if (!d->perm.series_p (i, n_patterns, i, n_patterns) |
| && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns)) |
| return false; |
| |
| if (d->testing_p) |
| return true; |
| |
| machine_mode mask_mode = get_mask_mode (vmode).require (); |
| rtx mask = gen_reg_rtx (mask_mode); |
| |
| rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm); |
| |
| /* MASK = SELECTOR < NUNTIS ? 1 : 0. */ |
| rtx x = gen_int_mode (vec_len, GET_MODE_INNER (sel_mode)); |
| insn_code icode = code_for_pred_cmp_scalar (sel_mode); |
| rtx cmp = gen_rtx_fmt_ee (LTU, mask_mode, sel, x); |
| rtx ops[] = {mask, cmp, sel, x}; |
| emit_vlmax_cmp_insn (icode, ops); |
| |
| /* TARGET = MASK ? OP0 : OP1. */ |
| emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, mask)); |
| return true; |
| } |
| |
| /* Recognize decompress patterns: |
| |
| 1. VEC_PERM_EXPR op0 and op1 |
| with isel = { 0, nunits, 1, nunits + 1, ... }. |
| Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }. |
| |
| 2. VEC_PERM_EXPR op0 and op1 |
| with isel = { 1/2 nunits, 3/2 nunits, 1/2 nunits+1, 3/2 nunits+1,... }. |
| Slide down op0 and op1 with OFFSET = 1/2 nunits. |
| Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }. |
| */ |
| static bool |
| shuffle_decompress_patterns (struct expand_vec_perm_d *d) |
| { |
| poly_uint64 nelt = d->perm.length (); |
| machine_mode mask_mode = get_mask_mode (d->vmode).require (); |
| |
| /* For constant size indices, we dont't need to handle it here. |
| Just leave it to vec_perm<mode>. */ |
| if (d->perm.length ().is_constant ()) |
| return false; |
| |
| poly_uint64 first = d->perm[0]; |
| if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt)) |
| || !d->perm.series_p (0, 2, first, 1) |
| || !d->perm.series_p (1, 2, first + nelt, 1)) |
| return false; |
| |
| /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv. |
| Otherwise, it could overflow the index range. */ |
| machine_mode sel_mode = related_int_vector_mode (d->vmode).require (); |
| if (GET_MODE_INNER (d->vmode) == QImode |
| && !get_vector_mode (HImode, nelt).exists (&sel_mode)) |
| return false; |
| |
| /* Success! */ |
| if (d->testing_p) |
| return true; |
| |
| rtx op0, op1; |
| if (known_eq (first, 0U)) |
| { |
| op0 = d->op0; |
| op1 = d->op1; |
| } |
| else |
| { |
| op0 = gen_reg_rtx (d->vmode); |
| op1 = gen_reg_rtx (d->vmode); |
| insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode); |
| rtx ops0[] = {op0, d->op0, gen_int_mode (first, Pmode)}; |
| rtx ops1[] = {op1, d->op1, gen_int_mode (first, Pmode)}; |
| emit_vlmax_insn (icode, RVV_BINOP, ops0); |
| emit_vlmax_insn (icode, RVV_BINOP, ops1); |
| } |
| /* Generate { 0, 1, .... } mask. */ |
| rtx vid = gen_reg_rtx (sel_mode); |
| rtx vid_repeat = gen_reg_rtx (sel_mode); |
| emit_insn (gen_vec_series (sel_mode, vid, const0_rtx, const1_rtx)); |
| rtx and_ops[] = {vid_repeat, vid, const1_rtx}; |
| emit_vlmax_insn (code_for_pred_scalar (AND, sel_mode), RVV_BINOP, and_ops); |
| rtx const_vec = gen_const_vector_dup (sel_mode, 1); |
| rtx mask = gen_reg_rtx (mask_mode); |
| expand_vec_cmp (mask, EQ, vid_repeat, const_vec); |
| emit_vlmax_decompress_insn (d->target, op0, op1, mask); |
| return true; |
| } |
| |
| /* Recognize the pattern that can be shuffled by generic approach. */ |
| |
| static bool |
| shuffle_generic_patterns (struct expand_vec_perm_d *d) |
| { |
| machine_mode sel_mode = related_int_vector_mode (d->vmode).require (); |
| poly_uint64 nunits = GET_MODE_NUNITS (d->vmode); |
| |
| /* We don't enable SLP for non-power of 2 NPATTERNS. */ |
| if (!pow2p_hwi (d->perm.encoding().npatterns ())) |
| return false; |
| |
| /* For constant size indices, we dont't need to handle it here. |
| Just leave it to vec_perm<mode>. */ |
| if (d->perm.length ().is_constant ()) |
| return false; |
| |
| /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv. |
| Otherwise, it could overflow the index range. */ |
| if (GET_MODE_INNER (d->vmode) == QImode |
| && !get_vector_mode (HImode, nunits).exists (&sel_mode)) |
| return false; |
| |
| /* Success! */ |
| if (d->testing_p) |
| return true; |
| |
| rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm); |
| expand_vec_perm (d->target, d->op0, d->op1, force_reg (sel_mode, sel)); |
| return true; |
| } |
| |
| /* This function recognizes and supports different permutation patterns |
| and enable VLA SLP auto-vectorization. */ |
| static bool |
| expand_vec_perm_const_1 (struct expand_vec_perm_d *d) |
| { |
| gcc_assert (d->op_mode != E_VOIDmode); |
| |
| /* The pattern matching functions above are written to look for a small |
| number to begin the sequence (0, 1, N/2). If we begin with an index |
| from the second operand, we can swap the operands. */ |
| poly_int64 nelt = d->perm.length (); |
| if (known_ge (d->perm[0], nelt)) |
| { |
| d->perm.rotate_inputs (1); |
| std::swap (d->op0, d->op1); |
| } |
| |
| if (known_gt (nelt, 1)) |
| { |
| if (d->vmode == d->op_mode) |
| { |
| if (shuffle_merge_patterns (d)) |
| return true; |
| if (shuffle_decompress_patterns (d)) |
| return true; |
| if (shuffle_generic_patterns (d)) |
| return true; |
| return false; |
| } |
| else |
| return false; |
| } |
| return false; |
| } |
| |
| /* This function implements TARGET_VECTORIZE_VEC_PERM_CONST by using RVV |
| * instructions. */ |
| bool |
| expand_vec_perm_const (machine_mode vmode, machine_mode op_mode, rtx target, |
| rtx op0, rtx op1, const vec_perm_indices &sel) |
| { |
| /* RVV doesn't have Mask type pack/unpack instructions and we don't use |
| mask to do the iteration loop control. Just disable it directly. */ |
| if (GET_MODE_CLASS (vmode) == MODE_VECTOR_BOOL) |
| return false; |
| |
| struct expand_vec_perm_d d; |
| |
| /* Check whether the mask can be applied to a single vector. */ |
| if (sel.ninputs () == 1 || (op0 && rtx_equal_p (op0, op1))) |
| d.one_vector_p = true; |
| else if (sel.all_from_input_p (0)) |
| { |
| d.one_vector_p = true; |
| op1 = op0; |
| } |
| else if (sel.all_from_input_p (1)) |
| { |
| d.one_vector_p = true; |
| op0 = op1; |
| } |
| else |
| d.one_vector_p = false; |
| |
| d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2, |
| sel.nelts_per_input ()); |
| d.vmode = vmode; |
| d.op_mode = op_mode; |
| d.target = target; |
| d.op0 = op0; |
| if (op0 == op1) |
| d.op1 = d.op0; |
| else |
| d.op1 = op1; |
| d.testing_p = !target; |
| |
| if (!d.testing_p) |
| return expand_vec_perm_const_1 (&d); |
| |
| rtx_insn *last = get_last_insn (); |
| bool ret = expand_vec_perm_const_1 (&d); |
| gcc_assert (last == get_last_insn ()); |
| |
| return ret; |
| } |
| |
| /* Generate no side effects vsetvl to get the vector length. */ |
| void |
| expand_select_vl (rtx *ops) |
| { |
| poly_int64 nunits = rtx_to_poly_int64 (ops[2]); |
| /* We arbitrary picked QImode as inner scalar mode to get vector mode. |
| since vsetvl only demand ratio. We let VSETVL PASS to optimize it. */ |
| scalar_int_mode mode = QImode; |
| machine_mode rvv_mode = get_vector_mode (mode, nunits).require (); |
| emit_insn (gen_no_side_effects_vsetvl_rtx (rvv_mode, ops[0], ops[1])); |
| } |
| |
| /* Expand LEN_MASK_{LOAD,STORE}. */ |
| void |
| expand_load_store (rtx *ops, bool is_load) |
| { |
| poly_int64 value; |
| rtx len = ops[2]; |
| rtx mask = ops[3]; |
| machine_mode mode = GET_MODE (ops[0]); |
| |
| if (poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode))) |
| { |
| /* If the length operand is equal to VF, it is VLMAX load/store. */ |
| if (is_load) |
| { |
| rtx m_ops[] = {ops[0], mask, RVV_VUNDEF (mode), ops[1]}; |
| emit_vlmax_masked_insn (code_for_pred_mov (mode), RVV_UNOP_M, m_ops); |
| } |
| else |
| { |
| len = gen_reg_rtx (Pmode); |
| emit_vlmax_vsetvl (mode, len); |
| emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len, |
| get_avl_type_rtx (VLMAX))); |
| } |
| } |
| else |
| { |
| if (!satisfies_constraint_K (len)) |
| len = force_reg (Pmode, len); |
| if (is_load) |
| { |
| rtx m_ops[] = {ops[0], mask, RVV_VUNDEF (mode), ops[1]}; |
| emit_nonvlmax_masked_insn (code_for_pred_mov (mode), RVV_UNOP_M, |
| m_ops, len); |
| } |
| else |
| emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len, |
| get_avl_type_rtx (NONVLMAX))); |
| } |
| } |
| |
| } // namespace riscv_vector |