| /* Cost model implementation for RISC-V 'V' Extension for GNU compiler. |
| Copyright (C) 2023-2025 Free Software Foundation, Inc. |
| Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd. |
| |
| This file is part of GCC. |
| |
| GCC is free software; you can redistribute it and/or modify |
| it under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 3, or (at your option) |
| any later version. |
| |
| GCC is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with GCC; see the file COPYING3. If not see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #define IN_TARGET_CODE 1 |
| |
| #define INCLUDE_STRING |
| #include "config.h" |
| #include "system.h" |
| #include "coretypes.h" |
| #include "tm.h" |
| #include "target.h" |
| #include "function.h" |
| #include "tree.h" |
| #include "basic-block.h" |
| #include "rtl.h" |
| #include "gimple.h" |
| #include "targhooks.h" |
| #include "cfgloop.h" |
| #include "fold-const.h" |
| #include "tm_p.h" |
| #include "tree-vectorizer.h" |
| #include "gimple-iterator.h" |
| #include "bitmap.h" |
| #include "ssa.h" |
| #include "backend.h" |
| #include "tree-data-ref.h" |
| #include "tree-ssa-loop-niter.h" |
| #include "tree-hash-traits.h" |
| |
| /* This file should be included last. */ |
| #include "riscv-vector-costs.h" |
| |
| namespace riscv_vector { |
| |
| /* Dynamic LMUL philosophy - Local linear-scan SSA live range based analysis |
| determine LMUL |
| |
| - Collect all vectorize STMTs locally for each loop block. |
| - Build program point based graph, ignore non-vectorize STMTs: |
| |
| vectorize STMT 0 - point 0 |
| scalar STMT 0 - ignore. |
| vectorize STMT 1 - point 1 |
| ... |
| - Compute the number of live V_REGs live at each program point |
| - Determine LMUL in VECTOR COST model according to the program point |
| which has maximum live V_REGs. |
| |
| Note: |
| |
| - BIGGEST_MODE is the biggest LMUL auto-vectorization element mode. |
| It's important for mixed size auto-vectorization (Conversions, ... etc). |
| E.g. For a loop that is vectorizing conversion of INT32 -> INT64. |
| The biggest mode is DImode and LMUL = 8, LMUL = 4 for SImode. |
| We compute the number live V_REGs at each program point according to |
| this information. |
| - We only compute program points and live ranges locally (within a block) |
| since we just need to compute the number of live V_REGs at each program |
| point and we are not really allocating the registers for each SSA. |
| We can make the variable has another local live range in another block |
| if it live out/live in to another block. Such approach doesn't affect |
| out accurate live range analysis. |
| - Current analysis didn't consider any instruction scheduling which |
| may improve the register pressure. So we are conservatively doing the |
| analysis which may end up with smaller LMUL. |
| TODO: Maybe we could support a reasonable live range shrink algorithm |
| which take advantage of instruction scheduling. |
| - We may have these following possible autovec modes analysis: |
| |
| 1. M8 -> M4 -> M2 -> M1 (stop analysis here) -> MF2 -> MF4 -> MF8 |
| 2. M8 -> M1(M4) -> MF2(M2) -> MF4(M1) (stop analysis here) -> MF8(MF2) |
| 3. M1(M8) -> MF2(M4) -> MF4(M2) -> MF8(M1) |
| */ |
| |
| static bool |
| is_gimple_assign_or_call (gimple *stmt) |
| { |
| return is_gimple_assign (stmt) || is_gimple_call (stmt); |
| } |
| |
| /* Return the program point of 1st vectorized lanes statement. */ |
| static unsigned int |
| get_first_lane_point (const vec<stmt_point> program_points, |
| stmt_vec_info stmt_info) |
| { |
| for (const auto program_point : program_points) |
| if (program_point.stmt_info == DR_GROUP_FIRST_ELEMENT (stmt_info)) |
| return program_point.point; |
| return 0; |
| } |
| |
| /* Return the program point of last vectorized lanes statement. */ |
| static unsigned int |
| get_last_lane_point (const vec<stmt_point> program_points, |
| stmt_vec_info stmt_info) |
| { |
| unsigned int max_point = 0; |
| for (auto s = DR_GROUP_FIRST_ELEMENT (stmt_info); s != NULL; |
| s = DR_GROUP_NEXT_ELEMENT (s)) |
| { |
| for (const auto program_point : program_points) |
| if (program_point.stmt_info == s && program_point.point > max_point) |
| max_point = program_point.point; |
| } |
| return max_point; |
| } |
| |
| /* Return the last variable that is in the live range list. */ |
| static pair * |
| get_live_range (hash_map<tree, pair> *live_ranges, tree arg) |
| { |
| auto *r = live_ranges->get (arg); |
| if (r) |
| return r; |
| else |
| { |
| tree t = arg; |
| gimple *def_stmt = NULL; |
| while (t && TREE_CODE (t) == SSA_NAME && !r |
| && (def_stmt = SSA_NAME_DEF_STMT (t))) |
| { |
| if (gimple_assign_cast_p (def_stmt)) |
| { |
| t = gimple_assign_rhs1 (def_stmt); |
| r = live_ranges->get (t); |
| def_stmt = NULL; |
| } |
| else |
| /* FIXME: Currently we don't see any fold for |
| non-conversion statements. */ |
| t = NULL_TREE; |
| } |
| if (r) |
| return r; |
| else |
| { |
| bool insert_p = live_ranges->put (arg, pair (0, 0)); |
| gcc_assert (!insert_p); |
| return live_ranges->get (arg); |
| } |
| } |
| } |
| |
| /* Collect all STMTs that are vectorized and compute their program points. |
| Note that we don't care about the STMTs that are not vectorized and |
| we only build the local graph (within a block) of program points. |
| |
| Loop: |
| bb 2: |
| STMT 1 (be vectorized) -- point 0 |
| STMT 2 (not be vectorized) -- ignored |
| STMT 3 (be vectorized) -- point 1 |
| STMT 4 (be vectorized) -- point 2 |
| STMT 5 (be vectorized) -- point 3 |
| ... |
| bb 3: |
| STMT 1 (be vectorized) -- point 0 |
| STMT 2 (be vectorized) -- point 1 |
| STMT 3 (not be vectorized) -- ignored |
| STMT 4 (not be vectorized) -- ignored |
| STMT 5 (be vectorized) -- point 2 |
| ... |
| */ |
| void |
| costs::compute_local_program_points ( |
| vec_info *vinfo, |
| hash_map<basic_block, vec<stmt_point>> &program_points_per_bb) |
| { |
| if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo)) |
| { |
| class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
| basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); |
| unsigned int nbbs = loop->num_nodes; |
| gimple_stmt_iterator si; |
| unsigned int i; |
| /* Collect the stmts that is vectorized and mark their program point. */ |
| for (i = 0; i < nbbs; i++) |
| { |
| unsigned int point = 1; |
| basic_block bb = bbs[i]; |
| vec<stmt_point> program_points = vNULL; |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Compute local program points for bb %d:\n", |
| bb->index); |
| for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) |
| { |
| if (!is_gimple_assign_or_call (gsi_stmt (si))) |
| continue; |
| stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si)); |
| if (STMT_VINFO_RELEVANT_P (stmt_info)) |
| { |
| stmt_point info = {point, gsi_stmt (si), stmt_info}; |
| program_points.safe_push (info); |
| point++; |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "program point %d: %G", info.point, |
| gsi_stmt (si)); |
| } |
| |
| /* If the statement is part of a pattern, also add the other |
| pattern statements. */ |
| gimple_seq pattern_def_seq; |
| if (STMT_VINFO_IN_PATTERN_P (stmt_info) |
| && (pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info))) |
| { |
| gimple_stmt_iterator si2; |
| |
| for (si2 = gsi_start (pattern_def_seq); |
| !gsi_end_p (si2); |
| gsi_next (&si2)) |
| { |
| stmt_vec_info pattern_def_stmt_info |
| = vinfo->lookup_stmt (gsi_stmt (si2)); |
| if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info) |
| || STMT_VINFO_LIVE_P (pattern_def_stmt_info)) |
| { |
| stmt_point info = {point, gsi_stmt (si2), |
| pattern_def_stmt_info}; |
| program_points.safe_push (info); |
| point++; |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "program point %d: %G", |
| info.point, gsi_stmt (si2)); |
| } |
| } |
| } |
| } |
| program_points_per_bb.put (bb, program_points); |
| } |
| } |
| } |
| |
| static machine_mode |
| get_biggest_mode (machine_mode mode1, machine_mode mode2) |
| { |
| unsigned int mode1_size = GET_MODE_BITSIZE (mode1).to_constant (); |
| unsigned int mode2_size = GET_MODE_BITSIZE (mode2).to_constant (); |
| return mode1_size >= mode2_size ? mode1 : mode2; |
| } |
| |
| /* Return true if OP is invariant. */ |
| |
| static bool |
| loop_invariant_op_p (class loop *loop, |
| tree op) |
| { |
| if (is_gimple_constant (op)) |
| return true; |
| if (SSA_NAME_IS_DEFAULT_DEF (op) |
| || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (op)))) |
| return true; |
| return false; |
| } |
| |
| /* Return true if the variable should be counted into liveness. */ |
| static bool |
| variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info, |
| slp_tree node, tree var, bool lhs_p) |
| { |
| if (!var) |
| return false; |
| gimple *stmt = STMT_VINFO_STMT (stmt_info); |
| stmt_info = vect_stmt_to_vectorize (stmt_info); |
| enum stmt_vec_info_type type = SLP_TREE_TYPE (node); |
| if (is_gimple_call (stmt) && gimple_call_internal_p (stmt)) |
| { |
| if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE |
| || gimple_call_internal_fn (stmt) == IFN_MASK_LOAD) |
| { |
| /* .MASK_LOAD (_5, 32B, _33) |
| ^ ^ ^ |
| Only the 3rd argument will be vectorized and consume |
| a vector register. */ |
| if (TREE_CODE (TREE_TYPE (var)) == BOOLEAN_TYPE |
| || (is_gimple_reg (var) && !POINTER_TYPE_P (TREE_TYPE (var)))) |
| return true; |
| else |
| return false; |
| } |
| } |
| else if (is_gimple_assign (stmt)) |
| { |
| tree_code tcode = gimple_assign_rhs_code (stmt); |
| /* vi variant doesn't need to allocate such statement. |
| E.g. tmp_15 = _4 + 1; will be transformed into vadd.vi |
| so the INTEGER_CST '1' doesn't need a vector register. */ |
| switch (tcode) |
| { |
| case PLUS_EXPR: |
| case BIT_IOR_EXPR: |
| case BIT_XOR_EXPR: |
| case BIT_AND_EXPR: |
| return TREE_CODE (var) != INTEGER_CST |
| || !tree_fits_shwi_p (var) |
| || !IN_RANGE (tree_to_shwi (var), -16, 15); |
| case MINUS_EXPR: |
| return TREE_CODE (var) != INTEGER_CST |
| || !tree_fits_shwi_p (var) |
| || !IN_RANGE (tree_to_shwi (var), -16, 15) |
| || gimple_assign_rhs1 (stmt) != var; |
| case LSHIFT_EXPR: |
| case RSHIFT_EXPR: |
| return gimple_assign_rhs2 (stmt) != var |
| || !loop_invariant_op_p (loop, var); |
| default: |
| break; |
| } |
| } |
| |
| if (lhs_p) |
| return is_gimple_reg (var) |
| && (!POINTER_TYPE_P (TREE_TYPE (var)) |
| || type != store_vec_info_type); |
| else |
| return poly_int_tree_p (var) |
| || (is_gimple_val (var) |
| && (!POINTER_TYPE_P (TREE_TYPE (var)) |
| || type != load_vec_info_type)); |
| } |
| |
| /* Compute local live ranges of each vectorized variable. |
| Note that we only compute local live ranges (within a block) since |
| local live ranges information is accurate enough for us to determine |
| the LMUL/vectorization factor of the loop. |
| |
| Loop: |
| bb 2: |
| STMT 1 -- point 0 |
| STMT 2 (def SSA 1) -- point 1 |
| STMT 3 (use SSA 1) -- point 2 |
| STMT 4 -- point 3 |
| bb 3: |
| STMT 1 -- point 0 |
| STMT 2 -- point 1 |
| STMT 3 -- point 2 |
| STMT 4 (use SSA 2) -- point 3 |
| |
| The live range of SSA 1 is [1, 3] in bb 2. |
| The live range of SSA 2 is [0, 4] in bb 3. */ |
| machine_mode |
| costs::compute_local_live_ranges ( |
| loop_vec_info loop_vinfo, |
| const hash_map<basic_block, vec<stmt_point>> &program_points_per_bb, |
| hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb) |
| { |
| machine_mode biggest_mode = QImode; |
| class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
| if (!program_points_per_bb.is_empty ()) |
| { |
| auto_vec<tree> visited_vars; |
| unsigned int i; |
| for (hash_map<basic_block, vec<stmt_point>>::iterator iter |
| = program_points_per_bb.begin (); |
| iter != program_points_per_bb.end (); ++iter) |
| { |
| basic_block bb = (*iter).first; |
| vec<stmt_point> program_points = (*iter).second; |
| bool existed_p = false; |
| hash_map<tree, pair> *live_ranges |
| = &live_ranges_per_bb.get_or_insert (bb, &existed_p); |
| gcc_assert (!existed_p); |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Compute local live ranges for bb %d:\n", |
| bb->index); |
| for (const auto program_point : program_points) |
| { |
| unsigned int point = program_point.point; |
| gimple *stmt = program_point.stmt; |
| tree lhs = gimple_get_lhs (stmt); |
| slp_tree *node = vinfo_slp_map.get (program_point.stmt_info); |
| if (!node) |
| continue; |
| if (variable_vectorized_p (loop, program_point.stmt_info, |
| *node, lhs, true)) |
| { |
| biggest_mode = get_biggest_mode (biggest_mode, |
| TYPE_MODE (TREE_TYPE (lhs))); |
| bool existed_p = false; |
| pair &live_range |
| = live_ranges->get_or_insert (lhs, &existed_p); |
| gcc_assert (!existed_p); |
| if (SLP_TREE_MEMORY_ACCESS_TYPE (*node) |
| == VMAT_LOAD_STORE_LANES) |
| point = get_first_lane_point (program_points, |
| program_point.stmt_info); |
| live_range = pair (point, point); |
| } |
| for (i = 0; i < gimple_num_args (stmt); i++) |
| { |
| tree var = gimple_arg (stmt, i); |
| if (variable_vectorized_p (loop, program_point.stmt_info, |
| *node, var, false)) |
| { |
| biggest_mode |
| = get_biggest_mode (biggest_mode, |
| TYPE_MODE (TREE_TYPE (var))); |
| bool existed_p = false; |
| pair &live_range |
| = live_ranges->get_or_insert (var, &existed_p); |
| if (SLP_TREE_MEMORY_ACCESS_TYPE (*node) |
| == VMAT_LOAD_STORE_LANES) |
| point = get_last_lane_point (program_points, |
| program_point.stmt_info); |
| else if (existed_p) |
| point = MAX (live_range.second, point); |
| if (existed_p) |
| /* We will grow the live range for each use. */ |
| live_range = pair (live_range.first, point); |
| else |
| { |
| gimple *def_stmt; |
| if (TREE_CODE (var) == SSA_NAME |
| && (def_stmt = SSA_NAME_DEF_STMT (var)) |
| && gimple_bb (def_stmt) == bb |
| && is_gimple_assign_or_call (def_stmt)) |
| { |
| live_ranges->remove (var); |
| for (unsigned int j = 0; |
| j < gimple_num_args (def_stmt); j++) |
| { |
| tree arg = gimple_arg (def_stmt, j); |
| auto *r = get_live_range (live_ranges, arg); |
| gcc_assert (r); |
| (*r).second = MAX (point, (*r).second); |
| biggest_mode = get_biggest_mode ( |
| biggest_mode, TYPE_MODE (TREE_TYPE (arg))); |
| } |
| } |
| else |
| /* The splat vector lives the whole block. */ |
| live_range = pair (0, program_points.length ()); |
| } |
| } |
| } |
| } |
| if (dump_enabled_p ()) |
| for (hash_map<tree, pair>::iterator iter = live_ranges->begin (); |
| iter != live_ranges->end (); ++iter) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "%T: type = %T, start = %d, end = %d\n", |
| (*iter).first, TREE_TYPE ((*iter).first), |
| (*iter).second.first, (*iter).second.second); |
| } |
| } |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, "Biggest mode = %s\n", |
| GET_MODE_NAME (biggest_mode)); |
| return biggest_mode; |
| } |
| |
| /* Compute the mode for MODE, BIGGEST_MODE and LMUL. |
| |
| E.g. If mode = SImode, biggest_mode = DImode, LMUL = M4. |
| Then return RVVM4SImode (LMUL = 4, element mode = SImode). */ |
| static unsigned int |
| compute_nregs_for_mode (loop_vec_info loop_vinfo, machine_mode mode, |
| machine_mode biggest_mode, int lmul) |
| { |
| unsigned int rgroup_size = LOOP_VINFO_LENS (loop_vinfo).is_empty () |
| ? 1 |
| : LOOP_VINFO_LENS (loop_vinfo).length (); |
| unsigned int mode_size = GET_MODE_SIZE (mode).to_constant (); |
| unsigned int biggest_size = GET_MODE_SIZE (biggest_mode).to_constant (); |
| gcc_assert (biggest_size >= mode_size); |
| unsigned int ratio = biggest_size / mode_size; |
| /* RVV mask bool modes always consume 1 vector register regardless LMUL. */ |
| unsigned int nregs = mode == BImode ? 1 : lmul / ratio; |
| return MAX (nregs, 1) * rgroup_size; |
| } |
| |
| /* This function helps to determine whether current LMUL will cause |
| potential vector register (V_REG) spillings according to live range |
| information. |
| |
| - First, compute how many variable are alive of each program point |
| in each bb of the loop. |
| - Second, compute how many V_REGs are alive of each program point |
| in each bb of the loop according the BIGGEST_MODE and the variable |
| mode. |
| - Third, Return the maximum V_REGs are alive of the loop. */ |
| static unsigned int |
| max_number_of_live_regs (loop_vec_info loop_vinfo, const basic_block bb, |
| const hash_map<tree, pair> &live_ranges, |
| unsigned int max_point, machine_mode biggest_mode, |
| int lmul) |
| { |
| unsigned int max_nregs = 0; |
| unsigned int i; |
| unsigned int live_point = 0; |
| auto_vec<unsigned int> live_vars_vec; |
| live_vars_vec.safe_grow_cleared (max_point, true); |
| for (hash_map<tree, pair>::iterator iter = live_ranges.begin (); |
| iter != live_ranges.end (); ++iter) |
| { |
| tree var = (*iter).first; |
| pair live_range = (*iter).second; |
| for (i = live_range.first + 1; i <= live_range.second; i++) |
| { |
| machine_mode mode; |
| if (TREE_CODE (TREE_TYPE (var)) == BOOLEAN_TYPE) |
| mode = BImode; |
| /* Constants do not have a mode, just use the biggest so |
| compute_nregs will return 1. */ |
| else if (TREE_CODE (var) == INTEGER_CST) |
| mode = biggest_mode; |
| else |
| mode = TYPE_MODE (TREE_TYPE (var)); |
| unsigned int nregs |
| = compute_nregs_for_mode (loop_vinfo, mode, biggest_mode, lmul); |
| live_vars_vec[i] += nregs; |
| if (live_vars_vec[i] > max_nregs) |
| { |
| max_nregs = live_vars_vec[i]; |
| live_point = i; |
| } |
| } |
| } |
| |
| /* Collect user explicit RVV type. */ |
| auto_vec<basic_block> all_preds |
| = get_all_dominated_blocks (CDI_POST_DOMINATORS, bb); |
| tree t; |
| FOR_EACH_SSA_NAME (i, t, cfun) |
| { |
| machine_mode mode = TYPE_MODE (TREE_TYPE (t)); |
| if (!lookup_vector_type_attribute (TREE_TYPE (t)) |
| && !riscv_v_ext_vls_mode_p (mode)) |
| continue; |
| |
| gimple *def = SSA_NAME_DEF_STMT (t); |
| if (gimple_bb (def) && !all_preds.contains (gimple_bb (def))) |
| continue; |
| use_operand_p use_p; |
| imm_use_iterator iterator; |
| |
| FOR_EACH_IMM_USE_FAST (use_p, iterator, t) |
| { |
| if (!USE_STMT (use_p) || is_gimple_debug (USE_STMT (use_p)) |
| || !dominated_by_p (CDI_POST_DOMINATORS, bb, |
| gimple_bb (USE_STMT (use_p)))) |
| continue; |
| |
| int regno_alignment = riscv_get_v_regno_alignment (mode); |
| max_nregs += regno_alignment; |
| if (dump_enabled_p ()) |
| dump_printf_loc ( |
| MSG_NOTE, vect_location, |
| "Explicit used SSA %T, vectype = %T, mode = %s, cause %d " |
| "V_REG live in bb %d at program point %d\n", |
| t, TREE_TYPE (t), GET_MODE_NAME (mode), regno_alignment, |
| bb->index, live_point); |
| break; |
| } |
| } |
| |
| if (dump_enabled_p ()) |
| dump_printf_loc ( |
| MSG_NOTE, vect_location, |
| "Maximum lmul = %d, At most %d number of live V_REG at program " |
| "point %d for bb %d\n", |
| lmul, max_nregs, live_point, bb->index); |
| return max_nregs; |
| } |
| |
| /* Get STORE value. */ |
| static tree |
| get_store_value (gimple *stmt) |
| { |
| if (is_gimple_call (stmt) && gimple_call_internal_p (stmt)) |
| { |
| if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE) |
| return gimple_call_arg (stmt, 3); |
| else |
| gcc_unreachable (); |
| } |
| else |
| return gimple_assign_rhs1 (stmt); |
| } |
| |
| /* Return true if additional vector vars needed. */ |
| bool |
| costs::need_additional_vector_vars_p (stmt_vec_info stmt_info, |
| slp_tree node) |
| { |
| enum stmt_vec_info_type type = SLP_TREE_TYPE (node); |
| if (type == load_vec_info_type || type == store_vec_info_type) |
| { |
| if (STMT_VINFO_GATHER_SCATTER_P (stmt_info) |
| && mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node))) |
| return true; |
| |
| machine_mode mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); |
| int lmul = riscv_get_v_regno_alignment (mode); |
| if (DR_GROUP_SIZE (stmt_info) * lmul > RVV_M8) |
| return true; |
| } |
| return false; |
| } |
| |
| /* Return the LMUL of the current analysis. */ |
| static int |
| compute_estimated_lmul (loop_vec_info loop_vinfo, machine_mode mode) |
| { |
| gcc_assert (GET_MODE_BITSIZE (mode).is_constant ()); |
| int regno_alignment = riscv_get_v_regno_alignment (loop_vinfo->vector_mode); |
| if (riscv_v_ext_vls_mode_p (loop_vinfo->vector_mode)) |
| return regno_alignment; |
| else |
| { |
| int estimated_vf = vect_vf_for_cost (loop_vinfo); |
| int estimated_lmul = estimated_vf * GET_MODE_BITSIZE (mode).to_constant () |
| / TARGET_MIN_VLEN; |
| if (estimated_lmul > RVV_M8) |
| return RVV_M8; |
| else |
| return estimated_lmul; |
| } |
| return 0; |
| } |
| |
| /* Update the live ranges according PHI. |
| |
| Loop: |
| bb 2: |
| STMT 1 -- point 0 |
| STMT 2 (def SSA 1) -- point 1 |
| STMT 3 (use SSA 1) -- point 2 |
| STMT 4 -- point 3 |
| bb 3: |
| SSA 2 = PHI<SSA 1> |
| STMT 1 -- point 0 |
| STMT 2 -- point 1 |
| STMT 3 (use SSA 2) -- point 2 |
| STMT 4 -- point 3 |
| |
| Before this function, the SSA 1 live range is [2, 3] in bb 2 |
| and SSA 2 is [0, 3] in bb 3. |
| |
| Then, after this function, we update SSA 1 live range in bb 2 |
| into [2, 4] since SSA 1 is live out into bb 3. */ |
| void |
| costs::update_local_live_ranges ( |
| vec_info *vinfo, |
| hash_map<basic_block, vec<stmt_point>> &program_points_per_bb, |
| hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb, |
| machine_mode *biggest_mode) |
| { |
| loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo); |
| if (!loop_vinfo) |
| return; |
| |
| class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
| basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); |
| unsigned int nbbs = loop->num_nodes; |
| unsigned int i, j; |
| gphi_iterator psi; |
| gimple_stmt_iterator si; |
| for (i = 0; i < nbbs; i++) |
| { |
| basic_block bb = bbs[i]; |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Update local program points for bb %d:\n", |
| bbs[i]->index); |
| for (psi = gsi_start_phis (bb); !gsi_end_p (psi); gsi_next (&psi)) |
| { |
| gphi *phi = psi.phi (); |
| stmt_vec_info stmt_info = vinfo->lookup_stmt (phi); |
| stmt_info = vect_stmt_to_vectorize (stmt_info); |
| slp_tree *node = vinfo_slp_map.get (stmt_info); |
| |
| if (!node) |
| continue; |
| |
| if (SLP_TREE_TYPE (*node) == undef_vec_info_type) |
| continue; |
| |
| for (j = 0; j < gimple_phi_num_args (phi); j++) |
| { |
| edge e = gimple_phi_arg_edge (phi, j); |
| tree def = gimple_phi_arg_def (phi, j); |
| auto *live_ranges = live_ranges_per_bb.get (bb); |
| auto *live_range = live_ranges->get (def); |
| if (poly_int_tree_p (def)) |
| { |
| /* Insert live range of INTEGER_CST or POLY_CST since we will |
| need to allocate a vector register for it. |
| |
| E.g. # j_17 = PHI <j_11(9), 0(5)> will be transformed |
| into # vect_vec_iv_.8_24 = PHI <_25(9), { 0, ... }(5)> |
| |
| The live range for such value is short which only lives |
| from program point 0 to 1. */ |
| if (live_range) |
| { |
| unsigned int start = (*live_range).first; |
| (*live_range).first = 0; |
| if (dump_enabled_p ()) |
| dump_printf_loc ( |
| MSG_NOTE, vect_location, |
| "Update %T start point from %d to 0:\n", def, start); |
| } |
| else |
| { |
| live_ranges->put (def, pair (0, 1)); |
| auto &program_points = (*program_points_per_bb.get (bb)); |
| if (program_points.is_empty ()) |
| { |
| stmt_point info = {1, phi, stmt_info}; |
| program_points.safe_push (info); |
| } |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Add %T start point from 0 to 1:\n", |
| def); |
| } |
| continue; |
| } |
| if (live_range && flow_bb_inside_loop_p (loop, e->src)) |
| { |
| unsigned int start = (*live_range).first; |
| (*live_range).first = 0; |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Update %T start point from %d to %d:\n", |
| def, start, (*live_range).first); |
| } |
| live_ranges = live_ranges_per_bb.get (e->src); |
| if (!program_points_per_bb.get (e->src)) |
| continue; |
| unsigned int max_point |
| = (*program_points_per_bb.get (e->src)).length (); |
| live_range = live_ranges->get (def); |
| if (!live_range) |
| continue; |
| |
| unsigned int end = (*live_range).second; |
| (*live_range).second = max_point; |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Update %T end point from %d to %d:\n", def, |
| end, (*live_range).second); |
| } |
| } |
| for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) |
| { |
| if (!is_gimple_assign_or_call (gsi_stmt (si))) |
| continue; |
| stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si)); |
| stmt_info = vect_stmt_to_vectorize (stmt_info); |
| slp_tree *node = vinfo_slp_map.get (stmt_info); |
| if (!node) |
| continue; |
| enum stmt_vec_info_type type = SLP_TREE_TYPE (*node); |
| if (need_additional_vector_vars_p (stmt_info, *node)) |
| { |
| /* For non-adjacent load/store STMT, we will potentially |
| convert it into: |
| |
| 1. MASK_LEN_GATHER_LOAD (..., perm indice). |
| 2. Contiguous load/store + VEC_PERM (..., perm indice) |
| |
| We will be likely using one more vector variable. */ |
| unsigned int max_point |
| = (*program_points_per_bb.get (bb)).length (); |
| auto *live_ranges = live_ranges_per_bb.get (bb); |
| bool existed_p = false; |
| tree var = type == load_vec_info_type |
| ? gimple_get_lhs (gsi_stmt (si)) |
| : get_store_value (gsi_stmt (si)); |
| tree sel_type = build_nonstandard_integer_type ( |
| TYPE_PRECISION (TREE_TYPE (var)), 1); |
| *biggest_mode |
| = get_biggest_mode (*biggest_mode, TYPE_MODE (sel_type)); |
| tree sel = build_decl (UNKNOWN_LOCATION, VAR_DECL, |
| get_identifier ("vect_perm"), sel_type); |
| pair &live_range = live_ranges->get_or_insert (sel, &existed_p); |
| gcc_assert (!existed_p); |
| live_range = pair (0, max_point); |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Add perm indice %T, start = 0, end = %d\n", |
| sel, max_point); |
| if (!LOOP_VINFO_LENS (loop_vinfo).is_empty () |
| && LOOP_VINFO_LENS (loop_vinfo).length () > 1) |
| { |
| /* If we are vectorizing a permutation when the rgroup number |
| > 1, we will need additional mask to shuffle the second |
| vector. */ |
| tree mask = build_decl (UNKNOWN_LOCATION, VAR_DECL, |
| get_identifier ("vect_perm_mask"), |
| boolean_type_node); |
| pair &live_range |
| = live_ranges->get_or_insert (mask, &existed_p); |
| gcc_assert (!existed_p); |
| live_range = pair (0, max_point); |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Add perm mask %T, start = 0, end = %d\n", |
| mask, max_point); |
| } |
| } |
| } |
| } |
| } |
| |
| /* Compute the maximum live V_REGS. */ |
| bool |
| costs::has_unexpected_spills_p (loop_vec_info loop_vinfo) |
| { |
| /* Compute local program points. |
| It's a fast and effective computation. */ |
| hash_map<basic_block, vec<stmt_point>> program_points_per_bb; |
| compute_local_program_points (loop_vinfo, program_points_per_bb); |
| |
| /* Compute local live ranges. */ |
| hash_map<basic_block, hash_map<tree, pair>> live_ranges_per_bb; |
| machine_mode biggest_mode |
| = compute_local_live_ranges (loop_vinfo, program_points_per_bb, |
| live_ranges_per_bb); |
| |
| /* Update live ranges according to PHI. */ |
| update_local_live_ranges (loop_vinfo, program_points_per_bb, |
| live_ranges_per_bb, &biggest_mode); |
| |
| int lmul = compute_estimated_lmul (loop_vinfo, biggest_mode); |
| gcc_assert (lmul <= RVV_M8); |
| /* TODO: We calculate the maximum live vars base on current STMTS |
| sequence. We can support live range shrink if it can give us |
| big improvement in the future. */ |
| if (lmul > RVV_M1) |
| { |
| if (!live_ranges_per_bb.is_empty ()) |
| { |
| unsigned int max_nregs = 0; |
| for (hash_map<basic_block, hash_map<tree, pair>>::iterator iter |
| = live_ranges_per_bb.begin (); |
| iter != live_ranges_per_bb.end (); ++iter) |
| { |
| basic_block bb = (*iter).first; |
| unsigned int max_point |
| = (*program_points_per_bb.get (bb)).length () + 1; |
| if ((*iter).second.is_empty ()) |
| continue; |
| /* We prefer larger LMUL unless it causes register spillings. */ |
| unsigned int nregs |
| = max_number_of_live_regs (loop_vinfo, bb, (*iter).second, |
| max_point, biggest_mode, lmul); |
| if (nregs > max_nregs) |
| max_nregs = nregs; |
| } |
| live_ranges_per_bb.empty (); |
| if (max_nregs > V_REG_NUM) |
| return true; |
| } |
| } |
| if (!program_points_per_bb.is_empty ()) |
| { |
| for (hash_map<basic_block, vec<stmt_point>>::iterator iter |
| = program_points_per_bb.begin (); |
| iter != program_points_per_bb.end (); ++iter) |
| { |
| vec<stmt_point> program_points = (*iter).second; |
| if (!program_points.is_empty ()) |
| program_points.release (); |
| } |
| program_points_per_bb.empty (); |
| } |
| return false; |
| } |
| |
| costs::costs (vec_info *vinfo, bool costing_for_scalar) |
| : vector_costs (vinfo, costing_for_scalar) |
| { |
| if (costing_for_scalar) |
| m_cost_type = SCALAR_COST; |
| else if (riscv_v_ext_vector_mode_p (vinfo->vector_mode)) |
| m_cost_type = VLA_VECTOR_COST; |
| else |
| m_cost_type = VLS_VECTOR_COST; |
| } |
| |
| /* Do one-time initialization of the costs given that we're |
| costing the loop vectorization described by LOOP_VINFO. */ |
| void |
| costs::analyze_loop_vinfo (loop_vec_info loop_vinfo) |
| { |
| /* Detect whether we're vectorizing for VLA and should apply the unrolling |
| heuristic described above m_unrolled_vls_niters. */ |
| record_potential_vls_unrolling (loop_vinfo); |
| } |
| |
| void |
| costs::record_lmul_spills (loop_vec_info loop_vinfo) |
| { |
| /* Detect whether the LOOP has unexpected spills. */ |
| record_potential_unexpected_spills (loop_vinfo); |
| } |
| |
| /* Analyze the vectorized program statements and use dynamic LMUL |
| heuristic to detect whether the loop has unexpected spills. */ |
| void |
| costs::record_potential_unexpected_spills (loop_vec_info loop_vinfo) |
| { |
| /* We only want to apply the heuristic if LOOP_VINFO is being |
| vectorized for VLA and known NITERS VLS loop. */ |
| if (rvv_max_lmul == RVV_DYNAMIC |
| && (m_cost_type == VLA_VECTOR_COST |
| || (m_cost_type == VLS_VECTOR_COST |
| && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)))) |
| { |
| bool post_dom_available_p = dom_info_available_p (CDI_POST_DOMINATORS); |
| if (!post_dom_available_p) |
| calculate_dominance_info (CDI_POST_DOMINATORS); |
| m_has_unexpected_spills_p = has_unexpected_spills_p (loop_vinfo); |
| if (!post_dom_available_p) |
| free_dominance_info (CDI_POST_DOMINATORS); |
| } |
| } |
| |
| /* Decide whether to use the unrolling heuristic described above |
| m_unrolled_vls_niters, updating that field if so. LOOP_VINFO |
| describes the loop that we're vectorizing. */ |
| void |
| costs::record_potential_vls_unrolling (loop_vec_info loop_vinfo) |
| { |
| /* We only want to apply the heuristic if LOOP_VINFO is being |
| vectorized for VLA. */ |
| if (m_cost_type != VLA_VECTOR_COST) |
| return; |
| |
| /* We don't want to apply the heuristic to outer loops, since it's |
| harder to track two levels of unrolling. */ |
| if (LOOP_VINFO_LOOP (loop_vinfo)->inner) |
| return; |
| |
| /* Only handle cases in which the number of VLS iterations |
| would be known at compile time but the number of SVE iterations |
| would not. */ |
| if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) |
| || BYTES_PER_RISCV_VECTOR.is_constant ()) |
| return; |
| |
| /* Guess how many times the VLS loop would iterate and make |
| sure that it is within the complete unrolling limit. Even if the |
| number of iterations is small enough, the number of statements might |
| not be, which is why we need to estimate the number of statements too. */ |
| unsigned int vls_vf = vect_vf_for_cost (loop_vinfo); |
| unsigned HOST_WIDE_INT unrolled_vls_niters |
| = LOOP_VINFO_INT_NITERS (loop_vinfo) / vls_vf; |
| if (unrolled_vls_niters > (unsigned int) param_max_completely_peel_times) |
| return; |
| |
| /* Record that we're applying the heuristic and should try to estimate |
| the number of statements in the VLS loop. */ |
| m_unrolled_vls_niters = unrolled_vls_niters; |
| } |
| |
| /* Return true if (a) we're applying the VLS vs. VLA unrolling |
| heuristic described above m_unrolled_vls_niters and (b) the heuristic |
| says that we should prefer the VLS loop. */ |
| bool |
| costs::prefer_unrolled_loop () const |
| { |
| if (!m_unrolled_vls_stmts) |
| return false; |
| |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Number of insns in" |
| " unrolled VLS loop = " HOST_WIDE_INT_PRINT_UNSIGNED "\n", |
| m_unrolled_vls_stmts); |
| |
| /* The balance here is tricky. On the one hand, we can't be sure whether |
| the code is vectorizable with VLS or not. However, even if |
| it isn't vectorizable with VLS, there's a possibility that |
| the scalar code could also be unrolled. Some of the code might then |
| benefit from SLP, or from using LDP and STP. We therefore apply |
| the heuristic regardless of can_use_vls_p. */ |
| return (m_unrolled_vls_stmts |
| && (m_unrolled_vls_stmts |
| <= (unsigned int) param_max_completely_peeled_insns)); |
| } |
| |
| bool |
| costs::better_main_loop_than_p (const vector_costs *uncast_other) const |
| { |
| auto other = static_cast<const costs *> (uncast_other); |
| auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo); |
| auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo); |
| |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Comparing two main loops (%s at VF %d vs %s at VF %d)\n", |
| GET_MODE_NAME (this_loop_vinfo->vector_mode), |
| vect_vf_for_cost (this_loop_vinfo), |
| GET_MODE_NAME (other_loop_vinfo->vector_mode), |
| vect_vf_for_cost (other_loop_vinfo)); |
| |
| /* Apply the unrolling heuristic described above m_unrolled_vls_niters. */ |
| if (bool (m_unrolled_vls_stmts) != bool (other->m_unrolled_vls_stmts) |
| && m_cost_type != other->m_cost_type) |
| { |
| bool this_prefer_unrolled = this->prefer_unrolled_loop (); |
| bool other_prefer_unrolled = other->prefer_unrolled_loop (); |
| if (this_prefer_unrolled != other_prefer_unrolled) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Preferring VLS loop because" |
| " it can be unrolled\n"); |
| return other_prefer_unrolled; |
| } |
| } |
| else if (rvv_max_lmul == RVV_DYNAMIC) |
| { |
| if (other->m_has_unexpected_spills_p) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Preferring smaller LMUL loop because" |
| " it has unexpected spills\n"); |
| return true; |
| } |
| else if (riscv_v_ext_vector_mode_p (other_loop_vinfo->vector_mode)) |
| { |
| if (LOOP_VINFO_NITERS_KNOWN_P (other_loop_vinfo)) |
| { |
| if (maybe_gt (LOOP_VINFO_INT_NITERS (this_loop_vinfo), |
| LOOP_VINFO_VECT_FACTOR (this_loop_vinfo))) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Keep current LMUL loop because" |
| " known NITERS exceed the new VF\n"); |
| return false; |
| } |
| } |
| else |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Keep current LMUL loop because" |
| " it is unknown NITERS\n"); |
| return false; |
| } |
| } |
| } |
| /* If NITERS is unknown, we should not use VLS modes to vectorize |
| the loop since we don't support partial vectors for VLS modes, |
| that is, we will have full vectors (VLSmodes) on loop body |
| and partial vectors (VLAmodes) on loop epilogue which is very |
| inefficient. Instead, we should apply partial vectors (VLAmodes) |
| on loop body without an epilogue on unknown NITERS loop. */ |
| else if (!LOOP_VINFO_NITERS_KNOWN_P (this_loop_vinfo) |
| && m_cost_type == VLS_VECTOR_COST) |
| return false; |
| |
| return vector_costs::better_main_loop_than_p (other); |
| } |
| |
| /* Returns the group size i.e. the number of vectors to be loaded by a |
| segmented load/store instruction. Return 0 if it is no segmented |
| load/store. */ |
| static int |
| segment_loadstore_group_size (enum vect_cost_for_stmt kind, |
| stmt_vec_info stmt_info, slp_tree node) |
| { |
| if (stmt_info |
| && (kind == vector_load || kind == vector_store) |
| && STMT_VINFO_DATA_REF (stmt_info)) |
| { |
| stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); |
| if (stmt_info |
| && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_LOAD_STORE_LANES) |
| return DR_GROUP_SIZE (stmt_info); |
| } |
| return 0; |
| } |
| |
| /* Adjust vectorization cost after calling riscv_builtin_vectorization_cost. |
| For some statement, we would like to further fine-grain tweak the cost on |
| top of riscv_builtin_vectorization_cost handling which doesn't have any |
| information on statement operation codes etc. */ |
| |
| unsigned |
| costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop, |
| stmt_vec_info stmt_info, |
| slp_tree node, tree vectype, int stmt_cost) |
| { |
| const cpu_vector_cost *costs = get_vector_costs (); |
| switch (kind) |
| { |
| case scalar_to_vec: |
| stmt_cost |
| += (FLOAT_TYPE_P (vectype) ? get_fr2vr_cost () : get_gr2vr_cost ()); |
| break; |
| case vec_to_scalar: |
| stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->VR2FR |
| : costs->regmove->VR2GR); |
| break; |
| case vector_load: |
| case vector_store: |
| { |
| if (stmt_info && stmt_info->stmt && STMT_VINFO_DATA_REF (stmt_info)) |
| { |
| /* Segment loads and stores. When the group size is > 1 |
| the vectorizer will add a vector load/store statement for |
| each vector in the group. Here we additionally add permute |
| costs for each. */ |
| /* TODO: Indexed and ordered/unordered cost. */ |
| int group_size = segment_loadstore_group_size (kind, stmt_info, |
| node); |
| if (group_size > 1) |
| { |
| switch (group_size) |
| { |
| case 2: |
| if (riscv_v_ext_vector_mode_p (loop->vector_mode)) |
| stmt_cost += costs->vla->segment_permute_2; |
| else |
| stmt_cost += costs->vls->segment_permute_2; |
| break; |
| case 3: |
| if (riscv_v_ext_vector_mode_p (loop->vector_mode)) |
| stmt_cost += costs->vla->segment_permute_3; |
| else |
| stmt_cost += costs->vls->segment_permute_3; |
| break; |
| case 4: |
| if (riscv_v_ext_vector_mode_p (loop->vector_mode)) |
| stmt_cost += costs->vla->segment_permute_4; |
| else |
| stmt_cost += costs->vls->segment_permute_4; |
| break; |
| case 5: |
| if (riscv_v_ext_vector_mode_p (loop->vector_mode)) |
| stmt_cost += costs->vla->segment_permute_5; |
| else |
| stmt_cost += costs->vls->segment_permute_5; |
| break; |
| case 6: |
| if (riscv_v_ext_vector_mode_p (loop->vector_mode)) |
| stmt_cost += costs->vla->segment_permute_6; |
| else |
| stmt_cost += costs->vls->segment_permute_6; |
| break; |
| case 7: |
| if (riscv_v_ext_vector_mode_p (loop->vector_mode)) |
| stmt_cost += costs->vla->segment_permute_7; |
| else |
| stmt_cost += costs->vls->segment_permute_7; |
| break; |
| case 8: |
| if (riscv_v_ext_vector_mode_p (loop->vector_mode)) |
| stmt_cost += costs->vla->segment_permute_8; |
| else |
| stmt_cost += costs->vls->segment_permute_8; |
| break; |
| default: |
| break; |
| } |
| } |
| else |
| { |
| /* Unit-stride vector loads and stores do not have offset |
| addressing as opposed to scalar loads and stores. |
| If the address depends on a variable we need an additional |
| add/sub for each load/store in the worst case. */ |
| data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); |
| class loop *father = stmt_info->stmt->bb->loop_father; |
| if (!loop && father && !father->inner && father->superloops) |
| { |
| tree ref; |
| if (TREE_CODE (dr->ref) != MEM_REF |
| || !(ref = TREE_OPERAND (dr->ref, 0)) |
| || TREE_CODE (ref) != SSA_NAME) |
| break; |
| |
| if (SSA_NAME_IS_DEFAULT_DEF (ref)) |
| break; |
| |
| if (memrefs.contains ({ref, cst0})) |
| break; |
| |
| memrefs.add ({ref, cst0}); |
| |
| /* In case we have not seen REF before and the base |
| address is a pointer operation try a bit harder. */ |
| tree base = DR_BASE_ADDRESS (dr); |
| if (TREE_CODE (base) == POINTER_PLUS_EXPR |
| || TREE_CODE (base) == POINTER_DIFF_EXPR) |
| { |
| /* Deconstruct BASE's first operand. If it is a |
| binary operation, i.e. a base and an "offset" |
| store this pair. Only increase the stmt_cost if |
| we haven't seen it before. */ |
| tree argp = TREE_OPERAND (base, 1); |
| typedef std::pair<tree, tree> addr_pair; |
| addr_pair pair; |
| if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary) |
| { |
| tree argp0 = tree_strip_nop_conversions |
| (TREE_OPERAND (argp, 0)); |
| tree argp1 = TREE_OPERAND (argp, 1); |
| pair = addr_pair (argp0, argp1); |
| if (memrefs.contains (pair)) |
| break; |
| |
| memrefs.add (pair); |
| stmt_cost |
| += builtin_vectorization_cost (scalar_stmt, |
| NULL_TREE, 0); |
| } |
| } |
| } |
| } |
| } |
| break; |
| } |
| |
| default: |
| break; |
| } |
| return stmt_cost; |
| } |
| |
| unsigned |
| costs::add_stmt_cost (int count, vect_cost_for_stmt kind, |
| stmt_vec_info stmt_info, slp_tree node, tree vectype, |
| int misalign, vect_cost_model_location where) |
| { |
| int stmt_cost |
| = targetm.vectorize.builtin_vectorization_cost (kind, vectype, misalign); |
| |
| if (stmt_info && node) |
| vinfo_slp_map.put (stmt_info, node); |
| |
| /* Do one-time initialization based on the vinfo. */ |
| loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo); |
| |
| if (!m_analyzed_vinfo) |
| { |
| if (loop_vinfo) |
| analyze_loop_vinfo (loop_vinfo); |
| |
| memrefs.empty (); |
| m_analyzed_vinfo = true; |
| } |
| |
| if (stmt_info) |
| { |
| /* If we're applying the VLA vs. VLS unrolling heuristic, |
| estimate the number of statements in the unrolled VLS |
| loop. For simplicity, we assume that one iteration of the |
| VLS loop would need the same number of statements |
| as one iteration of the VLA loop. */ |
| if (where == vect_body && m_unrolled_vls_niters) |
| m_unrolled_vls_stmts += count * m_unrolled_vls_niters; |
| } |
| |
| if (vectype) |
| stmt_cost = adjust_stmt_cost (kind, loop_vinfo, stmt_info, node, vectype, |
| stmt_cost); |
| |
| return record_stmt_cost (stmt_info, where, count * stmt_cost); |
| } |
| |
| /* For some target specific vectorization cost which can't be handled per stmt, |
| we check the requisite conditions and adjust the vectorization cost |
| accordingly if satisfied. One typical example is to model and adjust |
| loop_len cost for known_lt (NITERS, VF). */ |
| |
| void |
| costs::adjust_vect_cost_per_loop (loop_vec_info loop_vinfo) |
| { |
| if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo) |
| && !LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)) |
| { |
| /* In middle-end loop vectorizer, we don't count the loop_len cost in |
| vect_estimate_min_profitable_iters when NITERS < VF, that is, we only |
| count cost of len that we need to iterate loop more than once with VF. |
| It's correct for most of the cases: |
| |
| E.g. VF = [4, 4] |
| for (int i = 0; i < 3; i ++) |
| a[i] += b[i]; |
| |
| We don't need to cost MIN_EXPR or SELECT_VL for the case above. |
| |
| However, for some inefficient vectorized cases, it does use MIN_EXPR |
| to generate len. |
| |
| E.g. VF = [256, 256] |
| |
| Loop body: |
| # loop_len_110 = PHI <18(2), _119(11)> |
| ... |
| _117 = MIN_EXPR <ivtmp_114, 18>; |
| _118 = 18 - _117; |
| _119 = MIN_EXPR <_118, POLY_INT_CST [256, 256]>; |
| ... |
| |
| Epilogue: |
| ... |
| _112 = .VEC_EXTRACT (vect_patt_27.14_109, _111); |
| |
| We cost 1 unconditionally for this situation like other targets which |
| apply mask as the loop control. */ |
| rgroup_controls *rgc; |
| unsigned int num_vectors_m1; |
| unsigned int body_stmts = 0; |
| FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc) |
| if (rgc->type) |
| body_stmts += num_vectors_m1 + 1; |
| |
| add_stmt_cost (body_stmts, scalar_stmt, NULL, NULL, NULL_TREE, 0, |
| vect_body); |
| } |
| } |
| |
| void |
| costs::finish_cost (const vector_costs *scalar_costs) |
| { |
| if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo)) |
| { |
| record_lmul_spills (loop_vinfo); |
| |
| adjust_vect_cost_per_loop (loop_vinfo); |
| } |
| vector_costs::finish_cost (scalar_costs); |
| } |
| |
| } // namespace riscv_vector |