| /* Loop versioning pass. |
| Copyright (C) 2018-2020 Free Software Foundation, Inc. |
| |
| This file is part of GCC. |
| |
| GCC is free software; you can redistribute it and/or modify it |
| under the terms of the GNU General Public License as published by the |
| Free Software Foundation; either version 3, or (at your option) any |
| later version. |
| |
| GCC is distributed in the hope that it will be useful, but WITHOUT |
| ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with GCC; see the file COPYING3. If not see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #include "config.h" |
| #include "system.h" |
| #include "coretypes.h" |
| #include "backend.h" |
| #include "tree.h" |
| #include "gimple.h" |
| #include "gimple-iterator.h" |
| #include "tree-pass.h" |
| #include "gimplify-me.h" |
| #include "cfgloop.h" |
| #include "tree-ssa-loop.h" |
| #include "ssa.h" |
| #include "tree-scalar-evolution.h" |
| #include "tree-chrec.h" |
| #include "tree-ssa-loop-ivopts.h" |
| #include "fold-const.h" |
| #include "tree-ssa-propagate.h" |
| #include "tree-inline.h" |
| #include "domwalk.h" |
| #include "alloc-pool.h" |
| #include "vr-values.h" |
| #include "gimple-ssa-evrp-analyze.h" |
| #include "tree-vectorizer.h" |
| #include "omp-general.h" |
| #include "predict.h" |
| #include "tree-into-ssa.h" |
| |
| namespace { |
| |
| /* This pass looks for loops that could be simplified if certain loop |
| invariant conditions were true. It is effectively a form of loop |
| splitting in which the pass produces the split conditions itself, |
| instead of using ones that are already present in the IL. |
| |
| Versioning for when strides are 1 |
| --------------------------------- |
| |
| At the moment the only thing the pass looks for are memory references |
| like: |
| |
| for (auto i : ...) |
| ...x[i * stride]... |
| |
| It considers changing such loops to: |
| |
| if (stride == 1) |
| for (auto i : ...) [A] |
| ...x[i]... |
| else |
| for (auto i : ...) [B] |
| ...x[i * stride]... |
| |
| This can have several benefits: |
| |
| (1) [A] is often easier or cheaper to vectorize than [B]. |
| |
| (2) The scalar code in [A] is simpler than the scalar code in [B] |
| (if the loops cannot be vectorized or need an epilogue loop). |
| |
| (3) We might recognize [A] as a pattern, such as a memcpy or memset. |
| |
| (4) [A] has simpler address evolutions, which can help other passes |
| like loop interchange. |
| |
| The optimization is particularly useful for assumed-shape arrays in |
| Fortran, where the stride of the innermost dimension depends on the |
| array descriptor but is often equal to 1 in practice. For example: |
| |
| subroutine f1(x) |
| real :: x(:) |
| x(:) = 100 |
| end subroutine f1 |
| |
| generates the equivalent of: |
| |
| raw_stride = *x.dim[0].stride; |
| stride = raw_stride != 0 ? raw_stride : 1; |
| x_base = *x.data; |
| ... |
| tmp1 = stride * S; |
| tmp2 = tmp1 - stride; |
| *x_base[tmp2] = 1.0e+2; |
| |
| but in the common case that stride == 1, the last three statements |
| simplify to: |
| |
| tmp3 = S + -1; |
| *x_base[tmp3] = 1.0e+2; |
| |
| The optimization is in principle very simple. The difficult parts are: |
| |
| (a) deciding which parts of a general address calculation correspond |
| to the inner dimension of an array, since this usually isn't explicit |
| in the IL, and for C often isn't even explicit in the source code |
| |
| (b) estimating when the transformation is worthwhile |
| |
| Structure |
| --------- |
| |
| The pass has four phases: |
| |
| (1) Walk through the statements looking for and recording potential |
| versioning opportunities. Stop if there are none. |
| |
| (2) Use context-sensitive range information to see whether any versioning |
| conditions are impossible in practice. Remove them if so, and stop |
| if no opportunities remain. |
| |
| (We do this only after (1) to keep compile time down when no |
| versioning opportunities exist.) |
| |
| (3) Apply the cost model. Decide which versioning opportunities are |
| worthwhile and at which nesting level they should be applied. |
| |
| (4) Attempt to version all the loops selected by (3), so that: |
| |
| for (...) |
| ... |
| |
| becomes: |
| |
| if (!cond) |
| for (...) // Original loop |
| ... |
| else |
| for (...) // New loop |
| ... |
| |
| Use the version condition COND to simplify the new loop. */ |
| |
| /* Enumerates the likelihood that a particular value indexes the inner |
| dimension of an array. */ |
| enum inner_likelihood { |
| INNER_UNLIKELY, |
| INNER_DONT_KNOW, |
| INNER_LIKELY |
| }; |
| |
| /* Information about one term of an address_info. */ |
| struct address_term_info |
| { |
| /* The value of the term is EXPR * MULTIPLIER. */ |
| tree expr; |
| unsigned HOST_WIDE_INT multiplier; |
| |
| /* The stride applied by EXPR in each iteration of some unrecorded loop, |
| or null if no stride has been identified. */ |
| tree stride; |
| |
| /* Enumerates the likelihood that EXPR indexes the inner dimension |
| of an array. */ |
| enum inner_likelihood inner_likelihood; |
| |
| /* True if STRIDE == 1 is a versioning opportunity when considered |
| in isolation. */ |
| bool versioning_opportunity_p; |
| }; |
| |
| /* Information about an address calculation, and the range of constant |
| offsets applied to it. */ |
| class address_info |
| { |
| public: |
| static const unsigned int MAX_TERMS = 8; |
| |
| /* One statement that calculates the address. If multiple statements |
| share the same address, we only record the first. */ |
| gimple *stmt; |
| |
| /* The loop containing STMT (cached for convenience). If multiple |
| statements share the same address, they all belong to this loop. */ |
| class loop *loop; |
| |
| /* A decomposition of the calculation into a sum of terms plus an |
| optional base. When BASE is provided, it is never an SSA name. |
| Once initialization is complete, all members of TERMs are SSA names. */ |
| tree base; |
| auto_vec<address_term_info, MAX_TERMS> terms; |
| |
| /* All bytes accessed from the address fall in the offset range |
| [MIN_OFFSET, MAX_OFFSET). */ |
| HOST_WIDE_INT min_offset, max_offset; |
| }; |
| |
| /* Stores addresses based on their base and terms (ignoring the offsets). */ |
| struct address_info_hasher : nofree_ptr_hash <address_info> |
| { |
| static hashval_t hash (const address_info *); |
| static bool equal (const address_info *, const address_info *); |
| }; |
| |
| /* Information about the versioning we'd like to apply to a loop. */ |
| class loop_info |
| { |
| public: |
| bool worth_versioning_p () const; |
| |
| /* True if we've decided not to version this loop. The remaining |
| fields are meaningless if so. */ |
| bool rejected_p; |
| |
| /* True if at least one subloop of this loop benefits from versioning. */ |
| bool subloops_benefit_p; |
| |
| /* An estimate of the total number of instructions in the loop, |
| excluding those in subloops that benefit from versioning. */ |
| unsigned int num_insns; |
| |
| /* The outermost loop that can handle all the version checks |
| described below. */ |
| class loop *outermost; |
| |
| /* The first entry in the list of blocks that belong to this loop |
| (and not to subloops). m_next_block_in_loop provides the chain |
| pointers for the list. */ |
| basic_block block_list; |
| |
| /* We'd like to version the loop for the case in which these SSA names |
| (keyed off their SSA_NAME_VERSION) are all equal to 1 at runtime. */ |
| bitmap_head unity_names; |
| |
| /* If versioning succeeds, this points the version of the loop that |
| assumes the version conditions holds. */ |
| class loop *optimized_loop; |
| }; |
| |
| /* The main pass structure. */ |
| class loop_versioning |
| { |
| public: |
| loop_versioning (function *); |
| ~loop_versioning (); |
| unsigned int run (); |
| |
| private: |
| /* Used to walk the dominator tree to find loop versioning conditions |
| that are always false. */ |
| class lv_dom_walker : public dom_walker |
| { |
| public: |
| lv_dom_walker (loop_versioning &); |
| |
| edge before_dom_children (basic_block) FINAL OVERRIDE; |
| void after_dom_children (basic_block) FINAL OVERRIDE; |
| |
| private: |
| /* The parent pass. */ |
| loop_versioning &m_lv; |
| |
| /* Used to build context-dependent range information. */ |
| evrp_range_analyzer m_range_analyzer; |
| }; |
| |
| /* Used to simplify statements based on conditions that are established |
| by the version checks. */ |
| class name_prop : public substitute_and_fold_engine |
| { |
| public: |
| name_prop (loop_info &li) : m_li (li) {} |
| tree get_value (tree) FINAL OVERRIDE; |
| |
| private: |
| /* Information about the versioning we've performed on the loop. */ |
| loop_info &m_li; |
| }; |
| |
| loop_info &get_loop_info (class loop *loop) { return m_loops[loop->num]; } |
| |
| unsigned int max_insns_for_loop (class loop *); |
| bool expensive_stmt_p (gimple *); |
| |
| void version_for_unity (gimple *, tree); |
| bool acceptable_multiplier_p (tree, unsigned HOST_WIDE_INT, |
| unsigned HOST_WIDE_INT * = 0); |
| bool acceptable_type_p (tree, unsigned HOST_WIDE_INT *); |
| bool multiply_term_by (address_term_info &, tree); |
| inner_likelihood get_inner_likelihood (tree, unsigned HOST_WIDE_INT); |
| void dump_inner_likelihood (address_info &, address_term_info &); |
| void analyze_stride (address_info &, address_term_info &, |
| tree, class loop *); |
| bool find_per_loop_multiplication (address_info &, address_term_info &); |
| bool analyze_term_using_scevs (address_info &, address_term_info &); |
| void analyze_arbitrary_term (address_info &, address_term_info &); |
| void analyze_address_fragment (address_info &); |
| void record_address_fragment (gimple *, unsigned HOST_WIDE_INT, |
| tree, unsigned HOST_WIDE_INT, HOST_WIDE_INT); |
| void analyze_expr (gimple *, tree); |
| bool analyze_block (basic_block); |
| bool analyze_blocks (); |
| |
| void prune_loop_conditions (class loop *, vr_values *); |
| bool prune_conditions (); |
| |
| void merge_loop_info (class loop *, class loop *); |
| void add_loop_to_queue (class loop *); |
| bool decide_whether_loop_is_versionable (class loop *); |
| bool make_versioning_decisions (); |
| |
| bool version_loop (class loop *); |
| void implement_versioning_decisions (); |
| |
| /* The function we're optimizing. */ |
| function *m_fn; |
| |
| /* The obstack to use for all pass-specific bitmaps. */ |
| bitmap_obstack m_bitmap_obstack; |
| |
| /* An obstack to use for general allocation. */ |
| obstack m_obstack; |
| |
| /* The number of loops in the function. */ |
| unsigned int m_nloops; |
| |
| /* The total number of loop version conditions we've found. */ |
| unsigned int m_num_conditions; |
| |
| /* Assume that an address fragment of the form i * stride * scale |
| (for variable stride and constant scale) will not benefit from |
| versioning for stride == 1 when scale is greater than this value. */ |
| unsigned HOST_WIDE_INT m_maximum_scale; |
| |
| /* Information about each loop. */ |
| auto_vec<loop_info> m_loops; |
| |
| /* Used to form a linked list of blocks that belong to a loop, |
| started by loop_info::block_list. */ |
| auto_vec<basic_block> m_next_block_in_loop; |
| |
| /* The list of loops that we've decided to version. */ |
| auto_vec<class loop *> m_loops_to_version; |
| |
| /* A table of addresses in the current loop, keyed off their values |
| but not their offsets. */ |
| hash_table <address_info_hasher> m_address_table; |
| |
| /* A list of all addresses in M_ADDRESS_TABLE, in a predictable order. */ |
| auto_vec <address_info *, 32> m_address_list; |
| }; |
| |
| /* If EXPR is an SSA name and not a default definition, return the |
| defining statement, otherwise return null. */ |
| |
| static gimple * |
| maybe_get_stmt (tree expr) |
| { |
| if (TREE_CODE (expr) == SSA_NAME && !SSA_NAME_IS_DEFAULT_DEF (expr)) |
| return SSA_NAME_DEF_STMT (expr); |
| return NULL; |
| } |
| |
| /* Like maybe_get_stmt, but also return null if the defining |
| statement isn't an assignment. */ |
| |
| static gassign * |
| maybe_get_assign (tree expr) |
| { |
| return safe_dyn_cast <gassign *> (maybe_get_stmt (expr)); |
| } |
| |
| /* Return true if this pass should look through a cast of expression FROM |
| to type TYPE when analyzing pieces of an address. */ |
| |
| static bool |
| look_through_cast_p (tree type, tree from) |
| { |
| return (INTEGRAL_TYPE_P (TREE_TYPE (from)) == INTEGRAL_TYPE_P (type) |
| && POINTER_TYPE_P (TREE_TYPE (from)) == POINTER_TYPE_P (type)); |
| } |
| |
| /* Strip all conversions of integers or pointers from EXPR, regardless |
| of whether the conversions are nops. This is useful in the context |
| of this pass because we're not trying to fold or simulate the |
| expression; we just want to see how it's structured. */ |
| |
| static tree |
| strip_casts (tree expr) |
| { |
| const unsigned int MAX_NITERS = 4; |
| |
| tree type = TREE_TYPE (expr); |
| while (CONVERT_EXPR_P (expr) |
| && look_through_cast_p (type, TREE_OPERAND (expr, 0))) |
| expr = TREE_OPERAND (expr, 0); |
| |
| for (unsigned int niters = 0; niters < MAX_NITERS; ++niters) |
| { |
| gassign *assign = maybe_get_assign (expr); |
| if (assign |
| && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)) |
| && look_through_cast_p (type, gimple_assign_rhs1 (assign))) |
| expr = gimple_assign_rhs1 (assign); |
| else |
| break; |
| } |
| return expr; |
| } |
| |
| /* Compare two address_term_infos in the same address_info. */ |
| |
| static int |
| compare_address_terms (const void *a_uncast, const void *b_uncast) |
| { |
| const address_term_info *a = (const address_term_info *) a_uncast; |
| const address_term_info *b = (const address_term_info *) b_uncast; |
| |
| if (a->expr != b->expr) |
| return SSA_NAME_VERSION (a->expr) < SSA_NAME_VERSION (b->expr) ? -1 : 1; |
| |
| if (a->multiplier != b->multiplier) |
| return a->multiplier < b->multiplier ? -1 : 1; |
| |
| return 0; |
| } |
| |
| /* Dump ADDRESS using flags FLAGS. */ |
| |
| static void |
| dump_address_info (dump_flags_t flags, address_info &address) |
| { |
| if (address.base) |
| dump_printf (flags, "%T + ", address.base); |
| for (unsigned int i = 0; i < address.terms.length (); ++i) |
| { |
| if (i != 0) |
| dump_printf (flags, " + "); |
| dump_printf (flags, "%T", address.terms[i].expr); |
| if (address.terms[i].multiplier != 1) |
| dump_printf (flags, " * %wd", address.terms[i].multiplier); |
| } |
| dump_printf (flags, " + [%wd, %wd]", |
| address.min_offset, address.max_offset - 1); |
| } |
| |
| /* Hash an address_info based on its base and terms. */ |
| |
| hashval_t |
| address_info_hasher::hash (const address_info *info) |
| { |
| inchash::hash hash; |
| hash.add_int (info->base ? TREE_CODE (info->base) : 0); |
| hash.add_int (info->terms.length ()); |
| for (unsigned int i = 0; i < info->terms.length (); ++i) |
| { |
| hash.add_int (SSA_NAME_VERSION (info->terms[i].expr)); |
| hash.add_hwi (info->terms[i].multiplier); |
| } |
| return hash.end (); |
| } |
| |
| /* Return true if two address_infos have equal bases and terms. Other |
| properties might be different (such as the statement or constant |
| offset range). */ |
| |
| bool |
| address_info_hasher::equal (const address_info *a, const address_info *b) |
| { |
| if (a->base != b->base |
| && (!a->base || !b->base || !operand_equal_p (a->base, b->base, 0))) |
| return false; |
| |
| if (a->terms.length () != b->terms.length ()) |
| return false; |
| |
| for (unsigned int i = 0; i < a->terms.length (); ++i) |
| if (a->terms[i].expr != b->terms[i].expr |
| || a->terms[i].multiplier != b->terms[i].multiplier) |
| return false; |
| |
| return true; |
| } |
| |
| /* Return true if we want to version the loop, i.e. if we have a |
| specific reason for doing so and no specific reason not to. */ |
| |
| bool |
| loop_info::worth_versioning_p () const |
| { |
| return (!rejected_p |
| && (!bitmap_empty_p (&unity_names) || subloops_benefit_p)); |
| } |
| |
| loop_versioning::lv_dom_walker::lv_dom_walker (loop_versioning &lv) |
| : dom_walker (CDI_DOMINATORS), m_lv (lv), m_range_analyzer (false) |
| { |
| } |
| |
| /* Process BB before processing the blocks it dominates. */ |
| |
| edge |
| loop_versioning::lv_dom_walker::before_dom_children (basic_block bb) |
| { |
| m_range_analyzer.enter (bb); |
| |
| if (bb == bb->loop_father->header) |
| m_lv.prune_loop_conditions (bb->loop_father, |
| m_range_analyzer.get_vr_values ()); |
| |
| for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); |
| gsi_next (&si)) |
| m_range_analyzer.record_ranges_from_stmt (gsi_stmt (si), false); |
| |
| return NULL; |
| } |
| |
| /* Process BB after processing the blocks it dominates. */ |
| |
| void |
| loop_versioning::lv_dom_walker::after_dom_children (basic_block bb) |
| { |
| m_range_analyzer.leave (bb); |
| } |
| |
| /* Decide whether to replace VAL with a new value in a versioned loop. |
| Return the new value if so, otherwise return null. */ |
| |
| tree |
| loop_versioning::name_prop::get_value (tree val) |
| { |
| if (TREE_CODE (val) == SSA_NAME |
| && bitmap_bit_p (&m_li.unity_names, SSA_NAME_VERSION (val))) |
| return build_one_cst (TREE_TYPE (val)); |
| return NULL_TREE; |
| } |
| |
| /* Initialize the structure to optimize FN. */ |
| |
| loop_versioning::loop_versioning (function *fn) |
| : m_fn (fn), |
| m_nloops (number_of_loops (fn)), |
| m_num_conditions (0), |
| m_address_table (31) |
| { |
| bitmap_obstack_initialize (&m_bitmap_obstack); |
| gcc_obstack_init (&m_obstack); |
| |
| /* Initialize the loop information. */ |
| m_loops.safe_grow_cleared (m_nloops); |
| for (unsigned int i = 0; i < m_nloops; ++i) |
| { |
| m_loops[i].outermost = get_loop (m_fn, 0); |
| bitmap_initialize (&m_loops[i].unity_names, &m_bitmap_obstack); |
| } |
| |
| /* Initialize the list of blocks that belong to each loop. */ |
| unsigned int nbbs = last_basic_block_for_fn (fn); |
| m_next_block_in_loop.safe_grow (nbbs); |
| basic_block bb; |
| FOR_EACH_BB_FN (bb, fn) |
| { |
| loop_info &li = get_loop_info (bb->loop_father); |
| m_next_block_in_loop[bb->index] = li.block_list; |
| li.block_list = bb; |
| } |
| |
| /* MAX_FIXED_MODE_SIZE should be a reasonable maximum scale for |
| unvectorizable code, since it is the largest size that can be |
| handled efficiently by scalar code. omp_max_vf calculates the |
| maximum number of bytes in a vector, when such a value is relevant |
| to loop optimization. */ |
| m_maximum_scale = estimated_poly_value (omp_max_vf ()); |
| m_maximum_scale = MAX (m_maximum_scale, MAX_FIXED_MODE_SIZE); |
| } |
| |
| loop_versioning::~loop_versioning () |
| { |
| bitmap_obstack_release (&m_bitmap_obstack); |
| obstack_free (&m_obstack, NULL); |
| } |
| |
| /* Return the maximum number of instructions allowed in LOOP before |
| it becomes too big for versioning. |
| |
| There are separate limits for inner and outer loops. The limit for |
| inner loops applies only to loops that benefit directly from versioning. |
| The limit for outer loops applies to all code in the outer loop and |
| its subloops that *doesn't* benefit directly from versioning; such code |
| would be "taken along for the ride". The idea is that if the cost of |
| the latter is small, it is better to version outer loops rather than |
| inner loops, both to reduce the number of repeated checks and to enable |
| more of the loop nest to be optimized as a natural nest (e.g. by loop |
| interchange or outer-loop vectorization). */ |
| |
| unsigned int |
| loop_versioning::max_insns_for_loop (class loop *loop) |
| { |
| return (loop->inner |
| ? param_loop_versioning_max_outer_insns |
| : param_loop_versioning_max_inner_insns); |
| } |
| |
| /* Return true if for cost reasons we should avoid versioning any loop |
| that contains STMT. |
| |
| Note that we don't need to check whether versioning is invalid for |
| correctness reasons, since the versioning process does that for us. |
| The conditions involved are too rare to be worth duplicating here. */ |
| |
| bool |
| loop_versioning::expensive_stmt_p (gimple *stmt) |
| { |
| if (gcall *call = dyn_cast <gcall *> (stmt)) |
| /* Assume for now that the time spent in an "expensive" call would |
| overwhelm any saving from versioning. */ |
| return !gimple_inexpensive_call_p (call); |
| return false; |
| } |
| |
| /* Record that we want to version the loop that contains STMT for the |
| case in which SSA name NAME is equal to 1. We already know that NAME |
| is invariant in the loop. */ |
| |
| void |
| loop_versioning::version_for_unity (gimple *stmt, tree name) |
| { |
| class loop *loop = loop_containing_stmt (stmt); |
| loop_info &li = get_loop_info (loop); |
| |
| if (bitmap_set_bit (&li.unity_names, SSA_NAME_VERSION (name))) |
| { |
| /* This is the first time we've wanted to version LOOP for NAME. |
| Keep track of the outermost loop that can handle all versioning |
| checks in LI. */ |
| class loop *outermost |
| = outermost_invariant_loop_for_expr (loop, name); |
| if (loop_depth (li.outermost) < loop_depth (outermost)) |
| li.outermost = outermost; |
| |
| if (dump_enabled_p ()) |
| { |
| dump_printf_loc (MSG_NOTE, stmt, "want to version containing loop" |
| " for when %T == 1", name); |
| if (outermost == loop) |
| dump_printf (MSG_NOTE, "; cannot hoist check further"); |
| else |
| { |
| dump_printf (MSG_NOTE, "; could implement the check at loop" |
| " depth %d", loop_depth (outermost)); |
| if (loop_depth (li.outermost) > loop_depth (outermost)) |
| dump_printf (MSG_NOTE, ", but other checks only allow" |
| " a depth of %d", loop_depth (li.outermost)); |
| } |
| dump_printf (MSG_NOTE, "\n"); |
| } |
| |
| m_num_conditions += 1; |
| } |
| else |
| { |
| /* This is a duplicate request. */ |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, stmt, "already asked to version containing" |
| " loop for when %T == 1\n", name); |
| } |
| } |
| |
| /* Return true if OP1_TREE is constant and if in principle it is worth |
| versioning an address fragment of the form: |
| |
| i * OP1_TREE * OP2 * stride |
| |
| for the case in which stride == 1. This in practice means testing |
| whether: |
| |
| OP1_TREE * OP2 <= M_MAXIMUM_SCALE. |
| |
| If RESULT is nonnull, store OP1_TREE * OP2 there when returning true. */ |
| |
| bool |
| loop_versioning::acceptable_multiplier_p (tree op1_tree, |
| unsigned HOST_WIDE_INT op2, |
| unsigned HOST_WIDE_INT *result) |
| { |
| if (tree_fits_uhwi_p (op1_tree)) |
| { |
| unsigned HOST_WIDE_INT op1 = tree_to_uhwi (op1_tree); |
| /* The first part checks for overflow. */ |
| if (op1 * op2 >= op2 && op1 * op2 <= m_maximum_scale) |
| { |
| if (result) |
| *result = op1 * op2; |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /* Return true if it is worth using loop versioning on a memory access |
| of type TYPE. Store the size of the access in *SIZE if so. */ |
| |
| bool |
| loop_versioning::acceptable_type_p (tree type, unsigned HOST_WIDE_INT *size) |
| { |
| return (TYPE_SIZE_UNIT (type) |
| && acceptable_multiplier_p (TYPE_SIZE_UNIT (type), 1, size)); |
| } |
| |
| /* See whether OP is constant and whether we can multiply TERM by that |
| constant without exceeding M_MAXIMUM_SCALE. Return true and update |
| TERM if so. */ |
| |
| bool |
| loop_versioning::multiply_term_by (address_term_info &term, tree op) |
| { |
| return acceptable_multiplier_p (op, term.multiplier, &term.multiplier); |
| } |
| |
| /* Decide whether an address fragment of the form STRIDE * MULTIPLIER |
| is likely to be indexing an innermost dimension, returning the result |
| as an INNER_* probability. */ |
| |
| inner_likelihood |
| loop_versioning::get_inner_likelihood (tree stride, |
| unsigned HOST_WIDE_INT multiplier) |
| { |
| const unsigned int MAX_NITERS = 8; |
| |
| /* Iterate over possible values of STRIDE. Return INNER_LIKELY if at |
| least one of those values is likely to be for the innermost dimension. |
| Record in UNLIKELY_P if at least one of those values is unlikely to be |
| for the innermost dimension. |
| |
| E.g. for: |
| |
| stride = cond ? a * b : 1 |
| |
| we should treat STRIDE as being a likely inner dimension, since |
| we know that it is 1 under at least some circumstances. (See the |
| Fortran example below.) However: |
| |
| stride = a * b |
| |
| on its own is unlikely to be for the innermost dimension, since |
| that would require both a and b to be 1 at runtime. */ |
| bool unlikely_p = false; |
| tree worklist[MAX_NITERS]; |
| unsigned int length = 0; |
| worklist[length++] = stride; |
| for (unsigned int i = 0; i < length; ++i) |
| { |
| tree expr = worklist[i]; |
| |
| if (CONSTANT_CLASS_P (expr)) |
| { |
| /* See if EXPR * MULTIPLIER would be consistent with an individual |
| access or a small grouped access. */ |
| if (acceptable_multiplier_p (expr, multiplier)) |
| return INNER_LIKELY; |
| else |
| unlikely_p = true; |
| } |
| else if (gimple *stmt = maybe_get_stmt (expr)) |
| { |
| /* If EXPR is set by a PHI node, queue its arguments in case |
| we find one that is consistent with an inner dimension. |
| |
| An important instance of this is the Fortran handling of array |
| descriptors, which calculates the stride of the inner dimension |
| using a PHI equivalent of: |
| |
| raw_stride = a.dim[0].stride; |
| stride = raw_stride != 0 ? raw_stride : 1; |
| |
| (Strides for outer dimensions do not treat 0 specially.) */ |
| if (gphi *phi = dyn_cast <gphi *> (stmt)) |
| { |
| unsigned int nargs = gimple_phi_num_args (phi); |
| for (unsigned int j = 0; j < nargs && length < MAX_NITERS; ++j) |
| worklist[length++] = strip_casts (gimple_phi_arg_def (phi, j)); |
| } |
| /* If the value is set by an assignment, expect it to be read |
| from memory (such as an array descriptor) rather than be |
| calculated. */ |
| else if (gassign *assign = dyn_cast <gassign *> (stmt)) |
| { |
| if (!gimple_assign_load_p (assign)) |
| unlikely_p = true; |
| } |
| /* Things like calls don't really tell us anything. */ |
| } |
| } |
| |
| /* We didn't find any possible values of STRIDE that were likely to be |
| for the innermost dimension. If we found one that was actively |
| unlikely to be for the innermost dimension, assume that that applies |
| to STRIDE too. */ |
| return unlikely_p ? INNER_UNLIKELY : INNER_DONT_KNOW; |
| } |
| |
| /* Dump the likelihood that TERM's stride is for the innermost dimension. |
| ADDRESS is the address that contains TERM. */ |
| |
| void |
| loop_versioning::dump_inner_likelihood (address_info &address, |
| address_term_info &term) |
| { |
| if (term.inner_likelihood == INNER_LIKELY) |
| dump_printf_loc (MSG_NOTE, address.stmt, "%T is likely to be the" |
| " innermost dimension\n", term.stride); |
| else if (term.inner_likelihood == INNER_UNLIKELY) |
| dump_printf_loc (MSG_NOTE, address.stmt, "%T is probably not the" |
| " innermost dimension\n", term.stride); |
| else |
| dump_printf_loc (MSG_NOTE, address.stmt, "cannot tell whether %T" |
| " is the innermost dimension\n", term.stride); |
| } |
| |
| /* The caller has identified that STRIDE is the stride of interest |
| in TERM, and that the stride is applied in OP_LOOP. Record this |
| information in TERM, deciding whether STRIDE is likely to be for |
| the innermost dimension of an array and whether it represents a |
| versioning opportunity. ADDRESS is the address that contains TERM. */ |
| |
| void |
| loop_versioning::analyze_stride (address_info &address, |
| address_term_info &term, |
| tree stride, class loop *op_loop) |
| { |
| term.stride = stride; |
| |
| term.inner_likelihood = get_inner_likelihood (stride, term.multiplier); |
| if (dump_enabled_p ()) |
| dump_inner_likelihood (address, term); |
| |
| /* To be a versioning opportunity we require: |
| |
| - The multiplier applied by TERM is equal to the access size, |
| so that when STRIDE is 1, the accesses in successive loop |
| iterations are consecutive. |
| |
| This is deliberately conservative. We could relax it to handle |
| other cases (such as those with gaps between iterations) if we |
| find any real testcases for which it's useful. |
| |
| - the stride is applied in the same loop as STMT rather than |
| in an outer loop. Although versioning for strides applied in |
| outer loops could help in some cases -- such as enabling |
| more loop interchange -- the savings are much lower than for |
| inner loops. |
| |
| - the stride is an SSA name that is invariant in STMT's loop, |
| since otherwise versioning isn't possible. */ |
| unsigned HOST_WIDE_INT access_size = address.max_offset - address.min_offset; |
| if (term.multiplier == access_size |
| && address.loop == op_loop |
| && TREE_CODE (stride) == SSA_NAME |
| && expr_invariant_in_loop_p (address.loop, stride)) |
| { |
| term.versioning_opportunity_p = true; |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, address.stmt, "%T == 1 is a versioning" |
| " opportunity\n", stride); |
| } |
| } |
| |
| /* See whether address term TERM (which belongs to ADDRESS) is the result |
| of multiplying a varying SSA name by a loop-invariant SSA name. |
| Return true and update TERM if so. |
| |
| This handles both cases that SCEV might handle, such as: |
| |
| for (int i = 0; i < n; ++i) |
| res += a[i * stride]; |
| |
| and ones in which the term varies arbitrarily between iterations, such as: |
| |
| for (int i = 0; i < n; ++i) |
| res += a[index[i] * stride]; */ |
| |
| bool |
| loop_versioning::find_per_loop_multiplication (address_info &address, |
| address_term_info &term) |
| { |
| gassign *mult = maybe_get_assign (term.expr); |
| if (!mult || gimple_assign_rhs_code (mult) != MULT_EXPR) |
| return false; |
| |
| class loop *mult_loop = loop_containing_stmt (mult); |
| if (!loop_outer (mult_loop)) |
| return false; |
| |
| tree op1 = strip_casts (gimple_assign_rhs1 (mult)); |
| tree op2 = strip_casts (gimple_assign_rhs2 (mult)); |
| if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME) |
| return false; |
| |
| bool invariant1_p = expr_invariant_in_loop_p (mult_loop, op1); |
| bool invariant2_p = expr_invariant_in_loop_p (mult_loop, op2); |
| if (invariant1_p == invariant2_p) |
| return false; |
| |
| /* Make sure that the loop invariant is OP2 rather than OP1. */ |
| if (invariant1_p) |
| std::swap (op1, op2); |
| |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, address.stmt, "address term %T = varying %T" |
| " * loop-invariant %T\n", term.expr, op1, op2); |
| analyze_stride (address, term, op2, mult_loop); |
| return true; |
| } |
| |
| /* Try to use scalar evolutions to find an address stride for TERM, |
| which belongs to ADDRESS. Return true and update TERM if so. |
| |
| Here we are interested in any evolution information we can find, |
| not just evolutions wrt ADDRESS->LOOP. For example, if we find that |
| an outer loop obviously iterates over the inner dimension of an array, |
| that information can help us eliminate worthless versioning opportunities |
| in inner loops. */ |
| |
| bool |
| loop_versioning::analyze_term_using_scevs (address_info &address, |
| address_term_info &term) |
| { |
| gimple *setter = maybe_get_stmt (term.expr); |
| if (!setter) |
| return false; |
| |
| class loop *wrt_loop = loop_containing_stmt (setter); |
| if (!loop_outer (wrt_loop)) |
| return false; |
| |
| tree chrec = strip_casts (analyze_scalar_evolution (wrt_loop, term.expr)); |
| if (TREE_CODE (chrec) == POLYNOMIAL_CHREC) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, address.stmt, |
| "address term %T = %T\n", term.expr, chrec); |
| |
| /* Peel casts and accumulate constant multiplications, up to the |
| limit allowed by M_MAXIMUM_SCALE. */ |
| tree stride = strip_casts (CHREC_RIGHT (chrec)); |
| while (TREE_CODE (stride) == MULT_EXPR |
| && multiply_term_by (term, TREE_OPERAND (stride, 1))) |
| stride = strip_casts (TREE_OPERAND (stride, 0)); |
| |
| gassign *assign; |
| while ((assign = maybe_get_assign (stride)) |
| && gimple_assign_rhs_code (assign) == MULT_EXPR |
| && multiply_term_by (term, gimple_assign_rhs2 (assign))) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, address.stmt, |
| "looking through %G", assign); |
| stride = strip_casts (gimple_assign_rhs1 (assign)); |
| } |
| |
| analyze_stride (address, term, stride, get_chrec_loop (chrec)); |
| return true; |
| } |
| |
| return false; |
| } |
| |
| /* Address term TERM is an arbitrary term that provides no versioning |
| opportunities. Analyze it to see whether it contains any likely |
| inner strides, so that we don't mistakenly version for other |
| (less likely) candidates. |
| |
| This copes with invariant innermost indices such as: |
| |
| x(i, :) = 100 |
| |
| where the "i" component of the address is invariant in the loop |
| but provides the real inner stride. |
| |
| ADDRESS is the address that contains TERM. */ |
| |
| void |
| loop_versioning::analyze_arbitrary_term (address_info &address, |
| address_term_info &term) |
| |
| { |
| /* A multiplication offers two potential strides. Pick the one that |
| is most likely to be an innermost stride. */ |
| tree expr = term.expr, alt = NULL_TREE; |
| gassign *mult = maybe_get_assign (expr); |
| if (mult && gimple_assign_rhs_code (mult) == MULT_EXPR) |
| { |
| expr = strip_casts (gimple_assign_rhs1 (mult)); |
| alt = strip_casts (gimple_assign_rhs2 (mult)); |
| } |
| term.stride = expr; |
| term.inner_likelihood = get_inner_likelihood (expr, term.multiplier); |
| if (alt) |
| { |
| inner_likelihood alt_l = get_inner_likelihood (alt, term.multiplier); |
| if (alt_l > term.inner_likelihood) |
| { |
| term.stride = alt; |
| term.inner_likelihood = alt_l; |
| } |
| } |
| if (dump_enabled_p ()) |
| dump_inner_likelihood (address, term); |
| } |
| |
| /* Try to identify loop strides in ADDRESS and try to choose realistic |
| versioning opportunities based on these strides. |
| |
| The main difficulty here isn't finding strides that could be used |
| in a version check (that's pretty easy). The problem instead is to |
| avoid versioning for some stride S that is unlikely ever to be 1 at |
| runtime. Versioning for S == 1 on its own would lead to unnecessary |
| code bloat, while adding S == 1 to more realistic version conditions |
| would lose the optimisation opportunity offered by those other conditions. |
| |
| For example, versioning for a stride of 1 in the Fortran code: |
| |
| integer :: a(:,:) |
| a(1,:) = 1 |
| |
| is not usually a good idea, since the assignment is iterating over |
| an outer dimension and is relatively unlikely to have a stride of 1. |
| (It isn't impossible, since the inner dimension might be 1, or the |
| array might be transposed.) Similarly, in: |
| |
| integer :: a(:,:), b(:,:) |
| b(:,1) = a(1,:) |
| |
| b(:,1) is relatively likely to have a stride of 1 while a(1,:) isn't. |
| Versioning for when both strides are 1 would lose most of the benefit |
| of versioning for b's access. |
| |
| The approach we take is as follows: |
| |
| - Analyze each term to see whether it has an identifiable stride, |
| regardless of which loop applies the stride. |
| |
| - Evaluate the likelihood that each such stride is for the innermost |
| dimension of an array, on the scale "likely", "don't know" or "unlikely". |
| |
| - If there is a single "likely" innermost stride, and that stride is |
| applied in the loop that contains STMT, version the loop for when the |
| stride is 1. This deals with the cases in which we're fairly |
| confident of doing the right thing, such as the b(:,1) reference above. |
| |
| - If there are no "likely" innermost strides, and the loop that contains |
| STMT uses a stride that we rated as "don't know", version for when |
| that stride is 1. This is principally used for C code such as: |
| |
| for (int i = 0; i < n; ++i) |
| a[i * x] = ...; |
| |
| and: |
| |
| for (int j = 0; j < n; ++j) |
| for (int i = 0; i < n; ++i) |
| a[i * x + j * y] = ...; |
| |
| where nothing in the way "x" and "y" are set gives a hint as to |
| whether "i" iterates over the innermost dimension of the array. |
| In these situations it seems reasonable to assume the |
| programmer has nested the loops appropriately (although of course |
| there are examples like GEMM in which this assumption doesn't hold |
| for all accesses in the loop). |
| |
| This case is also useful for the Fortran equivalent of the |
| above C code. */ |
| |
| void |
| loop_versioning::analyze_address_fragment (address_info &address) |
| { |
| if (dump_enabled_p ()) |
| { |
| dump_printf_loc (MSG_NOTE, address.stmt, "analyzing address fragment "); |
| dump_address_info (MSG_NOTE, address); |
| dump_printf (MSG_NOTE, "\n"); |
| } |
| |
| /* Analyze each component of the sum to see whether it involves an |
| apparent stride. |
| |
| There is an overlap between the addresses that |
| find_per_loop_multiplication and analyze_term_using_scevs can handle, |
| but the former is much cheaper than SCEV analysis, so try it first. */ |
| for (unsigned int i = 0; i < address.terms.length (); ++i) |
| if (!find_per_loop_multiplication (address, address.terms[i]) |
| && !analyze_term_using_scevs (address, address.terms[i]) |
| && !POINTER_TYPE_P (TREE_TYPE (address.terms[i].expr))) |
| analyze_arbitrary_term (address, address.terms[i]); |
| |
| /* Check for strides that are likely to be for the innermost dimension. |
| |
| 1. If there is a single likely inner stride, if it is an SSA name, |
| and if it is worth versioning the loop for when the SSA name |
| equals 1, record that we want to do so. |
| |
| 2. Otherwise, if there any likely inner strides, bail out. This means |
| one of: |
| |
| (a) There are multiple likely inner strides. This suggests we're |
| confused and be can't be confident of doing the right thing. |
| |
| (b) There is a single likely inner stride and it is a constant |
| rather than an SSA name. This can mean either that the access |
| is a natural one without any variable strides, such as: |
| |
| for (int i = 0; i < n; ++i) |
| a[i] += 1; |
| |
| or that a variable stride is applied to an outer dimension, |
| such as: |
| |
| for (int i = 0; i < n; ++i) |
| for (int j = 0; j < n; ++j) |
| a[j * stride][i] += 1; |
| |
| (c) There is a single likely inner stride, and it is an SSA name, |
| but it isn't a worthwhile versioning opportunity. This usually |
| means that the variable stride is applied by an outer loop, |
| such as: |
| |
| for (int i = 0; i < n; ++i) |
| for (int j = 0; j < n; ++j) |
| a[j][i * stride] += 1; |
| |
| or (using an example with a more natural loop nesting): |
| |
| for (int i = 0; i < n; ++i) |
| for (int j = 0; j < n; ++j) |
| a[i][j] += b[i * stride]; |
| |
| in cases where b[i * stride] cannot (yet) be hoisted for |
| aliasing reasons. |
| |
| 3. If there are no likely inner strides, fall through to the next |
| set of checks. |
| |
| Pointer equality is enough to check for uniqueness in (1), since we |
| only care about SSA names. */ |
| tree chosen_stride = NULL_TREE; |
| tree version_stride = NULL_TREE; |
| for (unsigned int i = 0; i < address.terms.length (); ++i) |
| if (chosen_stride != address.terms[i].stride |
| && address.terms[i].inner_likelihood == INNER_LIKELY) |
| { |
| if (chosen_stride) |
| return; |
| chosen_stride = address.terms[i].stride; |
| if (address.terms[i].versioning_opportunity_p) |
| version_stride = chosen_stride; |
| } |
| |
| /* If there are no likely inner strides, see if there is a single |
| versioning opportunity for a stride that was rated as INNER_DONT_KNOW. |
| See the comment above the function for the cases that this code |
| handles. */ |
| if (!chosen_stride) |
| for (unsigned int i = 0; i < address.terms.length (); ++i) |
| if (version_stride != address.terms[i].stride |
| && address.terms[i].inner_likelihood == INNER_DONT_KNOW |
| && address.terms[i].versioning_opportunity_p) |
| { |
| if (version_stride) |
| return; |
| version_stride = address.terms[i].stride; |
| } |
| |
| if (version_stride) |
| version_for_unity (address.stmt, version_stride); |
| } |
| |
| /* Treat EXPR * MULTIPLIER + OFFSET as a fragment of an address that addresses |
| TYPE_SIZE bytes and record this address fragment for later processing. |
| STMT is the statement that contains the address. */ |
| |
| void |
| loop_versioning::record_address_fragment (gimple *stmt, |
| unsigned HOST_WIDE_INT type_size, |
| tree expr, |
| unsigned HOST_WIDE_INT multiplier, |
| HOST_WIDE_INT offset) |
| { |
| /* We're only interested in computed values. */ |
| if (TREE_CODE (expr) != SSA_NAME) |
| return; |
| |
| /* Quick exit if no part of the address is calculated in STMT's loop, |
| since such addresses have no versioning opportunities. */ |
| class loop *loop = loop_containing_stmt (stmt); |
| if (expr_invariant_in_loop_p (loop, expr)) |
| return; |
| |
| /* Set up an address_info for EXPR * MULTIPLIER. */ |
| address_info *address = XOBNEW (&m_obstack, address_info); |
| new (address) address_info; |
| address->stmt = stmt; |
| address->loop = loop; |
| address->base = NULL_TREE; |
| address->terms.quick_grow (1); |
| address->terms[0].expr = expr; |
| address->terms[0].multiplier = multiplier; |
| address->terms[0].stride = NULL_TREE; |
| address->terms[0].inner_likelihood = INNER_UNLIKELY; |
| address->terms[0].versioning_opportunity_p = false; |
| address->min_offset = offset; |
| |
| /* Peel apart the expression into a sum of address_terms, where each |
| term is multiplied by a constant. Treat a + b and a - b the same, |
| since it doesn't matter for our purposes whether an address is |
| increasing or decreasing. Distribute (a + b) * constant into |
| a * constant + b * constant. |
| |
| We don't care which loop each term belongs to, since we want to |
| examine as many candidate strides as possible when determining |
| which is likely to be for the innermost dimension. We therefore |
| don't limit the search to statements in STMT's loop. */ |
| for (unsigned int i = 0; i < address->terms.length (); ) |
| { |
| if (gassign *assign = maybe_get_assign (address->terms[i].expr)) |
| { |
| tree_code code = gimple_assign_rhs_code (assign); |
| if (code == PLUS_EXPR |
| || code == POINTER_PLUS_EXPR |
| || code == MINUS_EXPR) |
| { |
| tree op1 = gimple_assign_rhs1 (assign); |
| tree op2 = gimple_assign_rhs2 (assign); |
| if (TREE_CODE (op2) == INTEGER_CST) |
| { |
| address->terms[i].expr = strip_casts (op1); |
| /* This is heuristic only, so don't worry about truncation |
| or overflow. */ |
| address->min_offset += (TREE_INT_CST_LOW (op2) |
| * address->terms[i].multiplier); |
| continue; |
| } |
| else if (address->terms.length () < address_info::MAX_TERMS) |
| { |
| unsigned int j = address->terms.length (); |
| address->terms.quick_push (address->terms[i]); |
| address->terms[i].expr = strip_casts (op1); |
| address->terms[j].expr = strip_casts (op2); |
| continue; |
| } |
| } |
| if (code == MULT_EXPR) |
| { |
| tree op1 = gimple_assign_rhs1 (assign); |
| tree op2 = gimple_assign_rhs2 (assign); |
| if (multiply_term_by (address->terms[i], op2)) |
| { |
| address->terms[i].expr = strip_casts (op1); |
| continue; |
| } |
| } |
| if (CONVERT_EXPR_CODE_P (code)) |
| { |
| tree op1 = gimple_assign_rhs1 (assign); |
| address->terms[i].expr = strip_casts (op1); |
| continue; |
| } |
| } |
| i += 1; |
| } |
| |
| /* Peel off any symbolic pointer. */ |
| if (TREE_CODE (address->terms[0].expr) != SSA_NAME |
| && address->terms[0].multiplier == 1) |
| { |
| if (address->terms.length () == 1) |
| { |
| obstack_free (&m_obstack, address); |
| return; |
| } |
| address->base = address->terms[0].expr; |
| address->terms.ordered_remove (0); |
| } |
| |
| /* Require all remaining terms to be SSA names. (This could be false |
| for unfolded statements, but they aren't worth dealing with.) */ |
| for (unsigned int i = 0; i < address->terms.length (); ++i) |
| if (TREE_CODE (address->terms[i].expr) != SSA_NAME) |
| { |
| obstack_free (&m_obstack, address); |
| return; |
| } |
| |
| /* The loop above set MIN_OFFSET based on the first byte of the |
| referenced data. Calculate the end + 1. */ |
| address->max_offset = address->min_offset + type_size; |
| |
| /* Put the terms into a canonical order for the hash table lookup below. */ |
| address->terms.qsort (compare_address_terms); |
| |
| if (dump_enabled_p ()) |
| { |
| dump_printf_loc (MSG_NOTE, stmt, "recording address fragment %T", expr); |
| if (multiplier != 1) |
| dump_printf (MSG_NOTE, " * %wd", multiplier); |
| dump_printf (MSG_NOTE, " = "); |
| dump_address_info (MSG_NOTE, *address); |
| dump_printf (MSG_NOTE, "\n"); |
| } |
| |
| /* Pool address information with the same terms (but potentially |
| different offsets). */ |
| address_info **slot = m_address_table.find_slot (address, INSERT); |
| if (address_info *old_address = *slot) |
| { |
| /* We've already seen an address with the same terms. Extend the |
| offset range to account for this access. Doing this can paper |
| over gaps, such as in: |
| |
| a[i * stride * 4] + a[i * stride * 4 + 3]; |
| |
| where nothing references "+ 1" or "+ 2". However, the vectorizer |
| handles such gapped accesses without problems, so it's not worth |
| trying to exclude them. */ |
| if (old_address->min_offset > address->min_offset) |
| old_address->min_offset = address->min_offset; |
| if (old_address->max_offset < address->max_offset) |
| old_address->max_offset = address->max_offset; |
| obstack_free (&m_obstack, address); |
| } |
| else |
| { |
| /* This is the first time we've seen an address with these terms. */ |
| *slot = address; |
| m_address_list.safe_push (address); |
| } |
| } |
| |
| /* Analyze expression EXPR, which occurs in STMT. */ |
| |
| void |
| loop_versioning::analyze_expr (gimple *stmt, tree expr) |
| { |
| unsigned HOST_WIDE_INT type_size; |
| |
| while (handled_component_p (expr)) |
| { |
| /* See whether we can use versioning to avoid a multiplication |
| in an array index. */ |
| if (TREE_CODE (expr) == ARRAY_REF |
| && acceptable_type_p (TREE_TYPE (expr), &type_size)) |
| record_address_fragment (stmt, type_size, |
| TREE_OPERAND (expr, 1), type_size, 0); |
| expr = TREE_OPERAND (expr, 0); |
| } |
| |
| /* See whether we can use versioning to avoid a multiplication |
| in the pointer calculation of a MEM_REF. */ |
| if (TREE_CODE (expr) == MEM_REF |
| && acceptable_type_p (TREE_TYPE (expr), &type_size)) |
| record_address_fragment (stmt, type_size, TREE_OPERAND (expr, 0), 1, |
| /* This is heuristic only, so don't worry |
| about truncation or overflow. */ |
| TREE_INT_CST_LOW (TREE_OPERAND (expr, 1))); |
| |
| /* These would be easy to handle if they existed at this stage. */ |
| gcc_checking_assert (TREE_CODE (expr) != TARGET_MEM_REF); |
| } |
| |
| /* Analyze all the statements in BB looking for useful version checks. |
| Return true on success, false if something prevents the block from |
| being versioned. */ |
| |
| bool |
| loop_versioning::analyze_block (basic_block bb) |
| { |
| class loop *loop = bb->loop_father; |
| loop_info &li = get_loop_info (loop); |
| for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); |
| gsi_next (&gsi)) |
| { |
| gimple *stmt = gsi_stmt (gsi); |
| if (is_gimple_debug (stmt)) |
| continue; |
| |
| if (expensive_stmt_p (stmt)) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, stmt, "expensive statement" |
| " prevents versioning: %G", stmt); |
| return false; |
| } |
| |
| /* Only look for direct versioning opportunities in inner loops |
| since the benefit tends to be much smaller for outer loops. */ |
| if (!loop->inner) |
| { |
| unsigned int nops = gimple_num_ops (stmt); |
| for (unsigned int i = 0; i < nops; ++i) |
| if (tree op = gimple_op (stmt, i)) |
| analyze_expr (stmt, op); |
| } |
| |
| /* The point of the instruction limit is to prevent excessive |
| code growth, so this is a size-based estimate even though |
| the optimization is aimed at speed. */ |
| li.num_insns += estimate_num_insns (stmt, &eni_size_weights); |
| } |
| |
| return true; |
| } |
| |
| /* Analyze all the blocks in the function, looking for useful version checks. |
| Return true if we found one. */ |
| |
| bool |
| loop_versioning::analyze_blocks () |
| { |
| AUTO_DUMP_SCOPE ("analyze_blocks", |
| dump_user_location_t::from_function_decl (m_fn->decl)); |
| |
| /* For now we don't try to version the whole function, although |
| versioning at that level could be useful in some cases. */ |
| get_loop_info (get_loop (m_fn, 0)).rejected_p = true; |
| |
| class loop *loop; |
| FOR_EACH_LOOP (loop, LI_FROM_INNERMOST) |
| { |
| loop_info &linfo = get_loop_info (loop); |
| |
| /* Ignore cold loops. */ |
| if (!optimize_loop_for_speed_p (loop)) |
| linfo.rejected_p = true; |
| |
| /* See whether an inner loop prevents versioning of this loop. */ |
| if (!linfo.rejected_p) |
| for (class loop *inner = loop->inner; inner; inner = inner->next) |
| if (get_loop_info (inner).rejected_p) |
| { |
| linfo.rejected_p = true; |
| break; |
| } |
| |
| /* If versioning the loop is still a possibility, examine the |
| statements in the loop to look for versioning opportunities. */ |
| if (!linfo.rejected_p) |
| { |
| void *start_point = obstack_alloc (&m_obstack, 0); |
| |
| for (basic_block bb = linfo.block_list; bb; |
| bb = m_next_block_in_loop[bb->index]) |
| if (!analyze_block (bb)) |
| { |
| linfo.rejected_p = true; |
| break; |
| } |
| |
| if (!linfo.rejected_p) |
| { |
| /* Process any queued address fragments, now that we have |
| complete grouping information. */ |
| address_info *address; |
| unsigned int i; |
| FOR_EACH_VEC_ELT (m_address_list, i, address) |
| analyze_address_fragment (*address); |
| } |
| |
| m_address_table.empty (); |
| m_address_list.truncate (0); |
| obstack_free (&m_obstack, start_point); |
| } |
| } |
| |
| return m_num_conditions != 0; |
| } |
| |
| /* Use the ranges in VRS to remove impossible versioning conditions from |
| LOOP. */ |
| |
| void |
| loop_versioning::prune_loop_conditions (class loop *loop, vr_values *vrs) |
| { |
| loop_info &li = get_loop_info (loop); |
| |
| int to_remove = -1; |
| bitmap_iterator bi; |
| unsigned int i; |
| EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi) |
| { |
| tree name = ssa_name (i); |
| const value_range_equiv *vr = vrs->get_value_range (name); |
| if (vr && !vr->may_contain_p (build_one_cst (TREE_TYPE (name)))) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, find_loop_location (loop), |
| "%T can never be 1 in this loop\n", name); |
| |
| if (to_remove >= 0) |
| bitmap_clear_bit (&li.unity_names, to_remove); |
| to_remove = i; |
| m_num_conditions -= 1; |
| } |
| } |
| if (to_remove >= 0) |
| bitmap_clear_bit (&li.unity_names, to_remove); |
| } |
| |
| /* Remove any scheduled loop version conditions that will never be true. |
| Return true if any remain. */ |
| |
| bool |
| loop_versioning::prune_conditions () |
| { |
| AUTO_DUMP_SCOPE ("prune_loop_conditions", |
| dump_user_location_t::from_function_decl (m_fn->decl)); |
| |
| calculate_dominance_info (CDI_DOMINATORS); |
| lv_dom_walker dom_walker (*this); |
| dom_walker.walk (ENTRY_BLOCK_PTR_FOR_FN (m_fn)); |
| return m_num_conditions != 0; |
| } |
| |
| /* Merge the version checks for INNER into immediately-enclosing loop |
| OUTER. */ |
| |
| void |
| loop_versioning::merge_loop_info (class loop *outer, class loop *inner) |
| { |
| loop_info &inner_li = get_loop_info (inner); |
| loop_info &outer_li = get_loop_info (outer); |
| |
| if (dump_enabled_p ()) |
| { |
| bitmap_iterator bi; |
| unsigned int i; |
| EXECUTE_IF_SET_IN_BITMAP (&inner_li.unity_names, 0, i, bi) |
| if (!bitmap_bit_p (&outer_li.unity_names, i)) |
| dump_printf_loc (MSG_NOTE, find_loop_location (inner), |
| "hoisting check that %T == 1 to outer loop\n", |
| ssa_name (i)); |
| } |
| |
| bitmap_ior_into (&outer_li.unity_names, &inner_li.unity_names); |
| if (loop_depth (outer_li.outermost) < loop_depth (inner_li.outermost)) |
| outer_li.outermost = inner_li.outermost; |
| } |
| |
| /* Add LOOP to the queue of loops to version. */ |
| |
| void |
| loop_versioning::add_loop_to_queue (class loop *loop) |
| { |
| loop_info &li = get_loop_info (loop); |
| |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, find_loop_location (loop), |
| "queuing this loop for versioning\n"); |
| m_loops_to_version.safe_push (loop); |
| |
| /* Don't try to version superloops. */ |
| li.rejected_p = true; |
| } |
| |
| /* Decide whether the cost model would allow us to version LOOP, |
| either directly or as part of a parent loop, and return true if so. |
| This does not imply that the loop is actually worth versioning in its |
| own right, just that it would be valid to version it if something |
| benefited. |
| |
| We have already made this decision for all inner loops of LOOP. */ |
| |
| bool |
| loop_versioning::decide_whether_loop_is_versionable (class loop *loop) |
| { |
| loop_info &li = get_loop_info (loop); |
| |
| if (li.rejected_p) |
| return false; |
| |
| /* Examine the decisions made for inner loops. */ |
| for (class loop *inner = loop->inner; inner; inner = inner->next) |
| { |
| loop_info &inner_li = get_loop_info (inner); |
| if (inner_li.rejected_p) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, find_loop_location (loop), |
| "not versioning this loop because one of its" |
| " inner loops should not be versioned\n"); |
| return false; |
| } |
| |
| if (inner_li.worth_versioning_p ()) |
| li.subloops_benefit_p = true; |
| |
| /* Accumulate the number of instructions from subloops that are not |
| the innermost, or that don't benefit from versioning. Only the |
| instructions from innermost loops that benefit from versioning |
| should be weighed against loop-versioning-max-inner-insns; |
| everything else should be weighed against |
| loop-versioning-max-outer-insns. */ |
| if (!inner_li.worth_versioning_p () || inner->inner) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, find_loop_location (loop), |
| "counting %d instructions from this loop" |
| " against its parent loop\n", inner_li.num_insns); |
| li.num_insns += inner_li.num_insns; |
| } |
| } |
| |
| /* Enforce the size limits. */ |
| if (li.worth_versioning_p ()) |
| { |
| unsigned int max_num_insns = max_insns_for_loop (loop); |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, find_loop_location (loop), |
| "this loop has %d instructions, against" |
| " a versioning limit of %d\n", |
| li.num_insns, max_num_insns); |
| if (li.num_insns > max_num_insns) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION |
| | MSG_PRIORITY_USER_FACING, |
| find_loop_location (loop), |
| "this loop is too big to version"); |
| return false; |
| } |
| } |
| |
| /* Hoist all version checks from subloops to this loop. */ |
| for (class loop *subloop = loop->inner; subloop; subloop = subloop->next) |
| merge_loop_info (loop, subloop); |
| |
| return true; |
| } |
| |
| /* Decide which loops to version and add them to the versioning queue. |
| Return true if there are any loops to version. */ |
| |
| bool |
| loop_versioning::make_versioning_decisions () |
| { |
| AUTO_DUMP_SCOPE ("make_versioning_decisions", |
| dump_user_location_t::from_function_decl (m_fn->decl)); |
| |
| class loop *loop; |
| FOR_EACH_LOOP (loop, LI_FROM_INNERMOST) |
| { |
| loop_info &linfo = get_loop_info (loop); |
| if (decide_whether_loop_is_versionable (loop)) |
| { |
| /* Commit to versioning LOOP directly if we can't hoist the |
| version checks any further. */ |
| if (linfo.worth_versioning_p () |
| && (loop_depth (loop) == 1 || linfo.outermost == loop)) |
| add_loop_to_queue (loop); |
| } |
| else |
| { |
| /* We can't version this loop, so individually version any |
| subloops that would benefit and haven't been versioned yet. */ |
| linfo.rejected_p = true; |
| for (class loop *subloop = loop->inner; subloop; |
| subloop = subloop->next) |
| if (get_loop_info (subloop).worth_versioning_p ()) |
| add_loop_to_queue (subloop); |
| } |
| } |
| |
| return !m_loops_to_version.is_empty (); |
| } |
| |
| /* Attempt to implement loop versioning for LOOP, using the information |
| cached in the associated loop_info. Return true on success. */ |
| |
| bool |
| loop_versioning::version_loop (class loop *loop) |
| { |
| loop_info &li = get_loop_info (loop); |
| |
| /* Build up a condition that selects the original loop instead of |
| the simplified loop. */ |
| tree cond = boolean_false_node; |
| bitmap_iterator bi; |
| unsigned int i; |
| EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi) |
| { |
| tree name = ssa_name (i); |
| tree ne_one = fold_build2 (NE_EXPR, boolean_type_node, name, |
| build_one_cst (TREE_TYPE (name))); |
| cond = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, cond, ne_one); |
| } |
| |
| /* Convert the condition into a suitable gcond. */ |
| gimple_seq stmts = NULL; |
| cond = force_gimple_operand_1 (cond, &stmts, is_gimple_condexpr, NULL_TREE); |
| |
| /* Version the loop. */ |
| initialize_original_copy_tables (); |
| basic_block cond_bb; |
| li.optimized_loop = loop_version (loop, cond, &cond_bb, |
| profile_probability::unlikely (), |
| profile_probability::likely (), |
| profile_probability::unlikely (), |
| profile_probability::likely (), true); |
| free_original_copy_tables (); |
| if (!li.optimized_loop) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, find_loop_location (loop), |
| "tried but failed to version this loop for when" |
| " certain strides are 1\n"); |
| return false; |
| } |
| |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, find_loop_location (loop), |
| "versioned this loop for when certain strides are 1\n"); |
| |
| /* Insert the statements that feed COND. */ |
| if (stmts) |
| { |
| gimple_stmt_iterator gsi = gsi_last_bb (cond_bb); |
| gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT); |
| } |
| |
| return true; |
| } |
| |
| /* Attempt to version all loops in the versioning queue. */ |
| |
| void |
| loop_versioning::implement_versioning_decisions () |
| { |
| /* No AUTO_DUMP_SCOPE here since all messages are top-level and |
| user-facing at this point. */ |
| |
| bool any_succeeded_p = false; |
| class loop *loop; |
| unsigned int i; |
| FOR_EACH_VEC_ELT (m_loops_to_version, i, loop) |
| if (version_loop (loop)) |
| any_succeeded_p = true; |
| if (!any_succeeded_p) |
| return; |
| |
| update_ssa (TODO_update_ssa); |
| |
| /* Simplify the new loop, which is used when COND is false. */ |
| FOR_EACH_VEC_ELT (m_loops_to_version, i, loop) |
| { |
| loop_info &linfo = get_loop_info (loop); |
| if (linfo.optimized_loop) |
| name_prop (linfo).substitute_and_fold (linfo.optimized_loop->header); |
| } |
| } |
| |
| /* Run the pass and return a set of TODO_* flags. */ |
| |
| unsigned int |
| loop_versioning::run () |
| { |
| gcc_assert (scev_initialized_p ()); |
| |
| if (analyze_blocks () |
| && prune_conditions () |
| && make_versioning_decisions ()) |
| implement_versioning_decisions (); |
| |
| return 0; |
| } |
| |
| /* Loop versioning pass. */ |
| |
| const pass_data pass_data_loop_versioning = |
| { |
| GIMPLE_PASS, /* type */ |
| "lversion", /* name */ |
| OPTGROUP_LOOP, /* optinfo_flags */ |
| TV_LOOP_VERSIONING, /* tv_id */ |
| PROP_cfg, /* properties_required */ |
| 0, /* properties_provided */ |
| 0, /* properties_destroyed */ |
| 0, /* todo_flags_start */ |
| 0, /* todo_flags_finish */ |
| }; |
| |
| class pass_loop_versioning : public gimple_opt_pass |
| { |
| public: |
| pass_loop_versioning (gcc::context *ctxt) |
| : gimple_opt_pass (pass_data_loop_versioning, ctxt) |
| {} |
| |
| /* opt_pass methods: */ |
| virtual bool gate (function *) { return flag_version_loops_for_strides; } |
| virtual unsigned int execute (function *); |
| }; |
| |
| unsigned int |
| pass_loop_versioning::execute (function *fn) |
| { |
| if (number_of_loops (fn) <= 1) |
| return 0; |
| |
| return loop_versioning (fn).run (); |
| } |
| |
| } // anon namespace |
| |
| gimple_opt_pass * |
| make_pass_loop_versioning (gcc::context *ctxt) |
| { |
| return new pass_loop_versioning (ctxt); |
| } |