| /* SLP - Basic Block Vectorization |
| Copyright (C) 2007-2019 Free Software Foundation, Inc. |
| Contributed by Dorit Naishlos <dorit@il.ibm.com> |
| and Ira Rosen <irar@il.ibm.com> |
| |
| This file is part of GCC. |
| |
| GCC is free software; you can redistribute it and/or modify it under |
| the terms of the GNU General Public License as published by the Free |
| Software Foundation; either version 3, or (at your option) any later |
| version. |
| |
| GCC is distributed in the hope that it will be useful, but WITHOUT ANY |
| WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with GCC; see the file COPYING3. If not see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #include "config.h" |
| #include "system.h" |
| #include "coretypes.h" |
| #include "backend.h" |
| #include "target.h" |
| #include "rtl.h" |
| #include "tree.h" |
| #include "gimple.h" |
| #include "tree-pass.h" |
| #include "ssa.h" |
| #include "optabs-tree.h" |
| #include "insn-config.h" |
| #include "recog.h" /* FIXME: for insn_data */ |
| #include "params.h" |
| #include "fold-const.h" |
| #include "stor-layout.h" |
| #include "gimple-iterator.h" |
| #include "cfgloop.h" |
| #include "tree-vectorizer.h" |
| #include "langhooks.h" |
| #include "gimple-walk.h" |
| #include "dbgcnt.h" |
| #include "tree-vector-builder.h" |
| #include "vec-perm-indices.h" |
| #include "gimple-fold.h" |
| #include "internal-fn.h" |
| |
| |
| /* Recursively free the memory allocated for the SLP tree rooted at NODE. |
| FINAL_P is true if we have vectorized the instance or if we have |
| made a final decision not to vectorize the statements in any way. */ |
| |
| static void |
| vect_free_slp_tree (slp_tree node, bool final_p) |
| { |
| int i; |
| slp_tree child; |
| |
| if (--node->refcnt != 0) |
| return; |
| |
| FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
| vect_free_slp_tree (child, final_p); |
| |
| /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant. |
| Some statements might no longer exist, after having been |
| removed by vect_transform_stmt. Updating the remaining |
| statements would be redundant. */ |
| if (!final_p) |
| { |
| stmt_vec_info stmt_info; |
| FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
| { |
| gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0); |
| STMT_VINFO_NUM_SLP_USES (stmt_info)--; |
| } |
| } |
| |
| SLP_TREE_CHILDREN (node).release (); |
| SLP_TREE_SCALAR_STMTS (node).release (); |
| SLP_TREE_VEC_STMTS (node).release (); |
| SLP_TREE_LOAD_PERMUTATION (node).release (); |
| |
| free (node); |
| } |
| |
| /* Free the memory allocated for the SLP instance. FINAL_P is true if we |
| have vectorized the instance or if we have made a final decision not |
| to vectorize the statements in any way. */ |
| |
| void |
| vect_free_slp_instance (slp_instance instance, bool final_p) |
| { |
| vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p); |
| SLP_INSTANCE_LOADS (instance).release (); |
| free (instance); |
| } |
| |
| |
| /* Create an SLP node for SCALAR_STMTS. */ |
| |
| static slp_tree |
| vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts) |
| { |
| slp_tree node; |
| stmt_vec_info stmt_info = scalar_stmts[0]; |
| unsigned int nops; |
| |
| if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) |
| nops = gimple_call_num_args (stmt); |
| else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt)) |
| { |
| nops = gimple_num_ops (stmt) - 1; |
| if (gimple_assign_rhs_code (stmt) == COND_EXPR) |
| nops++; |
| } |
| else if (is_a <gphi *> (stmt_info->stmt)) |
| nops = 0; |
| else |
| return NULL; |
| |
| node = XNEW (struct _slp_tree); |
| SLP_TREE_SCALAR_STMTS (node) = scalar_stmts; |
| SLP_TREE_VEC_STMTS (node).create (0); |
| SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0; |
| SLP_TREE_CHILDREN (node).create (nops); |
| SLP_TREE_LOAD_PERMUTATION (node) = vNULL; |
| SLP_TREE_TWO_OPERATORS (node) = false; |
| SLP_TREE_DEF_TYPE (node) = vect_internal_def; |
| node->refcnt = 1; |
| node->max_nunits = 1; |
| |
| unsigned i; |
| FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info) |
| STMT_VINFO_NUM_SLP_USES (stmt_info)++; |
| |
| return node; |
| } |
| |
| |
| /* This structure is used in creation of an SLP tree. Each instance |
| corresponds to the same operand in a group of scalar stmts in an SLP |
| node. */ |
| typedef struct _slp_oprnd_info |
| { |
| /* Def-stmts for the operands. */ |
| vec<stmt_vec_info> def_stmts; |
| /* Information about the first statement, its vector def-type, type, the |
| operand itself in case it's constant, and an indication if it's a pattern |
| stmt. */ |
| tree first_op_type; |
| enum vect_def_type first_dt; |
| bool any_pattern; |
| } *slp_oprnd_info; |
| |
| |
| /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each |
| operand. */ |
| static vec<slp_oprnd_info> |
| vect_create_oprnd_info (int nops, int group_size) |
| { |
| int i; |
| slp_oprnd_info oprnd_info; |
| vec<slp_oprnd_info> oprnds_info; |
| |
| oprnds_info.create (nops); |
| for (i = 0; i < nops; i++) |
| { |
| oprnd_info = XNEW (struct _slp_oprnd_info); |
| oprnd_info->def_stmts.create (group_size); |
| oprnd_info->first_dt = vect_uninitialized_def; |
| oprnd_info->first_op_type = NULL_TREE; |
| oprnd_info->any_pattern = false; |
| oprnds_info.quick_push (oprnd_info); |
| } |
| |
| return oprnds_info; |
| } |
| |
| |
| /* Free operands info. */ |
| |
| static void |
| vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info) |
| { |
| int i; |
| slp_oprnd_info oprnd_info; |
| |
| FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) |
| { |
| oprnd_info->def_stmts.release (); |
| XDELETE (oprnd_info); |
| } |
| |
| oprnds_info.release (); |
| } |
| |
| |
| /* Find the place of the data-ref in STMT_INFO in the interleaving chain |
| that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part |
| of the chain. */ |
| |
| int |
| vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info, |
| stmt_vec_info first_stmt_info) |
| { |
| stmt_vec_info next_stmt_info = first_stmt_info; |
| int result = 0; |
| |
| if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info)) |
| return -1; |
| |
| do |
| { |
| if (next_stmt_info == stmt_info) |
| return result; |
| next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info); |
| if (next_stmt_info) |
| result += DR_GROUP_GAP (next_stmt_info); |
| } |
| while (next_stmt_info); |
| |
| return -1; |
| } |
| |
| /* Check whether it is possible to load COUNT elements of type ELT_MODE |
| using the method implemented by duplicate_and_interleave. Return true |
| if so, returning the number of intermediate vectors in *NVECTORS_OUT |
| (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT |
| (if nonnull). */ |
| |
| bool |
| can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode, |
| unsigned int *nvectors_out, |
| tree *vector_type_out, |
| tree *permutes) |
| { |
| poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode); |
| poly_int64 nelts; |
| unsigned int nvectors = 1; |
| for (;;) |
| { |
| scalar_int_mode int_mode; |
| poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT; |
| if (multiple_p (current_vector_size, elt_bytes, &nelts) |
| && int_mode_for_size (elt_bits, 0).exists (&int_mode)) |
| { |
| tree int_type = build_nonstandard_integer_type |
| (GET_MODE_BITSIZE (int_mode), 1); |
| tree vector_type = build_vector_type (int_type, nelts); |
| if (VECTOR_MODE_P (TYPE_MODE (vector_type))) |
| { |
| vec_perm_builder sel1 (nelts, 2, 3); |
| vec_perm_builder sel2 (nelts, 2, 3); |
| poly_int64 half_nelts = exact_div (nelts, 2); |
| for (unsigned int i = 0; i < 3; ++i) |
| { |
| sel1.quick_push (i); |
| sel1.quick_push (i + nelts); |
| sel2.quick_push (half_nelts + i); |
| sel2.quick_push (half_nelts + i + nelts); |
| } |
| vec_perm_indices indices1 (sel1, 2, nelts); |
| vec_perm_indices indices2 (sel2, 2, nelts); |
| if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1) |
| && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2)) |
| { |
| if (nvectors_out) |
| *nvectors_out = nvectors; |
| if (vector_type_out) |
| *vector_type_out = vector_type; |
| if (permutes) |
| { |
| permutes[0] = vect_gen_perm_mask_checked (vector_type, |
| indices1); |
| permutes[1] = vect_gen_perm_mask_checked (vector_type, |
| indices2); |
| } |
| return true; |
| } |
| } |
| } |
| if (!multiple_p (elt_bytes, 2, &elt_bytes)) |
| return false; |
| nvectors *= 2; |
| } |
| } |
| |
| /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that |
| they are of a valid type and that they match the defs of the first stmt of |
| the SLP group (stored in OPRNDS_INFO). This function tries to match stmts |
| by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP |
| indicates swap is required for cond_expr stmts. Specifically, *SWAP |
| is 1 if STMT is cond and operands of comparison need to be swapped; |
| *SWAP is 2 if STMT is cond and code of comparison needs to be inverted. |
| If there is any operand swap in this function, *SWAP is set to non-zero |
| value. |
| If there was a fatal error return -1; if the error could be corrected by |
| swapping operands of father node of this one, return 1; if everything is |
| ok return 0. */ |
| static int |
| vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap, |
| vec<stmt_vec_info> stmts, unsigned stmt_num, |
| vec<slp_oprnd_info> *oprnds_info) |
| { |
| stmt_vec_info stmt_info = stmts[stmt_num]; |
| tree oprnd; |
| unsigned int i, number_of_oprnds; |
| enum vect_def_type dt = vect_uninitialized_def; |
| slp_oprnd_info oprnd_info; |
| int first_op_idx = 1; |
| unsigned int commutative_op = -1U; |
| bool first_op_cond = false; |
| bool first = stmt_num == 0; |
| |
| if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) |
| { |
| number_of_oprnds = gimple_call_num_args (stmt); |
| first_op_idx = 3; |
| if (gimple_call_internal_p (stmt)) |
| { |
| internal_fn ifn = gimple_call_internal_fn (stmt); |
| commutative_op = first_commutative_argument (ifn); |
| } |
| } |
| else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt)) |
| { |
| enum tree_code code = gimple_assign_rhs_code (stmt); |
| number_of_oprnds = gimple_num_ops (stmt) - 1; |
| /* Swap can only be done for cond_expr if asked to, otherwise we |
| could result in different comparison code to the first stmt. */ |
| if (gimple_assign_rhs_code (stmt) == COND_EXPR |
| && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt))) |
| { |
| first_op_cond = true; |
| number_of_oprnds++; |
| } |
| else |
| commutative_op = commutative_tree_code (code) ? 0U : -1U; |
| } |
| else |
| return -1; |
| |
| bool swapped = (*swap != 0); |
| gcc_assert (!swapped || first_op_cond); |
| for (i = 0; i < number_of_oprnds; i++) |
| { |
| again: |
| if (first_op_cond) |
| { |
| /* Map indicating how operands of cond_expr should be swapped. */ |
| int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}}; |
| int *map = maps[*swap]; |
| |
| if (i < 2) |
| oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt, |
| first_op_idx), map[i]); |
| else |
| oprnd = gimple_op (stmt_info->stmt, map[i]); |
| } |
| else |
| oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i)); |
| |
| oprnd_info = (*oprnds_info)[i]; |
| |
| stmt_vec_info def_stmt_info; |
| if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info)) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: can't analyze def for %T\n", |
| oprnd); |
| |
| return -1; |
| } |
| |
| if (def_stmt_info && is_pattern_stmt_p (def_stmt_info)) |
| oprnd_info->any_pattern = true; |
| |
| if (first) |
| { |
| oprnd_info->first_dt = dt; |
| oprnd_info->first_op_type = TREE_TYPE (oprnd); |
| } |
| else |
| { |
| /* Not first stmt of the group, check that the def-stmt/s match |
| the def-stmt/s of the first stmt. Allow different definition |
| types for reduction chains: the first stmt must be a |
| vect_reduction_def (a phi node), and the rest |
| vect_internal_def. */ |
| tree type = TREE_TYPE (oprnd); |
| if ((oprnd_info->first_dt != dt |
| && !(oprnd_info->first_dt == vect_reduction_def |
| && dt == vect_internal_def) |
| && !((oprnd_info->first_dt == vect_external_def |
| || oprnd_info->first_dt == vect_constant_def) |
| && (dt == vect_external_def |
| || dt == vect_constant_def))) |
| || !types_compatible_p (oprnd_info->first_op_type, type)) |
| { |
| /* Try swapping operands if we got a mismatch. */ |
| if (i == commutative_op && !swapped) |
| { |
| swapped = true; |
| goto again; |
| } |
| |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: different types\n"); |
| |
| return 1; |
| } |
| if ((dt == vect_constant_def |
| || dt == vect_external_def) |
| && !current_vector_size.is_constant () |
| && (TREE_CODE (type) == BOOLEAN_TYPE |
| || !can_duplicate_and_interleave_p (stmts.length (), |
| TYPE_MODE (type)))) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: invalid type of def " |
| "for variable-length SLP %T\n", oprnd); |
| return -1; |
| } |
| } |
| |
| /* Check the types of the definitions. */ |
| switch (dt) |
| { |
| case vect_constant_def: |
| case vect_external_def: |
| break; |
| |
| case vect_reduction_def: |
| case vect_induction_def: |
| case vect_internal_def: |
| oprnd_info->def_stmts.quick_push (def_stmt_info); |
| break; |
| |
| default: |
| /* FORNOW: Not supported. */ |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: illegal type of def %T\n", |
| oprnd); |
| |
| return -1; |
| } |
| } |
| |
| /* Swap operands. */ |
| if (swapped) |
| { |
| /* If there are already uses of this stmt in a SLP instance then |
| we've committed to the operand order and can't swap it. */ |
| if (STMT_VINFO_NUM_SLP_USES (stmt_info) != 0) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: cannot swap operands of " |
| "shared stmt %G", stmt_info->stmt); |
| return -1; |
| } |
| |
| if (first_op_cond) |
| { |
| gassign *stmt = as_a <gassign *> (stmt_info->stmt); |
| tree cond = gimple_assign_rhs1 (stmt); |
| enum tree_code code = TREE_CODE (cond); |
| |
| /* Swap. */ |
| if (*swap == 1) |
| { |
| swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0), |
| &TREE_OPERAND (cond, 1)); |
| TREE_SET_CODE (cond, swap_tree_comparison (code)); |
| } |
| /* Invert. */ |
| else |
| { |
| swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt), |
| gimple_assign_rhs3_ptr (stmt)); |
| bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0)); |
| code = invert_tree_comparison (TREE_CODE (cond), honor_nans); |
| gcc_assert (code != ERROR_MARK); |
| TREE_SET_CODE (cond, code); |
| } |
| } |
| else |
| { |
| unsigned int op = commutative_op + first_op_idx; |
| swap_ssa_operands (stmt_info->stmt, |
| gimple_op_ptr (stmt_info->stmt, op), |
| gimple_op_ptr (stmt_info->stmt, op + 1)); |
| } |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "swapped operands to match def types in %G", |
| stmt_info->stmt); |
| } |
| |
| *swap = swapped; |
| return 0; |
| } |
| |
| /* Return true if call statements CALL1 and CALL2 are similar enough |
| to be combined into the same SLP group. */ |
| |
| static bool |
| compatible_calls_p (gcall *call1, gcall *call2) |
| { |
| unsigned int nargs = gimple_call_num_args (call1); |
| if (nargs != gimple_call_num_args (call2)) |
| return false; |
| |
| if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2)) |
| return false; |
| |
| if (gimple_call_internal_p (call1)) |
| { |
| if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)), |
| TREE_TYPE (gimple_call_lhs (call2)))) |
| return false; |
| for (unsigned int i = 0; i < nargs; ++i) |
| if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)), |
| TREE_TYPE (gimple_call_arg (call2, i)))) |
| return false; |
| } |
| else |
| { |
| if (!operand_equal_p (gimple_call_fn (call1), |
| gimple_call_fn (call2), 0)) |
| return false; |
| |
| if (gimple_call_fntype (call1) != gimple_call_fntype (call2)) |
| return false; |
| } |
| return true; |
| } |
| |
| /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the |
| caller's attempt to find the vector type in STMT_INFO with the narrowest |
| element type. Return true if VECTYPE is nonnull and if it is valid |
| for STMT_INFO. When returning true, update MAX_NUNITS to reflect the |
| number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for |
| vect_build_slp_tree. */ |
| |
| static bool |
| vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size, |
| tree vectype, poly_uint64 *max_nunits) |
| { |
| if (!vectype) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: unsupported data-type in %G\n", |
| stmt_info->stmt); |
| /* Fatal mismatch. */ |
| return false; |
| } |
| |
| /* If populating the vector type requires unrolling then fail |
| before adjusting *max_nunits for basic-block vectorization. */ |
| poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); |
| unsigned HOST_WIDE_INT const_nunits; |
| if (STMT_VINFO_BB_VINFO (stmt_info) |
| && (!nunits.is_constant (&const_nunits) |
| || const_nunits > group_size)) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: unrolling required " |
| "in basic block SLP\n"); |
| /* Fatal mismatch. */ |
| return false; |
| } |
| |
| /* In case of multiple types we need to detect the smallest type. */ |
| vect_update_max_nunits (max_nunits, vectype); |
| return true; |
| } |
| |
| /* STMTS is a group of GROUP_SIZE SLP statements in which some |
| statements do the same operation as the first statement and in which |
| the others do ALT_STMT_CODE. Return true if we can take one vector |
| of the first operation and one vector of the second and permute them |
| to get the required result. VECTYPE is the type of the vector that |
| would be permuted. */ |
| |
| static bool |
| vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts, |
| unsigned int group_size, tree vectype, |
| tree_code alt_stmt_code) |
| { |
| unsigned HOST_WIDE_INT count; |
| if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count)) |
| return false; |
| |
| vec_perm_builder sel (count, count, 1); |
| for (unsigned int i = 0; i < count; ++i) |
| { |
| unsigned int elt = i; |
| gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt); |
| if (gimple_assign_rhs_code (stmt) == alt_stmt_code) |
| elt += count; |
| sel.quick_push (elt); |
| } |
| vec_perm_indices indices (sel, 2, count); |
| return can_vec_perm_const_p (TYPE_MODE (vectype), indices); |
| } |
| |
| /* Verify if the scalar stmts STMTS are isomorphic, require data |
| permutation or are of unsupported types of operation. Return |
| true if they are, otherwise return false and indicate in *MATCHES |
| which stmts are not isomorphic to the first one. If MATCHES[0] |
| is false then this indicates the comparison could not be |
| carried out or the stmts will never be vectorized by SLP. |
| |
| Note COND_EXPR is possibly ismorphic to another one after swapping its |
| operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to |
| the first stmt by swapping the two operands of comparison; set SWAP[i] |
| to 2 if stmt I is isormorphic to the first stmt by inverting the code |
| of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped |
| to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */ |
| |
| static bool |
| vect_build_slp_tree_1 (unsigned char *swap, |
| vec<stmt_vec_info> stmts, unsigned int group_size, |
| poly_uint64 *max_nunits, bool *matches, |
| bool *two_operators) |
| { |
| unsigned int i; |
| stmt_vec_info first_stmt_info = stmts[0]; |
| enum tree_code first_stmt_code = ERROR_MARK; |
| enum tree_code alt_stmt_code = ERROR_MARK; |
| enum tree_code rhs_code = ERROR_MARK; |
| enum tree_code first_cond_code = ERROR_MARK; |
| tree lhs; |
| bool need_same_oprnds = false; |
| tree vectype = NULL_TREE, first_op1 = NULL_TREE; |
| optab optab; |
| int icode; |
| machine_mode optab_op2_mode; |
| machine_mode vec_mode; |
| stmt_vec_info first_load = NULL, prev_first_load = NULL; |
| |
| /* For every stmt in NODE find its def stmt/s. */ |
| stmt_vec_info stmt_info; |
| FOR_EACH_VEC_ELT (stmts, i, stmt_info) |
| { |
| gimple *stmt = stmt_info->stmt; |
| swap[i] = 0; |
| matches[i] = false; |
| |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt); |
| |
| /* Fail to vectorize statements marked as unvectorizable. */ |
| if (!STMT_VINFO_VECTORIZABLE (stmt_info)) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: unvectorizable statement %G", |
| stmt); |
| /* Fatal mismatch. */ |
| matches[0] = false; |
| return false; |
| } |
| |
| lhs = gimple_get_lhs (stmt); |
| if (lhs == NULL_TREE) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: not GIMPLE_ASSIGN nor " |
| "GIMPLE_CALL %G", stmt); |
| /* Fatal mismatch. */ |
| matches[0] = false; |
| return false; |
| } |
| |
| tree nunits_vectype; |
| if (!vect_get_vector_types_for_stmt (stmt_info, &vectype, |
| &nunits_vectype) |
| || (nunits_vectype |
| && !vect_record_max_nunits (stmt_info, group_size, |
| nunits_vectype, max_nunits))) |
| { |
| /* Fatal mismatch. */ |
| matches[0] = false; |
| return false; |
| } |
| |
| gcc_assert (vectype); |
| |
| if (gcall *call_stmt = dyn_cast <gcall *> (stmt)) |
| { |
| rhs_code = CALL_EXPR; |
| if ((gimple_call_internal_p (call_stmt) |
| && (!vectorizable_internal_fn_p |
| (gimple_call_internal_fn (call_stmt)))) |
| || gimple_call_tail_p (call_stmt) |
| || gimple_call_noreturn_p (call_stmt) |
| || !gimple_call_nothrow_p (call_stmt) |
| || gimple_call_chain (call_stmt)) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: unsupported call type %G", |
| call_stmt); |
| /* Fatal mismatch. */ |
| matches[0] = false; |
| return false; |
| } |
| } |
| else |
| rhs_code = gimple_assign_rhs_code (stmt); |
| |
| /* Check the operation. */ |
| if (i == 0) |
| { |
| first_stmt_code = rhs_code; |
| |
| /* Shift arguments should be equal in all the packed stmts for a |
| vector shift with scalar shift operand. */ |
| if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR |
| || rhs_code == LROTATE_EXPR |
| || rhs_code == RROTATE_EXPR) |
| { |
| if (vectype == boolean_type_node) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: shift of a" |
| " boolean.\n"); |
| /* Fatal mismatch. */ |
| matches[0] = false; |
| return false; |
| } |
| |
| vec_mode = TYPE_MODE (vectype); |
| |
| /* First see if we have a vector/vector shift. */ |
| optab = optab_for_tree_code (rhs_code, vectype, |
| optab_vector); |
| |
| if (!optab |
| || optab_handler (optab, vec_mode) == CODE_FOR_nothing) |
| { |
| /* No vector/vector shift, try for a vector/scalar shift. */ |
| optab = optab_for_tree_code (rhs_code, vectype, |
| optab_scalar); |
| |
| if (!optab) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: no optab.\n"); |
| /* Fatal mismatch. */ |
| matches[0] = false; |
| return false; |
| } |
| icode = (int) optab_handler (optab, vec_mode); |
| if (icode == CODE_FOR_nothing) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: " |
| "op not supported by target.\n"); |
| /* Fatal mismatch. */ |
| matches[0] = false; |
| return false; |
| } |
| optab_op2_mode = insn_data[icode].operand[2].mode; |
| if (!VECTOR_MODE_P (optab_op2_mode)) |
| { |
| need_same_oprnds = true; |
| first_op1 = gimple_assign_rhs2 (stmt); |
| } |
| } |
| } |
| else if (rhs_code == WIDEN_LSHIFT_EXPR) |
| { |
| need_same_oprnds = true; |
| first_op1 = gimple_assign_rhs2 (stmt); |
| } |
| } |
| else |
| { |
| if (first_stmt_code != rhs_code |
| && alt_stmt_code == ERROR_MARK) |
| alt_stmt_code = rhs_code; |
| if (first_stmt_code != rhs_code |
| && (first_stmt_code != IMAGPART_EXPR |
| || rhs_code != REALPART_EXPR) |
| && (first_stmt_code != REALPART_EXPR |
| || rhs_code != IMAGPART_EXPR) |
| /* Handle mismatches in plus/minus by computing both |
| and merging the results. */ |
| && !((first_stmt_code == PLUS_EXPR |
| || first_stmt_code == MINUS_EXPR) |
| && (alt_stmt_code == PLUS_EXPR |
| || alt_stmt_code == MINUS_EXPR) |
| && rhs_code == alt_stmt_code) |
| && !(STMT_VINFO_GROUPED_ACCESS (stmt_info) |
| && (first_stmt_code == ARRAY_REF |
| || first_stmt_code == BIT_FIELD_REF |
| || first_stmt_code == INDIRECT_REF |
| || first_stmt_code == COMPONENT_REF |
| || first_stmt_code == MEM_REF))) |
| { |
| if (dump_enabled_p ()) |
| { |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: different operation " |
| "in stmt %G", stmt); |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "original stmt %G", first_stmt_info->stmt); |
| } |
| /* Mismatch. */ |
| continue; |
| } |
| |
| if (need_same_oprnds |
| && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0)) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: different shift " |
| "arguments in %G", stmt); |
| /* Mismatch. */ |
| continue; |
| } |
| |
| if (rhs_code == CALL_EXPR) |
| { |
| if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt), |
| as_a <gcall *> (stmt))) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: different calls in %G", |
| stmt); |
| /* Mismatch. */ |
| continue; |
| } |
| } |
| } |
| |
| /* Grouped store or load. */ |
| if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
| { |
| if (REFERENCE_CLASS_P (lhs)) |
| { |
| /* Store. */ |
| ; |
| } |
| else |
| { |
| /* Load. */ |
| first_load = DR_GROUP_FIRST_ELEMENT (stmt_info); |
| if (prev_first_load) |
| { |
| /* Check that there are no loads from different interleaving |
| chains in the same node. */ |
| if (prev_first_load != first_load) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, |
| vect_location, |
| "Build SLP failed: different " |
| "interleaving chains in one node %G", |
| stmt); |
| /* Mismatch. */ |
| continue; |
| } |
| } |
| else |
| prev_first_load = first_load; |
| } |
| } /* Grouped access. */ |
| else |
| { |
| if (TREE_CODE_CLASS (rhs_code) == tcc_reference) |
| { |
| /* Not grouped load. */ |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: not grouped load %G", stmt); |
| |
| /* FORNOW: Not grouped loads are not supported. */ |
| /* Fatal mismatch. */ |
| matches[0] = false; |
| return false; |
| } |
| |
| /* Not memory operation. */ |
| if (TREE_CODE_CLASS (rhs_code) != tcc_binary |
| && TREE_CODE_CLASS (rhs_code) != tcc_unary |
| && TREE_CODE_CLASS (rhs_code) != tcc_expression |
| && TREE_CODE_CLASS (rhs_code) != tcc_comparison |
| && rhs_code != CALL_EXPR) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: operation unsupported %G", |
| stmt); |
| /* Fatal mismatch. */ |
| matches[0] = false; |
| return false; |
| } |
| |
| if (rhs_code == COND_EXPR) |
| { |
| tree cond_expr = gimple_assign_rhs1 (stmt); |
| enum tree_code cond_code = TREE_CODE (cond_expr); |
| enum tree_code swap_code = ERROR_MARK; |
| enum tree_code invert_code = ERROR_MARK; |
| |
| if (i == 0) |
| first_cond_code = TREE_CODE (cond_expr); |
| else if (TREE_CODE_CLASS (cond_code) == tcc_comparison) |
| { |
| bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0)); |
| swap_code = swap_tree_comparison (cond_code); |
| invert_code = invert_tree_comparison (cond_code, honor_nans); |
| } |
| |
| if (first_cond_code == cond_code) |
| ; |
| /* Isomorphic can be achieved by swapping. */ |
| else if (first_cond_code == swap_code) |
| swap[i] = 1; |
| /* Isomorphic can be achieved by inverting. */ |
| else if (first_cond_code == invert_code) |
| swap[i] = 2; |
| else |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: different" |
| " operation %G", stmt); |
| /* Mismatch. */ |
| continue; |
| } |
| } |
| } |
| |
| matches[i] = true; |
| } |
| |
| for (i = 0; i < group_size; ++i) |
| if (!matches[i]) |
| return false; |
| |
| /* If we allowed a two-operation SLP node verify the target can cope |
| with the permute we are going to use. */ |
| if (alt_stmt_code != ERROR_MARK |
| && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference) |
| { |
| if (vectype == boolean_type_node |
| || !vect_two_operations_perm_ok_p (stmts, group_size, |
| vectype, alt_stmt_code)) |
| { |
| for (i = 0; i < group_size; ++i) |
| if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code) |
| { |
| matches[i] = false; |
| if (dump_enabled_p ()) |
| { |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: different operation " |
| "in stmt %G", stmts[i]->stmt); |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "original stmt %G", first_stmt_info->stmt); |
| } |
| } |
| return false; |
| } |
| *two_operators = true; |
| } |
| |
| return true; |
| } |
| |
| /* Traits for the hash_set to record failed SLP builds for a stmt set. |
| Note we never remove apart from at destruction time so we do not |
| need a special value for deleted that differs from empty. */ |
| struct bst_traits |
| { |
| typedef vec <stmt_vec_info> value_type; |
| typedef vec <stmt_vec_info> compare_type; |
| static inline hashval_t hash (value_type); |
| static inline bool equal (value_type existing, value_type candidate); |
| static inline bool is_empty (value_type x) { return !x.exists (); } |
| static inline bool is_deleted (value_type x) { return !x.exists (); } |
| static inline void mark_empty (value_type &x) { x.release (); } |
| static inline void mark_deleted (value_type &x) { x.release (); } |
| static inline void remove (value_type &x) { x.release (); } |
| }; |
| inline hashval_t |
| bst_traits::hash (value_type x) |
| { |
| inchash::hash h; |
| for (unsigned i = 0; i < x.length (); ++i) |
| h.add_int (gimple_uid (x[i]->stmt)); |
| return h.end (); |
| } |
| inline bool |
| bst_traits::equal (value_type existing, value_type candidate) |
| { |
| if (existing.length () != candidate.length ()) |
| return false; |
| for (unsigned i = 0; i < existing.length (); ++i) |
| if (existing[i] != candidate[i]) |
| return false; |
| return true; |
| } |
| |
| typedef hash_map <vec <gimple *>, slp_tree, |
| simple_hashmap_traits <bst_traits, slp_tree> > |
| scalar_stmts_to_slp_tree_map_t; |
| |
| static slp_tree |
| vect_build_slp_tree_2 (vec_info *vinfo, |
| vec<stmt_vec_info> stmts, unsigned int group_size, |
| poly_uint64 *max_nunits, |
| bool *matches, unsigned *npermutes, unsigned *tree_size, |
| unsigned max_tree_size, |
| scalar_stmts_to_slp_tree_map_t *bst_map); |
| |
| static slp_tree |
| vect_build_slp_tree (vec_info *vinfo, |
| vec<stmt_vec_info> stmts, unsigned int group_size, |
| poly_uint64 *max_nunits, |
| bool *matches, unsigned *npermutes, unsigned *tree_size, |
| unsigned max_tree_size, |
| scalar_stmts_to_slp_tree_map_t *bst_map) |
| { |
| if (slp_tree *leader = bst_map->get (stmts)) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n", |
| *leader ? "" : "failed ", *leader); |
| if (*leader) |
| { |
| (*leader)->refcnt++; |
| vect_update_max_nunits (max_nunits, (*leader)->max_nunits); |
| } |
| return *leader; |
| } |
| poly_uint64 this_max_nunits = 1; |
| slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, |
| &this_max_nunits, |
| matches, npermutes, tree_size, |
| max_tree_size, bst_map); |
| if (res) |
| { |
| res->max_nunits = this_max_nunits; |
| vect_update_max_nunits (max_nunits, this_max_nunits); |
| /* Keep a reference for the bst_map use. */ |
| res->refcnt++; |
| } |
| bst_map->put (stmts.copy (), res); |
| return res; |
| } |
| |
| /* Recursively build an SLP tree starting from NODE. |
| Fail (and return a value not equal to zero) if def-stmts are not |
| isomorphic, require data permutation or are of unsupported types of |
| operation. Otherwise, return 0. |
| The value returned is the depth in the SLP tree where a mismatch |
| was found. */ |
| |
| static slp_tree |
| vect_build_slp_tree_2 (vec_info *vinfo, |
| vec<stmt_vec_info> stmts, unsigned int group_size, |
| poly_uint64 *max_nunits, |
| bool *matches, unsigned *npermutes, unsigned *tree_size, |
| unsigned max_tree_size, |
| scalar_stmts_to_slp_tree_map_t *bst_map) |
| { |
| unsigned nops, i, this_tree_size = 0; |
| poly_uint64 this_max_nunits = *max_nunits; |
| slp_tree node; |
| |
| matches[0] = false; |
| |
| stmt_vec_info stmt_info = stmts[0]; |
| if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) |
| nops = gimple_call_num_args (stmt); |
| else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt)) |
| { |
| nops = gimple_num_ops (stmt) - 1; |
| if (gimple_assign_rhs_code (stmt) == COND_EXPR) |
| nops++; |
| } |
| else if (is_a <gphi *> (stmt_info->stmt)) |
| nops = 0; |
| else |
| return NULL; |
| |
| /* If the SLP node is a PHI (induction or reduction), terminate |
| the recursion. */ |
| if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt)) |
| { |
| tree scalar_type = TREE_TYPE (PHI_RESULT (stmt)); |
| tree vectype = get_vectype_for_scalar_type (scalar_type); |
| if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits)) |
| return NULL; |
| |
| vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info); |
| /* Induction from different IVs is not supported. */ |
| if (def_type == vect_induction_def) |
| { |
| stmt_vec_info other_info; |
| FOR_EACH_VEC_ELT (stmts, i, other_info) |
| if (stmt_info != other_info) |
| return NULL; |
| } |
| else if (def_type == vect_reduction_def |
| || def_type == vect_double_reduction_def |
| || def_type == vect_nested_cycle) |
| { |
| /* Else def types have to match. */ |
| stmt_vec_info other_info; |
| FOR_EACH_VEC_ELT (stmts, i, other_info) |
| { |
| /* But for reduction chains only check on the first stmt. */ |
| if (!STMT_VINFO_DATA_REF (other_info) |
| && REDUC_GROUP_FIRST_ELEMENT (other_info) |
| && REDUC_GROUP_FIRST_ELEMENT (other_info) != stmt_info) |
| continue; |
| if (STMT_VINFO_DEF_TYPE (other_info) != def_type) |
| return NULL; |
| } |
| } |
| else |
| return NULL; |
| node = vect_create_new_slp_node (stmts); |
| return node; |
| } |
| |
| |
| bool two_operators = false; |
| unsigned char *swap = XALLOCAVEC (unsigned char, group_size); |
| if (!vect_build_slp_tree_1 (swap, stmts, group_size, |
| &this_max_nunits, matches, &two_operators)) |
| return NULL; |
| |
| /* If the SLP node is a load, terminate the recursion. */ |
| if (STMT_VINFO_GROUPED_ACCESS (stmt_info) |
| && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) |
| { |
| *max_nunits = this_max_nunits; |
| node = vect_create_new_slp_node (stmts); |
| return node; |
| } |
| |
| /* Get at the operands, verifying they are compatible. */ |
| vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size); |
| slp_oprnd_info oprnd_info; |
| FOR_EACH_VEC_ELT (stmts, i, stmt_info) |
| { |
| int res = vect_get_and_check_slp_defs (vinfo, &swap[i], |
| stmts, i, &oprnds_info); |
| if (res != 0) |
| matches[(res == -1) ? 0 : i] = false; |
| if (!matches[0]) |
| break; |
| } |
| for (i = 0; i < group_size; ++i) |
| if (!matches[i]) |
| { |
| vect_free_oprnd_info (oprnds_info); |
| return NULL; |
| } |
| |
| auto_vec<slp_tree, 4> children; |
| |
| stmt_info = stmts[0]; |
| |
| if (tree_size) |
| max_tree_size -= *tree_size; |
| |
| /* Create SLP_TREE nodes for the definition node/s. */ |
| FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) |
| { |
| slp_tree child; |
| unsigned old_tree_size = this_tree_size; |
| unsigned int j; |
| |
| if (oprnd_info->first_dt != vect_internal_def |
| && oprnd_info->first_dt != vect_reduction_def |
| && oprnd_info->first_dt != vect_induction_def) |
| continue; |
| |
| if (++this_tree_size > max_tree_size) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, |
| vect_location, |
| "Build SLP failed: SLP tree too large\n"); |
| FOR_EACH_VEC_ELT (children, j, child) |
| vect_free_slp_tree (child, false); |
| vect_free_oprnd_info (oprnds_info); |
| return NULL; |
| } |
| |
| if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts, |
| group_size, &this_max_nunits, |
| matches, npermutes, |
| &this_tree_size, |
| max_tree_size, bst_map)) != NULL) |
| { |
| /* If we have all children of child built up from scalars then just |
| throw that away and build it up this node from scalars. */ |
| if (!SLP_TREE_CHILDREN (child).is_empty () |
| /* ??? Rejecting patterns this way doesn't work. We'd have to |
| do extra work to cancel the pattern so the uses see the |
| scalar version. */ |
| && !oprnd_info->any_pattern) |
| { |
| slp_tree grandchild; |
| |
| FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild) |
| if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def) |
| break; |
| if (!grandchild) |
| { |
| /* Roll back. */ |
| this_tree_size = old_tree_size; |
| FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild) |
| vect_free_slp_tree (grandchild, false); |
| SLP_TREE_CHILDREN (child).truncate (0); |
| |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Building parent vector operands from " |
| "scalars instead\n"); |
| oprnd_info->def_stmts = vNULL; |
| SLP_TREE_DEF_TYPE (child) = vect_external_def; |
| children.safe_push (child); |
| continue; |
| } |
| } |
| |
| oprnd_info->def_stmts = vNULL; |
| children.safe_push (child); |
| continue; |
| } |
| |
| /* If the SLP build failed fatally and we analyze a basic-block |
| simply treat nodes we fail to build as externally defined |
| (and thus build vectors from the scalar defs). |
| The cost model will reject outright expensive cases. |
| ??? This doesn't treat cases where permutation ultimatively |
| fails (or we don't try permutation below). Ideally we'd |
| even compute a permutation that will end up with the maximum |
| SLP tree size... */ |
| if (is_a <bb_vec_info> (vinfo) |
| && !matches[0] |
| /* ??? Rejecting patterns this way doesn't work. We'd have to |
| do extra work to cancel the pattern so the uses see the |
| scalar version. */ |
| && !is_pattern_stmt_p (stmt_info) |
| && !oprnd_info->any_pattern) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Building vector operands from scalars\n"); |
| child = vect_create_new_slp_node (oprnd_info->def_stmts); |
| SLP_TREE_DEF_TYPE (child) = vect_external_def; |
| children.safe_push (child); |
| oprnd_info->def_stmts = vNULL; |
| continue; |
| } |
| |
| /* If the SLP build for operand zero failed and operand zero |
| and one can be commutated try that for the scalar stmts |
| that failed the match. */ |
| if (i == 0 |
| /* A first scalar stmt mismatch signals a fatal mismatch. */ |
| && matches[0] |
| /* ??? For COND_EXPRs we can swap the comparison operands |
| as well as the arms under some constraints. */ |
| && nops == 2 |
| && oprnds_info[1]->first_dt == vect_internal_def |
| && is_gimple_assign (stmt_info->stmt) |
| /* Swapping operands for reductions breaks assumptions later on. */ |
| && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def |
| && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def |
| /* Do so only if the number of not successful permutes was nor more |
| than a cut-ff as re-trying the recursive match on |
| possibly each level of the tree would expose exponential |
| behavior. */ |
| && *npermutes < 4) |
| { |
| /* See whether we can swap the matching or the non-matching |
| stmt operands. */ |
| bool swap_not_matching = true; |
| do |
| { |
| for (j = 0; j < group_size; ++j) |
| { |
| if (matches[j] != !swap_not_matching) |
| continue; |
| stmt_vec_info stmt_info = stmts[j]; |
| /* Verify if we can swap operands of this stmt. */ |
| gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt); |
| if (!stmt |
| || !commutative_tree_code (gimple_assign_rhs_code (stmt))) |
| { |
| if (!swap_not_matching) |
| goto fail; |
| swap_not_matching = false; |
| break; |
| } |
| /* Verify if we can safely swap or if we committed to a |
| specific operand order already. |
| ??? Instead of modifying GIMPLE stmts here we could |
| record whether we want to swap operands in the SLP |
| node and temporarily do that when processing it |
| (or wrap operand accessors in a helper). */ |
| else if (swap[j] != 0 |
| || STMT_VINFO_NUM_SLP_USES (stmt_info)) |
| { |
| if (!swap_not_matching) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, |
| vect_location, |
| "Build SLP failed: cannot swap " |
| "operands of shared stmt %G", |
| stmts[j]->stmt); |
| goto fail; |
| } |
| swap_not_matching = false; |
| break; |
| } |
| } |
| } |
| while (j != group_size); |
| |
| /* Swap mismatched definition stmts. */ |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Re-trying with swapped operands of stmts "); |
| for (j = 0; j < group_size; ++j) |
| if (matches[j] == !swap_not_matching) |
| { |
| std::swap (oprnds_info[0]->def_stmts[j], |
| oprnds_info[1]->def_stmts[j]); |
| if (dump_enabled_p ()) |
| dump_printf (MSG_NOTE, "%d ", j); |
| } |
| if (dump_enabled_p ()) |
| dump_printf (MSG_NOTE, "\n"); |
| /* And try again with scratch 'matches' ... */ |
| bool *tem = XALLOCAVEC (bool, group_size); |
| if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts, |
| group_size, &this_max_nunits, |
| tem, npermutes, |
| &this_tree_size, |
| max_tree_size, bst_map)) != NULL) |
| { |
| /* ... so if successful we can apply the operand swapping |
| to the GIMPLE IL. This is necessary because for example |
| vect_get_slp_defs uses operand indexes and thus expects |
| canonical operand order. This is also necessary even |
| if we end up building the operand from scalars as |
| we'll continue to process swapped operand two. */ |
| for (j = 0; j < group_size; ++j) |
| gimple_set_plf (stmts[j]->stmt, GF_PLF_1, false); |
| for (j = 0; j < group_size; ++j) |
| if (matches[j] == !swap_not_matching) |
| { |
| gassign *stmt = as_a <gassign *> (stmts[j]->stmt); |
| /* Avoid swapping operands twice. */ |
| if (gimple_plf (stmt, GF_PLF_1)) |
| continue; |
| swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt), |
| gimple_assign_rhs2_ptr (stmt)); |
| gimple_set_plf (stmt, GF_PLF_1, true); |
| } |
| /* Verify we swap all duplicates or none. */ |
| if (flag_checking) |
| for (j = 0; j < group_size; ++j) |
| gcc_assert (gimple_plf (stmts[j]->stmt, GF_PLF_1) |
| == (matches[j] == !swap_not_matching)); |
| |
| /* If we have all children of child built up from scalars then |
| just throw that away and build it up this node from scalars. */ |
| if (!SLP_TREE_CHILDREN (child).is_empty () |
| /* ??? Rejecting patterns this way doesn't work. We'd have |
| to do extra work to cancel the pattern so the uses see the |
| scalar version. */ |
| && !oprnd_info->any_pattern) |
| { |
| unsigned int j; |
| slp_tree grandchild; |
| |
| FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild) |
| if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def) |
| break; |
| if (!grandchild) |
| { |
| /* Roll back. */ |
| this_tree_size = old_tree_size; |
| FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild) |
| vect_free_slp_tree (grandchild, false); |
| SLP_TREE_CHILDREN (child).truncate (0); |
| |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Building parent vector operands from " |
| "scalars instead\n"); |
| oprnd_info->def_stmts = vNULL; |
| SLP_TREE_DEF_TYPE (child) = vect_external_def; |
| children.safe_push (child); |
| continue; |
| } |
| } |
| |
| oprnd_info->def_stmts = vNULL; |
| children.safe_push (child); |
| continue; |
| } |
| |
| ++*npermutes; |
| } |
| |
| fail: |
| gcc_assert (child == NULL); |
| FOR_EACH_VEC_ELT (children, j, child) |
| vect_free_slp_tree (child, false); |
| vect_free_oprnd_info (oprnds_info); |
| return NULL; |
| } |
| |
| vect_free_oprnd_info (oprnds_info); |
| |
| if (tree_size) |
| *tree_size += this_tree_size; |
| *max_nunits = this_max_nunits; |
| |
| node = vect_create_new_slp_node (stmts); |
| SLP_TREE_TWO_OPERATORS (node) = two_operators; |
| SLP_TREE_CHILDREN (node).splice (children); |
| return node; |
| } |
| |
| /* Dump a slp tree NODE using flags specified in DUMP_KIND. */ |
| |
| static void |
| vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc, |
| slp_tree node, hash_set<slp_tree> &visited) |
| { |
| int i; |
| stmt_vec_info stmt_info; |
| slp_tree child; |
| |
| if (visited.add (node)) |
| return; |
| |
| dump_metadata_t metadata (dump_kind, loc.get_impl_location ()); |
| dump_user_location_t user_loc = loc.get_user_location (); |
| dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u)\n", |
| SLP_TREE_DEF_TYPE (node) != vect_internal_def |
| ? " (external)" : "", node, |
| estimated_poly_value (node->max_nunits)); |
| FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
| dump_printf_loc (metadata, user_loc, "\tstmt %d %G", i, stmt_info->stmt); |
| if (SLP_TREE_CHILDREN (node).is_empty ()) |
| return; |
| dump_printf_loc (metadata, user_loc, "\tchildren"); |
| FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
| dump_printf (dump_kind, " %p", (void *)child); |
| dump_printf (dump_kind, "\n"); |
| FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
| vect_print_slp_tree (dump_kind, loc, child, visited); |
| } |
| |
| static void |
| vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc, |
| slp_tree node) |
| { |
| hash_set<slp_tree> visited; |
| vect_print_slp_tree (dump_kind, loc, node, visited); |
| } |
| |
| /* Mark the tree rooted at NODE with PURE_SLP. */ |
| |
| static void |
| vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited) |
| { |
| int i; |
| stmt_vec_info stmt_info; |
| slp_tree child; |
| |
| if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) |
| return; |
| |
| if (visited.add (node)) |
| return; |
| |
| FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
| STMT_SLP_TYPE (stmt_info) = pure_slp; |
| |
| FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
| vect_mark_slp_stmts (child, visited); |
| } |
| |
| static void |
| vect_mark_slp_stmts (slp_tree node) |
| { |
| hash_set<slp_tree> visited; |
| vect_mark_slp_stmts (node, visited); |
| } |
| |
| /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */ |
| |
| static void |
| vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited) |
| { |
| int i; |
| stmt_vec_info stmt_info; |
| slp_tree child; |
| |
| if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) |
| return; |
| |
| if (visited.add (node)) |
| return; |
| |
| FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
| { |
| gcc_assert (!STMT_VINFO_RELEVANT (stmt_info) |
| || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope); |
| STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope; |
| } |
| |
| FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
| vect_mark_slp_stmts_relevant (child, visited); |
| } |
| |
| static void |
| vect_mark_slp_stmts_relevant (slp_tree node) |
| { |
| hash_set<slp_tree> visited; |
| vect_mark_slp_stmts_relevant (node, visited); |
| } |
| |
| |
| /* Rearrange the statements of NODE according to PERMUTATION. */ |
| |
| static void |
| vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size, |
| vec<unsigned> permutation, |
| hash_set<slp_tree> &visited) |
| { |
| stmt_vec_info stmt_info; |
| vec<stmt_vec_info> tmp_stmts; |
| unsigned int i; |
| slp_tree child; |
| |
| if (visited.add (node)) |
| return; |
| |
| FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
| vect_slp_rearrange_stmts (child, group_size, permutation, visited); |
| |
| gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ()); |
| tmp_stmts.create (group_size); |
| tmp_stmts.quick_grow_cleared (group_size); |
| |
| FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
| tmp_stmts[permutation[i]] = stmt_info; |
| |
| SLP_TREE_SCALAR_STMTS (node).release (); |
| SLP_TREE_SCALAR_STMTS (node) = tmp_stmts; |
| } |
| |
| |
| /* Attempt to reorder stmts in a reduction chain so that we don't |
| require any load permutation. Return true if that was possible, |
| otherwise return false. */ |
| |
| static bool |
| vect_attempt_slp_rearrange_stmts (slp_instance slp_instn) |
| { |
| unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn); |
| unsigned int i, j; |
| unsigned int lidx; |
| slp_tree node, load; |
| |
| /* Compare all the permutation sequences to the first one. We know |
| that at least one load is permuted. */ |
| node = SLP_INSTANCE_LOADS (slp_instn)[0]; |
| if (!node->load_permutation.exists ()) |
| return false; |
| for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i) |
| { |
| if (!load->load_permutation.exists ()) |
| return false; |
| FOR_EACH_VEC_ELT (load->load_permutation, j, lidx) |
| if (lidx != node->load_permutation[j]) |
| return false; |
| } |
| |
| /* Check that the loads in the first sequence are different and there |
| are no gaps between them. */ |
| auto_sbitmap load_index (group_size); |
| bitmap_clear (load_index); |
| FOR_EACH_VEC_ELT (node->load_permutation, i, lidx) |
| { |
| if (lidx >= group_size) |
| return false; |
| if (bitmap_bit_p (load_index, lidx)) |
| return false; |
| |
| bitmap_set_bit (load_index, lidx); |
| } |
| for (i = 0; i < group_size; i++) |
| if (!bitmap_bit_p (load_index, i)) |
| return false; |
| |
| /* This permutation is valid for reduction. Since the order of the |
| statements in the nodes is not important unless they are memory |
| accesses, we can rearrange the statements in all the nodes |
| according to the order of the loads. */ |
| hash_set<slp_tree> visited; |
| vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size, |
| node->load_permutation, visited); |
| |
| /* We are done, no actual permutations need to be generated. */ |
| poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn); |
| FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) |
| { |
| stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; |
| first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info); |
| /* But we have to keep those permutations that are required because |
| of handling of gaps. */ |
| if (known_eq (unrolling_factor, 1U) |
| || (group_size == DR_GROUP_SIZE (first_stmt_info) |
| && DR_GROUP_GAP (first_stmt_info) == 0)) |
| SLP_TREE_LOAD_PERMUTATION (node).release (); |
| else |
| for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j) |
| SLP_TREE_LOAD_PERMUTATION (node)[j] = j; |
| } |
| |
| return true; |
| } |
| |
| /* Gather loads in the SLP graph NODE and populate the INST loads array. */ |
| |
| static void |
| vect_gather_slp_loads (slp_instance inst, slp_tree node, |
| hash_set<slp_tree> &visited) |
| { |
| if (visited.add (node)) |
| return; |
| |
| if (SLP_TREE_CHILDREN (node).length () == 0) |
| { |
| stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; |
| if (SLP_TREE_DEF_TYPE (node) == vect_internal_def |
| && STMT_VINFO_GROUPED_ACCESS (stmt_info) |
| && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) |
| SLP_INSTANCE_LOADS (inst).safe_push (node); |
| } |
| else |
| { |
| unsigned i; |
| slp_tree child; |
| FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
| vect_gather_slp_loads (inst, child, visited); |
| } |
| } |
| |
| static void |
| vect_gather_slp_loads (slp_instance inst, slp_tree node) |
| { |
| hash_set<slp_tree> visited; |
| vect_gather_slp_loads (inst, node, visited); |
| } |
| |
| /* Check if the required load permutations in the SLP instance |
| SLP_INSTN are supported. */ |
| |
| static bool |
| vect_supported_load_permutation_p (slp_instance slp_instn) |
| { |
| unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn); |
| unsigned int i, j, k, next; |
| slp_tree node; |
| |
| if (dump_enabled_p ()) |
| { |
| dump_printf_loc (MSG_NOTE, vect_location, "Load permutation "); |
| FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) |
| if (node->load_permutation.exists ()) |
| FOR_EACH_VEC_ELT (node->load_permutation, j, next) |
| dump_printf (MSG_NOTE, "%d ", next); |
| else |
| for (k = 0; k < group_size; ++k) |
| dump_printf (MSG_NOTE, "%d ", k); |
| dump_printf (MSG_NOTE, "\n"); |
| } |
| |
| /* In case of reduction every load permutation is allowed, since the order |
| of the reduction statements is not important (as opposed to the case of |
| grouped stores). The only condition we need to check is that all the |
| load nodes are of the same size and have the same permutation (and then |
| rearrange all the nodes of the SLP instance according to this |
| permutation). */ |
| |
| /* Check that all the load nodes are of the same size. */ |
| /* ??? Can't we assert this? */ |
| FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) |
| if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size) |
| return false; |
| |
| node = SLP_INSTANCE_TREE (slp_instn); |
| stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; |
| |
| /* Reduction (there are no data-refs in the root). |
| In reduction chain the order of the loads is not important. */ |
| if (!STMT_VINFO_DATA_REF (stmt_info) |
| && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)) |
| vect_attempt_slp_rearrange_stmts (slp_instn); |
| |
| /* In basic block vectorization we allow any subchain of an interleaving |
| chain. |
| FORNOW: not supported in loop SLP because of realignment compications. */ |
| if (STMT_VINFO_BB_VINFO (stmt_info)) |
| { |
| /* Check whether the loads in an instance form a subchain and thus |
| no permutation is necessary. */ |
| FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) |
| { |
| if (!SLP_TREE_LOAD_PERMUTATION (node).exists ()) |
| continue; |
| bool subchain_p = true; |
| stmt_vec_info next_load_info = NULL; |
| stmt_vec_info load_info; |
| FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info) |
| { |
| if (j != 0 |
| && (next_load_info != load_info |
| || DR_GROUP_GAP (load_info) != 1)) |
| { |
| subchain_p = false; |
| break; |
| } |
| next_load_info = DR_GROUP_NEXT_ELEMENT (load_info); |
| } |
| if (subchain_p) |
| SLP_TREE_LOAD_PERMUTATION (node).release (); |
| else |
| { |
| stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0]; |
| group_info = DR_GROUP_FIRST_ELEMENT (group_info); |
| unsigned HOST_WIDE_INT nunits; |
| unsigned k, maxk = 0; |
| FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k) |
| if (k > maxk) |
| maxk = k; |
| /* In BB vectorization we may not actually use a loaded vector |
| accessing elements in excess of DR_GROUP_SIZE. */ |
| tree vectype = STMT_VINFO_VECTYPE (group_info); |
| if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits) |
| || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1))) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "BB vectorization with gaps at the end of " |
| "a load is not supported\n"); |
| return false; |
| } |
| |
| /* Verify the permutation can be generated. */ |
| vec<tree> tem; |
| unsigned n_perms; |
| if (!vect_transform_slp_perm_load (node, tem, NULL, |
| 1, slp_instn, true, &n_perms)) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, |
| vect_location, |
| "unsupported load permutation\n"); |
| return false; |
| } |
| } |
| } |
| return true; |
| } |
| |
| /* For loop vectorization verify we can generate the permutation. Be |
| conservative about the vectorization factor, there are permutations |
| that will use three vector inputs only starting from a specific factor |
| and the vectorization factor is not yet final. |
| ??? The SLP instance unrolling factor might not be the maximum one. */ |
| unsigned n_perms; |
| poly_uint64 test_vf |
| = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), |
| LOOP_VINFO_VECT_FACTOR |
| (STMT_VINFO_LOOP_VINFO (stmt_info))); |
| FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) |
| if (node->load_permutation.exists () |
| && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf, |
| slp_instn, true, &n_perms)) |
| return false; |
| |
| return true; |
| } |
| |
| |
| /* Find the last store in SLP INSTANCE. */ |
| |
| stmt_vec_info |
| vect_find_last_scalar_stmt_in_slp (slp_tree node) |
| { |
| stmt_vec_info last = NULL; |
| stmt_vec_info stmt_vinfo; |
| |
| for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++) |
| { |
| stmt_vinfo = vect_orig_stmt (stmt_vinfo); |
| last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo; |
| } |
| |
| return last; |
| } |
| |
| /* Splits a group of stores, currently beginning at FIRST_VINFO, into |
| two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE |
| (also containing the first GROUP1_SIZE stmts, since stores are |
| consecutive), the second containing the remainder. |
| Return the first stmt in the second group. */ |
| |
| static stmt_vec_info |
| vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size) |
| { |
| gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo); |
| gcc_assert (group1_size > 0); |
| int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size; |
| gcc_assert (group2_size > 0); |
| DR_GROUP_SIZE (first_vinfo) = group1_size; |
| |
| stmt_vec_info stmt_info = first_vinfo; |
| for (unsigned i = group1_size; i > 1; i--) |
| { |
| stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info); |
| gcc_assert (DR_GROUP_GAP (stmt_info) == 1); |
| } |
| /* STMT is now the last element of the first group. */ |
| stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info); |
| DR_GROUP_NEXT_ELEMENT (stmt_info) = 0; |
| |
| DR_GROUP_SIZE (group2) = group2_size; |
| for (stmt_info = group2; stmt_info; |
| stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info)) |
| { |
| DR_GROUP_FIRST_ELEMENT (stmt_info) = group2; |
| gcc_assert (DR_GROUP_GAP (stmt_info) == 1); |
| } |
| |
| /* For the second group, the DR_GROUP_GAP is that before the original group, |
| plus skipping over the first vector. */ |
| DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size; |
| |
| /* DR_GROUP_GAP of the first group now has to skip over the second group too. */ |
| DR_GROUP_GAP (first_vinfo) += group2_size; |
| |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n", |
| group1_size, group2_size); |
| |
| return group2; |
| } |
| |
| /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE |
| statements and a vector of NUNITS elements. */ |
| |
| static poly_uint64 |
| calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size) |
| { |
| return exact_div (common_multiple (nunits, group_size), group_size); |
| } |
| |
| /* Analyze an SLP instance starting from a group of grouped stores. Call |
| vect_build_slp_tree to build a tree of packed stmts if possible. |
| Return FALSE if it's impossible to SLP any stmt in the loop. */ |
| |
| static bool |
| vect_analyze_slp_instance (vec_info *vinfo, |
| stmt_vec_info stmt_info, unsigned max_tree_size) |
| { |
| slp_instance new_instance; |
| slp_tree node; |
| unsigned int group_size; |
| tree vectype, scalar_type = NULL_TREE; |
| unsigned int i; |
| struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); |
| vec<stmt_vec_info> scalar_stmts; |
| |
| if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
| { |
| scalar_type = TREE_TYPE (DR_REF (dr)); |
| vectype = get_vectype_for_scalar_type (scalar_type); |
| group_size = DR_GROUP_SIZE (stmt_info); |
| } |
| else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info)) |
| { |
| gcc_assert (is_a <loop_vec_info> (vinfo)); |
| vectype = STMT_VINFO_VECTYPE (stmt_info); |
| group_size = REDUC_GROUP_SIZE (stmt_info); |
| } |
| else |
| { |
| gcc_assert (is_a <loop_vec_info> (vinfo)); |
| vectype = STMT_VINFO_VECTYPE (stmt_info); |
| group_size = as_a <loop_vec_info> (vinfo)->reductions.length (); |
| } |
| |
| if (!vectype) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: unsupported data-type %T\n", |
| scalar_type); |
| |
| return false; |
| } |
| poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); |
| |
| /* Create a node (a root of the SLP tree) for the packed grouped stores. */ |
| scalar_stmts.create (group_size); |
| stmt_vec_info next_info = stmt_info; |
| if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
| { |
| /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */ |
| while (next_info) |
| { |
| scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info)); |
| next_info = DR_GROUP_NEXT_ELEMENT (next_info); |
| } |
| } |
| else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info)) |
| { |
| /* Collect the reduction stmts and store them in |
| SLP_TREE_SCALAR_STMTS. */ |
| while (next_info) |
| { |
| scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info)); |
| next_info = REDUC_GROUP_NEXT_ELEMENT (next_info); |
| } |
| /* Mark the first element of the reduction chain as reduction to properly |
| transform the node. In the reduction analysis phase only the last |
| element of the chain is marked as reduction. */ |
| STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def; |
| STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) |
| = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ())); |
| } |
| else |
| { |
| /* Collect reduction statements. */ |
| vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions; |
| for (i = 0; reductions.iterate (i, &next_info); i++) |
| scalar_stmts.safe_push (next_info); |
| } |
| |
| /* Build the tree for the SLP instance. */ |
| bool *matches = XALLOCAVEC (bool, group_size); |
| unsigned npermutes = 0; |
| scalar_stmts_to_slp_tree_map_t *bst_map |
| = new scalar_stmts_to_slp_tree_map_t (); |
| poly_uint64 max_nunits = nunits; |
| node = vect_build_slp_tree (vinfo, scalar_stmts, group_size, |
| &max_nunits, matches, &npermutes, |
| NULL, max_tree_size, bst_map); |
| /* The map keeps a reference on SLP nodes built, release that. */ |
| for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin (); |
| it != bst_map->end (); ++it) |
| if ((*it).second) |
| vect_free_slp_tree ((*it).second, false); |
| delete bst_map; |
| if (node != NULL) |
| { |
| /* Calculate the unrolling factor based on the smallest type. */ |
| poly_uint64 unrolling_factor |
| = calculate_unrolling_factor (max_nunits, group_size); |
| |
| if (maybe_ne (unrolling_factor, 1U) |
| && is_a <bb_vec_info> (vinfo)) |
| { |
| unsigned HOST_WIDE_INT const_max_nunits; |
| if (!max_nunits.is_constant (&const_max_nunits) |
| || const_max_nunits > group_size) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: store group " |
| "size not a multiple of the vector size " |
| "in basic block SLP\n"); |
| vect_free_slp_tree (node, false); |
| return false; |
| } |
| /* Fatal mismatch. */ |
| matches[group_size / const_max_nunits * const_max_nunits] = false; |
| vect_free_slp_tree (node, false); |
| } |
| else |
| { |
| /* Create a new SLP instance. */ |
| new_instance = XNEW (struct _slp_instance); |
| SLP_INSTANCE_TREE (new_instance) = node; |
| SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size; |
| SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor; |
| SLP_INSTANCE_LOADS (new_instance) = vNULL; |
| vect_gather_slp_loads (new_instance, node); |
| |
| /* Compute the load permutation. */ |
| slp_tree load_node; |
| bool loads_permuted = false; |
| FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node) |
| { |
| vec<unsigned> load_permutation; |
| int j; |
| stmt_vec_info load_info; |
| bool this_load_permuted = false; |
| load_permutation.create (group_size); |
| stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT |
| (SLP_TREE_SCALAR_STMTS (load_node)[0]); |
| FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info) |
| { |
| int load_place = vect_get_place_in_interleaving_chain |
| (load_info, first_stmt_info); |
| gcc_assert (load_place != -1); |
| if (load_place != j) |
| this_load_permuted = true; |
| load_permutation.safe_push (load_place); |
| } |
| if (!this_load_permuted |
| /* The load requires permutation when unrolling exposes |
| a gap either because the group is larger than the SLP |
| group-size or because there is a gap between the groups. */ |
| && group_size == DR_GROUP_SIZE (first_stmt_info) |
| && DR_GROUP_GAP (first_stmt_info) == 0) |
| { |
| load_permutation.release (); |
| continue; |
| } |
| SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation; |
| loads_permuted = true; |
| } |
| |
| if (loads_permuted) |
| { |
| if (!vect_supported_load_permutation_p (new_instance)) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Build SLP failed: unsupported load " |
| "permutation %G", stmt_info->stmt); |
| vect_free_slp_instance (new_instance, false); |
| return false; |
| } |
| } |
| |
| /* If the loads and stores can be handled with load/store-lan |
| instructions do not generate this SLP instance. */ |
| if (is_a <loop_vec_info> (vinfo) |
| && loads_permuted |
| && dr && vect_store_lanes_supported (vectype, group_size, false)) |
| { |
| slp_tree load_node; |
| FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node) |
| { |
| stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT |
| (SLP_TREE_SCALAR_STMTS (load_node)[0]); |
| /* Use SLP for strided accesses (or if we can't load-lanes). */ |
| if (STMT_VINFO_STRIDED_P (stmt_vinfo) |
| || ! vect_load_lanes_supported |
| (STMT_VINFO_VECTYPE (stmt_vinfo), |
| DR_GROUP_SIZE (stmt_vinfo), false)) |
| break; |
| } |
| if (i == SLP_INSTANCE_LOADS (new_instance).length ()) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Built SLP cancelled: can use " |
| "load/store-lanes\n"); |
| vect_free_slp_instance (new_instance, false); |
| return false; |
| } |
| } |
| |
| vinfo->slp_instances.safe_push (new_instance); |
| |
| if (dump_enabled_p ()) |
| { |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Final SLP tree for instance:\n"); |
| vect_print_slp_tree (MSG_NOTE, vect_location, node); |
| } |
| |
| return true; |
| } |
| } |
| else |
| { |
| /* Failed to SLP. */ |
| /* Free the allocated memory. */ |
| scalar_stmts.release (); |
| } |
| |
| /* For basic block SLP, try to break the group up into multiples of the |
| vector size. */ |
| unsigned HOST_WIDE_INT const_nunits; |
| if (is_a <bb_vec_info> (vinfo) |
| && STMT_VINFO_GROUPED_ACCESS (stmt_info) |
| && DR_GROUP_FIRST_ELEMENT (stmt_info) |
| && nunits.is_constant (&const_nunits)) |
| { |
| /* We consider breaking the group only on VF boundaries from the existing |
| start. */ |
| for (i = 0; i < group_size; i++) |
| if (!matches[i]) break; |
| |
| if (i >= const_nunits && i < group_size) |
| { |
| /* Split into two groups at the first vector boundary before i. */ |
| gcc_assert ((const_nunits & (const_nunits - 1)) == 0); |
| unsigned group1_size = i & ~(const_nunits - 1); |
| |
| stmt_vec_info rest = vect_split_slp_store_group (stmt_info, |
| group1_size); |
| bool res = vect_analyze_slp_instance (vinfo, stmt_info, |
| max_tree_size); |
| /* If the first non-match was in the middle of a vector, |
| skip the rest of that vector. */ |
| if (group1_size < i) |
| { |
| i = group1_size + const_nunits; |
| if (i < group_size) |
| rest = vect_split_slp_store_group (rest, const_nunits); |
| } |
| if (i < group_size) |
| res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size); |
| return res; |
| } |
| /* Even though the first vector did not all match, we might be able to SLP |
| (some) of the remainder. FORNOW ignore this possibility. */ |
| } |
| |
| return false; |
| } |
| |
| |
| /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP |
| trees of packed scalar stmts if SLP is possible. */ |
| |
| opt_result |
| vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) |
| { |
| unsigned int i; |
| stmt_vec_info first_element; |
| |
| DUMP_VECT_SCOPE ("vect_analyze_slp"); |
| |
| /* Find SLP sequences starting from groups of grouped stores. */ |
| FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element) |
| vect_analyze_slp_instance (vinfo, first_element, max_tree_size); |
| |
| if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) |
| { |
| if (loop_vinfo->reduction_chains.length () > 0) |
| { |
| /* Find SLP sequences starting from reduction chains. */ |
| FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element) |
| if (! vect_analyze_slp_instance (vinfo, first_element, |
| max_tree_size)) |
| { |
| /* Dissolve reduction chain group. */ |
| stmt_vec_info vinfo = first_element; |
| while (vinfo) |
| { |
| stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo); |
| REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL; |
| REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL; |
| vinfo = next; |
| } |
| STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def; |
| } |
| } |
| |
| /* Find SLP sequences starting from groups of reductions. */ |
| if (loop_vinfo->reductions.length () > 1) |
| vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0], |
| max_tree_size); |
| } |
| |
| return opt_result::success (); |
| } |
| |
| |
| /* For each possible SLP instance decide whether to SLP it and calculate overall |
| unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at |
| least one instance. */ |
| |
| bool |
| vect_make_slp_decision (loop_vec_info loop_vinfo) |
| { |
| unsigned int i; |
| poly_uint64 unrolling_factor = 1; |
| vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); |
| slp_instance instance; |
| int decided_to_slp = 0; |
| |
| DUMP_VECT_SCOPE ("vect_make_slp_decision"); |
| |
| FOR_EACH_VEC_ELT (slp_instances, i, instance) |
| { |
| /* FORNOW: SLP if you can. */ |
| /* All unroll factors have the form current_vector_size * X for some |
| rational X, so they must have a common multiple. */ |
| unrolling_factor |
| = force_common_multiple (unrolling_factor, |
| SLP_INSTANCE_UNROLLING_FACTOR (instance)); |
| |
| /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we |
| call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and |
| loop-based vectorization. Such stmts will be marked as HYBRID. */ |
| vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance)); |
| decided_to_slp++; |
| } |
| |
| LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor; |
| |
| if (decided_to_slp && dump_enabled_p ()) |
| { |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Decided to SLP %d instances. Unrolling factor ", |
| decided_to_slp); |
| dump_dec (MSG_NOTE, unrolling_factor); |
| dump_printf (MSG_NOTE, "\n"); |
| } |
| |
| return (decided_to_slp > 0); |
| } |
| |
| |
| /* Private data for vect_detect_hybrid_slp. */ |
| struct vdhs_data |
| { |
| loop_vec_info loop_vinfo; |
| vec<stmt_vec_info> *worklist; |
| }; |
| |
| /* Walker for walk_gimple_op. */ |
| |
| static tree |
| vect_detect_hybrid_slp (tree *tp, int *, void *data) |
| { |
| walk_stmt_info *wi = (walk_stmt_info *)data; |
| vdhs_data *dat = (vdhs_data *)wi->info; |
| |
| if (wi->is_lhs) |
| return NULL_TREE; |
| |
| stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp); |
| if (!def_stmt_info) |
| return NULL_TREE; |
| def_stmt_info = vect_stmt_to_vectorize (def_stmt_info); |
| if (PURE_SLP_STMT (def_stmt_info)) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G", |
| def_stmt_info->stmt); |
| STMT_SLP_TYPE (def_stmt_info) = hybrid; |
| dat->worklist->safe_push (def_stmt_info); |
| } |
| |
| return NULL_TREE; |
| } |
| |
| /* Find stmts that must be both vectorized and SLPed. */ |
| |
| void |
| vect_detect_hybrid_slp (loop_vec_info loop_vinfo) |
| { |
| DUMP_VECT_SCOPE ("vect_detect_hybrid_slp"); |
| |
| /* All stmts participating in SLP are marked pure_slp, all other |
| stmts are loop_vect. |
| First collect all loop_vect stmts into a worklist. */ |
| auto_vec<stmt_vec_info> worklist; |
| for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i) |
| { |
| basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i]; |
| for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi); |
| gsi_next (&gsi)) |
| { |
| gphi *phi = gsi.phi (); |
| stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi); |
| if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info)) |
| worklist.safe_push (stmt_info); |
| } |
| for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); |
| gsi_next (&gsi)) |
| { |
| gimple *stmt = gsi_stmt (gsi); |
| stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt); |
| if (STMT_VINFO_IN_PATTERN_P (stmt_info)) |
| { |
| for (gimple_stmt_iterator gsi2 |
| = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)); |
| !gsi_end_p (gsi2); gsi_next (&gsi2)) |
| { |
| stmt_vec_info patt_info |
| = loop_vinfo->lookup_stmt (gsi_stmt (gsi2)); |
| if (!STMT_SLP_TYPE (patt_info) |
| && STMT_VINFO_RELEVANT (patt_info)) |
| worklist.safe_push (patt_info); |
| } |
| stmt_info = STMT_VINFO_RELATED_STMT (stmt_info); |
| } |
| if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info)) |
| worklist.safe_push (stmt_info); |
| } |
| } |
| |
| /* Now we have a worklist of non-SLP stmts, follow use->def chains and |
| mark any SLP vectorized stmt as hybrid. |
| ??? We're visiting def stmts N times (once for each non-SLP and |
| once for each hybrid-SLP use). */ |
| walk_stmt_info wi; |
| vdhs_data dat; |
| dat.worklist = &worklist; |
| dat.loop_vinfo = loop_vinfo; |
| memset (&wi, 0, sizeof (wi)); |
| wi.info = (void *)&dat; |
| while (!worklist.is_empty ()) |
| { |
| stmt_vec_info stmt_info = worklist.pop (); |
| /* Since SSA operands are not set up for pattern stmts we need |
| to use walk_gimple_op. */ |
| wi.is_lhs = 0; |
| walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi); |
| } |
| } |
| |
| |
| /* Initialize a bb_vec_info struct for the statements between |
| REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */ |
| |
| _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in, |
| gimple_stmt_iterator region_end_in, |
| vec_info_shared *shared) |
| : vec_info (vec_info::bb, init_cost (NULL), shared), |
| bb (gsi_bb (region_begin_in)), |
| region_begin (region_begin_in), |
| region_end (region_end_in) |
| { |
| gimple_stmt_iterator gsi; |
| |
| for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end); |
| gsi_next (&gsi)) |
| { |
| gimple *stmt = gsi_stmt (gsi); |
| gimple_set_uid (stmt, 0); |
| add_stmt (stmt); |
| } |
| |
| bb->aux = this; |
| } |
| |
| |
| /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the |
| stmts in the basic block. */ |
| |
| _bb_vec_info::~_bb_vec_info () |
| { |
| for (gimple_stmt_iterator si = region_begin; |
| gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si)) |
| /* Reset region marker. */ |
| gimple_set_uid (gsi_stmt (si), -1); |
| |
| bb->aux = NULL; |
| } |
| |
| /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE, |
| given then that child nodes have already been processed, and that |
| their def types currently match their SLP node's def type. */ |
| |
| static bool |
| vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node, |
| slp_instance node_instance, |
| stmt_vector_for_cost *cost_vec) |
| { |
| stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; |
| gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect); |
| |
| /* For BB vectorization vector types are assigned here. |
| Memory accesses already got their vector type assigned |
| in vect_analyze_data_refs. */ |
| bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); |
| if (bb_vinfo |
| && ! STMT_VINFO_DATA_REF (stmt_info)) |
| { |
| tree vectype, nunits_vectype; |
| if (!vect_get_vector_types_for_stmt (stmt_info, &vectype, |
| &nunits_vectype)) |
| /* We checked this when building the node. */ |
| gcc_unreachable (); |
| if (vectype == boolean_type_node) |
| { |
| vectype = vect_get_mask_type_for_stmt (stmt_info); |
| if (!vectype) |
| /* vect_get_mask_type_for_stmt has already explained the |
| failure. */ |
| return false; |
| } |
| |
| stmt_vec_info sstmt_info; |
| unsigned int i; |
| FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt_info) |
| STMT_VINFO_VECTYPE (sstmt_info) = vectype; |
| } |
| |
| /* Calculate the number of vector statements to be created for the |
| scalar stmts in this node. For SLP reductions it is equal to the |
| number of vector statements in the children (which has already been |
| calculated by the recursive call). Otherwise it is the number of |
| scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by |
| VF divided by the number of elements in a vector. */ |
| if (!STMT_VINFO_GROUPED_ACCESS (stmt_info) |
| && REDUC_GROUP_FIRST_ELEMENT (stmt_info)) |
| SLP_TREE_NUMBER_OF_VEC_STMTS (node) |
| = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]); |
| else |
| { |
| poly_uint64 vf; |
| if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) |
| vf = loop_vinfo->vectorization_factor; |
| else |
| vf = 1; |
| unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance); |
| tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
| SLP_TREE_NUMBER_OF_VEC_STMTS (node) |
| = vect_get_num_vectors (vf * group_size, vectype); |
| } |
| |
| bool dummy; |
| return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec); |
| } |
| |
| /* Analyze statements contained in SLP tree NODE after recursively analyzing |
| the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE. |
| |
| Return true if the operations are supported. */ |
| |
| static bool |
| vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node, |
| slp_instance node_instance, |
| scalar_stmts_to_slp_tree_map_t *visited, |
| scalar_stmts_to_slp_tree_map_t *lvisited, |
| stmt_vector_for_cost *cost_vec) |
| { |
| int i, j; |
| slp_tree child; |
| |
| if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) |
| return true; |
| |
| /* If we already analyzed the exact same set of scalar stmts we're done. |
| We share the generated vector stmts for those. */ |
| slp_tree *leader; |
| if ((leader = visited->get (SLP_TREE_SCALAR_STMTS (node))) |
| || (leader = lvisited->get (SLP_TREE_SCALAR_STMTS (node)))) |
| { |
| SLP_TREE_NUMBER_OF_VEC_STMTS (node) |
| = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader); |
| return true; |
| } |
| |
| /* The SLP graph is acyclic so not caching whether we failed or succeeded |
| doesn't result in any issue since we throw away the lvisited set |
| when we fail. */ |
| lvisited->put (SLP_TREE_SCALAR_STMTS (node).copy (), node); |
| |
| FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
| if (!vect_slp_analyze_node_operations (vinfo, child, node_instance, |
| visited, lvisited, cost_vec)) |
| return false; |
| |
| /* ??? We have to catch the case late where two first scalar stmts appear |
| in multiple SLP children with different def type and fail. Remember |
| original def types first since SLP_TREE_DEF_TYPE doesn't necessarily |
| match it when that is vect_internal_def. */ |
| auto_vec<vect_def_type, 4> dt; |
| dt.safe_grow (SLP_TREE_CHILDREN (node).length ()); |
| FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) |
| dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]); |
| |
| /* Push SLP node def-type to stmt operands. */ |
| FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) |
| if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) |
| STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) |
| = SLP_TREE_DEF_TYPE (child); |
| |
| /* Check everything worked out. */ |
| bool res = true; |
| FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) |
| if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) |
| { |
| if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) |
| != SLP_TREE_DEF_TYPE (child)) |
| res = false; |
| } |
| else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) != dt[j]) |
| res = false; |
| if (!res && dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "not vectorized: same operand with different " |
| "def type in stmt.\n"); |
| |
| if (res) |
| res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance, |
| cost_vec); |
| |
| /* Restore def-types. */ |
| FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) |
| STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j]; |
| |
| return res; |
| } |
| |
| |
| /* Analyze statements in SLP instances of VINFO. Return true if the |
| operations are supported. */ |
| |
| bool |
| vect_slp_analyze_operations (vec_info *vinfo) |
| { |
| slp_instance instance; |
| int i; |
| |
| DUMP_VECT_SCOPE ("vect_slp_analyze_operations"); |
| |
| scalar_stmts_to_slp_tree_map_t *visited |
| = new scalar_stmts_to_slp_tree_map_t (); |
| for (i = 0; vinfo->slp_instances.iterate (i, &instance); ) |
| { |
| scalar_stmts_to_slp_tree_map_t lvisited; |
| stmt_vector_for_cost cost_vec; |
| cost_vec.create (2); |
| if (!vect_slp_analyze_node_operations (vinfo, |
| SLP_INSTANCE_TREE (instance), |
| instance, visited, &lvisited, |
| &cost_vec)) |
| { |
| slp_tree node = SLP_INSTANCE_TREE (instance); |
| stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "removing SLP instance operations starting from: %G", |
| stmt_info->stmt); |
| vect_free_slp_instance (instance, false); |
| vinfo->slp_instances.ordered_remove (i); |
| cost_vec.release (); |
| } |
| else |
| { |
| for (scalar_stmts_to_slp_tree_map_t::iterator x = lvisited.begin(); |
| x != lvisited.end(); ++x) |
| visited->put ((*x).first.copy (), (*x).second); |
| i++; |
| |
| add_stmt_costs (vinfo->target_cost_data, &cost_vec); |
| cost_vec.release (); |
| } |
| } |
| delete visited; |
| |
| return !vinfo->slp_instances.is_empty (); |
| } |
| |
| |
| /* Compute the scalar cost of the SLP node NODE and its children |
| and return it. Do not account defs that are marked in LIFE and |
| update LIFE according to uses of NODE. */ |
| |
| static void |
| vect_bb_slp_scalar_cost (basic_block bb, |
| slp_tree node, vec<bool, va_heap> *life, |
| stmt_vector_for_cost *cost_vec, |
| hash_set<slp_tree> &visited) |
| { |
| unsigned i; |
| stmt_vec_info stmt_info; |
| slp_tree child; |
| |
| if (visited.add (node)) |
| return; |
| |
| FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) |
| { |
| gimple *stmt = stmt_info->stmt; |
| vec_info *vinfo = stmt_info->vinfo; |
| ssa_op_iter op_iter; |
| def_operand_p def_p; |
| |
| if ((*life)[i]) |
| continue; |
| |
| /* If there is a non-vectorized use of the defs then the scalar |
| stmt is kept live in which case we do not account it or any |
| required defs in the SLP children in the scalar cost. This |
| way we make the vectorization more costly when compared to |
| the scalar cost. */ |
| FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF) |
| { |
| imm_use_iterator use_iter; |
| gimple *use_stmt; |
| FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p)) |
| if (!is_gimple_debug (use_stmt)) |
| { |
| stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt); |
| if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info)) |
| { |
| (*life)[i] = true; |
| BREAK_FROM_IMM_USE_STMT (use_iter); |
| } |
| } |
| } |
| if ((*life)[i]) |
| continue; |
| |
| /* Count scalar stmts only once. */ |
| if (gimple_visited_p (stmt)) |
| continue; |
| gimple_set_visited (stmt, true); |
| |
| vect_cost_for_stmt kind; |
| if (STMT_VINFO_DATA_REF (stmt_info)) |
| { |
| if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) |
| kind = scalar_load; |
| else |
| kind = scalar_store; |
| } |
| else |
| kind = scalar_stmt; |
| record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body); |
| } |
| |
| auto_vec<bool, 20> subtree_life; |
| FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) |
| { |
| if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) |
| { |
| /* Do not directly pass LIFE to the recursive call, copy it to |
| confine changes in the callee to the current child/subtree. */ |
| subtree_life.safe_splice (*life); |
| vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec, |
| visited); |
| subtree_life.truncate (0); |
| } |
| } |
| } |
| |
| static void |
| vect_bb_slp_scalar_cost (basic_block bb, |
| slp_tree node, vec<bool, va_heap> *life, |
| stmt_vector_for_cost *cost_vec) |
| { |
| hash_set<slp_tree> visited; |
| vect_bb_slp_scalar_cost (bb, node, life, cost_vec, visited); |
| } |
| |
| /* Check if vectorization of the basic block is profitable. */ |
| |
| static bool |
| vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo) |
| { |
| vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo); |
| slp_instance instance; |
| int i; |
| unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0; |
| unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0; |
| |
| /* Calculate scalar cost. */ |
| stmt_vector_for_cost scalar_costs; |
| scalar_costs.create (0); |
| FOR_EACH_VEC_ELT (slp_instances, i, instance) |
| { |
| auto_vec<bool, 20> life; |
| life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance)); |
| vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo), |
| SLP_INSTANCE_TREE (instance), |
| &life, &scalar_costs); |
| } |
| void *target_cost_data = init_cost (NULL); |
| add_stmt_costs (target_cost_data, &scalar_costs); |
| scalar_costs.release (); |
| unsigned dummy; |
| finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy); |
| destroy_cost_data (target_cost_data); |
| |
| /* Unset visited flag. */ |
| for (gimple_stmt_iterator gsi = bb_vinfo->region_begin; |
| gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi)) |
| gimple_set_visited (gsi_stmt (gsi), false); |
| |
| /* Complete the target-specific cost calculation. */ |
| finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost, |
| &vec_inside_cost, &vec_epilogue_cost); |
| |
| vec_outside_cost = vec_prologue_cost + vec_epilogue_cost; |
| |
| if (dump_enabled_p ()) |
| { |
| dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); |
| dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n", |
| vec_inside_cost); |
| dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost); |
| dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost); |
| dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost); |
| } |
| |
| /* Vectorization is profitable if its cost is more than the cost of scalar |
| version. Note that we err on the vector side for equal cost because |
| the cost estimate is otherwise quite pessimistic (constant uses are |
| free on the scalar side but cost a load on the vector side for |
| example). */ |
| if (vec_outside_cost + vec_inside_cost > scalar_cost) |
| return false; |
| |
| return true; |
| } |
| |
| /* Check if the basic block can be vectorized. Returns a bb_vec_info |
| if so and sets fatal to true if failure is independent of |
| current_vector_size. */ |
| |
| static bb_vec_info |
| vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin, |
| gimple_stmt_iterator region_end, |
| vec<data_reference_p> datarefs, int n_stmts, |
| bool &fatal, vec_info_shared *shared) |
| { |
| DUMP_VECT_SCOPE ("vect_slp_analyze_bb"); |
| |
| bb_vec_info bb_vinfo; |
| slp_instance instance; |
| int i; |
| poly_uint64 min_vf = 2; |
| |
| /* The first group of checks is independent of the vector size. */ |
| fatal = true; |
| |
| if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB)) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "not vectorized: too many instructions in " |
| "basic block.\n"); |
| free_data_refs (datarefs); |
| return NULL; |
| } |
| |
| bb_vinfo = new _bb_vec_info (region_begin, region_end, shared); |
| if (!bb_vinfo) |
| return NULL; |
| |
| BB_VINFO_DATAREFS (bb_vinfo) = datarefs; |
| bb_vinfo->shared->save_datarefs (); |
| |
| /* Analyze the data references. */ |
| |
| if (!vect_analyze_data_refs (bb_vinfo, &min_vf)) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "not vectorized: unhandled data-ref in basic " |
| "block.\n"); |
| |
| delete bb_vinfo; |
| return NULL; |
| } |
| |
| if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "not vectorized: not enough data-refs in " |
| "basic block.\n"); |
| |
| delete bb_vinfo; |
| return NULL; |
| } |
| |
| if (!vect_analyze_data_ref_accesses (bb_vinfo)) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "not vectorized: unhandled data access in " |
| "basic block.\n"); |
| |
| delete bb_vinfo; |
| return NULL; |
| } |
| |
| /* If there are no grouped stores in the region there is no need |
| to continue with pattern recog as vect_analyze_slp will fail |
| anyway. */ |
| if (bb_vinfo->grouped_stores.is_empty ()) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "not vectorized: no grouped stores in " |
| "basic block.\n"); |
| |
| delete bb_vinfo; |
| return NULL; |
| } |
| |
| /* While the rest of the analysis below depends on it in some way. */ |
| fatal = false; |
| |
| vect_pattern_recog (bb_vinfo); |
| |
| /* Check the SLP opportunities in the basic block, analyze and build SLP |
| trees. */ |
| if (!vect_analyze_slp (bb_vinfo, n_stmts)) |
| { |
| if (dump_enabled_p ()) |
| { |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "Failed to SLP the basic block.\n"); |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "not vectorized: failed to find SLP opportunities " |
| "in basic block.\n"); |
| } |
| |
| delete bb_vinfo; |
| return NULL; |
| } |
| |
| vect_record_base_alignments (bb_vinfo); |
| |
| /* Analyze and verify the alignment of data references and the |
| dependence in the SLP instances. */ |
| for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); ) |
| { |
| if (! vect_slp_analyze_and_verify_instance_alignment (instance) |
| || ! vect_slp_analyze_instance_dependence (instance)) |
| { |
| slp_tree node = SLP_INSTANCE_TREE (instance); |
| stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "removing SLP instance operations starting from: %G", |
| stmt_info->stmt); |
| vect_free_slp_instance (instance, false); |
| BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i); |
| continue; |
| } |
| |
| /* Mark all the statements that we want to vectorize as pure SLP and |
| relevant. */ |
| vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance)); |
| vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance)); |
| |
| i++; |
| } |
| if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ()) |
| { |
| delete bb_vinfo; |
| return NULL; |
| } |
| |
| if (!vect_slp_analyze_operations (bb_vinfo)) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "not vectorized: bad operation in basic block.\n"); |
| |
| delete bb_vinfo; |
| return NULL; |
| } |
| |
| /* Cost model: check if the vectorization is worthwhile. */ |
| if (!unlimited_cost_model (NULL) |
| && !vect_bb_vectorization_profitable_p (bb_vinfo)) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
| "not vectorized: vectorization is not " |
| "profitable.\n"); |
| |
| delete bb_vinfo; |
| return NULL; |
| } |
| |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "Basic block will be vectorized using SLP\n"); |
| |
| return bb_vinfo; |
| } |
| |
| |
| /* Main entry for the BB vectorizer. Analyze and transform BB, returns |
| true if anything in the basic-block was vectorized. */ |
| |
| bool |
| vect_slp_bb (basic_block bb) |
| { |
| bb_vec_info bb_vinfo; |
| gimple_stmt_iterator gsi; |
| bool any_vectorized = false; |
| auto_vector_sizes vector_sizes; |
| |
| /* Autodetect first vector size we try. */ |
| current_vector_size = 0; |
| targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); |
| unsigned int next_size = 0; |
| |
| gsi = gsi_start_bb (bb); |
| |
| poly_uint64 autodetected_vector_size = 0; |
| while (1) |
| { |
| if (gsi_end_p (gsi)) |
| break; |
| |
| gimple_stmt_iterator region_begin = gsi; |
| vec<data_reference_p> datarefs = vNULL; |
| int insns = 0; |
| |
| for (; !gsi_end_p (gsi); gsi_next (&gsi)) |
| { |
| gimple *stmt = gsi_stmt (gsi); |
| if (is_gimple_debug (stmt)) |
| continue; |
| insns++; |
| |
| if (gimple_location (stmt) != UNKNOWN_LOCATION) |
| vect_location = stmt; |
| |
| if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs)) |
| break; |
| } |
| |
| /* Skip leading unhandled stmts. */ |
| if (gsi_stmt (region_begin) == gsi_stmt (gsi)) |
| { |
| gsi_next (&gsi); |
| continue; |
| } |
| |
| gimple_stmt_iterator region_end = gsi; |
| |
| bool vectorized = false; |
| bool fatal = false; |
| vec_info_shared shared; |
| bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end, |
| datarefs, insns, fatal, &shared); |
| if (bb_vinfo |
| && dbg_cnt (vect_slp)) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n"); |
| |
| bb_vinfo->shared->check_datarefs (); |
| vect_schedule_slp (bb_vinfo); |
| |
| unsigned HOST_WIDE_INT bytes; |
| if (dump_enabled_p ()) |
| { |
| if (current_vector_size.is_constant (&bytes)) |
| dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, |
| "basic block part vectorized using %wu byte " |
| "vectors\n", bytes); |
| else |
| dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, |
| "basic block part vectorized using variable " |
| "length vectors\n"); |
| } |
| |
| vectorized = true; |
| } |
| delete bb_vinfo; |
| |
| any_vectorized |= vectorized; |
| |
| if (next_size == 0) |
| autodetected_vector_size = current_vector_size; |
| |
| if (next_size < vector_sizes.length () |
| && known_eq (vector_sizes[next_size], autodetected_vector_size)) |
| next_size += 1; |
| |
| if (vectorized |
| || next_size == vector_sizes.length () |
| || known_eq (current_vector_size, 0U) |
| /* If vect_slp_analyze_bb_1 signaled that analysis for all |
| vector sizes will fail do not bother iterating. */ |
| || fatal) |
| { |
| if (gsi_end_p (region_end)) |
| break; |
| |
| /* Skip the unhandled stmt. */ |
| gsi_next (&gsi); |
| |
| /* And reset vector sizes. */ |
| current_vector_size = 0; |
| next_size = 0; |
| } |
| else |
| { |
| /* Try the next biggest vector size. */ |
| current_vector_size = vector_sizes[next_size++]; |
| if (dump_enabled_p ()) |
| { |
| dump_printf_loc (MSG_NOTE, vect_location, |
| "***** Re-trying analysis with " |
| "vector size "); |
| dump_dec (MSG_NOTE, current_vector_size); |
| dump_printf (MSG_NOTE, "\n"); |
| } |
| |
| /* Start over. */ |
| gsi = region_begin; |
| } |
| } |
| |
| return any_vectorized; |
| } |
| |
| |
| /* Return 1 if vector type STMT_VINFO is a boolean vector. */ |
| |
| static bool |
| vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo) |
| { |
| enum tree_code code = gimple_expr_code (stmt_vinfo->stmt); |
| tree op, vectype; |
| enum vect_def_type dt; |
| |
| /* For comparison and COND_EXPR type is chosen depending |
| on the non-constant other comparison operand. */ |
| if (TREE_CODE_CLASS (code) == tcc_comparison) |
| { |
| gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt); |
| op = gimple_assign_rhs1 (stmt); |
| |
| if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype)) |
| gcc_unreachable (); |
| |
| return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype); |
| } |
| |
| if (code == COND_EXPR) |
| { |
| gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt); |
| tree cond = gimple_assign_rhs1 (stmt); |
| |
| if (TREE_CODE (cond) == SSA_NAME) |
| op = cond; |
| else |
| op = TREE_OPERAND (cond, 0); |
| |
| if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype)) |
| gcc_unreachable (); |
| |
| return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype); |
| } |
| |
| return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo)); |
| } |
| |
| /* Build a variable-length vector in which the elements in ELTS are repeated |
| to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in |
| RESULTS and add any new instructions to SEQ. |
| |
| The approach we use is: |
| |
| (1) Find a vector mode VM with integer elements of mode IM. |
| |
| (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of |
| ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs |
| from small vectors to IM. |
| |
| (3) Duplicate each ELTS'[I] into a vector of mode VM. |
| |
| (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the |
| correct byte contents. |
| |
| (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type. |
| |
| We try to find the largest IM for which this sequence works, in order |
| to cut down on the number of interleaves. */ |
| |
| void |
| duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts, |
| unsigned int nresults, vec<tree> &results) |
| { |
| unsigned int nelts = elts.length (); |
| tree element_type = TREE_TYPE (vector_type); |
| |
| /* (1) Find a vector mode VM with integer elements of mode IM. */ |
| unsigned int nvectors = 1; |
| tree new_vector_type; |
| tree permutes[2]; |
| if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type), |
| &nvectors, &new_vector_type, |
| permutes)) |
| gcc_unreachable (); |
| |
| /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */ |
| unsigned int partial_nelts = nelts / nvectors; |
| tree partial_vector_type = build_vector_type (element_type, partial_nelts); |
| |
| tree_vector_builder partial_elts; |
| auto_vec<tree, 32> pieces (nvectors * 2); |
| pieces.quick_grow (nvectors * 2); |
| for (unsigned int i = 0; i < nvectors; ++i) |
| { |
| /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of |
| ELTS' has mode IM. */ |
| partial_elts.new_vector (partial_vector_type, partial_nelts, 1); |
| for (unsigned int j = 0; j < partial_nelts; ++j) |
| partial_elts.quick_push (elts[i * partial_nelts + j]); |
| tree t = gimple_build_vector (seq, &partial_elts); |
| t = gimple_build (seq, VIEW_CONVERT_EXPR, |
| TREE_TYPE (new_vector_type), t); |
| |
| /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */ |
| pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t); |
| } |
| |
| /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the |
| correct byte contents. |
| |
| We need to repeat the following operation log2(nvectors) times: |
| |
| out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute); |
| out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute); |
| |
| However, if each input repeats every N elements and the VF is |