blob: e54414f6befadcea95419bf9b84904b9cb4245b9 [file] [log] [blame]
/* SLP - Basic Block Vectorization
Copyright (C) 2007-2022 Free Software Foundation, Inc.
Contributed by Dorit Naishlos <dorit@il.ibm.com>
and Ira Rosen <irar@il.ibm.com>
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
#include "config.h"
#define INCLUDE_ALGORITHM
#include "system.h"
#include "coretypes.h"
#include "backend.h"
#include "target.h"
#include "rtl.h"
#include "tree.h"
#include "gimple.h"
#include "tree-pass.h"
#include "ssa.h"
#include "optabs-tree.h"
#include "insn-config.h"
#include "recog.h" /* FIXME: for insn_data */
#include "fold-const.h"
#include "stor-layout.h"
#include "gimple-iterator.h"
#include "cfgloop.h"
#include "tree-vectorizer.h"
#include "langhooks.h"
#include "gimple-walk.h"
#include "dbgcnt.h"
#include "tree-vector-builder.h"
#include "vec-perm-indices.h"
#include "gimple-fold.h"
#include "internal-fn.h"
#include "dump-context.h"
#include "cfganal.h"
#include "tree-eh.h"
#include "tree-cfg.h"
#include "alloc-pool.h"
#include "sreal.h"
#include "predict.h"
static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
load_permutation_t &,
const vec<tree> &,
gimple_stmt_iterator *,
poly_uint64, bool, bool,
unsigned *,
unsigned * = nullptr,
bool = false);
static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
slp_tree, lane_permutation_t &,
vec<slp_tree> &, bool);
static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
slp_tree, stmt_vector_for_cost *);
static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
static object_allocator<_slp_tree> *slp_tree_pool;
static slp_tree slp_first_node;
void
vect_slp_init (void)
{
slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
}
void
vect_slp_fini (void)
{
while (slp_first_node)
delete slp_first_node;
delete slp_tree_pool;
slp_tree_pool = NULL;
}
void *
_slp_tree::operator new (size_t n)
{
gcc_assert (n == sizeof (_slp_tree));
return slp_tree_pool->allocate_raw ();
}
void
_slp_tree::operator delete (void *node, size_t n)
{
gcc_assert (n == sizeof (_slp_tree));
slp_tree_pool->remove_raw (node);
}
/* Initialize a SLP node. */
_slp_tree::_slp_tree ()
{
this->prev_node = NULL;
if (slp_first_node)
slp_first_node->prev_node = this;
this->next_node = slp_first_node;
slp_first_node = this;
SLP_TREE_SCALAR_STMTS (this) = vNULL;
SLP_TREE_SCALAR_OPS (this) = vNULL;
SLP_TREE_VEC_STMTS (this) = vNULL;
SLP_TREE_VEC_DEFS (this) = vNULL;
SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
SLP_TREE_CHILDREN (this) = vNULL;
SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
SLP_TREE_LANE_PERMUTATION (this) = vNULL;
SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
SLP_TREE_CODE (this) = ERROR_MARK;
SLP_TREE_VECTYPE (this) = NULL_TREE;
SLP_TREE_REPRESENTATIVE (this) = NULL;
SLP_TREE_REF_COUNT (this) = 1;
this->failed = NULL;
this->max_nunits = 1;
this->lanes = 0;
}
/* Tear down a SLP node. */
_slp_tree::~_slp_tree ()
{
if (this->prev_node)
this->prev_node->next_node = this->next_node;
else
slp_first_node = this->next_node;
if (this->next_node)
this->next_node->prev_node = this->prev_node;
SLP_TREE_CHILDREN (this).release ();
SLP_TREE_SCALAR_STMTS (this).release ();
SLP_TREE_SCALAR_OPS (this).release ();
SLP_TREE_VEC_STMTS (this).release ();
SLP_TREE_VEC_DEFS (this).release ();
SLP_TREE_LOAD_PERMUTATION (this).release ();
SLP_TREE_LANE_PERMUTATION (this).release ();
if (this->failed)
free (failed);
}
/* Recursively free the memory allocated for the SLP tree rooted at NODE. */
void
vect_free_slp_tree (slp_tree node)
{
int i;
slp_tree child;
if (--SLP_TREE_REF_COUNT (node) != 0)
return;
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
if (child)
vect_free_slp_tree (child);
/* If the node defines any SLP only patterns then those patterns are no
longer valid and should be removed. */
stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
{
stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
}
delete node;
}
/* Return a location suitable for dumpings related to the SLP instance. */
dump_user_location_t
_slp_instance::location () const
{
if (!root_stmts.is_empty ())
return root_stmts[0]->stmt;
else
return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
}
/* Free the memory allocated for the SLP instance. */
void
vect_free_slp_instance (slp_instance instance)
{
vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
SLP_INSTANCE_LOADS (instance).release ();
SLP_INSTANCE_ROOT_STMTS (instance).release ();
instance->subgraph_entries.release ();
instance->cost_vec.release ();
free (instance);
}
/* Create an SLP node for SCALAR_STMTS. */
slp_tree
vect_create_new_slp_node (unsigned nops, tree_code code)
{
slp_tree node = new _slp_tree;
SLP_TREE_SCALAR_STMTS (node) = vNULL;
SLP_TREE_CHILDREN (node).create (nops);
SLP_TREE_DEF_TYPE (node) = vect_internal_def;
SLP_TREE_CODE (node) = code;
return node;
}
/* Create an SLP node for SCALAR_STMTS. */
static slp_tree
vect_create_new_slp_node (slp_tree node,
vec<stmt_vec_info> scalar_stmts, unsigned nops)
{
SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
SLP_TREE_CHILDREN (node).create (nops);
SLP_TREE_DEF_TYPE (node) = vect_internal_def;
SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
SLP_TREE_LANES (node) = scalar_stmts.length ();
return node;
}
/* Create an SLP node for SCALAR_STMTS. */
static slp_tree
vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
{
return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
}
/* Create an SLP node for OPS. */
static slp_tree
vect_create_new_slp_node (slp_tree node, vec<tree> ops)
{
SLP_TREE_SCALAR_OPS (node) = ops;
SLP_TREE_DEF_TYPE (node) = vect_external_def;
SLP_TREE_LANES (node) = ops.length ();
return node;
}
/* Create an SLP node for OPS. */
static slp_tree
vect_create_new_slp_node (vec<tree> ops)
{
return vect_create_new_slp_node (new _slp_tree, ops);
}
/* This structure is used in creation of an SLP tree. Each instance
corresponds to the same operand in a group of scalar stmts in an SLP
node. */
typedef struct _slp_oprnd_info
{
/* Def-stmts for the operands. */
vec<stmt_vec_info> def_stmts;
/* Operands. */
vec<tree> ops;
/* Information about the first statement, its vector def-type, type, the
operand itself in case it's constant, and an indication if it's a pattern
stmt. */
tree first_op_type;
enum vect_def_type first_dt;
bool any_pattern;
} *slp_oprnd_info;
/* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
operand. */
static vec<slp_oprnd_info>
vect_create_oprnd_info (int nops, int group_size)
{
int i;
slp_oprnd_info oprnd_info;
vec<slp_oprnd_info> oprnds_info;
oprnds_info.create (nops);
for (i = 0; i < nops; i++)
{
oprnd_info = XNEW (struct _slp_oprnd_info);
oprnd_info->def_stmts.create (group_size);
oprnd_info->ops.create (group_size);
oprnd_info->first_dt = vect_uninitialized_def;
oprnd_info->first_op_type = NULL_TREE;
oprnd_info->any_pattern = false;
oprnds_info.quick_push (oprnd_info);
}
return oprnds_info;
}
/* Free operands info. */
static void
vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
{
int i;
slp_oprnd_info oprnd_info;
FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
{
oprnd_info->def_stmts.release ();
oprnd_info->ops.release ();
XDELETE (oprnd_info);
}
oprnds_info.release ();
}
/* Return the execution frequency of NODE (so that a higher value indicates
a "more important" node when optimizing for speed). */
static sreal
vect_slp_node_weight (slp_tree node)
{
stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
basic_block bb = gimple_bb (stmt_info->stmt);
return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
}
/* Return true if STMTS contains a pattern statement. */
static bool
vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
{
stmt_vec_info stmt_info;
unsigned int i;
FOR_EACH_VEC_ELT (stmts, i, stmt_info)
if (is_pattern_stmt_p (stmt_info))
return true;
return false;
}
/* Return true when all lanes in the external or constant NODE have
the same value. */
static bool
vect_slp_tree_uniform_p (slp_tree node)
{
gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
|| SLP_TREE_DEF_TYPE (node) == vect_external_def);
/* Pre-exsting vectors. */
if (SLP_TREE_SCALAR_OPS (node).is_empty ())
return false;
unsigned i;
tree op, first = NULL_TREE;
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
if (!first)
first = op;
else if (!operand_equal_p (first, op, 0))
return false;
return true;
}
/* Find the place of the data-ref in STMT_INFO in the interleaving chain
that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
of the chain. */
int
vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
stmt_vec_info first_stmt_info)
{
stmt_vec_info next_stmt_info = first_stmt_info;
int result = 0;
if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
return -1;
do
{
if (next_stmt_info == stmt_info)
return result;
next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
if (next_stmt_info)
result += DR_GROUP_GAP (next_stmt_info);
}
while (next_stmt_info);
return -1;
}
/* Check whether it is possible to load COUNT elements of type ELT_TYPE
using the method implemented by duplicate_and_interleave. Return true
if so, returning the number of intermediate vectors in *NVECTORS_OUT
(if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
(if nonnull). */
bool
can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
tree elt_type, unsigned int *nvectors_out,
tree *vector_type_out,
tree *permutes)
{
tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
return false;
machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
unsigned int nvectors = 1;
for (;;)
{
scalar_int_mode int_mode;
poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
{
/* Get the natural vector type for this SLP group size. */
tree int_type = build_nonstandard_integer_type
(GET_MODE_BITSIZE (int_mode), 1);
tree vector_type
= get_vectype_for_scalar_type (vinfo, int_type, count);
if (vector_type
&& VECTOR_MODE_P (TYPE_MODE (vector_type))
&& known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
GET_MODE_SIZE (base_vector_mode)))
{
/* Try fusing consecutive sequences of COUNT / NVECTORS elements
together into elements of type INT_TYPE and using the result
to build NVECTORS vectors. */
poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
vec_perm_builder sel1 (nelts, 2, 3);
vec_perm_builder sel2 (nelts, 2, 3);
poly_int64 half_nelts = exact_div (nelts, 2);
for (unsigned int i = 0; i < 3; ++i)
{
sel1.quick_push (i);
sel1.quick_push (i + nelts);
sel2.quick_push (half_nelts + i);
sel2.quick_push (half_nelts + i + nelts);
}
vec_perm_indices indices1 (sel1, 2, nelts);
vec_perm_indices indices2 (sel2, 2, nelts);
machine_mode vmode = TYPE_MODE (vector_type);
if (can_vec_perm_const_p (vmode, vmode, indices1)
&& can_vec_perm_const_p (vmode, vmode, indices2))
{
if (nvectors_out)
*nvectors_out = nvectors;
if (vector_type_out)
*vector_type_out = vector_type;
if (permutes)
{
permutes[0] = vect_gen_perm_mask_checked (vector_type,
indices1);
permutes[1] = vect_gen_perm_mask_checked (vector_type,
indices2);
}
return true;
}
}
}
if (!multiple_p (elt_bytes, 2, &elt_bytes))
return false;
nvectors *= 2;
}
}
/* Return true if DTA and DTB match. */
static bool
vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
{
return (dta == dtb
|| ((dta == vect_external_def || dta == vect_constant_def)
&& (dtb == vect_external_def || dtb == vect_constant_def)));
}
static const int cond_expr_maps[3][5] = {
{ 4, -1, -2, 1, 2 },
{ 4, -2, -1, 1, 2 },
{ 4, -1, -2, 2, 1 }
};
static const int arg1_map[] = { 1, 1 };
static const int arg2_map[] = { 1, 2 };
static const int arg1_arg4_map[] = { 2, 1, 4 };
static const int op1_op0_map[] = { 2, 1, 0 };
/* For most SLP statements, there is a one-to-one mapping between
gimple arguments and child nodes. If that is not true for STMT,
return an array that contains:
- the number of child nodes, followed by
- for each child node, the index of the argument associated with that node.
The special index -1 is the first operand of an embedded comparison and
the special index -2 is the second operand of an embedded comparison.
SWAP is as for vect_get_and_check_slp_defs. */
static const int *
vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
{
if (auto assign = dyn_cast<const gassign *> (stmt))
{
if (gimple_assign_rhs_code (assign) == COND_EXPR
&& COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
return cond_expr_maps[swap];
if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
&& swap)
return op1_op0_map;
}
gcc_assert (!swap);
if (auto call = dyn_cast<const gcall *> (stmt))
{
if (gimple_call_internal_p (call))
switch (gimple_call_internal_fn (call))
{
case IFN_MASK_LOAD:
return arg2_map;
case IFN_GATHER_LOAD:
return arg1_map;
case IFN_MASK_GATHER_LOAD:
return arg1_arg4_map;
default:
break;
}
}
return nullptr;
}
/* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
they are of a valid type and that they match the defs of the first stmt of
the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
indicates swap is required for cond_expr stmts. Specifically, SWAP
is 1 if STMT is cond and operands of comparison need to be swapped;
SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
If there was a fatal error return -1; if the error could be corrected by
swapping operands of father node of this one, return 1; if everything is
ok return 0. */
static int
vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
bool *skip_args,
vec<stmt_vec_info> stmts, unsigned stmt_num,
vec<slp_oprnd_info> *oprnds_info)
{
stmt_vec_info stmt_info = stmts[stmt_num];
tree oprnd;
unsigned int i, number_of_oprnds;
enum vect_def_type dt = vect_uninitialized_def;
slp_oprnd_info oprnd_info;
unsigned int commutative_op = -1U;
bool first = stmt_num == 0;
if (!is_a<gcall *> (stmt_info->stmt)
&& !is_a<gassign *> (stmt_info->stmt)
&& !is_a<gphi *> (stmt_info->stmt))
return -1;
number_of_oprnds = gimple_num_args (stmt_info->stmt);
const int *map = vect_get_operand_map (stmt_info->stmt, swap);
if (map)
number_of_oprnds = *map++;
if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
{
if (gimple_call_internal_p (stmt))
{
internal_fn ifn = gimple_call_internal_fn (stmt);
commutative_op = first_commutative_argument (ifn);
}
}
else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
{
if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
commutative_op = 0;
}
bool swapped = (swap != 0);
bool backedge = false;
enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
for (i = 0; i < number_of_oprnds; i++)
{
int opno = map ? map[i] : int (i);
if (opno < 0)
oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
else
{
oprnd = gimple_arg (stmt_info->stmt, opno);
if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
backedge = dominated_by_p (CDI_DOMINATORS,
gimple_phi_arg_edge (stmt, opno)->src,
gimple_bb (stmt_info->stmt));
}
if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
oprnd = TREE_OPERAND (oprnd, 0);
oprnd_info = (*oprnds_info)[i];
stmt_vec_info def_stmt_info;
if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: can't analyze def for %T\n",
oprnd);
return -1;
}
if (skip_args[i])
{
oprnd_info->def_stmts.quick_push (NULL);
oprnd_info->ops.quick_push (NULL_TREE);
oprnd_info->first_dt = vect_uninitialized_def;
continue;
}
oprnd_info->def_stmts.quick_push (def_stmt_info);
oprnd_info->ops.quick_push (oprnd);
if (def_stmt_info
&& is_pattern_stmt_p (def_stmt_info))
{
if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
!= def_stmt_info)
oprnd_info->any_pattern = true;
else
/* If we promote this to external use the original stmt def. */
oprnd_info->ops.last ()
= gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
}
/* If there's a extern def on a backedge make sure we can
code-generate at the region start.
??? This is another case that could be fixed by adjusting
how we split the function but at the moment we'd have conflicting
goals there. */
if (backedge
&& dts[i] == vect_external_def
&& is_a <bb_vec_info> (vinfo)
&& TREE_CODE (oprnd) == SSA_NAME
&& !SSA_NAME_IS_DEFAULT_DEF (oprnd)
&& !dominated_by_p (CDI_DOMINATORS,
as_a <bb_vec_info> (vinfo)->bbs[0],
gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: extern def %T only defined "
"on backedge\n", oprnd);
return -1;
}
if (first)
{
tree type = TREE_TYPE (oprnd);
dt = dts[i];
if ((dt == vect_constant_def
|| dt == vect_external_def)
&& !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
&& (TREE_CODE (type) == BOOLEAN_TYPE
|| !can_duplicate_and_interleave_p (vinfo, stmts.length (),
type)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: invalid type of def "
"for variable-length SLP %T\n", oprnd);
return -1;
}
/* For the swapping logic below force vect_reduction_def
for the reduction op in a SLP reduction group. */
if (!STMT_VINFO_DATA_REF (stmt_info)
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info)
&& (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
&& def_stmt_info)
dts[i] = dt = vect_reduction_def;
/* Check the types of the definition. */
switch (dt)
{
case vect_external_def:
case vect_constant_def:
case vect_internal_def:
case vect_reduction_def:
case vect_induction_def:
case vect_nested_cycle:
case vect_first_order_recurrence:
break;
default:
/* FORNOW: Not supported. */
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: illegal type of def %T\n",
oprnd);
return -1;
}
oprnd_info->first_dt = dt;
oprnd_info->first_op_type = type;
}
}
if (first)
return 0;
/* Now match the operand definition types to that of the first stmt. */
for (i = 0; i < number_of_oprnds;)
{
if (skip_args[i])
{
++i;
continue;
}
oprnd_info = (*oprnds_info)[i];
dt = dts[i];
stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
oprnd = oprnd_info->ops[stmt_num];
tree type = TREE_TYPE (oprnd);
if (!types_compatible_p (oprnd_info->first_op_type, type))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: different operand types\n");
return 1;
}
/* Not first stmt of the group, check that the def-stmt/s match
the def-stmt/s of the first stmt. Allow different definition
types for reduction chains: the first stmt must be a
vect_reduction_def (a phi node), and the rest
end in the reduction chain. */
if ((!vect_def_types_match (oprnd_info->first_dt, dt)
&& !(oprnd_info->first_dt == vect_reduction_def
&& !STMT_VINFO_DATA_REF (stmt_info)
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info)
&& def_stmt_info
&& !STMT_VINFO_DATA_REF (def_stmt_info)
&& (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
== REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
|| (!STMT_VINFO_DATA_REF (stmt_info)
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info)
&& ((!def_stmt_info
|| STMT_VINFO_DATA_REF (def_stmt_info)
|| (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
!= REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
!= (oprnd_info->first_dt != vect_reduction_def))))
{
/* Try swapping operands if we got a mismatch. For BB
vectorization only in case it will clearly improve things. */
if (i == commutative_op && !swapped
&& (!is_a <bb_vec_info> (vinfo)
|| (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
dts[i+1])
&& (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
|| vect_def_types_match
((*oprnds_info)[i+1]->first_dt, dts[i])))))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"trying swapped operands\n");
std::swap (dts[i], dts[i+1]);
std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
(*oprnds_info)[i+1]->def_stmts[stmt_num]);
std::swap ((*oprnds_info)[i]->ops[stmt_num],
(*oprnds_info)[i+1]->ops[stmt_num]);
swapped = true;
continue;
}
if (is_a <bb_vec_info> (vinfo)
&& !oprnd_info->any_pattern)
{
/* Now for commutative ops we should see whether we can
make the other operand matching. */
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"treating operand as external\n");
oprnd_info->first_dt = dt = vect_external_def;
}
else
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: different types\n");
return 1;
}
}
/* Make sure to demote the overall operand to external. */
if (dt == vect_external_def)
oprnd_info->first_dt = vect_external_def;
/* For a SLP reduction chain we want to duplicate the reduction to
each of the chain members. That gets us a sane SLP graph (still
the stmts are not 100% correct wrt the initial values). */
else if ((dt == vect_internal_def
|| dt == vect_reduction_def)
&& oprnd_info->first_dt == vect_reduction_def
&& !STMT_VINFO_DATA_REF (stmt_info)
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info)
&& !STMT_VINFO_DATA_REF (def_stmt_info)
&& (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
== REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
{
oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
}
++i;
}
/* Swap operands. */
if (swapped)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"swapped operands to match def types in %G",
stmt_info->stmt);
}
return 0;
}
/* Return true if call statements CALL1 and CALL2 are similar enough
to be combined into the same SLP group. */
bool
compatible_calls_p (gcall *call1, gcall *call2)
{
unsigned int nargs = gimple_call_num_args (call1);
if (nargs != gimple_call_num_args (call2))
return false;
if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
return false;
if (gimple_call_internal_p (call1))
{
if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
TREE_TYPE (gimple_call_lhs (call2))))
return false;
for (unsigned int i = 0; i < nargs; ++i)
if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
TREE_TYPE (gimple_call_arg (call2, i))))
return false;
}
else
{
if (!operand_equal_p (gimple_call_fn (call1),
gimple_call_fn (call2), 0))
return false;
if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
return false;
}
/* Check that any unvectorized arguments are equal. */
if (const int *map = vect_get_operand_map (call1))
{
unsigned int nkept = *map++;
unsigned int mapi = 0;
for (unsigned int i = 0; i < nargs; ++i)
if (mapi < nkept && map[mapi] == int (i))
mapi += 1;
else if (!operand_equal_p (gimple_call_arg (call1, i),
gimple_call_arg (call2, i)))
return false;
}
return true;
}
/* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
caller's attempt to find the vector type in STMT_INFO with the narrowest
element type. Return true if VECTYPE is nonnull and if it is valid
for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
vect_build_slp_tree. */
static bool
vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
unsigned int group_size,
tree vectype, poly_uint64 *max_nunits)
{
if (!vectype)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: unsupported data-type in %G\n",
stmt_info->stmt);
/* Fatal mismatch. */
return false;
}
/* If populating the vector type requires unrolling then fail
before adjusting *max_nunits for basic-block vectorization. */
if (is_a <bb_vec_info> (vinfo)
&& !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: unrolling required "
"in basic block SLP\n");
/* Fatal mismatch. */
return false;
}
/* In case of multiple types we need to detect the smallest type. */
vect_update_max_nunits (max_nunits, vectype);
return true;
}
/* Verify if the scalar stmts STMTS are isomorphic, require data
permutation or are of unsupported types of operation. Return
true if they are, otherwise return false and indicate in *MATCHES
which stmts are not isomorphic to the first one. If MATCHES[0]
is false then this indicates the comparison could not be
carried out or the stmts will never be vectorized by SLP.
Note COND_EXPR is possibly isomorphic to another one after swapping its
operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
the first stmt by swapping the two operands of comparison; set SWAP[i]
to 2 if stmt I is isormorphic to the first stmt by inverting the code
of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
static bool
vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
vec<stmt_vec_info> stmts, unsigned int group_size,
poly_uint64 *max_nunits, bool *matches,
bool *two_operators, tree *node_vectype)
{
unsigned int i;
stmt_vec_info first_stmt_info = stmts[0];
code_helper first_stmt_code = ERROR_MARK;
code_helper alt_stmt_code = ERROR_MARK;
code_helper rhs_code = ERROR_MARK;
code_helper first_cond_code = ERROR_MARK;
tree lhs;
bool need_same_oprnds = false;
tree vectype = NULL_TREE, first_op1 = NULL_TREE;
stmt_vec_info first_load = NULL, prev_first_load = NULL;
bool first_stmt_load_p = false, load_p = false;
bool first_stmt_phi_p = false, phi_p = false;
bool maybe_soft_fail = false;
tree soft_fail_nunits_vectype = NULL_TREE;
/* For every stmt in NODE find its def stmt/s. */
stmt_vec_info stmt_info;
FOR_EACH_VEC_ELT (stmts, i, stmt_info)
{
gimple *stmt = stmt_info->stmt;
swap[i] = 0;
matches[i] = false;
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
/* Fail to vectorize statements marked as unvectorizable, throw
or are volatile. */
if (!STMT_VINFO_VECTORIZABLE (stmt_info)
|| stmt_can_throw_internal (cfun, stmt)
|| gimple_has_volatile_ops (stmt))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: unvectorizable statement %G",
stmt);
/* ??? For BB vectorization we want to commutate operands in a way
to shuffle all unvectorizable defs into one operand and have
the other still vectorized. The following doesn't reliably
work for this though but it's the easiest we can do here. */
if (is_a <bb_vec_info> (vinfo) && i != 0)
continue;
/* Fatal mismatch. */
matches[0] = false;
return false;
}
lhs = gimple_get_lhs (stmt);
if (lhs == NULL_TREE)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: not GIMPLE_ASSIGN nor "
"GIMPLE_CALL %G", stmt);
if (is_a <bb_vec_info> (vinfo) && i != 0)
continue;
/* Fatal mismatch. */
matches[0] = false;
return false;
}
tree nunits_vectype;
if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
&nunits_vectype, group_size))
{
if (is_a <bb_vec_info> (vinfo) && i != 0)
continue;
/* Fatal mismatch. */
matches[0] = false;
return false;
}
/* Record nunits required but continue analysis, producing matches[]
as if nunits was not an issue. This allows splitting of groups
to happen. */
if (nunits_vectype
&& !vect_record_max_nunits (vinfo, stmt_info, group_size,
nunits_vectype, max_nunits))
{
gcc_assert (is_a <bb_vec_info> (vinfo));
maybe_soft_fail = true;
soft_fail_nunits_vectype = nunits_vectype;
}
gcc_assert (vectype);
gcall *call_stmt = dyn_cast <gcall *> (stmt);
if (call_stmt)
{
combined_fn cfn = gimple_call_combined_fn (call_stmt);
if (cfn != CFN_LAST)
rhs_code = cfn;
else
rhs_code = CALL_EXPR;
if (cfn == CFN_MASK_LOAD
|| cfn == CFN_GATHER_LOAD
|| cfn == CFN_MASK_GATHER_LOAD)
load_p = true;
else if ((internal_fn_p (cfn)
&& !vectorizable_internal_fn_p (as_internal_fn (cfn)))
|| gimple_call_tail_p (call_stmt)
|| gimple_call_noreturn_p (call_stmt)
|| gimple_call_chain (call_stmt))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: unsupported call type %G",
(gimple *) call_stmt);
if (is_a <bb_vec_info> (vinfo) && i != 0)
continue;
/* Fatal mismatch. */
matches[0] = false;
return false;
}
}
else if (gimple_code (stmt) == GIMPLE_PHI)
{
rhs_code = ERROR_MARK;
phi_p = true;
}
else
{
rhs_code = gimple_assign_rhs_code (stmt);
load_p = gimple_vuse (stmt);
}
/* Check the operation. */
if (i == 0)
{
*node_vectype = vectype;
first_stmt_code = rhs_code;
first_stmt_load_p = load_p;
first_stmt_phi_p = phi_p;
/* Shift arguments should be equal in all the packed stmts for a
vector shift with scalar shift operand. */
if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
|| rhs_code == LROTATE_EXPR
|| rhs_code == RROTATE_EXPR)
{
/* First see if we have a vector/vector shift. */
if (!directly_supported_p (rhs_code, vectype, optab_vector))
{
/* No vector/vector shift, try for a vector/scalar shift. */
if (!directly_supported_p (rhs_code, vectype, optab_scalar))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: "
"op not supported by target.\n");
if (is_a <bb_vec_info> (vinfo) && i != 0)
continue;
/* Fatal mismatch. */
matches[0] = false;
return false;
}
need_same_oprnds = true;
first_op1 = gimple_assign_rhs2 (stmt);
}
}
else if (rhs_code == WIDEN_LSHIFT_EXPR)
{
need_same_oprnds = true;
first_op1 = gimple_assign_rhs2 (stmt);
}
else if (!load_p
&& rhs_code == BIT_FIELD_REF)
{
tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
if (!is_a <bb_vec_info> (vinfo)
|| TREE_CODE (vec) != SSA_NAME
/* When the element types are not compatible we pun the
source to the target vectype which requires equal size. */
|| ((!VECTOR_TYPE_P (TREE_TYPE (vec))
|| !types_compatible_p (TREE_TYPE (vectype),
TREE_TYPE (TREE_TYPE (vec))))
&& !operand_equal_p (TYPE_SIZE (vectype),
TYPE_SIZE (TREE_TYPE (vec)))))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: "
"BIT_FIELD_REF not supported\n");
/* Fatal mismatch. */
matches[0] = false;
return false;
}
}
else if (rhs_code == CFN_DIV_POW2)
{
need_same_oprnds = true;
first_op1 = gimple_call_arg (call_stmt, 1);
}
}
else
{
if (first_stmt_code != rhs_code
&& alt_stmt_code == ERROR_MARK)
alt_stmt_code = rhs_code;
if ((first_stmt_code != rhs_code
&& (first_stmt_code != IMAGPART_EXPR
|| rhs_code != REALPART_EXPR)
&& (first_stmt_code != REALPART_EXPR
|| rhs_code != IMAGPART_EXPR)
/* Handle mismatches in plus/minus by computing both
and merging the results. */
&& !((first_stmt_code == PLUS_EXPR
|| first_stmt_code == MINUS_EXPR)
&& (alt_stmt_code == PLUS_EXPR
|| alt_stmt_code == MINUS_EXPR)
&& rhs_code == alt_stmt_code)
&& !(first_stmt_code.is_tree_code ()
&& rhs_code.is_tree_code ()
&& (TREE_CODE_CLASS (tree_code (first_stmt_code))
== tcc_comparison)
&& (swap_tree_comparison (tree_code (first_stmt_code))
== tree_code (rhs_code)))
&& !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
&& (first_stmt_code == ARRAY_REF
|| first_stmt_code == BIT_FIELD_REF
|| first_stmt_code == INDIRECT_REF
|| first_stmt_code == COMPONENT_REF
|| first_stmt_code == MEM_REF)
&& (rhs_code == ARRAY_REF
|| rhs_code == BIT_FIELD_REF
|| rhs_code == INDIRECT_REF
|| rhs_code == COMPONENT_REF
|| rhs_code == MEM_REF)))
|| first_stmt_load_p != load_p
|| first_stmt_phi_p != phi_p)
{
if (dump_enabled_p ())
{
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: different operation "
"in stmt %G", stmt);
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"original stmt %G", first_stmt_info->stmt);
}
/* Mismatch. */
continue;
}
if (!load_p
&& first_stmt_code == BIT_FIELD_REF
&& (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
!= TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: different BIT_FIELD_REF "
"arguments in %G", stmt);
/* Mismatch. */
continue;
}
if (call_stmt && first_stmt_code != CFN_MASK_LOAD)
{
if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
call_stmt))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: different calls in %G",
stmt);
/* Mismatch. */
continue;
}
}
if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
&& (gimple_bb (first_stmt_info->stmt)
!= gimple_bb (stmt_info->stmt)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: different BB for PHI "
"or possibly trapping operation in %G", stmt);
/* Mismatch. */
continue;
}
if (need_same_oprnds)
{
tree other_op1 = gimple_arg (stmt, 1);
if (!operand_equal_p (first_op1, other_op1, 0))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: different shift "
"arguments in %G", stmt);
/* Mismatch. */
continue;
}
}
if (!types_compatible_p (vectype, *node_vectype))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: different vector type "
"in %G", stmt);
/* Mismatch. */
continue;
}
}
/* Grouped store or load. */
if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
{
if (REFERENCE_CLASS_P (lhs))
{
/* Store. */
;
}
else
{
/* Load. */
first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
if (prev_first_load)
{
/* Check that there are no loads from different interleaving
chains in the same node. */
if (prev_first_load != first_load)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION,
vect_location,
"Build SLP failed: different "
"interleaving chains in one node %G",
stmt);
/* Mismatch. */
continue;
}
}
else
prev_first_load = first_load;
}
} /* Grouped access. */
else
{
if (load_p
&& rhs_code != CFN_GATHER_LOAD
&& rhs_code != CFN_MASK_GATHER_LOAD)
{
/* Not grouped load. */
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: not grouped load %G", stmt);
/* FORNOW: Not grouped loads are not supported. */
if (is_a <bb_vec_info> (vinfo) && i != 0)
continue;
/* Fatal mismatch. */
matches[0] = false;
return false;
}
/* Not memory operation. */
if (!phi_p
&& rhs_code.is_tree_code ()
&& TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
&& TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
&& TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
&& TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
&& rhs_code != VIEW_CONVERT_EXPR
&& rhs_code != CALL_EXPR
&& rhs_code != BIT_FIELD_REF)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: operation unsupported %G",
stmt);
if (is_a <bb_vec_info> (vinfo) && i != 0)
continue;
/* Fatal mismatch. */
matches[0] = false;
return false;
}
if (rhs_code == COND_EXPR)
{
tree cond_expr = gimple_assign_rhs1 (stmt);
enum tree_code cond_code = TREE_CODE (cond_expr);
enum tree_code swap_code = ERROR_MARK;
enum tree_code invert_code = ERROR_MARK;
if (i == 0)
first_cond_code = TREE_CODE (cond_expr);
else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
{
bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
swap_code = swap_tree_comparison (cond_code);
invert_code = invert_tree_comparison (cond_code, honor_nans);
}
if (first_cond_code == cond_code)
;
/* Isomorphic can be achieved by swapping. */
else if (first_cond_code == swap_code)
swap[i] = 1;
/* Isomorphic can be achieved by inverting. */
else if (first_cond_code == invert_code)
swap[i] = 2;
else
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: different"
" operation %G", stmt);
/* Mismatch. */
continue;
}
}
if (rhs_code.is_tree_code ()
&& TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
&& (swap_tree_comparison ((tree_code)first_stmt_code)
== (tree_code)rhs_code))
swap[i] = 1;
}
matches[i] = true;
}
for (i = 0; i < group_size; ++i)
if (!matches[i])
return false;
/* If we allowed a two-operation SLP node verify the target can cope
with the permute we are going to use. */
if (alt_stmt_code != ERROR_MARK
&& (!alt_stmt_code.is_tree_code ()
|| (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
&& TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
{
*two_operators = true;
}
if (maybe_soft_fail)
{
unsigned HOST_WIDE_INT const_nunits;
if (!TYPE_VECTOR_SUBPARTS
(soft_fail_nunits_vectype).is_constant (&const_nunits)
|| const_nunits > group_size)
matches[0] = false;
else
{
/* With constant vector elements simulate a mismatch at the
point we need to split. */
unsigned tail = group_size & (const_nunits - 1);
memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
}
return false;
}
return true;
}
/* Traits for the hash_set to record failed SLP builds for a stmt set.
Note we never remove apart from at destruction time so we do not
need a special value for deleted that differs from empty. */
struct bst_traits
{
typedef vec <stmt_vec_info> value_type;
typedef vec <stmt_vec_info> compare_type;
static inline hashval_t hash (value_type);
static inline bool equal (value_type existing, value_type candidate);
static inline bool is_empty (value_type x) { return !x.exists (); }
static inline bool is_deleted (value_type x) { return !x.exists (); }
static const bool empty_zero_p = true;
static inline void mark_empty (value_type &x) { x.release (); }
static inline void mark_deleted (value_type &x) { x.release (); }
static inline void remove (value_type &x) { x.release (); }
};
inline hashval_t
bst_traits::hash (value_type x)
{
inchash::hash h;
for (unsigned i = 0; i < x.length (); ++i)
h.add_int (gimple_uid (x[i]->stmt));
return h.end ();
}
inline bool
bst_traits::equal (value_type existing, value_type candidate)
{
if (existing.length () != candidate.length ())
return false;
for (unsigned i = 0; i < existing.length (); ++i)
if (existing[i] != candidate[i])
return false;
return true;
}
/* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
but then vec::insert does memmove and that's not compatible with
std::pair. */
struct chain_op_t
{
chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
: code (code_), dt (dt_), op (op_) {}
tree_code code;
vect_def_type dt;
tree op;
};
/* Comparator for sorting associatable chains. */
static int
dt_sort_cmp (const void *op1_, const void *op2_, void *)
{
auto *op1 = (const chain_op_t *) op1_;
auto *op2 = (const chain_op_t *) op2_;
if (op1->dt != op2->dt)
return (int)op1->dt - (int)op2->dt;
return (int)op1->code - (int)op2->code;
}
/* Linearize the associatable expression chain at START with the
associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
filling CHAIN with the result and using WORKLIST as intermediate storage.
CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
stmts, starting with START. */
static void
vect_slp_linearize_chain (vec_info *vinfo,
vec<std::pair<tree_code, gimple *> > &worklist,
vec<chain_op_t> &chain,
enum tree_code code, gimple *start,
gimple *&code_stmt, gimple *&alt_code_stmt,
vec<gimple *> *chain_stmts)
{
/* For each lane linearize the addition/subtraction (or other
uniform associatable operation) expression tree. */
worklist.safe_push (std::make_pair (code, start));
while (!worklist.is_empty ())
{
auto entry = worklist.pop ();
gassign *stmt = as_a <gassign *> (entry.second);
enum tree_code in_code = entry.first;
enum tree_code this_code = gimple_assign_rhs_code (stmt);
/* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
if (!code_stmt
&& gimple_assign_rhs_code (stmt) == code)
code_stmt = stmt;
else if (!alt_code_stmt
&& gimple_assign_rhs_code (stmt) == MINUS_EXPR)
alt_code_stmt = stmt;
if (chain_stmts)
chain_stmts->safe_push (stmt);
for (unsigned opnum = 1; opnum <= 2; ++opnum)
{
tree op = gimple_op (stmt, opnum);
vect_def_type dt;
stmt_vec_info def_stmt_info;
bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
gcc_assert (res);
if (dt == vect_internal_def
&& is_pattern_stmt_p (def_stmt_info))
op = gimple_get_lhs (def_stmt_info->stmt);
gimple *use_stmt;
use_operand_p use_p;
if (dt == vect_internal_def
&& single_imm_use (op, &use_p, &use_stmt)
&& is_gimple_assign (def_stmt_info->stmt)
&& (gimple_assign_rhs_code (def_stmt_info->stmt) == code
|| (code == PLUS_EXPR
&& (gimple_assign_rhs_code (def_stmt_info->stmt)
== MINUS_EXPR))))
{
tree_code op_def_code = this_code;
if (op_def_code == MINUS_EXPR && opnum == 1)
op_def_code = PLUS_EXPR;
if (in_code == MINUS_EXPR)
op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
worklist.safe_push (std::make_pair (op_def_code,
def_stmt_info->stmt));
}
else
{
tree_code op_def_code = this_code;
if (op_def_code == MINUS_EXPR && opnum == 1)
op_def_code = PLUS_EXPR;
if (in_code == MINUS_EXPR)
op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
chain.safe_push (chain_op_t (op_def_code, dt, op));
}
}
}
}
typedef hash_map <vec <stmt_vec_info>, slp_tree,
simple_hashmap_traits <bst_traits, slp_tree> >
scalar_stmts_to_slp_tree_map_t;
static slp_tree
vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
vec<stmt_vec_info> stmts, unsigned int group_size,
poly_uint64 *max_nunits,
bool *matches, unsigned *limit, unsigned *tree_size,
scalar_stmts_to_slp_tree_map_t *bst_map);
static slp_tree
vect_build_slp_tree (vec_info *vinfo,
vec<stmt_vec_info> stmts, unsigned int group_size,
poly_uint64 *max_nunits,
bool *matches, unsigned *limit, unsigned *tree_size,
scalar_stmts_to_slp_tree_map_t *bst_map)
{
if (slp_tree *leader = bst_map->get (stmts))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
!(*leader)->failed ? "" : "failed ",
(void *) *leader);
if (!(*leader)->failed)
{
SLP_TREE_REF_COUNT (*leader)++;
vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
stmts.release ();
return *leader;
}
memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
return NULL;
}
/* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
so we can pick up backedge destinations during discovery. */
slp_tree res = new _slp_tree;
SLP_TREE_DEF_TYPE (res) = vect_internal_def;
SLP_TREE_SCALAR_STMTS (res) = stmts;
bst_map->put (stmts.copy (), res);
if (*limit == 0)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"SLP discovery limit exceeded\n");
/* Mark the node invalid so we can detect those when still in use
as backedge destinations. */
SLP_TREE_SCALAR_STMTS (res) = vNULL;
SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
res->failed = XNEWVEC (bool, group_size);
memset (res->failed, 0, sizeof (bool) * group_size);
memset (matches, 0, sizeof (bool) * group_size);
return NULL;
}
--*limit;
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"starting SLP discovery for node %p\n", (void *) res);
poly_uint64 this_max_nunits = 1;
slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
&this_max_nunits,
matches, limit, tree_size, bst_map);
if (!res_)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"SLP discovery for node %p failed\n", (void *) res);
/* Mark the node invalid so we can detect those when still in use
as backedge destinations. */
SLP_TREE_SCALAR_STMTS (res) = vNULL;
SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
res->failed = XNEWVEC (bool, group_size);
if (flag_checking)
{
unsigned i;
for (i = 0; i < group_size; ++i)
if (!matches[i])
break;
gcc_assert (i < group_size);
}
memcpy (res->failed, matches, sizeof (bool) * group_size);
}
else
{
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"SLP discovery for node %p succeeded\n",
(void *) res);
gcc_assert (res_ == res);
res->max_nunits = this_max_nunits;
vect_update_max_nunits (max_nunits, this_max_nunits);
/* Keep a reference for the bst_map use. */
SLP_TREE_REF_COUNT (res)++;
}
return res_;
}
/* Helper for building an associated SLP node chain. */
static void
vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
slp_tree op0, slp_tree op1,
stmt_vec_info oper1, stmt_vec_info oper2,
vec<std::pair<unsigned, unsigned> > lperm)
{
unsigned group_size = SLP_TREE_LANES (op1);
slp_tree child1 = new _slp_tree;
SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
SLP_TREE_VECTYPE (child1) = vectype;
SLP_TREE_LANES (child1) = group_size;
SLP_TREE_CHILDREN (child1).create (2);
SLP_TREE_CHILDREN (child1).quick_push (op0);
SLP_TREE_CHILDREN (child1).quick_push (op1);
SLP_TREE_REPRESENTATIVE (child1) = oper1;
slp_tree child2 = new _slp_tree;
SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
SLP_TREE_VECTYPE (child2) = vectype;
SLP_TREE_LANES (child2) = group_size;
SLP_TREE_CHILDREN (child2).create (2);
SLP_TREE_CHILDREN (child2).quick_push (op0);
SLP_TREE_REF_COUNT (op0)++;
SLP_TREE_CHILDREN (child2).quick_push (op1);
SLP_TREE_REF_COUNT (op1)++;
SLP_TREE_REPRESENTATIVE (child2) = oper2;
SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
SLP_TREE_VECTYPE (perm) = vectype;
SLP_TREE_LANES (perm) = group_size;
/* ??? We should set this NULL but that's not expected. */
SLP_TREE_REPRESENTATIVE (perm) = oper1;
SLP_TREE_LANE_PERMUTATION (perm) = lperm;
SLP_TREE_CHILDREN (perm).quick_push (child1);
SLP_TREE_CHILDREN (perm).quick_push (child2);
}
/* Recursively build an SLP tree starting from NODE.
Fail (and return a value not equal to zero) if def-stmts are not
isomorphic, require data permutation or are of unsupported types of
operation. Otherwise, return 0.
The value returned is the depth in the SLP tree where a mismatch
was found. */
static slp_tree
vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
vec<stmt_vec_info> stmts, unsigned int group_size,
poly_uint64 *max_nunits,
bool *matches, unsigned *limit, unsigned *tree_size,
scalar_stmts_to_slp_tree_map_t *bst_map)
{
unsigned nops, i, this_tree_size = 0;
poly_uint64 this_max_nunits = *max_nunits;
matches[0] = false;
stmt_vec_info stmt_info = stmts[0];
if (!is_a<gcall *> (stmt_info->stmt)
&& !is_a<gassign *> (stmt_info->stmt)
&& !is_a<gphi *> (stmt_info->stmt))
return NULL;
nops = gimple_num_args (stmt_info->stmt);
if (const int *map = vect_get_operand_map (stmt_info->stmt))
nops = map[0];
/* If the SLP node is a PHI (induction or reduction), terminate
the recursion. */
bool *skip_args = XALLOCAVEC (bool, nops);
memset (skip_args, 0, sizeof (bool) * nops);
if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
{
tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
group_size);
if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
max_nunits))
return NULL;
vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
if (def_type == vect_induction_def)
{
/* Induction PHIs are not cycles but walk the initial
value. Only for inner loops through, for outer loops
we need to pick up the value from the actual PHIs
to more easily support peeling and epilogue vectorization. */
class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
if (!nested_in_vect_loop_p (loop, stmt_info))
skip_args[loop_preheader_edge (loop)->dest_idx] = true;
else
loop = loop->inner;
skip_args[loop_latch_edge (loop)->dest_idx] = true;
}
else if (def_type == vect_reduction_def
|| def_type == vect_double_reduction_def
|| def_type == vect_nested_cycle
|| def_type == vect_first_order_recurrence)
{
/* Else def types have to match. */
stmt_vec_info other_info;
bool all_same = true;
FOR_EACH_VEC_ELT (stmts, i, other_info)
{
if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
return NULL;
if (other_info != stmt_info)
all_same = false;
}
class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
/* Reduction initial values are not explicitely represented. */
if (def_type != vect_first_order_recurrence
&& !nested_in_vect_loop_p (loop, stmt_info))
skip_args[loop_preheader_edge (loop)->dest_idx] = true;
/* Reduction chain backedge defs are filled manually.
??? Need a better way to identify a SLP reduction chain PHI.
Or a better overall way to SLP match those. */
if (all_same && def_type == vect_reduction_def)
skip_args[loop_latch_edge (loop)->dest_idx] = true;
}
else if (def_type != vect_internal_def)
return NULL;
}
bool two_operators = false;
unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
tree vectype = NULL_TREE;
if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
&this_max_nunits, matches, &two_operators,
&vectype))
return NULL;
/* If the SLP node is a load, terminate the recursion unless masked. */
if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
&& DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
{
if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
|| gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
|| gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
else
{
*max_nunits = this_max_nunits;
(*tree_size)++;
node = vect_create_new_slp_node (node, stmts, 0);
SLP_TREE_VECTYPE (node) = vectype;
/* And compute the load permutation. Whether it is actually
a permutation depends on the unrolling factor which is
decided later. */
vec<unsigned> load_permutation;
int j;
stmt_vec_info load_info;
load_permutation.create (group_size);
stmt_vec_info first_stmt_info
= DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
{
int load_place = vect_get_place_in_interleaving_chain
(load_info, first_stmt_info);
gcc_assert (load_place != -1);
load_permutation.safe_push (load_place);
}
SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
return node;
}
}
else if (gimple_assign_single_p (stmt_info->stmt)
&& !gimple_vuse (stmt_info->stmt)
&& gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
{
/* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
the same SSA name vector of a compatible type to vectype. */
vec<std::pair<unsigned, unsigned> > lperm = vNULL;
tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
stmt_vec_info estmt_info;
FOR_EACH_VEC_ELT (stmts, i, estmt_info)
{
gassign *estmt = as_a <gassign *> (estmt_info->stmt);
tree bfref = gimple_assign_rhs1 (estmt);
HOST_WIDE_INT lane;
if (!known_eq (bit_field_size (bfref),
tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
|| !constant_multiple_p (bit_field_offset (bfref),
bit_field_size (bfref), &lane))
{
lperm.release ();
matches[0] = false;
return NULL;
}
lperm.safe_push (std::make_pair (0, (unsigned)lane));
}
slp_tree vnode = vect_create_new_slp_node (vNULL);
if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
/* ??? We record vectype here but we hide eventually necessary
punning and instead rely on code generation to materialize
VIEW_CONVERT_EXPRs as necessary. We instead should make
this explicit somehow. */
SLP_TREE_VECTYPE (vnode) = vectype;
else
{
/* For different size but compatible elements we can still
use VEC_PERM_EXPR without punning. */
gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
&& types_compatible_p (TREE_TYPE (vectype),
TREE_TYPE (TREE_TYPE (vec))));
SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
}
auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
unsigned HOST_WIDE_INT const_nunits;
if (nunits.is_constant (&const_nunits))
SLP_TREE_LANES (vnode) = const_nunits;
SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
/* We are always building a permutation node even if it is an identity
permute to shield the rest of the vectorizer from the odd node
representing an actual vector without any scalar ops.
??? We could hide it completely with making the permute node
external? */
node = vect_create_new_slp_node (node, stmts, 1);
SLP_TREE_CODE (node) = VEC_PERM_EXPR;
SLP_TREE_LANE_PERMUTATION (node) = lperm;
SLP_TREE_VECTYPE (node) = vectype;
SLP_TREE_CHILDREN (node).quick_push (vnode);
return node;
}
/* When discovery reaches an associatable operation see whether we can
improve that to match up lanes in a way superior to the operand
swapping code which at most looks at two defs.
??? For BB vectorization we cannot do the brute-force search
for matching as we can succeed by means of builds from scalars
and have no good way to "cost" one build against another. */
else if (is_a <loop_vec_info> (vinfo)
/* ??? We don't handle !vect_internal_def defs below. */
&& STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
&& is_gimple_assign (stmt_info->stmt)
&& (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
|| gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
&& ((FLOAT_TYPE_P (vectype) && flag_associative_math)
|| (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
&& TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
{
/* See if we have a chain of (mixed) adds or subtracts or other
associatable ops. */
enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
if (code == MINUS_EXPR)
code = PLUS_EXPR;
stmt_vec_info other_op_stmt_info = NULL;
stmt_vec_info op_stmt_info = NULL;
unsigned chain_len = 0;
auto_vec<chain_op_t> chain;
auto_vec<std::pair<tree_code, gimple *> > worklist;
auto_vec<vec<chain_op_t> > chains (group_size);
auto_vec<slp_tree, 4> children;
bool hard_fail = true;
for (unsigned lane = 0; lane < group_size; ++lane)
{
/* For each lane linearize the addition/subtraction (or other
uniform associatable operation) expression tree. */
gimple *op_stmt = NULL, *other_op_stmt = NULL;
vect_slp_linearize_chain (vinfo, worklist, chain, code,
stmts[lane]->stmt, op_stmt, other_op_stmt,
NULL);
if (!op_stmt_info && op_stmt)
op_stmt_info = vinfo->lookup_stmt (op_stmt);
if (!other_op_stmt_info && other_op_stmt)
other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
if (chain.length () == 2)
{
/* In a chain of just two elements resort to the regular
operand swapping scheme. If we run into a length
mismatch still hard-FAIL. */
if (chain_len == 0)
hard_fail = false;
else
{
matches[lane] = false;
/* ??? We might want to process the other lanes, but
make sure to not give false matching hints to the
caller for lanes we did not process. */
if (lane != group_size - 1)
matches[0] = false;
}
break;
}
else if (chain_len == 0)
chain_len = chain.length ();
else if (chain.length () != chain_len)
{
/* ??? Here we could slip in magic to compensate with
neutral operands. */
matches[lane] = false;
if (lane != group_size - 1)
matches[0] = false;
break;
}
chains.quick_push (chain.copy ());
chain.truncate (0);
}
if (chains.length () == group_size)
{
/* We cannot yet use SLP_TREE_CODE to communicate the operation. */
if (!op_stmt_info)
{
hard_fail = false;
goto out;
}
/* Now we have a set of chains with the same length. */
/* 1. pre-sort according to def_type and operation. */
for (unsigned lane = 0; lane < group_size; ++lane)
chains[lane].stablesort (dt_sort_cmp, vinfo);
if (dump_enabled_p ())
{
dump_printf_loc (MSG_NOTE, vect_location,
"pre-sorted chains of %s\n",
get_tree_code_name (code));
for (unsigned lane = 0; lane < group_size; ++lane)
{
for (unsigned opnum = 0; opnum < chain_len; ++opnum)
dump_printf (MSG_NOTE, "%s %T ",
get_tree_code_name (chains[lane][opnum].code),
chains[lane][opnum].op);
dump_printf (MSG_NOTE, "\n");
}
}
/* 2. try to build children nodes, associating as necessary. */
for (unsigned n = 0; n < chain_len; ++n)
{
vect_def_type dt = chains[0][n].dt;
unsigned lane;
for (lane = 0; lane < group_size; ++lane)
if (chains[lane][n].dt != dt)
{
if (dt == vect_constant_def
&& chains[lane][n].dt == vect_external_def)
dt = vect_external_def;
else if (dt == vect_external_def
&& chains[lane][n].dt == vect_constant_def)
;
else
break;
}
if (lane != group_size)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"giving up on chain due to mismatched "
"def types\n");
matches[lane] = false;
if (lane != group_size - 1)
matches[0] = false;
goto out;
}
if (dt == vect_constant_def
|| dt == vect_external_def)
{
/* Check whether we can build the invariant. If we can't
we never will be able to. */
tree type = TREE_TYPE (chains[0][n].op);
if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
&& (TREE_CODE (type) == BOOLEAN_TYPE
|| !can_duplicate_and_interleave_p (vinfo, group_size,
type)))
{
matches[0] = false;
goto out;
}
vec<tree> ops;
ops.create (group_size);
for (lane = 0; lane < group_size; ++lane)
ops.quick_push (chains[lane][n].op);
slp_tree child = vect_create_new_slp_node (ops);
SLP_TREE_DEF_TYPE (child) = dt;
children.safe_push (child);
}
else if (dt != vect_internal_def)
{
/* Not sure, we might need sth special.
gcc.dg/vect/pr96854.c,
gfortran.dg/vect/fast-math-pr37021.f90
and gfortran.dg/vect/pr61171.f trigger. */
/* Soft-fail for now. */
hard_fail = false;
goto out;
}
else
{
vec<stmt_vec_info> op_stmts;
op_stmts.create (group_size);
slp_tree child = NULL;
/* Brute-force our way. We have to consider a lane
failing after fixing an earlier fail up in the
SLP discovery recursion. So track the current
permute per lane. */
unsigned *perms = XALLOCAVEC (unsigned, group_size);
memset (perms, 0, sizeof (unsigned) * group_size);
do
{
op_stmts.truncate (0);
for (lane = 0; lane < group_size; ++lane)
op_stmts.quick_push
(vinfo->lookup_def (chains[lane][n].op));
child = vect_build_slp_tree (vinfo, op_stmts,
group_size, &this_max_nunits,
matches, limit,
&this_tree_size, bst_map);
/* ??? We're likely getting too many fatal mismatches
here so maybe we want to ignore them (but then we
have no idea which lanes fatally mismatched). */
if (child || !matches[0])
break;
/* Swap another lane we have not yet matched up into
lanes that did not match. If we run out of
permute possibilities for a lane terminate the
search. */
bool term = false;
for (lane = 1; lane < group_size; ++lane)
if (!matches[lane])
{
if (n + perms[lane] + 1 == chain_len)
{
term = true;
break;
}
std::swap (chains[lane][n],
chains[lane][n + perms[lane] + 1]);
perms[lane]++;
}
if (term)
break;
}
while (1);
if (!child)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"failed to match up op %d\n", n);
op_stmts.release ();
if (lane != group_size - 1)
matches[0] = false;
else
matches[lane] = false;
goto out;
}
if (dump_enabled_p ())
{
dump_printf_loc (MSG_NOTE, vect_location,
"matched up op %d to\n", n);
vect_print_slp_tree (MSG_NOTE, vect_location, child);
}
children.safe_push (child);
}
}
/* 3. build SLP nodes to combine the chain. */
for (unsigned lane = 0; lane < group_size; ++lane)
if (chains[lane][0].code != code)
{
/* See if there's any alternate all-PLUS entry. */
unsigned n;
for (n = 1; n < chain_len; ++n)
{
for (lane = 0; lane < group_size; ++lane)
if (chains[lane][n].code != code)
break;
if (lane == group_size)
break;
}
if (n != chain_len)
{
/* Swap that in at first position. */
std::swap (children[0], children[n]);
for (lane = 0; lane < group_size; ++lane)
std::swap (chains[lane][0], chains[lane][n]);
}
else
{
/* ??? When this triggers and we end up with two
vect_constant/external_def up-front things break (ICE)
spectacularly finding an insertion place for the
all-constant op. We should have a fully
vect_internal_def operand though(?) so we can swap
that into first place and then prepend the all-zero
constant. */
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"inserting constant zero to compensate "
"for (partially) negated first "
"operand\n");
chain_len++;
for (lane = 0; lane < group_size; ++lane)
chains[lane].safe_insert
(0, chain_op_t (code, vect_constant_def, NULL_TREE));
vec<tree> zero_ops;
zero_ops.create (group_size);
zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
for (lane = 1; lane < group_size; ++lane)
zero_ops.quick_push (zero_ops[0]);
slp_tree zero = vect_create_new_slp_node (zero_ops);
SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
children.safe_insert (0, zero);
}
break;
}
for (unsigned i = 1; i < children.length (); ++i)
{
slp_tree op0 = children[i - 1];
slp_tree op1 = children[i];
bool this_two_op = false;
for (unsigned lane = 0; lane < group_size; ++lane)
if (chains[lane][i].code != chains[0][i].code)
{
this_two_op = true;
break;
}
slp_tree child;
if (i == children.length () - 1)
child = vect_create_new_slp_node (node, stmts, 2);
else
child = vect_create_new_slp_node (2, ERROR_MARK);
if (this_two_op)
{
vec<std::pair<unsigned, unsigned> > lperm;
lperm.create (group_size);
for (unsigned lane = 0; lane < group_size; ++lane)
lperm.quick_push (std::make_pair
(chains[lane][i].code != chains[0][i].code, lane));
vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
(chains[0][i].code == code
? op_stmt_info
: other_op_stmt_info),
(chains[0][i].code == code
? other_op_stmt_info
: op_stmt_info),
lperm);
}
else
{
SLP_TREE_DEF_TYPE (child) = vect_internal_def;
SLP_TREE_VECTYPE (child) = vectype;
SLP_TREE_LANES (child) = group_size;
SLP_TREE_CHILDREN (child).quick_push (op0);
SLP_TREE_CHILDREN (child).quick_push (op1);
SLP_TREE_REPRESENTATIVE (child)
= (chains[0][i].code == code
? op_stmt_info : other_op_stmt_info);
}
children[i] = child;
}
*tree_size += this_tree_size + 1;
*max_nunits = this_max_nunits;
while (!chains.is_empty ())
chains.pop ().release ();
return node;
}
out:
while (!children.is_empty ())
vect_free_slp_tree (children.pop ());
while (!chains.is_empty ())
chains.pop ().release ();
/* Hard-fail, otherwise we might run into quadratic processing of the
chains starting one stmt into the chain again. */
if (hard_fail)
return NULL;
/* Fall thru to normal processing. */
}
/* Get at the operands, verifying they are compatible. */
vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
slp_oprnd_info oprnd_info;
FOR_EACH_VEC_ELT (stmts, i, stmt_info)
{
int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
stmts, i, &oprnds_info);
if (res != 0)
matches[(res == -1) ? 0 : i] = false;
if (!matches[0])
break;
}
for (i = 0; i < group_size; ++i)
if (!matches[i])
{
vect_free_oprnd_info (oprnds_info);
return NULL;
}
swap = NULL;
auto_vec<slp_tree, 4> children;
stmt_info = stmts[0];
/* Create SLP_TREE nodes for the definition node/s. */
FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
{
slp_tree child;
unsigned int j;
/* We're skipping certain operands from processing, for example
outer loop reduction initial defs. */
if (skip_args[i])
{
children.safe_push (NULL);
continue;
}
if (oprnd_info->first_dt == vect_uninitialized_def)
{
/* COND_EXPR have one too many eventually if the condition
is a SSA name. */
gcc_assert (i == 3 && nops == 4);
continue;
}
if (is_a <bb_vec_info> (vinfo)
&& oprnd_info->first_dt == vect_internal_def
&& !oprnd_info->any_pattern)
{
/* For BB vectorization, if all defs are the same do not
bother to continue the build along the single-lane
graph but use a splat of the scalar value. */
stmt_vec_info first_def = oprnd_info->def_stmts[0];
for (j = 1; j < group_size; ++j)
if (oprnd_info->def_stmts[j] != first_def)
break;
if (j == group_size
/* But avoid doing this for loads where we may be
able to CSE things, unless the stmt is not
vectorizable. */
&& (!STMT_VINFO_VECTORIZABLE (first_def)
|| !gimple_vuse (first_def->stmt)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"Using a splat of the uniform operand %G",
first_def->stmt);
oprnd_info->first_dt = vect_external_def;
}
}
if (oprnd_info->first_dt == vect_external_def
|| oprnd_info->first_dt == vect_constant_def)
{
slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
oprnd_info->ops = vNULL;
children.safe_push (invnode);
continue;
}
if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
group_size, &this_max_nunits,
matches, limit,
&this_tree_size, bst_map)) != NULL)
{
oprnd_info->def_stmts = vNULL;
children.safe_push (child);
continue;
}
/* If the SLP build for operand zero failed and operand zero
and one can be commutated try that for the scalar stmts
that failed the match. */
if (i == 0
/* A first scalar stmt mismatch signals a fatal mismatch. */
&& matches[0]
/* ??? For COND_EXPRs we can swap the comparison operands
as well as the arms under some constraints. */
&& nops == 2
&& oprnds_info[1]->first_dt == vect_internal_def
&& is_gimple_assign (stmt_info->stmt)
/* Swapping operands for reductions breaks assumptions later on. */
&& STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
&& STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
{
/* See whether we can swap the matching or the non-matching
stmt operands. */
bool swap_not_matching = true;
do
{
for (j = 0; j < group_size; ++j)
{
if (matches[j] != !swap_not_matching)
continue;
stmt_vec_info stmt_info = stmts[j];
/* Verify if we can swap operands of this stmt. */
gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
if (!stmt
|| !commutative_tree_code (gimple_assign_rhs_code (stmt)))
{
if (!swap_not_matching)
goto fail;
swap_not_matching = false;
break;
}
}
}
while (j != group_size);
/* Swap mismatched definition stmts. */
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"Re-trying with swapped operands of stmts ");
for (j = 0; j < group_size; ++j)
if (matches[j] == !swap_not_matching)
{
std::swap (oprnds_info[0]->def_stmts[j],
oprnds_info[1]->def_stmts[j]);
std::swap (oprnds_info[0]->ops[j],
oprnds_info[1]->ops[j]);
if (dump_enabled_p ())
dump_printf (MSG_NOTE, "%d ", j);
}
if (dump_enabled_p ())
dump_printf (MSG_NOTE, "\n");
/* After swapping some operands we lost track whether an
operand has any pattern defs so be conservative here. */
if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
/* And try again with scratch 'matches' ... */
bool *tem = XALLOCAVEC (bool, group_size);
if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
group_size, &this_max_nunits,
tem, limit,
&this_tree_size, bst_map)) != NULL)
{
oprnd_info->def_stmts = vNULL;
children.safe_push (child);
continue;
}
}
fail:
/* If the SLP build failed and we analyze a basic-block
simply treat nodes we fail to build as externally defined
(and thus build vectors from the scalar defs).
The cost model will reject outright expensive cases.
??? This doesn't treat cases where permutation ultimatively
fails (or we don't try permutation below). Ideally we'd
even compute a permutation that will end up with the maximum
SLP tree size... */
if (is_a <bb_vec_info> (vinfo)
/* ??? Rejecting patterns this way doesn't work. We'd have to
do extra work to cancel the pattern so the uses see the
scalar version. */
&& !is_pattern_stmt_p (stmt_info)
&& !oprnd_info->any_pattern)
{
/* But if there's a leading vector sized set of matching stmts
fail here so we can split the group. This matches the condition
vect_analyze_slp_instance uses. */
/* ??? We might want to split here and combine the results to support
multiple vector sizes better. */
for (j = 0; j < group_size; ++j)
if (!matches[j])
break;
if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"Building vector operands from scalars\n");
this_tree_size++;
child = vect_create_new_slp_node (oprnd_info->ops);
children.safe_push (child);
oprnd_info->ops = vNULL;
continue;
}
}
gcc_assert (child == NULL);
FOR_EACH_VEC_ELT (children, j, child)
if (child)
vect_free_slp_tree (child);
vect_free_oprnd_info (oprnds_info);
return NULL;
}
vect_free_oprnd_info (oprnds_info);
/* If we have all children of a child built up from uniform scalars
or does more than one possibly expensive vector construction then
just throw that away, causing it built up from scalars.
The exception is the SLP node for the vector store. */
if (is_a <bb_vec_info> (vinfo)
&& !STMT_VINFO_GROUPED_ACCESS (stmt_info)
/* ??? Rejecting patterns this way doesn't work. We'd have to
do extra work to cancel the pattern so the uses see the
scalar version. */
&& !is_pattern_stmt_p (stmt_info))
{
slp_tree child;
unsigned j;
bool all_uniform_p = true;
unsigned n_vector_builds = 0;
FOR_EACH_VEC_ELT (children, j, child)
{
if (!child)
;
else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
all_uniform_p = false;
else if (!vect_slp_tree_uniform_p (child))
{
all_uniform_p = false;
if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
n_vector_builds++;
}
}
if (all_uniform_p
|| n_vector_builds > 1
|| (n_vector_builds == children.length ()
&& is_a <gphi *> (stmt_info->stmt)))
{
/* Roll back. */
matches[0] = false;
FOR_EACH_VEC_ELT (children, j, child)
if (child)
vect_free_slp_tree (child);
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"Building parent vector operands from "
"scalars instead\n");
return NULL;
}
}
*tree_size += this_tree_size + 1;
*max_nunits = this_max_nunits;
if (two_operators)
{
/* ??? We'd likely want to either cache in bst_map sth like
{ a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
the true { a+b, a+b, a+b, a+b } ... but there we don't have
explicit stmts to put in so the keying on 'stmts' doesn't
work (but we have the same issue with nodes that use 'ops'). */
slp_tree one = new _slp_tree;
slp_tree two = new _slp_tree;
SLP_TREE_DEF_TYPE (one) = vect_internal_def;
SLP_TREE_DEF_TYPE (two) = vect_internal_def;
SLP_TREE_VECTYPE (one) = vectype;
SLP_TREE_VECTYPE (two) = vectype;
SLP_TREE_CHILDREN (one).safe_splice (children);
SLP_TREE_CHILDREN (two).safe_splice (children);
slp_tree child;
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
SLP_TREE_REF_COUNT (child)++;
/* Here we record the original defs since this
node represents the final lane configuration. */
node = vect_create_new_slp_node (node, stmts, 2);
SLP_TREE_VECTYPE (node) = vectype;
SLP_TREE_CODE (node) = VEC_PERM_EXPR;
SLP_TREE_CHILDREN (node).quick_push (one);
SLP_TREE_CHILDREN (node).quick_push (two);
gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
enum tree_code code0 = gimple_assign_rhs_code (stmt);
enum tree_code ocode = ERROR_MARK;
stmt_vec_info ostmt_info;
unsigned j = 0;
FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
{
gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
if (gimple_assign_rhs_code (ostmt) != code0)
{
SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
ocode = gimple_assign_rhs_code (ostmt);
j = i;
}
else
SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
}
SLP_TREE_CODE (one) = code0;
SLP_TREE_CODE (two) = ocode;
SLP_TREE_LANES (one) = stmts.length ();
SLP_TREE_LANES (two) = stmts.length ();
SLP_TREE_REPRESENTATIVE (one) = stmts[0];
SLP_TREE_REPRESENTATIVE (two) = stmts[j];
return node;
}
node = vect_create_new_slp_node (node, stmts, nops);
SLP_TREE_VECTYPE (node) = vectype;
SLP_TREE_CHILDREN (node).splice (children);
return node;
}
/* Dump a single SLP tree NODE. */
static void
vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
slp_tree node)
{
unsigned i, j;
slp_tree child;
stmt_vec_info stmt_info;
tree op;
dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
dump_user_location_t user_loc = loc.get_user_location ();
dump_printf_loc (metadata, user_loc,
"node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
", refcnt=%u)",
SLP_TREE_DEF_TYPE (node) == vect_external_def
? " (external)"
: (SLP_TREE_DEF_TYPE (node) == vect_constant_def
? " (constant)"
: ""), (void *) node,
estimated_poly_value (node->max_nunits),
SLP_TREE_REF_COUNT (node));
if (SLP_TREE_VECTYPE (node))
dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
dump_printf (metadata, "\n");
if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
{
if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
else
dump_printf_loc (metadata, user_loc, "op template: %G",
SLP_TREE_REPRESENTATIVE (node)->stmt);
}
if (SLP_TREE_SCALAR_STMTS (node).exists ())
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
else
{
dump_printf_loc (metadata, user_loc, "\t{ ");
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
dump_printf (metadata, "%T%s ", op,
i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
dump_printf (metadata, "}\n");
}
if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
{
dump_printf_loc (metadata, user_loc, "\tload permutation {");
FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
dump_printf (dump_kind, " %u", j);
dump_printf (dump_kind, " }\n");
}
if (SLP_TREE_LANE_PERMUTATION (node).exists ())
{
dump_printf_loc (metadata, user_loc, "\tlane permutation {");
for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
dump_printf (dump_kind, " %u[%u]",
SLP_TREE_LANE_PERMUTATION (node)[i].first,
SLP_TREE_LANE_PERMUTATION (node)[i].second);
dump_printf (dump_kind, " }\n");
}
if (SLP_TREE_CHILDREN (node).is_empty ())
return;
dump_printf_loc (metadata, user_loc, "\tchildren");
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
dump_printf (dump_kind, " %p", (void *)child);
dump_printf (dump_kind, "\n");
}
DEBUG_FUNCTION void
debug (slp_tree node)
{
debug_dump_context ctx;
vect_print_slp_tree (MSG_NOTE,
dump_location_t::from_location_t (UNKNOWN_LOCATION),
node);
}
/* Recursive helper for the dot producer below. */
static void
dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
{
if (visited.add (node))
return;
fprintf (f, "\"%p\" [label=\"", (void *)node);
vect_print_slp_tree (MSG_NOTE,
dump_location_t::from_location_t (UNKNOWN_LOCATION),
node);
fprintf (f, "\"];\n");
for (slp_tree child : SLP_TREE_CHILDREN (node))
fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
for (slp_tree child : SLP_TREE_CHILDREN (node))
if (child)
dot_slp_tree (f, child, visited);
}
DEBUG_FUNCTION void
dot_slp_tree (const char *fname, slp_tree node)
{
FILE *f = fopen (fname, "w");
fprintf (f, "digraph {\n");
fflush (f);
{
debug_dump_context ctx (f);
hash_set<slp_tree> visited;
dot_slp_tree (f, node, visited);
}
fflush (f);
fprintf (f, "}\n");
fclose (f);
}
/* Dump a slp tree NODE using flags specified in DUMP_KIND. */
static void
vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
slp_tree node, hash_set<slp_tree> &visited)
{
unsigned i;
slp_tree child;
if (visited.add (node))
return;
vect_print_slp_tree (dump_kind, loc, node);
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
if (child)
vect_print_slp_graph (dump_kind, loc, child, visited);
}
static void
vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
slp_tree entry)
{
hash_set<slp_tree> visited;
vect_print_slp_graph (dump_kind, loc, entry, visited);
}
/* Mark the tree rooted at NODE with PURE_SLP. */
static void
vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
{
int i;
stmt_vec_info stmt_info;
slp_tree child;
if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
return;
if (visited.add (node))
return;
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
STMT_SLP_TYPE (stmt_info) = pure_slp;
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
if (child)
vect_mark_slp_stmts (child, visited);
}
static void
vect_mark_slp_stmts (slp_tree node)
{
hash_set<slp_tree> visited;
vect_mark_slp_stmts (node, visited);
}
/* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
static void
vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
{
int i;
stmt_vec_info stmt_info;
slp_tree child;
if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
return;
if (visited.add (node))
return;
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
{
gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
|| STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
}
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
if (child)
vect_mark_slp_stmts_relevant (child, visited);
}
static void
vect_mark_slp_stmts_relevant (slp_tree node)
{
hash_set<slp_tree> visited;
vect_mark_slp_stmts_relevant (node, visited);
}
/* Gather loads in the SLP graph NODE and populate the INST loads array. */
static void
vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
hash_set<slp_tree> &visited)
{
if (!node || visited.add (node))
return;
if (SLP_TREE_CHILDREN (node).length () == 0)
{
if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
return;
stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
&& DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
loads.safe_push (node);
}
else
{
unsigned i;
slp_tree child;
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
vect_gather_slp_loads (loads, child, visited);
}
}
/* Find the last store in SLP INSTANCE. */
stmt_vec_info
vect_find_last_scalar_stmt_in_slp (slp_tree node)
{
stmt_vec_info last = NULL;
stmt_vec_info stmt_vinfo;
for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
{
stmt_vinfo = vect_orig_stmt (stmt_vinfo);
last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
}
return last;
}
/* Find the first stmt in NODE. */
stmt_vec_info
vect_find_first_scalar_stmt_in_slp (slp_tree node)
{
stmt_vec_info first = NULL;
stmt_vec_info stmt_vinfo;
for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
{
stmt_vinfo = vect_orig_stmt (stmt_vinfo);
if (!first
|| get_later_stmt (stmt_vinfo, first) == first)
first = stmt_vinfo;
}
return first;
}
/* Splits a group of stores, currently beginning at FIRST_VINFO, into
two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
(also containing the first GROUP1_SIZE stmts, since stores are
consecutive), the second containing the remainder.
Return the first stmt in the second group. */
static stmt_vec_info
vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
{
gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
gcc_assert (group1_size > 0);
int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
gcc_assert (group2_size > 0);
DR_GROUP_SIZE (first_vinfo) = group1_size;
stmt_vec_info stmt_info = first_vinfo;
for (unsigned i = group1_size; i > 1; i--)
{
stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
}
/* STMT is now the last element of the first group. */
stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
DR_GROUP_SIZE (group2) = group2_size;
for (stmt_info = group2; stmt_info;
stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
{
DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
}
/* For the second group, the DR_GROUP_GAP is that before the original group,
plus skipping over the first vector. */
DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
/* DR_GROUP_GAP of the first group now has to skip over the second group too. */
DR_GROUP_GAP (first_vinfo) += group2_size;
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
group1_size, group2_size);
return group2;
}
/* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
statements and a vector of NUNITS elements. */
static poly_uint64
calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
{
return exact_div (common_multiple (nunits, group_size), group_size);
}
/* Helper that checks to see if a node is a load node. */
static inline bool
vect_is_slp_load_node (slp_tree root)
{
return SLP_TREE_DEF_TYPE (root) == vect_internal_def
&& STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
&& DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
}
/* Helper function of optimize_load_redistribution that performs the operation
recursively. */
static slp_tree
optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
vec_info *vinfo, unsigned int group_size,
hash_map<slp_tree, slp_tree> *load_map,
slp_tree root)
{
if (slp_tree *leader = load_map->get (root))
return *leader;
slp_tree node;
unsigned i;
/* For now, we don't know anything about externals so do not do anything. */
if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
return NULL;
else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
{
/* First convert this node into a load node and add it to the leaves
list and flatten the permute from a lane to a load one. If it's
unneeded it will be elided later. */
vec<stmt_vec_info> stmts;
stmts.create (SLP_TREE_LANES (root));
lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
for (unsigned j = 0; j < lane_perm.length (); j++)
{
std::pair<unsigned, unsigned> perm = lane_perm[j];
node = SLP_TREE_CHILDREN (root)[perm.first];
if (!vect_is_slp_load_node (node)
|| SLP_TREE_CHILDREN (node).exists ())
{
stmts.release ();
goto next;
}
stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
}
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"converting stmts on permute node %p\n",
(void *) root);
bool *matches = XALLOCAVEC (bool, group_size);
poly_uint64 max_nunits = 1;
unsigned tree_size = 0, limit = 1;
node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
matches, &limit, &tree_size, bst_map);
if (!node)
stmts.release ();
load_map->put (root, node);
return node;
}
next:
load_map->put (root, NULL);
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
{
slp_tree value
= optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
node);
if (value)
{
SLP_TREE_REF_COUNT (value)++;
SLP_TREE_CHILDREN (root)[i] = value;
/* ??? We know the original leafs of the replaced nodes will
be referenced by bst_map, only the permutes created by
pattern matching are not. */
if (SLP_TREE_REF_COUNT (node) == 1)
load_map->remove (node);
vect_free_slp_tree (node);
}
}
return NULL;
}
/* Temporary workaround for loads not being CSEd during SLP build. This
function will traverse the SLP tree rooted in ROOT for INSTANCE and find
VEC_PERM nodes that blend vectors from multiple nodes that all read from the
same DR such that the final operation is equal to a permuted load. Such
NODES are then directly converted into LOADS themselves. The nodes are
CSEd using BST_MAP. */
static void
optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
vec_info *vinfo, unsigned int group_size,
hash_map<slp_tree, slp_tree> *load_map,
slp_tree root)
{
slp_tree node;
unsigned i;
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
{
slp_tree value
= optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
node);
if (value)
{
SLP_TREE_REF_COUNT (value)++;
SLP_TREE_CHILDREN (root)[i] = value;
/* ??? We know the original leafs of the replaced nodes will
be referenced by bst_map, only the permutes created by
pattern matching are not. */
if (SLP_TREE_REF_COUNT (node) == 1)
load_map->remove (node);
vect_free_slp_tree (node);
}
}
}
/* Helper function of vect_match_slp_patterns.
Attempts to match patterns against the slp tree rooted in REF_NODE using
VINFO. Patterns are matched in post-order traversal.
If matching is successful the value in REF_NODE is updated and returned, if
not then it is returned unchanged. */
static bool
vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
slp_tree_to_load_perm_map_t *perm_cache,
slp_compat_nodes_map_t *compat_cache,
hash_set<slp_tree> *visited)
{
unsigned i;
slp_tree node = *ref_node;
bool found_p = false;
if (!node || visited->add (node))
return false;
slp_tree child;
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
vinfo, perm_cache, compat_cache,
visited);
for (unsigned x = 0; x < num__slp_patterns; x++)
{
vect_pattern *pattern
= slp_patterns[x] (perm_cache, compat_cache, ref_node);
if (pattern)
{
pattern->build (vinfo);
delete pattern;
found_p = true;