| /* Find single-entry, single-exit regions for OpenACC. |
| Copyright (C) 2014-2017 Free Software Foundation, Inc. |
| |
| This file is part of GCC. |
| |
| GCC is free software; you can redistribute it and/or modify it |
| under the terms of the GNU General Public License as published |
| by the Free Software Foundation; either version 3, or (at your |
| option) any later version. |
| |
| GCC is distributed in the hope that it will be useful, but WITHOUT |
| ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public |
| License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with GCC; see the file COPYING3. If not see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #include "config.h" |
| #include "system.h" |
| #include "coretypes.h" |
| #include "backend.h" |
| #include "rtl.h" |
| #include "tree.h" |
| #include "gimple.h" |
| #include "tree-pass.h" |
| #include "ssa.h" |
| #include "cgraph.h" |
| #include "pretty-print.h" |
| #include "fold-const.h" |
| #include "gimplify.h" |
| #include "gimple-iterator.h" |
| #include "gimple-walk.h" |
| #include "tree-inline.h" |
| #include "langhooks.h" |
| #include "omp-general.h" |
| #include "omp-low.h" |
| #include "omp-grid.h" |
| #include "gimple-pretty-print.h" |
| #include "cfghooks.h" |
| #include "insn-config.h" |
| #include "recog.h" |
| #include "internal-fn.h" |
| #include "bitmap.h" |
| #include "tree-nested.h" |
| #include "stor-layout.h" |
| #include "tree-ssa-threadupdate.h" |
| #include "tree-into-ssa.h" |
| #include "splay-tree.h" |
| #include "target.h" |
| #include "cfgloop.h" |
| #include "tree-cfg.h" |
| #include "omp-offload.h" |
| #include "attribs.h" |
| #include "omp-sese.h" |
| |
| /* Loop structure of the function. The entire function is described as |
| a NULL loop. */ |
| |
| struct parallel_g |
| { |
| /* Parent parallel. */ |
| parallel_g *parent; |
| |
| /* Next sibling parallel. */ |
| parallel_g *next; |
| |
| /* First child parallel. */ |
| parallel_g *inner; |
| |
| /* Partitioning mask of the parallel. */ |
| unsigned mask; |
| |
| /* Partitioning used within inner parallels. */ |
| unsigned inner_mask; |
| |
| /* Location of parallel forked and join. The forked is the first |
| block in the parallel and the join is the first block after of |
| the partition. */ |
| basic_block forked_block; |
| basic_block join_block; |
| |
| gimple *forked_stmt; |
| gimple *join_stmt; |
| |
| gimple *fork_stmt; |
| gimple *joining_stmt; |
| |
| /* Basic blocks in this parallel, but not in child parallels. The |
| FORKED and JOINING blocks are in the partition. The FORK and JOIN |
| blocks are not. */ |
| auto_vec<basic_block> blocks; |
| |
| tree record_type; |
| tree sender_decl; |
| tree receiver_decl; |
| |
| public: |
| parallel_g (parallel_g *parent, unsigned mode); |
| ~parallel_g (); |
| }; |
| |
| /* Constructor links the new parallel into it's parent's chain of |
| children. */ |
| |
| parallel_g::parallel_g (parallel_g *parent_, unsigned mask_) |
| :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0) |
| { |
| forked_block = join_block = 0; |
| forked_stmt = join_stmt = NULL; |
| fork_stmt = joining_stmt = NULL; |
| |
| record_type = NULL_TREE; |
| sender_decl = NULL_TREE; |
| receiver_decl = NULL_TREE; |
| |
| if (parent) |
| { |
| next = parent->inner; |
| parent->inner = this; |
| } |
| } |
| |
| parallel_g::~parallel_g () |
| { |
| delete inner; |
| delete next; |
| } |
| |
| static bool |
| local_var_based_p (tree decl) |
| { |
| switch (TREE_CODE (decl)) |
| { |
| case VAR_DECL: |
| return !is_global_var (decl); |
| |
| case COMPONENT_REF: |
| case BIT_FIELD_REF: |
| case ARRAY_REF: |
| return local_var_based_p (TREE_OPERAND (decl, 0)); |
| |
| default: |
| return false; |
| } |
| } |
| |
| /* Map of basic blocks to gimple stmts. */ |
| typedef hash_map<basic_block, gimple *> bb_stmt_map_t; |
| |
| /* Calls to OpenACC routines are made by all workers/wavefronts/warps, since |
| the routine likely contains partitioned loops (else will do its own |
| neutering and variable propagation). Return TRUE if a function call CALL |
| should be made in (worker) single mode instead, rather than redundant |
| mode. */ |
| |
| static bool |
| omp_sese_active_worker_call (gcall *call) |
| { |
| #define GOMP_DIM_SEQ GOMP_DIM_MAX |
| tree fndecl = gimple_call_fndecl (call); |
| |
| if (!fndecl) |
| return true; |
| |
| tree attrs = oacc_get_fn_attrib (fndecl); |
| |
| if (!attrs) |
| return true; |
| |
| int level = oacc_fn_attrib_level (attrs); |
| |
| /* Neither regular functions nor "seq" routines should be run by all threads |
| in worker-single mode. */ |
| return level == -1 || level == GOMP_DIM_SEQ; |
| #undef GOMP_DIM_SEQ |
| } |
| |
| /* Split basic blocks such that each forked and join unspecs are at |
| the start of their basic blocks. Thus afterwards each block will |
| have a single partitioning mode. We also do the same for return |
| insns, as they are executed by every thread. Return the |
| partitioning mode of the function as a whole. Populate MAP with |
| head and tail blocks. We also clear the BB visited flag, which is |
| used when finding partitions. */ |
| |
| static void |
| omp_sese_split_blocks (bb_stmt_map_t *map) |
| { |
| auto_vec<gimple *> worklist; |
| basic_block block; |
| |
| /* Locate all the reorg instructions of interest. */ |
| FOR_ALL_BB_FN (block, cfun) |
| { |
| /* Clear visited flag, for use by parallel locator */ |
| block->flags &= ~BB_VISITED; |
| |
| for (gimple_stmt_iterator gsi = gsi_start_bb (block); |
| !gsi_end_p (gsi); |
| gsi_next (&gsi)) |
| { |
| gimple *stmt = gsi_stmt (gsi); |
| |
| if (gimple_call_internal_p (stmt, IFN_UNIQUE)) |
| { |
| enum ifn_unique_kind k = ((enum ifn_unique_kind) |
| TREE_INT_CST_LOW (gimple_call_arg (stmt, 0))); |
| |
| if (k == IFN_UNIQUE_OACC_JOIN) |
| worklist.safe_push (stmt); |
| else if (k == IFN_UNIQUE_OACC_FORK) |
| { |
| gcc_assert (gsi_one_before_end_p (gsi)); |
| basic_block forked_block = single_succ (block); |
| gimple_stmt_iterator gsi2 = gsi_start_bb (forked_block); |
| |
| /* We push a NOP as a placeholder for the "forked" stmt. |
| This is then recognized in omp_sese_find_par. */ |
| gimple *nop = gimple_build_nop (); |
| gsi_insert_before (&gsi2, nop, GSI_SAME_STMT); |
| |
| worklist.safe_push (nop); |
| } |
| } |
| else if (gimple_code (stmt) == GIMPLE_RETURN |
| || gimple_code (stmt) == GIMPLE_COND |
| || gimple_code (stmt) == GIMPLE_SWITCH |
| || (gimple_code (stmt) == GIMPLE_CALL |
| && !gimple_call_internal_p (stmt) |
| && !omp_sese_active_worker_call (as_a <gcall *> (stmt)))) |
| worklist.safe_push (stmt); |
| else if (is_gimple_assign (stmt)) |
| { |
| tree lhs = gimple_assign_lhs (stmt); |
| |
| /* Force assignments to components/fields/elements of local |
| aggregates into fully-partitioned (redundant) mode. This |
| avoids having to broadcast the whole aggregate. The RHS of |
| the assignment will be propagated using the normal |
| mechanism. */ |
| |
| switch (TREE_CODE (lhs)) |
| { |
| case COMPONENT_REF: |
| case BIT_FIELD_REF: |
| case ARRAY_REF: |
| { |
| tree aggr = TREE_OPERAND (lhs, 0); |
| |
| if (local_var_based_p (aggr)) |
| worklist.safe_push (stmt); |
| } |
| break; |
| |
| default: |
| ; |
| } |
| } |
| } |
| } |
| |
| /* Split blocks on the worklist. */ |
| unsigned ix; |
| gimple *stmt; |
| |
| for (ix = 0; worklist.iterate (ix, &stmt); ix++) |
| { |
| basic_block block = gimple_bb (stmt); |
| |
| if (gimple_code (stmt) == GIMPLE_COND) |
| { |
| gcond *orig_cond = as_a <gcond *> (stmt); |
| tree_code code = gimple_expr_code (orig_cond); |
| tree pred = make_ssa_name (boolean_type_node); |
| gimple *asgn = gimple_build_assign (pred, code, |
| gimple_cond_lhs (orig_cond), |
| gimple_cond_rhs (orig_cond)); |
| gcond *new_cond |
| = gimple_build_cond (NE_EXPR, pred, boolean_false_node, |
| gimple_cond_true_label (orig_cond), |
| gimple_cond_false_label (orig_cond)); |
| |
| gimple_stmt_iterator gsi = gsi_for_stmt (stmt); |
| gsi_insert_before (&gsi, asgn, GSI_SAME_STMT); |
| gsi_replace (&gsi, new_cond, true); |
| |
| edge e = split_block (block, asgn); |
| block = e->dest; |
| map->get_or_insert (block) = new_cond; |
| } |
| else if ((gimple_code (stmt) == GIMPLE_CALL |
| && !gimple_call_internal_p (stmt)) |
| || is_gimple_assign (stmt)) |
| { |
| gimple_stmt_iterator gsi = gsi_for_stmt (stmt); |
| gsi_prev (&gsi); |
| |
| edge call = split_block (block, gsi_stmt (gsi)); |
| |
| gimple *call_stmt = gsi_stmt (gsi_start_bb (call->dest)); |
| |
| edge call_to_ret = split_block (call->dest, call_stmt); |
| |
| map->get_or_insert (call_to_ret->src) = call_stmt; |
| } |
| else |
| { |
| gimple_stmt_iterator gsi = gsi_for_stmt (stmt); |
| gsi_prev (&gsi); |
| |
| if (gsi_end_p (gsi)) |
| map->get_or_insert (block) = stmt; |
| else |
| { |
| /* Split block before insn. The insn is in the new block. */ |
| edge e = split_block (block, gsi_stmt (gsi)); |
| |
| block = e->dest; |
| map->get_or_insert (block) = stmt; |
| } |
| } |
| } |
| } |
| |
| static const char * |
| mask_name (unsigned mask) |
| { |
| switch (mask) |
| { |
| case 0: return "gang redundant"; |
| case 1: return "gang partitioned"; |
| case 2: return "worker partitioned"; |
| case 3: return "gang+worker partitioned"; |
| case 4: return "vector partitioned"; |
| case 5: return "gang+vector partitioned"; |
| case 6: return "worker+vector partitioned"; |
| case 7: return "fully partitioned"; |
| default: return "<illegal>"; |
| } |
| } |
| |
| /* Dump this parallel and all its inner parallels. */ |
| |
| static void |
| omp_sese_dump_pars (parallel_g *par, unsigned depth) |
| { |
| fprintf (dump_file, "%u: mask %d (%s) head=%d, tail=%d\n", |
| depth, par->mask, mask_name (par->mask), |
| par->forked_block ? par->forked_block->index : -1, |
| par->join_block ? par->join_block->index : -1); |
| |
| fprintf (dump_file, " blocks:"); |
| |
| basic_block block; |
| for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++) |
| fprintf (dump_file, " %d", block->index); |
| fprintf (dump_file, "\n"); |
| if (par->inner) |
| omp_sese_dump_pars (par->inner, depth + 1); |
| |
| if (par->next) |
| omp_sese_dump_pars (par->next, depth); |
| } |
| |
| /* If BLOCK contains a fork/join marker, process it to create or |
| terminate a loop structure. Add this block to the current loop, |
| and then walk successor blocks. */ |
| |
| static parallel_g * |
| omp_sese_find_par (bb_stmt_map_t *map, parallel_g *par, basic_block block) |
| { |
| if (block->flags & BB_VISITED) |
| return par; |
| block->flags |= BB_VISITED; |
| |
| if (gimple **stmtp = map->get (block)) |
| { |
| gimple *stmt = *stmtp; |
| |
| if (gimple_code (stmt) == GIMPLE_COND |
| || gimple_code (stmt) == GIMPLE_SWITCH |
| || gimple_code (stmt) == GIMPLE_RETURN |
| || (gimple_code (stmt) == GIMPLE_CALL |
| && !gimple_call_internal_p (stmt)) |
| || is_gimple_assign (stmt)) |
| { |
| /* A single block that is forced to be at the maximum partition |
| level. Make a singleton par for it. */ |
| par = new parallel_g (par, GOMP_DIM_MASK (GOMP_DIM_GANG) |
| | GOMP_DIM_MASK (GOMP_DIM_WORKER) |
| | GOMP_DIM_MASK (GOMP_DIM_VECTOR)); |
| par->forked_block = block; |
| par->forked_stmt = stmt; |
| par->blocks.safe_push (block); |
| par = par->parent; |
| goto walk_successors; |
| } |
| else if (gimple_nop_p (stmt)) |
| { |
| basic_block pred = single_pred (block); |
| gcc_assert (pred); |
| gimple_stmt_iterator gsi = gsi_last_bb (pred); |
| gimple *final_stmt = gsi_stmt (gsi); |
| |
| if (gimple_call_internal_p (final_stmt, IFN_UNIQUE)) |
| { |
| gcall *call = as_a <gcall *> (final_stmt); |
| enum ifn_unique_kind k = ((enum ifn_unique_kind) |
| TREE_INT_CST_LOW (gimple_call_arg (call, 0))); |
| |
| if (k == IFN_UNIQUE_OACC_FORK) |
| { |
| HOST_WIDE_INT dim |
| = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); |
| unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0; |
| |
| par = new parallel_g (par, mask); |
| par->forked_block = block; |
| par->forked_stmt = final_stmt; |
| par->fork_stmt = stmt; |
| } |
| else |
| gcc_unreachable (); |
| } |
| else |
| gcc_unreachable (); |
| } |
| else if (gimple_call_internal_p (stmt, IFN_UNIQUE)) |
| { |
| gcall *call = as_a <gcall *> (stmt); |
| enum ifn_unique_kind k = ((enum ifn_unique_kind) |
| TREE_INT_CST_LOW (gimple_call_arg (call, 0))); |
| if (k == IFN_UNIQUE_OACC_JOIN) |
| { |
| HOST_WIDE_INT dim = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2)); |
| unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0; |
| |
| gcc_assert (par->mask == mask); |
| par->join_block = block; |
| par->join_stmt = stmt; |
| par = par->parent; |
| } |
| else |
| gcc_unreachable (); |
| } |
| else |
| gcc_unreachable (); |
| } |
| |
| if (par) |
| /* Add this block onto the current loop's list of blocks. */ |
| par->blocks.safe_push (block); |
| else |
| /* This must be the entry block. Create a NULL parallel. */ |
| par = new parallel_g (0, 0); |
| |
| walk_successors: |
| /* Walk successor blocks. */ |
| edge e; |
| edge_iterator ei; |
| |
| FOR_EACH_EDGE (e, ei, block->succs) |
| omp_sese_find_par (map, par, e->dest); |
| |
| return par; |
| } |
| |
| /* DFS walk the CFG looking for fork & join markers. Construct |
| loop structures as we go. MAP is a mapping of basic blocks |
| to head & tail markers, discovered when splitting blocks. This |
| speeds up the discovery. We rely on the BB visited flag having |
| been cleared when splitting blocks. */ |
| |
| static parallel_g * |
| omp_sese_discover_pars (bb_stmt_map_t *map) |
| { |
| basic_block block; |
| |
| /* Mark exit blocks as visited. */ |
| block = EXIT_BLOCK_PTR_FOR_FN (cfun); |
| block->flags |= BB_VISITED; |
| |
| /* And entry block as not. */ |
| block = ENTRY_BLOCK_PTR_FOR_FN (cfun); |
| block->flags &= ~BB_VISITED; |
| |
| parallel_g *par = omp_sese_find_par (map, 0, block); |
| |
| if (dump_file) |
| { |
| fprintf (dump_file, "\nLoops\n"); |
| omp_sese_dump_pars (par, 0); |
| fprintf (dump_file, "\n"); |
| } |
| |
| return par; |
| } |
| |
| static void |
| populate_single_mode_bitmaps (parallel_g *par, bitmap worker_single, |
| bitmap vector_single, unsigned outer_mask, |
| int depth) |
| { |
| unsigned mask = outer_mask | par->mask; |
| |
| basic_block block; |
| |
| for (unsigned i = 0; par->blocks.iterate (i, &block); i++) |
| { |
| if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0) |
| bitmap_set_bit (worker_single, block->index); |
| |
| if ((mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) == 0) |
| bitmap_set_bit (vector_single, block->index); |
| } |
| |
| if (par->inner) |
| populate_single_mode_bitmaps (par->inner, worker_single, vector_single, |
| mask, depth + 1); |
| if (par->next) |
| populate_single_mode_bitmaps (par->next, worker_single, vector_single, |
| outer_mask, depth); |
| } |
| |
| /* A map from SSA names or var decls to record fields. */ |
| |
| typedef hash_map<tree, tree> field_map_t; |
| |
| /* For each propagation record type, this is a map from SSA names or var decls |
| to propagate, to the field in the record type that should be used for |
| transmission and reception. */ |
| |
| typedef hash_map<tree, field_map_t *> record_field_map_t; |
| |
| static GTY(()) record_field_map_t *field_map; |
| |
| static void |
| install_var_field (tree var, tree record_type) |
| { |
| field_map_t *fields = *field_map->get (record_type); |
| tree name; |
| char tmp[20]; |
| |
| if (TREE_CODE (var) == SSA_NAME) |
| name = SSA_NAME_IDENTIFIER (var); |
| else if (TREE_CODE (var) == VAR_DECL) |
| name = DECL_NAME (var); |
| else |
| gcc_unreachable (); |
| |
| gcc_assert (!fields->get (var)); |
| |
| if (!name) |
| { |
| sprintf (tmp, "_%u", (unsigned) SSA_NAME_VERSION (var)); |
| name = get_identifier (tmp); |
| } |
| |
| tree type = TREE_TYPE (var); |
| |
| if (POINTER_TYPE_P (type) |
| && TYPE_RESTRICT (type)) |
| type = build_qualified_type (type, TYPE_QUALS (type) & ~TYPE_QUAL_RESTRICT); |
| |
| tree field = build_decl (BUILTINS_LOCATION, FIELD_DECL, name, type); |
| |
| if (TREE_CODE (var) == VAR_DECL && type == TREE_TYPE (var)) |
| { |
| SET_DECL_ALIGN (field, DECL_ALIGN (var)); |
| DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var); |
| TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var); |
| } |
| else |
| SET_DECL_ALIGN (field, TYPE_ALIGN (type)); |
| |
| fields->put (var, field); |
| |
| insert_field_into_struct (record_type, field); |
| } |
| |
| /* Sets of SSA_NAMES or VAR_DECLs to propagate. */ |
| typedef hash_set<tree> propagation_set; |
| |
| static void |
| find_ssa_names_to_propagate (parallel_g *par, unsigned outer_mask, |
| bitmap worker_single, bitmap vector_single, |
| vec<propagation_set *> *prop_set) |
| { |
| unsigned mask = outer_mask | par->mask; |
| |
| if (par->inner) |
| find_ssa_names_to_propagate (par->inner, mask, worker_single, |
| vector_single, prop_set); |
| if (par->next) |
| find_ssa_names_to_propagate (par->next, outer_mask, worker_single, |
| vector_single, prop_set); |
| |
| if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) |
| { |
| basic_block block; |
| int ix; |
| |
| for (ix = 0; par->blocks.iterate (ix, &block); ix++) |
| { |
| for (gphi_iterator psi = gsi_start_phis (block); |
| !gsi_end_p (psi); gsi_next (&psi)) |
| { |
| gphi *phi = psi.phi (); |
| use_operand_p use; |
| ssa_op_iter iter; |
| |
| FOR_EACH_PHI_ARG (use, phi, iter, SSA_OP_USE) |
| { |
| tree var = USE_FROM_PTR (use); |
| |
| if (TREE_CODE (var) != SSA_NAME) |
| continue; |
| |
| gimple *def_stmt = SSA_NAME_DEF_STMT (var); |
| |
| if (gimple_nop_p (def_stmt)) |
| continue; |
| |
| basic_block def_bb = gimple_bb (def_stmt); |
| |
| if (bitmap_bit_p (worker_single, def_bb->index)) |
| { |
| if (!(*prop_set)[def_bb->index]) |
| (*prop_set)[def_bb->index] = new propagation_set; |
| |
| propagation_set *ws_prop = (*prop_set)[def_bb->index]; |
| |
| ws_prop->add (var); |
| } |
| } |
| } |
| |
| for (gimple_stmt_iterator gsi = gsi_start_bb (block); |
| !gsi_end_p (gsi); gsi_next (&gsi)) |
| { |
| use_operand_p use; |
| ssa_op_iter iter; |
| gimple *stmt = gsi_stmt (gsi); |
| |
| FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE) |
| { |
| tree var = USE_FROM_PTR (use); |
| |
| gimple *def_stmt = SSA_NAME_DEF_STMT (var); |
| |
| if (gimple_nop_p (def_stmt)) |
| continue; |
| |
| basic_block def_bb = gimple_bb (def_stmt); |
| |
| if (bitmap_bit_p (worker_single, def_bb->index)) |
| { |
| if (!(*prop_set)[def_bb->index]) |
| (*prop_set)[def_bb->index] = new propagation_set; |
| |
| propagation_set *ws_prop = (*prop_set)[def_bb->index]; |
| |
| ws_prop->add (var); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| /* Callback for walk_gimple_stmt to find RHS VAR_DECLs (uses) in a |
| statement. */ |
| |
| static tree |
| find_partitioned_var_uses_1 (tree *node, int *, void *data) |
| { |
| walk_stmt_info *wi = (walk_stmt_info *) data; |
| hash_set<tree> *partitioned_var_uses = (hash_set<tree> *) wi->info; |
| |
| if (!wi->is_lhs && VAR_P (*node)) |
| partitioned_var_uses->add (*node); |
| |
| return NULL_TREE; |
| } |
| |
| static void |
| find_partitioned_var_uses (parallel_g *par, unsigned outer_mask, |
| hash_set<tree> *partitioned_var_uses) |
| { |
| unsigned mask = outer_mask | par->mask; |
| |
| if (par->inner) |
| find_partitioned_var_uses (par->inner, mask, partitioned_var_uses); |
| if (par->next) |
| find_partitioned_var_uses (par->next, outer_mask, partitioned_var_uses); |
| |
| if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) |
| { |
| basic_block block; |
| int ix; |
| |
| for (ix = 0; par->blocks.iterate (ix, &block); ix++) |
| for (gimple_stmt_iterator gsi = gsi_start_bb (block); |
| !gsi_end_p (gsi); gsi_next (&gsi)) |
| { |
| walk_stmt_info wi; |
| memset (&wi, 0, sizeof (wi)); |
| wi.info = (void *) partitioned_var_uses; |
| walk_gimple_stmt (&gsi, NULL, find_partitioned_var_uses_1, &wi); |
| } |
| } |
| } |
| |
| /* Gang-private variables (typically placed in a GPU's shared memory) do not |
| need to be processed by the worker-propagation mechanism. Populate the |
| GANGPRIVATE_VARS set with any such variables found in the current |
| function. */ |
| |
| static void |
| find_gangprivate_vars (hash_set<tree> *gangprivate_vars) |
| { |
| basic_block block; |
| |
| FOR_EACH_BB_FN (block, cfun) |
| { |
| for (gimple_stmt_iterator gsi = gsi_start_bb (block); |
| !gsi_end_p (gsi); |
| gsi_next (&gsi)) |
| { |
| gimple *stmt = gsi_stmt (gsi); |
| |
| if (gimple_call_internal_p (stmt, IFN_UNIQUE)) |
| { |
| enum ifn_unique_kind k = ((enum ifn_unique_kind) |
| TREE_INT_CST_LOW (gimple_call_arg (stmt, 0))); |
| if (k == IFN_UNIQUE_OACC_PRIVATE) |
| { |
| HOST_WIDE_INT level |
| = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2)); |
| if (level != GOMP_DIM_GANG) |
| continue; |
| for (unsigned i = 3; i < gimple_call_num_args (stmt); i++) |
| { |
| tree arg = gimple_call_arg (stmt, i); |
| gcc_assert (TREE_CODE (arg) == ADDR_EXPR); |
| tree decl = TREE_OPERAND (arg, 0); |
| gangprivate_vars->add (decl); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| static void |
| find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask, |
| hash_set<tree> *partitioned_var_uses, |
| hash_set<tree> *gangprivate_vars, |
| vec<propagation_set *> *prop_set) |
| { |
| unsigned mask = outer_mask | par->mask; |
| |
| if (par->inner) |
| find_local_vars_to_propagate (par->inner, mask, partitioned_var_uses, |
| gangprivate_vars, prop_set); |
| if (par->next) |
| find_local_vars_to_propagate (par->next, outer_mask, partitioned_var_uses, |
| gangprivate_vars, prop_set); |
| |
| if (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))) |
| { |
| basic_block block; |
| int ix; |
| |
| for (ix = 0; par->blocks.iterate (ix, &block); ix++) |
| { |
| for (gimple_stmt_iterator gsi = gsi_start_bb (block); |
| !gsi_end_p (gsi); gsi_next (&gsi)) |
| { |
| gimple *stmt = gsi_stmt (gsi); |
| tree var; |
| unsigned i; |
| |
| FOR_EACH_LOCAL_DECL (cfun, i, var) |
| { |
| if (!VAR_P (var) |
| || is_global_var (var) |
| || AGGREGATE_TYPE_P (TREE_TYPE (var)) |
| || !partitioned_var_uses->contains (var) |
| || gangprivate_vars->contains (var)) |
| continue; |
| |
| if (stmt_may_clobber_ref_p (stmt, var)) |
| { |
| if (dump_file) |
| { |
| fprintf (dump_file, "bb %u: local variable may be " |
| "clobbered in %s mode: ", block->index, |
| mask_name (mask)); |
| print_generic_expr (dump_file, var, TDF_SLIM); |
| fprintf (dump_file, "\n"); |
| } |
| |
| if (!(*prop_set)[block->index]) |
| (*prop_set)[block->index] = new propagation_set; |
| |
| propagation_set *ws_prop |
| = (*prop_set)[block->index]; |
| |
| ws_prop->add (var); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| /* Transform basic blocks FROM, TO (which may be the same block) into: |
| if (GOACC_single_start ()) |
| BLOCK; |
| GOACC_barrier (); |
| \ | / |
| +----+ |
| | | (new) predicate block |
| +----+-- |
| \ | / \ | / |t \ |
| +----+ +----+ +----+ | |
| | | | | ===> | | | f (old) from block |
| +----+ +----+ +----+ | |
| | t/ \f | / |
| +----+/ |
| (split (split before | | skip block |
| at end) condition) +----+ |
| t/ \f |
| */ |
| |
| static void |
| worker_single_simple (basic_block from, basic_block to, |
| hash_set<tree> *def_escapes_block) |
| { |
| gimple *call, *cond; |
| tree lhs, decl; |
| basic_block skip_block; |
| |
| gimple_stmt_iterator gsi = gsi_last_bb (to); |
| if (EDGE_COUNT (to->succs) > 1) |
| { |
| gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_COND); |
| gsi_prev (&gsi); |
| } |
| edge e = split_block (to, gsi_stmt (gsi)); |
| skip_block = e->dest; |
| |
| gimple_stmt_iterator start = gsi_after_labels (from); |
| |
| decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_START); |
| lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl))); |
| call = gimple_build_call (decl, 0); |
| gimple_call_set_lhs (call, lhs); |
| gsi_insert_before (&start, call, GSI_NEW_STMT); |
| update_stmt (call); |
| |
| cond = gimple_build_cond (EQ_EXPR, lhs, |
| fold_convert_loc (UNKNOWN_LOCATION, |
| TREE_TYPE (lhs), |
| boolean_true_node), |
| NULL_TREE, NULL_TREE); |
| gsi_insert_after (&start, cond, GSI_NEW_STMT); |
| update_stmt (cond); |
| |
| edge et = split_block (from, cond); |
| et->flags &= ~EDGE_FALLTHRU; |
| et->flags |= EDGE_TRUE_VALUE; |
| /* Make the active worker the more probable path so we prefer fallthrough |
| (letting the idle workers jump around more). */ |
| et->probability = profile_probability::likely (); |
| |
| edge ef = make_edge (from, skip_block, EDGE_FALSE_VALUE); |
| ef->probability = et->probability.invert (); |
| |
| basic_block neutered = split_edge (ef); |
| gimple_stmt_iterator neut_gsi = gsi_last_bb (neutered); |
| |
| for (gsi = gsi_start_bb (et->dest); !gsi_end_p (gsi); gsi_next (&gsi)) |
| { |
| gimple *stmt = gsi_stmt (gsi); |
| ssa_op_iter iter; |
| tree var; |
| |
| FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_DEF) |
| { |
| if (def_escapes_block->contains (var)) |
| { |
| gphi *join_phi = create_phi_node (NULL_TREE, skip_block); |
| create_new_def_for (var, join_phi, |
| gimple_phi_result_ptr (join_phi)); |
| add_phi_arg (join_phi, var, e, UNKNOWN_LOCATION); |
| |
| tree neutered_def = copy_ssa_name (var, NULL); |
| /* We really want "don't care" or some value representing |
| undefined here, but optimizers will probably get rid of the |
| zero-assignments anyway. */ |
| gassign *zero = gimple_build_assign (neutered_def, |
| build_zero_cst (TREE_TYPE (neutered_def))); |
| |
| gsi_insert_after (&neut_gsi, zero, GSI_CONTINUE_LINKING); |
| update_stmt (zero); |
| |
| add_phi_arg (join_phi, neutered_def, single_succ_edge (neutered), |
| UNKNOWN_LOCATION); |
| update_stmt (join_phi); |
| } |
| } |
| } |
| |
| gsi = gsi_start_bb (skip_block); |
| |
| decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER); |
| gimple *acc_bar = gimple_build_call (decl, 0); |
| |
| gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT); |
| update_stmt (acc_bar); |
| } |
| |
| /* This is a copied and renamed omp-low.c:omp_build_component_ref. */ |
| |
| static tree |
| oacc_build_component_ref (tree obj, tree field) |
| { |
| tree ret = build3 (COMPONENT_REF, TREE_TYPE (field), obj, field, NULL); |
| if (TREE_THIS_VOLATILE (field)) |
| TREE_THIS_VOLATILE (ret) |= 1; |
| if (TREE_READONLY (field)) |
| TREE_READONLY (ret) |= 1; |
| return ret; |
| } |
| |
| static tree |
| build_receiver_ref (tree record_type, tree var, tree receiver_decl) |
| { |
| field_map_t *fields = *field_map->get (record_type); |
| tree x = build_simple_mem_ref (receiver_decl); |
| tree field = *fields->get (var); |
| TREE_THIS_NOTRAP (x) = 1; |
| x = oacc_build_component_ref (x, field); |
| return x; |
| } |
| |
| static tree |
| build_sender_ref (tree record_type, tree var, tree sender_decl) |
| { |
| field_map_t *fields = *field_map->get (record_type); |
| tree field = *fields->get (var); |
| return oacc_build_component_ref (sender_decl, field); |
| } |
| |
| static int |
| sort_by_ssa_version_or_uid (const void *p1, const void *p2) |
| { |
| const tree t1 = *(const tree *)p1; |
| const tree t2 = *(const tree *)p2; |
| |
| if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) == SSA_NAME) |
| return SSA_NAME_VERSION (t1) - SSA_NAME_VERSION (t2); |
| else if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) != SSA_NAME) |
| return -1; |
| else if (TREE_CODE (t1) != SSA_NAME && TREE_CODE (t2) == SSA_NAME) |
| return 1; |
| else |
| return DECL_UID (t1) - DECL_UID (t2); |
| } |
| |
| static int |
| sort_by_size_then_ssa_version_or_uid (const void *p1, const void *p2) |
| { |
| const tree t1 = *(const tree *)p1; |
| const tree t2 = *(const tree *)p2; |
| unsigned HOST_WIDE_INT s1 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t1))); |
| unsigned HOST_WIDE_INT s2 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t2))); |
| if (s1 != s2) |
| return s2 - s1; |
| else |
| return sort_by_ssa_version_or_uid (p1, p2); |
| } |
| |
| static void |
| worker_single_copy (basic_block from, basic_block to, |
| hash_set<tree> *def_escapes_block, |
| hash_set<tree> *worker_partitioned_uses, |
| tree record_type) |
| { |
| /* If we only have virtual defs, we'll have no record type, but we still want |
| to emit single_copy_start and (particularly) single_copy_end to act as |
| a vdef source on the neutered edge representing memory writes on the |
| non-neutered edge. */ |
| if (!record_type) |
| record_type = char_type_node; |
| |
| tree sender_decl |
| = targetm.goacc.create_propagation_record (record_type, true, |
| ".oacc_worker_o"); |
| tree receiver_decl |
| = targetm.goacc.create_propagation_record (record_type, false, |
| ".oacc_worker_i"); |
| |
| gimple_stmt_iterator gsi = gsi_last_bb (to); |
| if (EDGE_COUNT (to->succs) > 1) |
| gsi_prev (&gsi); |
| edge e = split_block (to, gsi_stmt (gsi)); |
| basic_block barrier_block = e->dest; |
| |
| gimple_stmt_iterator start = gsi_after_labels (from); |
| |
| tree decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_START); |
| |
| tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl))); |
| |
| gimple *call = gimple_build_call (decl, 1, |
| build_fold_addr_expr (sender_decl)); |
| gimple_call_set_lhs (call, lhs); |
| gsi_insert_before (&start, call, GSI_NEW_STMT); |
| update_stmt (call); |
| |
| tree conv_tmp = make_ssa_name (TREE_TYPE (receiver_decl)); |
| |
| gimple *conv = gimple_build_assign (conv_tmp, |
| fold_convert (TREE_TYPE (receiver_decl), |
| lhs)); |
| update_stmt (conv); |
| gsi_insert_after (&start, conv, GSI_NEW_STMT); |
| gimple *asgn = gimple_build_assign (receiver_decl, conv_tmp); |
| gsi_insert_after (&start, asgn, GSI_NEW_STMT); |
| update_stmt (asgn); |
| |
| tree zero_ptr = build_int_cst (TREE_TYPE (receiver_decl), 0); |
| |
| tree recv_tmp = make_ssa_name (TREE_TYPE (receiver_decl)); |
| asgn = gimple_build_assign (recv_tmp, receiver_decl); |
| gsi_insert_after (&start, asgn, GSI_NEW_STMT); |
| update_stmt (asgn); |
| |
| gimple *cond = gimple_build_cond (EQ_EXPR, recv_tmp, zero_ptr, NULL_TREE, |
| NULL_TREE); |
| update_stmt (cond); |
| |
| gsi_insert_after (&start, cond, GSI_NEW_STMT); |
| |
| edge et = split_block (from, cond); |
| et->flags &= ~EDGE_FALLTHRU; |
| et->flags |= EDGE_TRUE_VALUE; |
| /* Make the active worker the more probable path so we prefer fallthrough |
| (letting the idle workers jump around more). */ |
| et->probability = profile_probability::likely (); |
| |
| basic_block body = et->dest; |
| |
| edge ef = make_edge (from, barrier_block, EDGE_FALSE_VALUE); |
| ef->probability = et->probability.invert (); |
| |
| decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER); |
| gimple *acc_bar = gimple_build_call (decl, 0); |
| |
| gimple_stmt_iterator bar_gsi = gsi_start_bb (barrier_block); |
| gsi_insert_before (&bar_gsi, acc_bar, GSI_NEW_STMT); |
| |
| cond = gimple_build_cond (NE_EXPR, recv_tmp, zero_ptr, NULL_TREE, NULL_TREE); |
| gsi_insert_after (&bar_gsi, cond, GSI_NEW_STMT); |
| |
| edge et2 = split_block (barrier_block, cond); |
| et2->flags &= ~EDGE_FALLTHRU; |
| et2->flags |= EDGE_TRUE_VALUE; |
| et2->probability = profile_probability::unlikely (); |
| |
| basic_block exit_block = et2->dest; |
| |
| basic_block copyout_block = split_edge (et2); |
| edge ef2 = make_edge (barrier_block, exit_block, EDGE_FALSE_VALUE); |
| ef2->probability = et2->probability.invert (); |
| |
| gimple_stmt_iterator copyout_gsi = gsi_start_bb (copyout_block); |
| |
| edge copyout_to_exit = single_succ_edge (copyout_block); |
| |
| gimple_seq sender_seq = NULL; |
| |
| /* Make sure we iterate over definitions in a stable order. */ |
| auto_vec<tree> escape_vec (def_escapes_block->elements ()); |
| for (hash_set<tree>::iterator it = def_escapes_block->begin (); |
| it != def_escapes_block->end (); ++it) |
| escape_vec.quick_push (*it); |
| escape_vec.qsort (sort_by_ssa_version_or_uid); |
| |
| for (unsigned i = 0; i < escape_vec.length (); i++) |
| { |
| tree var = escape_vec[i]; |
| |
| if (TREE_CODE (var) == SSA_NAME && SSA_NAME_IS_VIRTUAL_OPERAND (var)) |
| continue; |
| |
| tree barrier_def = 0; |
| |
| if (TREE_CODE (var) == SSA_NAME) |
| { |
| gimple *def_stmt = SSA_NAME_DEF_STMT (var); |
| |
| if (gimple_nop_p (def_stmt)) |
| continue; |
| |
| /* The barrier phi takes one result from the actual work of the |
| block we're neutering, and the other result is constant zero of |
| the same type. */ |
| |
| gphi *barrier_phi = create_phi_node (NULL_TREE, barrier_block); |
| barrier_def = create_new_def_for (var, barrier_phi, |
| gimple_phi_result_ptr (barrier_phi)); |
| |
| add_phi_arg (barrier_phi, var, e, UNKNOWN_LOCATION); |
| add_phi_arg (barrier_phi, build_zero_cst (TREE_TYPE (var)), ef, |
| UNKNOWN_LOCATION); |
| |
| update_stmt (barrier_phi); |
| } |
| else |
| gcc_assert (TREE_CODE (var) == VAR_DECL); |
| |
| /* If we had no record type, we will have no fields map. */ |
| field_map_t **fields_p = field_map->get (record_type); |
| field_map_t *fields = fields_p ? *fields_p : NULL; |
| |
| if (worker_partitioned_uses->contains (var) |
| && fields |
| && fields->get (var)) |
| { |
| tree neutered_def = make_ssa_name (TREE_TYPE (var)); |
| |
| /* Receive definition from shared memory block. */ |
| |
| tree receiver_ref = build_receiver_ref (record_type, var, |
| receiver_decl); |
| gassign *recv = gimple_build_assign (neutered_def, |
| receiver_ref); |
| gsi_insert_after (©out_gsi, recv, GSI_CONTINUE_LINKING); |
| update_stmt (recv); |
| |
| if (TREE_CODE (var) == VAR_DECL) |
| { |
| /* If it's a VAR_DECL, we only copied to an SSA temporary. Copy |
| to the final location now. */ |
| gassign *asgn = gimple_build_assign (var, neutered_def); |
| gsi_insert_after (©out_gsi, asgn, GSI_CONTINUE_LINKING); |
| update_stmt (asgn); |
| } |
| else |
| { |
| /* If it's an SSA name, create a new phi at the join node to |
| represent either the output from the active worker (the |
| barrier) or the inactive workers (the copyout block). */ |
| gphi *join_phi = create_phi_node (NULL_TREE, exit_block); |
| create_new_def_for (barrier_def, join_phi, |
| gimple_phi_result_ptr (join_phi)); |
| add_phi_arg (join_phi, barrier_def, ef2, UNKNOWN_LOCATION); |
| add_phi_arg (join_phi, neutered_def, copyout_to_exit, |
| UNKNOWN_LOCATION); |
| update_stmt (join_phi); |
| } |
| |
| /* Send definition to shared memory block. */ |
| |
| tree sender_ref = build_sender_ref (record_type, var, sender_decl); |
| |
| if (TREE_CODE (var) == SSA_NAME) |
| { |
| gassign *send = gimple_build_assign (sender_ref, var); |
| gimple_seq_add_stmt (&sender_seq, send); |
| update_stmt (send); |
| } |
| else if (TREE_CODE (var) == VAR_DECL) |
| { |
| tree tmp = make_ssa_name (TREE_TYPE (var)); |
| gassign *send = gimple_build_assign (tmp, var); |
| gimple_seq_add_stmt (&sender_seq, send); |
| update_stmt (send); |
| send = gimple_build_assign (sender_ref, tmp); |
| gimple_seq_add_stmt (&sender_seq, send); |
| update_stmt (send); |
| } |
| else |
| gcc_unreachable (); |
| } |
| } |
| |
| /* It's possible for the ET->DEST block (the work done by the active thread) |
| to finish with a control-flow insn, e.g. a UNIQUE function call. Split |
| the block and add SENDER_SEQ in the latter part to avoid having control |
| flow in the middle of a BB. */ |
| |
| decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_END); |
| call = gimple_build_call (decl, 1, build_fold_addr_expr (sender_decl)); |
| gimple_seq_add_stmt (&sender_seq, call); |
| |
| gsi = gsi_last_bb (body); |
| gimple *last = gsi_stmt (gsi); |
| basic_block sender_block = split_block (body, last)->dest; |
| gsi = gsi_last_bb (sender_block); |
| gsi_insert_seq_after (&gsi, sender_seq, GSI_CONTINUE_LINKING); |
| } |
| |
| static void |
| neuter_worker_single (parallel_g *par, unsigned outer_mask, |
| bitmap worker_single, bitmap vector_single, |
| vec<propagation_set *> *prop_set, |
| hash_set<tree> *partitioned_var_uses) |
| { |
| unsigned mask = outer_mask | par->mask; |
| |
| if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0) |
| { |
| basic_block block; |
| |
| for (unsigned i = 0; par->blocks.iterate (i, &block); i++) |
| { |
| bool has_defs = false; |
| hash_set<tree> def_escapes_block; |
| hash_set<tree> worker_partitioned_uses; |
| unsigned j; |
| tree var; |
| |
| FOR_EACH_SSA_NAME (j, var, cfun) |
| { |
| if (SSA_NAME_IS_VIRTUAL_OPERAND (var)) |
| { |
| has_defs = true; |
| continue; |
| } |
| |
| gimple *def_stmt = SSA_NAME_DEF_STMT (var); |
| |
| if (gimple_nop_p (def_stmt)) |
| continue; |
| |
| if (gimple_bb (def_stmt)->index != block->index) |
| continue; |
| |
| gimple *use_stmt; |
| imm_use_iterator use_iter; |
| bool uses_outside_block = false; |
| bool worker_partitioned_use = false; |
| |
| FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, var) |
| { |
| int blocknum = gimple_bb (use_stmt)->index; |
| |
| /* Don't propagate SSA names that are only used in the |
| current block, unless the usage is in a phi node: that |
| means the name left the block, then came back in at the |
| top. */ |
| if (blocknum != block->index |
| || gimple_code (use_stmt) == GIMPLE_PHI) |
| uses_outside_block = true; |
| if (!bitmap_bit_p (worker_single, blocknum)) |
| worker_partitioned_use = true; |
| } |
| |
| if (uses_outside_block) |
| def_escapes_block.add (var); |
| |
| if (worker_partitioned_use) |
| { |
| worker_partitioned_uses.add (var); |
| has_defs = true; |
| } |
| } |
| |
| propagation_set *ws_prop = (*prop_set)[block->index]; |
| |
| if (ws_prop) |
| { |
| for (propagation_set::iterator it = ws_prop->begin (); |
| it != ws_prop->end (); |
| ++it) |
| { |
| tree var = *it; |
| if (TREE_CODE (var) == VAR_DECL) |
| { |
| def_escapes_block.add (var); |
| if (partitioned_var_uses->contains (var)) |
| { |
| worker_partitioned_uses.add (var); |
| has_defs = true; |
| } |
| } |
| } |
| |
| delete ws_prop; |
| (*prop_set)[block->index] = 0; |
| } |
| |
| tree record_type = (tree) block->aux; |
| |
| if (has_defs) |
| worker_single_copy (block, block, &def_escapes_block, |
| &worker_partitioned_uses, record_type); |
| else |
| worker_single_simple (block, block, &def_escapes_block); |
| } |
| } |
| |
| if ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0) |
| { |
| basic_block block; |
| |
| for (unsigned i = 0; par->blocks.iterate (i, &block); i++) |
| for (gimple_stmt_iterator gsi = gsi_start_bb (block); |
| !gsi_end_p (gsi); |
| gsi_next (&gsi)) |
| { |
| gimple *stmt = gsi_stmt (gsi); |
| |
| if (gimple_code (stmt) == GIMPLE_CALL |
| && !gimple_call_internal_p (stmt) |
| && !omp_sese_active_worker_call (as_a <gcall *> (stmt))) |
| { |
| /* If we have an OpenACC routine call in worker-single mode, |
| place barriers before and afterwards to prevent |
| clobbering re-used shared memory regions (as are used |
| for AMDGCN at present, for example). */ |
| tree decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER); |
| gsi_insert_before (&gsi, gimple_build_call (decl, 0), |
| GSI_SAME_STMT); |
| gsi_insert_after (&gsi, gimple_build_call (decl, 0), |
| GSI_NEW_STMT); |
| } |
| } |
| } |
| |
| if (par->inner) |
| neuter_worker_single (par->inner, mask, worker_single, vector_single, |
| prop_set, partitioned_var_uses); |
| if (par->next) |
| neuter_worker_single (par->next, outer_mask, worker_single, vector_single, |
| prop_set, partitioned_var_uses); |
| } |
| |
| |
| void |
| oacc_do_neutering (void) |
| { |
| bb_stmt_map_t bb_stmt_map; |
| auto_bitmap worker_single, vector_single; |
| |
| omp_sese_split_blocks (&bb_stmt_map); |
| |
| if (dump_file) |
| { |
| fprintf (dump_file, "\n\nAfter splitting:\n\n"); |
| dump_function_to_file (current_function_decl, dump_file, dump_flags); |
| } |
| |
| unsigned mask = 0; |
| |
| /* If this is a routine, calculate MASK as if the outer levels are already |
| partitioned. */ |
| tree attr = oacc_get_fn_attrib (current_function_decl); |
| if (attr) |
| { |
| tree dims = TREE_VALUE (attr); |
| unsigned ix; |
| for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims)) |
| { |
| tree allowed = TREE_PURPOSE (dims); |
| if (allowed && integer_zerop (allowed)) |
| mask |= GOMP_DIM_MASK (ix); |
| } |
| } |
| |
| parallel_g *par = omp_sese_discover_pars (&bb_stmt_map); |
| populate_single_mode_bitmaps (par, worker_single, vector_single, mask, 0); |
| |
| basic_block bb; |
| FOR_ALL_BB_FN (bb, cfun) |
| bb->aux = NULL; |
| |
| field_map = record_field_map_t::create_ggc (40); |
| |
| vec<propagation_set *> prop_set; |
| prop_set.create (last_basic_block_for_fn (cfun)); |
| |
| for (int i = 0; i < last_basic_block_for_fn (cfun); i++) |
| prop_set.quick_push (0); |
| |
| find_ssa_names_to_propagate (par, mask, worker_single, vector_single, |
| &prop_set); |
| |
| hash_set<tree> partitioned_var_uses; |
| hash_set<tree> gangprivate_vars; |
| |
| find_gangprivate_vars (&gangprivate_vars); |
| find_partitioned_var_uses (par, mask, &partitioned_var_uses); |
| find_local_vars_to_propagate (par, mask, &partitioned_var_uses, |
| &gangprivate_vars, &prop_set); |
| |
| FOR_ALL_BB_FN (bb, cfun) |
| { |
| propagation_set *ws_prop = prop_set[bb->index]; |
| if (ws_prop) |
| { |
| tree record_type = lang_hooks.types.make_type (RECORD_TYPE); |
| tree name = create_tmp_var_name (".oacc_ws_data_s"); |
| name = build_decl (UNKNOWN_LOCATION, TYPE_DECL, name, record_type); |
| DECL_ARTIFICIAL (name) = 1; |
| DECL_NAMELESS (name) = 1; |
| TYPE_NAME (record_type) = name; |
| TYPE_ARTIFICIAL (record_type) = 1; |
| |
| auto_vec<tree> field_vec (ws_prop->elements ()); |
| for (hash_set<tree>::iterator it = ws_prop->begin (); |
| it != ws_prop->end (); ++it) |
| field_vec.quick_push (*it); |
| |
| field_vec.qsort (sort_by_size_then_ssa_version_or_uid); |
| |
| field_map->put (record_type, field_map_t::create_ggc (17)); |
| |
| /* Insert var fields in reverse order, so the last inserted element |
| is the first in the structure. */ |
| for (int i = field_vec.length () - 1; i >= 0; i--) |
| install_var_field (field_vec[i], record_type); |
| |
| layout_type (record_type); |
| |
| bb->aux = (tree) record_type; |
| } |
| } |
| |
| neuter_worker_single (par, mask, worker_single, vector_single, &prop_set, |
| &partitioned_var_uses); |
| |
| prop_set.release (); |
| |
| /* This doesn't seem to make a difference. */ |
| loops_state_clear (LOOP_CLOSED_SSA); |
| |
| /* Neutering worker-single neutered blocks will invalidate dominance info. |
| It may be possible to incrementally update just the affected blocks, but |
| obliterate everything for now. */ |
| free_dominance_info (CDI_DOMINATORS); |
| free_dominance_info (CDI_POST_DOMINATORS); |
| |
| if (dump_file) |
| { |
| fprintf (dump_file, "\n\nAfter neutering:\n\n"); |
| dump_function_to_file (current_function_decl, dump_file, dump_flags); |
| } |
| } |
| |
| /* Analyse a group of BBs within a partitioned region and create N |
| Single-Entry-Single-Exit regions. Some of those regions will be |
| trivial ones consisting of a single BB. The blocks of a |
| partitioned region might form a set of disjoint graphs -- because |
| the region encloses a differently partitoned sub region. |
| |
| We use the linear time algorithm described in 'Finding Regions Fast: |
| Single Entry Single Exit and control Regions in Linear Time' |
| Johnson, Pearson & Pingali. That algorithm deals with complete |
| CFGs, where a back edge is inserted from END to START, and thus the |
| problem becomes one of finding equivalent loops. |
| |
| In this case we have a partial CFG. We complete it by redirecting |
| any incoming edge to the graph to be from an arbitrary external BB, |
| and similarly redirecting any outgoing edge to be to that BB. |
| Thus we end up with a closed graph. |
| |
| The algorithm works by building a spanning tree of an undirected |
| graph and keeping track of back edges from nodes further from the |
| root in the tree to nodes nearer to the root in the tree. In the |
| description below, the root is up and the tree grows downwards. |
| |
| We avoid having to deal with degenerate back-edges to the same |
| block, by splitting each BB into 3 -- one for input edges, one for |
| the node itself and one for the output edges. Such back edges are |
| referred to as 'Brackets'. Cycle equivalent nodes will have the |
| same set of brackets. |
| |
| Determining bracket equivalency is done by maintaining a list of |
| brackets in such a manner that the list length and final bracket |
| uniquely identify the set. |
| |
| We use coloring to mark all BBs with cycle equivalency with the |
| same color. This is the output of the 'Finding Regions Fast' |
| algorithm. Notice it doesn't actually find the set of nodes within |
| a particular region, just unorderd sets of nodes that are the |
| entries and exits of SESE regions. |
| |
| After determining cycle equivalency, we need to find the minimal |
| set of SESE regions. Do this with a DFS coloring walk of the |
| complete graph. We're either 'looking' or 'coloring'. When |
| looking, and we're in the subgraph, we start coloring the color of |
| the current node, and remember that node as the start of the |
| current color's SESE region. Every time we go to a new node, we |
| decrement the count of nodes with thet color. If it reaches zero, |
| we remember that node as the end of the current color's SESE region |
| and return to 'looking'. Otherwise we color the node the current |
| color. |
| |
| This way we end up with coloring the inside of non-trivial SESE |
| regions with the color of that region. */ |
| |
| /* A node in the undirected CFG. The discriminator SECOND indicates just |
| above or just below the BB idicated by FIRST. */ |
| typedef std::pair<basic_block, int> pseudo_node_t; |
| |
| /* A bracket indicates an edge towards the root of the spanning tree of the |
| undirected graph. Each bracket has a color, determined |
| from the currrent set of brackets. */ |
| struct bracket |
| { |
| pseudo_node_t back; /* Back target */ |
| |
| /* Current color and size of set. */ |
| unsigned color; |
| unsigned size; |
| |
| bracket (pseudo_node_t back_) |
| : back (back_), color (~0u), size (~0u) |
| { |
| } |
| |
| unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length) |
| { |
| if (length != size) |
| { |
| size = length; |
| color = color_counts.length (); |
| color_counts.quick_push (0); |
| } |
| color_counts[color]++; |
| return color; |
| } |
| }; |
| |
| typedef auto_vec<bracket> bracket_vec_t; |
| |
| /* Basic block info for finding SESE regions. */ |
| |
| struct bb_sese |
| { |
| int node; /* Node number in spanning tree. */ |
| int parent; /* Parent node number. */ |
| |
| /* The algorithm splits each node A into Ai, A', Ao. The incoming |
| edges arrive at pseudo-node Ai and the outgoing edges leave at |
| pseudo-node Ao. We have to remember which way we arrived at a |
| particular node when generating the spanning tree. dir > 0 means |
| we arrived at Ai, dir < 0 means we arrived at Ao. */ |
| int dir; |
| |
| /* Lowest numbered pseudo-node reached via a backedge from thsis |
| node, or any descendant. */ |
| pseudo_node_t high; |
| |
| int color; /* Cycle-equivalence color */ |
| |
| /* Stack of brackets for this node. */ |
| bracket_vec_t brackets; |
| |
| bb_sese (unsigned node_, unsigned p, int dir_) |
| :node (node_), parent (p), dir (dir_) |
| { |
| } |
| ~bb_sese (); |
| |
| /* Push a bracket ending at BACK. */ |
| void push (const pseudo_node_t &back) |
| { |
| if (dump_file) |
| fprintf (dump_file, "Pushing backedge %d:%+d\n", |
| back.first ? back.first->index : 0, back.second); |
| brackets.safe_push (bracket (back)); |
| } |
| |
| void append (bb_sese *child); |
| void remove (const pseudo_node_t &); |
| |
| /* Set node's color. */ |
| void set_color (auto_vec<unsigned> &color_counts) |
| { |
| color = brackets.last ().get_color (color_counts, brackets.length ()); |
| } |
| }; |
| |
| bb_sese::~bb_sese () |
| { |
| } |
| |
| /* Destructively append CHILD's brackets. */ |
| |
| void |
| bb_sese::append (bb_sese *child) |
| { |
| if (int len = child->brackets.length ()) |
| { |
| int ix; |
| |
| if (dump_file) |
| { |
| for (ix = 0; ix < len; ix++) |
| { |
| const pseudo_node_t &pseudo = child->brackets[ix].back; |
| fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n", |
| child->node, pseudo.first ? pseudo.first->index : 0, |
| pseudo.second); |
| } |
| } |
| if (!brackets.length ()) |
| std::swap (brackets, child->brackets); |
| else |
| { |
| brackets.reserve (len); |
| for (ix = 0; ix < len; ix++) |
| brackets.quick_push (child->brackets[ix]); |
| } |
| } |
| } |
| |
| /* Remove brackets that terminate at PSEUDO. */ |
| |
| void |
| bb_sese::remove (const pseudo_node_t &pseudo) |
| { |
| unsigned removed = 0; |
| int len = brackets.length (); |
| |
| for (int ix = 0; ix < len; ix++) |
| { |
| if (brackets[ix].back == pseudo) |
| { |
| if (dump_file) |
| fprintf (dump_file, "Removing backedge %d:%+d\n", |
| pseudo.first ? pseudo.first->index : 0, pseudo.second); |
| removed++; |
| } |
| else if (removed) |
| brackets[ix-removed] = brackets[ix]; |
| } |
| while (removed--) |
| brackets.pop (); |
| } |
| |
| /* Accessors for BB's aux pointer. */ |
| #define BB_SET_SESE(B, S) ((B)->aux = (S)) |
| #define BB_GET_SESE(B) ((bb_sese *)(B)->aux) |
| |
| /* DFS walk creating SESE data structures. Only cover nodes with |
| BB_VISITED set. Append discovered blocks to LIST. We number in |
| increments of 3 so that the above and below pseudo nodes can be |
| implicitly numbered too. */ |
| |
| static int |
| omp_sese_number (int n, int p, int dir, basic_block b, |
| auto_vec<basic_block> *list) |
| { |
| if (BB_GET_SESE (b)) |
| return n; |
| |
| if (dump_file) |
| fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n", |
| b->index, n, p, dir); |
| |
| BB_SET_SESE (b, new bb_sese (n, p, dir)); |
| p = n; |
| |
| n += 3; |
| list->quick_push (b); |
| |
| /* First walk the nodes on the 'other side' of this node, then walk |
| the nodes on the same side. */ |
| for (unsigned ix = 2; ix; ix--) |
| { |
| vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds; |
| size_t offset = (dir > 0 ? offsetof (edge_def, dest) |
| : offsetof (edge_def, src)); |
| edge e; |
| edge_iterator ei; |
| |
| FOR_EACH_EDGE (e, ei, edges) |
| { |
| basic_block target = *(basic_block *)((char *)e + offset); |
| |
| if (target->flags & BB_VISITED) |
| n = omp_sese_number (n, p, dir, target, list); |
| } |
| dir = -dir; |
| } |
| return n; |
| } |
| |
| /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME. |
| EDGES are the outgoing edges and OFFSET is the offset to the src |
| or dst block on the edges. */ |
| |
| static void |
| omp_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir, |
| vec<edge, va_gc> *edges, size_t offset) |
| { |
| edge e; |
| edge_iterator ei; |
| int hi_back = depth; |
| pseudo_node_t node_back (0, depth); |
| int hi_child = depth; |
| pseudo_node_t node_child (0, depth); |
| basic_block child = NULL; |
| unsigned num_children = 0; |
| int usd = -dir * sese->dir; |
| |
| if (dump_file) |
| fprintf (dump_file, "\nProcessing %d(%d) %+d\n", |
| me->index, sese->node, dir); |
| |
| if (dir < 0) |
| { |
| /* This is the above pseudo-child. It has the BB itself as an |
| additional child node. */ |
| node_child = sese->high; |
| hi_child = node_child.second; |
| if (node_child.first) |
| hi_child += BB_GET_SESE (node_child.first)->node; |
| num_children++; |
| } |
| |
| /* Examine each edge. |
| - if it is a child (a) append its bracket list and (b) record |
| whether it is the child with the highest reaching bracket. |
| - if it is an edge to ancestor, record whether it's the highest |
| reaching backlink. */ |
| FOR_EACH_EDGE (e, ei, edges) |
| { |
| basic_block target = *(basic_block *)((char *)e + offset); |
| |
| if (bb_sese *t_sese = BB_GET_SESE (target)) |
| { |
| if (t_sese->parent == sese->node && !(t_sese->dir + usd)) |
| { |
| /* Child node. Append its bracket list. */ |
| num_children++; |
| sese->append (t_sese); |
| |
| /* Compare it's hi value. */ |
| int t_hi = t_sese->high.second; |
| |
| if (basic_block child_hi_block = t_sese->high.first) |
| t_hi += BB_GET_SESE (child_hi_block)->node; |
| |
| if (hi_child > t_hi) |
| { |
| hi_child = t_hi; |
| node_child = t_sese->high; |
| child = target; |
| } |
| } |
| else if (t_sese->node < sese->node + dir |
| && !(dir < 0 && sese->parent == t_sese->node)) |
| { |
| /* Non-parental ancestor node -- a backlink. */ |
| int d = usd * t_sese->dir; |
| int back = t_sese->node + d; |
| |
| if (hi_back > back) |
| { |
| hi_back = back; |
| node_back = pseudo_node_t (target, d); |
| } |
| } |
| } |
| else |
| { /* Fallen off graph, backlink to entry node. */ |
| hi_back = 0; |
| node_back = pseudo_node_t (0, 0); |
| } |
| } |
| |
| /* Remove any brackets that terminate at this pseudo node. */ |
| sese->remove (pseudo_node_t (me, dir)); |
| |
| /* Now push any backlinks from this pseudo node. */ |
| FOR_EACH_EDGE (e, ei, edges) |
| { |
| basic_block target = *(basic_block *)((char *)e + offset); |
| if (bb_sese *t_sese = BB_GET_SESE (target)) |
| { |
| if (t_sese->node < sese->node + dir |
| && !(dir < 0 && sese->parent == t_sese->node)) |
| /* Non-parental ancestor node - backedge from me. */ |
| sese->push (pseudo_node_t (target, usd * t_sese->dir)); |
| } |
| else |
| { |
| /* back edge to entry node */ |
| sese->push (pseudo_node_t (0, 0)); |
| } |
| } |
| |
| /* If this node leads directly or indirectly to a no-return region of |
| the graph, then fake a backedge to entry node. */ |
| if (!sese->brackets.length () || !edges || !edges->length ()) |
| { |
| hi_back = 0; |
| node_back = pseudo_node_t (0, 0); |
| sese->push (node_back); |
| } |
| |
| /* Record the highest reaching backedge from us or a descendant. */ |
| sese->high = hi_back < hi_child ? node_back : node_child; |
| |
| if (num_children > 1) |
| { |
| /* There is more than one child -- this is a Y shaped piece of |
| spanning tree. We have to insert a fake backedge from this |
| node to the highest ancestor reached by not-the-highest |
| reaching child. Note that there may be multiple children |
| with backedges to the same highest node. That's ok and we |
| insert the edge to that highest node. */ |
| hi_child = depth; |
| if (dir < 0 && child) |
| { |
| node_child = sese->high; |
| hi_child = node_child.second; |
| if (node_child.first) |
| hi_child += BB_GET_SESE (node_child.first)->node; |
| } |
| |
| FOR_EACH_EDGE (e, ei, edges) |
| { |
| basic_block target = *(basic_block *)((char *)e + offset); |
| |
| if (target == child) |
| /* Ignore the highest child. */ |
| continue; |
| |
| bb_sese *t_sese = BB_GET_SESE (target); |
| if (!t_sese) |
| continue; |
| if (t_sese->parent != sese->node) |
| /* Not a child. */ |
| continue; |
| |
| /* Compare its hi value. */ |
| int t_hi = t_sese->high.second; |
| |
| if (basic_block child_hi_block = t_sese->high.first) |
| t_hi += BB_GET_SESE (child_hi_block)->node; |
| |
| if (hi_child > t_hi) |
| { |
| hi_child = t_hi; |
| node_child = t_sese->high; |
| } |
| } |
| |
| sese->push (node_child); |
| } |
| } |
| |
| |
| /* DFS walk of BB graph. Color node BLOCK according to COLORING then |
| proceed to successors. Set SESE entry and exit nodes of |
| REGIONS. */ |
| |
| static void |
| omp_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t ®ions, |
| basic_block block, int coloring) |
| { |
| bb_sese *sese = BB_GET_SESE (block); |
| |
| if (block->flags & BB_VISITED) |
| { |
| /* If we've already encountered this block, either we must not |
| be coloring, or it must have been colored the current color. */ |
| gcc_assert (coloring < 0 || (sese && coloring == sese->color)); |
| return; |
| } |
| |
| block->flags |= BB_VISITED; |
| |
| if (sese) |
| { |
| if (coloring < 0) |
| { |
| /* Start coloring a region. */ |
| regions[sese->color].first = block; |
| coloring = sese->color; |
| } |
| |
| if (!--color_counts[sese->color] && sese->color == coloring) |
| { |
| /* Found final block of SESE region. */ |
| regions[sese->color].second = block; |
| coloring = -1; |
| } |
| else |
| /* Color the node, so we can assert on revisiting the node |
| that the graph is indeed SESE. */ |
| sese->color = coloring; |
| } |
| else |
| /* Fallen off the subgraph, we cannot be coloring. */ |
| gcc_assert (coloring < 0); |
| |
| /* Walk each successor block. */ |
| if (block->succs && block->succs->length ()) |
| { |
| edge e; |
| edge_iterator ei; |
| |
| FOR_EACH_EDGE (e, ei, block->succs) |
| omp_sese_color (color_counts, regions, e->dest, coloring); |
| } |
| else |
| gcc_assert (coloring < 0); |
| } |
| |
| /* Find minimal set of SESE regions covering BLOCKS. REGIONS might |
| end up with NULL entries in it. */ |
| |
| void |
| omp_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t ®ions) |
| { |
| basic_block block; |
| int ix; |
| |
| /* First clear each BB of the whole function. */ |
| FOR_EACH_BB_FN (block, cfun) |
| { |
| block->flags &= ~BB_VISITED; |
| BB_SET_SESE (block, 0); |
| } |
| block = EXIT_BLOCK_PTR_FOR_FN (cfun); |
| block->flags &= ~BB_VISITED; |
| BB_SET_SESE (block, 0); |
| block = ENTRY_BLOCK_PTR_FOR_FN (cfun); |
| block->flags &= ~BB_VISITED; |
| BB_SET_SESE (block, 0); |
| |
| /* Mark blocks in the function that are in this graph. */ |
| for (ix = 0; blocks.iterate (ix, &block); ix++) |
| block->flags |= BB_VISITED; |
| |
| /* Counts of nodes assigned to each color. There cannot be more |
| colors than blocks (and hopefully there will be fewer). */ |
| auto_vec<unsigned> color_counts; |
| color_counts.reserve (blocks.length ()); |
| |
| /* Worklist of nodes in the spanning tree. Again, there cannot be |
| more nodes in the tree than blocks (there will be fewer if the |
| CFG of blocks is disjoint). */ |
| auto_vec<basic_block> spanlist; |
| spanlist.reserve (blocks.length ()); |
| |
| /* Make sure every block has its cycle class determined. */ |
| for (ix = 0; blocks.iterate (ix, &block); ix++) |
| { |
| if (BB_GET_SESE (block)) |
| /* We already met this block in an earlier graph solve. */ |
| continue; |
| |
| if (dump_file) |
| fprintf (dump_file, "Searching graph starting at %d\n", block->index); |
| |
| /* Number the nodes reachable from block initial DFS order. */ |
| int depth = omp_sese_number (2, 0, +1, block, &spanlist); |
| |
| /* Now walk in reverse DFS order to find cycle equivalents. */ |
| while (spanlist.length ()) |
| { |
| block = spanlist.pop (); |
| bb_sese *sese = BB_GET_SESE (block); |
| |
| /* Do the pseudo node below. */ |
| omp_sese_pseudo (block, sese, depth, +1, |
| sese->dir > 0 ? block->succs : block->preds, |
| (sese->dir > 0 ? offsetof (edge_def, dest) |
| : offsetof (edge_def, src))); |
| sese->set_color (color_counts); |
| /* Do the pseudo node above. */ |
| omp_sese_pseudo (block, sese, depth, -1, |
| sese->dir < 0 ? block->succs : block->preds, |
| (sese->dir < 0 ? offsetof (edge_def, dest) |
| : offsetof (edge_def, src))); |
| } |
| if (dump_file) |
| fprintf (dump_file, "\n"); |
| } |
| |
| if (dump_file) |
| { |
| unsigned count; |
| const char *comma = ""; |
| |
| fprintf (dump_file, "Found %d cycle equivalents\n", |
| color_counts.length ()); |
| for (ix = 0; color_counts.iterate (ix, &count); ix++) |
| { |
| fprintf (dump_file, "%s%d[%d]={", comma, ix, count); |
| |
| comma = ""; |
| for (unsigned jx = 0; blocks.iterate (jx, &block); jx++) |
| if (BB_GET_SESE (block)->color == ix) |
| { |
| block->flags |= BB_VISITED; |
| fprintf (dump_file, "%s%d", comma, block->index); |
| comma=","; |
| } |
| fprintf (dump_file, "}"); |
| comma = ", "; |
| } |
| fprintf (dump_file, "\n"); |
| } |
| |
| /* Now we've colored every block in the subgraph. We now need to |
| determine the minimal set of SESE regions that cover that |
| subgraph. Do this with a DFS walk of the complete function. |
| During the walk we're either 'looking' or 'coloring'. When we |
| reach the last node of a particular color, we stop coloring and |
| return to looking. */ |
| |
| /* There cannot be more SESE regions than colors. */ |
| regions.reserve (color_counts.length ()); |
| for (ix = color_counts.length (); ix--;) |
| regions.quick_push (bb_pair_t (0, 0)); |
| |
| for (ix = 0; blocks.iterate (ix, &block); ix++) |
| block->flags &= ~BB_VISITED; |
| |
| omp_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1); |
| |
| if (dump_file) |
| { |
| const char *comma = ""; |
| int len = regions.length (); |
| |
| fprintf (dump_file, "SESE regions:"); |
| for (ix = 0; ix != len; ix++) |
| { |
| basic_block from = regions[ix].first; |
| basic_block to = regions[ix].second; |
| |
| if (from) |
| { |
| fprintf (dump_file, "%s %d{%d", comma, ix, from->index); |
| if (to != from) |
| fprintf (dump_file, "->%d", to->index); |
| |
| int color = BB_GET_SESE (from)->color; |
| |
| /* Print the blocks within the region (excluding ends). */ |
| FOR_EACH_BB_FN (block, cfun) |
| { |
| bb_sese *sese = BB_GET_SESE (block); |
| |
| if (sese && sese->color == color |
| && block != from && block != to) |
| fprintf (dump_file, ".%d", block->index); |
| } |
| fprintf (dump_file, "}"); |
| } |
| comma = ","; |
| } |
| fprintf (dump_file, "\n\n"); |
| } |
| |
| for (ix = 0; blocks.iterate (ix, &block); ix++) |
| delete BB_GET_SESE (block); |
| } |
| |
| #undef BB_SET_SESE |
| #undef BB_GET_SESE |