| /* OpenACC worker partitioning via middle end neutering/broadcasting scheme |
| |
| Copyright (C) 2015-2022 Free Software Foundation, Inc. |
| |
| This file is part of GCC. |
| |
| GCC is free software; you can redistribute it and/or modify it |
| under the terms of the GNU General Public License as published |
| by the Free Software Foundation; either version 3, or (at your |
| option) any later version. |
| |
| GCC is distributed in the hope that it will be useful, but WITHOUT |
| ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public |
| License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with GCC; see the file COPYING3. If not see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #include "config.h" |
| #include "system.h" |
| #include "coretypes.h" |
| #include "backend.h" |
| #include "rtl.h" |
| #include "tree.h" |
| #include "gimple.h" |
| #include "tree-pass.h" |
| #include "ssa.h" |
| #include "cgraph.h" |
| #include "pretty-print.h" |
| #include "fold-const.h" |
| #include "gimplify.h" |
| #include "gimple-iterator.h" |
| #include "gimple-walk.h" |
| #include "tree-inline.h" |
| #include "langhooks.h" |
| #include "omp-general.h" |
| #include "omp-low.h" |
| #include "gimple-pretty-print.h" |
| #include "cfghooks.h" |
| #include "insn-config.h" |
| #include "recog.h" |
| #include "internal-fn.h" |
| #include "bitmap.h" |
| #include "tree-nested.h" |
| #include "stor-layout.h" |
| #include "tree-ssa-threadupdate.h" |
| #include "tree-into-ssa.h" |
| #include "splay-tree.h" |
| #include "target.h" |
| #include "cfgloop.h" |
| #include "tree-cfg.h" |
| #include "omp-offload.h" |
| #include "attribs.h" |
| #include "targhooks.h" |
| #include "diagnostic-core.h" |
| |
| /* Loop structure of the function. The entire function is described as |
| a NULL loop. */ |
| /* Adapted from 'gcc/config/nvptx/nvptx.cc:struct parallel'. */ |
| |
| struct parallel_g |
| { |
| /* Parent parallel. */ |
| parallel_g *parent; |
| |
| /* Next sibling parallel. */ |
| parallel_g *next; |
| |
| /* First child parallel. */ |
| parallel_g *inner; |
| |
| /* Partitioning mask of the parallel. */ |
| unsigned mask; |
| |
| /* Partitioning used within inner parallels. */ |
| unsigned inner_mask; |
| |
| /* Location of parallel forked and join. The forked is the first |
| block in the parallel and the join is the first block after of |
| the partition. */ |
| basic_block forked_block; |
| basic_block join_block; |
| |
| gimple *forked_stmt; |
| gimple *join_stmt; |
| |
| gimple *fork_stmt; |
| gimple *joining_stmt; |
| |
| /* Basic blocks in this parallel, but not in child parallels. The |
| FORKED and JOINING blocks are in the partition. The FORK and JOIN |
| blocks are not. */ |
| auto_vec<basic_block> blocks; |
| |
| tree record_type; |
| tree sender_decl; |
| tree receiver_decl; |
| |
| public: |
| parallel_g (parallel_g *parent, unsigned mode); |
| ~parallel_g (); |
| }; |
| |
| /* Constructor links the new parallel into it's parent's chain of |
| children. */ |
| |
| parallel_g::parallel_g (parallel_g *parent_, unsigned mask_) |
| :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0) |
| { |
| forked_block = join_block = 0; |
| forked_stmt = join_stmt = NULL; |
| fork_stmt = joining_stmt = NULL; |
| |
| record_type = NULL_TREE; |
| sender_decl = NULL_TREE; |
| receiver_decl = NULL_TREE; |
| |
| if (parent) |
| { |
| next = parent->inner; |
| parent->inner = this; |
| } |
| } |
| |
| parallel_g::~parallel_g () |
| { |
| delete inner; |
| delete next; |
| } |
| |
| static bool |
| local_var_based_p (tree decl) |
| { |
| switch (TREE_CODE (decl)) |
| { |
| case VAR_DECL: |
| return !is_global_var (decl); |
| |
| case COMPONENT_REF: |
| case BIT_FIELD_REF: |
| case ARRAY_REF: |
| return local_var_based_p (TREE_OPERAND (decl, 0)); |
| |
| default: |
| return false; |
| } |
| } |
| |
| /* Map of basic blocks to gimple stmts. */ |
| typedef hash_map<basic_block, gimple *> bb_stmt_map_t; |
| |
| /* Calls to OpenACC routines are made by all workers/wavefronts/warps, since |
| the routine likely contains partitioned loops (else will do its own |
| neutering and variable propagation). Return TRUE if a function call CALL |
| should be made in (worker) single mode instead, rather than redundant |
| mode. */ |
| |
| static bool |
| omp_sese_active_worker_call (gcall *call) |
| { |
| #define GOMP_DIM_SEQ GOMP_DIM_MAX |
| tree fndecl = gimple_call_fndecl (call); |
| |
| if (!fndecl) |
| return true; |
| |
| tree attrs = oacc_get_fn_attrib (fndecl); |
| |
| if (!attrs) |
| return true; |
| |
| int level = oacc_fn_attrib_level (attrs); |
| |
| /* Neither regular functions nor "seq" routines should be run by all threads |
| in worker-single mode. */ |
| return level == -1 || level == GOMP_DIM_SEQ; |
| #undef GOMP_DIM_SEQ |
| } |
| |
| /* Split basic blocks such that each forked and join unspecs are at |
| the start of their basic blocks. Thus afterwards each block will |
| have a single partitioning mode. We also do the same for return |
| insns, as they are executed by every thread. Return the |
| partitioning mode of the function as a whole. Populate MAP with |
| head and tail blocks. We also clear the BB visited flag, which is |
| used when finding partitions. */ |
| /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_split_blocks'. */ |
| |
| static void |
| omp_sese_split_blocks (bb_stmt_map_t *map) |
| { |
| auto_vec<gimple *> worklist; |
| basic_block block; |
| |
| /* Locate all the reorg instructions of interest. */ |
| FOR_ALL_BB_FN (block, cfun) |
| { |
| /* Clear visited flag, for use by parallel locator */ |
| block->flags &= ~BB_VISITED; |
| |
| for (gimple_stmt_iterator gsi = gsi_start_bb (block); |
| !gsi_end_p (gsi); |
| gsi_next (&gsi)) |
| { |
| gimple *stmt = gsi_stmt (gsi); |
| |
| if (gimple_call_internal_p (stmt, IFN_UNIQUE)) |
| { |
| enum ifn_unique_kind k = ((enum ifn_unique_kind) |
| TREE_INT_CST_LOW (gimple_call_arg (stmt, 0))); |
| |
| if (k == IFN_UNIQUE_OACC_JOIN) |
| worklist.safe_push (stmt); |
| else if (k == IFN_UNIQUE_OACC_FORK) |
| { |
| gcc_assert (gsi_one_before_end_p (gsi)); |
| basic_block forked_block = single_succ (block); |
| gimple_stmt_iterator gsi2 = gsi_start_bb (forked_block); |
| |
| /* We push a NOP as a placeholder for the "forked" stmt. |
| This is then recognized in omp_sese_find_par. */ |
| gimple *nop = gimple_build_nop (); |
| gsi_insert_before (&gsi2, nop, GSI_SAME_STMT); |
| |
| worklist.safe_push (nop); |
| } |
| } |
| else if (gimple_code (stmt) == GIMPLE_RETURN |
| || gimple_code (stmt) == GIMPLE_COND |
| || gimple_code (stmt) == GIMPLE_SWITCH |
| || (gimple_code (stmt) == GIMPLE_CALL |
| && !gimple_call_internal_p (stmt) |
| && !omp_sese_active_worker_call (as_a <gcall *> (stmt)))) |
| worklist.safe_push (stmt); |
| else if (is_gimple_assign (stmt)) |
| { |
| tree lhs = gimple_assign_lhs (stmt); |
| |
| /* Force assignments to components/fields/elements of local |
| aggregates into fully-partitioned (redundant) mode. This |
| avoids having to broadcast the whole aggregate. The RHS of |
| the assignment will be propagated using the normal |
| mechanism. */ |
| |
| switch (TREE_CODE (lhs)) |
| { |
| case COMPONENT_REF: |
| case BIT_FIELD_REF: |
| case ARRAY_REF: |
| { |
| tree aggr = TREE_OPERAND (lhs, 0); |
| |
| if (local_var_based_p (aggr)) |
| worklist.safe_push (stmt); |
| } |
| break; |
| |
| default: |
| ; |
| } |
| } |
| } |
| } |
| |
| /* Split blocks on the worklist. */ |
| unsigned ix; |
| gimple *stmt; |
| |
| for (ix = 0; worklist.iterate (ix, &stmt); ix++) |
| { |
| basic_block block = gimple_bb (stmt); |
| |
| if (gimple_code (stmt) == GIMPLE_COND) |
| { |
| gcond *orig_cond = as_a <gcond *> (stmt); |
| tree_code code = gimple_expr_code (orig_cond); |
| tree pred = make_ssa_name (boolean_type_node); |
| gimple *asgn = gimple_build_assign (pred, code, |
| gimple_cond_lhs (orig_cond), |
| gimple_cond_rhs (orig_cond)); |
| gcond *new_cond |
| = gimple_build_cond (NE_EXPR, pred, boolean_false_node, |
| gimple_cond_true_label (orig_cond), |
| gimple_cond_false_label (orig_cond)); |
| |
| gimple_stmt_iterator gsi = gsi_for_stmt (stmt); |
| gsi_insert_before (&gsi, asgn, GSI_SAME_STMT); |
| gsi_replace (&gsi, new_cond, true); |
| |
| edge e = split_block (block, asgn); |
| block = e->dest; |
| map->get_or_insert (block) = new_cond; |
| } |
| else if ((gimple_code (stmt) == GIMPLE_CALL |
| && !gimple_call_internal_p (stmt)) |
| || is_gimple_assign (stmt)) |
| { |
| gimple_stmt_iterator gsi = gsi_for_stmt (stmt); |
| gsi_prev (&gsi); |
| |
| edge call = split_block (block, gsi_stmt (gsi)); |
| |
| gimple *call_stmt = gsi_stmt (gsi_start_bb (call->dest)); |
| |
| edge call_to_ret = split_block (call->dest, call_stmt); |
| |
| map->get_or_insert (call_to_ret->src) = call_stmt; |
| } |
| else |
| { |
| gimple_stmt_iterator gsi = gsi_for_stmt (stmt); |
| gsi_prev (&gsi); |
| |
| if (gsi_end_p (gsi)) |
| map->get_or_insert (block) = stmt; |
| else |
| { |
| /* Split block before insn. The insn is in the new block. */ |
| edge e = split_block (block, gsi_stmt (gsi)); |
| |
| block = e->dest; |
| map->get_or_insert (block) = stmt; |
| } |
| } |
| } |
| } |
| |
| static const char * |
| mask_name (unsigned mask) |
| { |
| switch (mask) |
| { |
| case 0: return "gang redundant"; |
| case 1: return "gang partitioned"; |
| case 2: return "worker partitioned"; |
| case 3: return "gang+worker partitioned"; |
| case 4: return "vector partitioned"; |
| case 5: return "gang+vector partitioned"; |
| case 6: return "worker+vector partitioned"; |
| case 7: return "fully partitioned"; |
| default: return "<illegal>"; |
| } |
| } |
| |
| /* Dump this parallel and all its inner parallels. */ |
| /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_dump_pars'. */ |
| |
| static void |
| omp_sese_dump_pars (parallel_g *par, unsigned depth) |
| { |
| fprintf (dump_file, "%u: mask %d (%s) head=%d, tail=%d\n", |
| depth, par->mask, mask_name (par->mask), |
| par->forked_block ? par->forked_block->index : -1, |
| par->join_block ? par->join_block->index : -1); |
| |
| fprintf (dump_file, " blocks:"); |
| |
| basic_block block; |
| for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++) |
| fprintf (dump_file, " %d", block->index); |
| fprintf (dump_file, "\n"); |
| if (par->inner) |
| omp_sese_dump_pars (par->inner, depth + 1); |
| |
| if (par->next) |
| omp_sese_dump_pars (par->next, depth); |
| } |
| |
| /* If BLOCK contains a fork/join marker, process it to create or |
| terminate a loop structure. Add this block to the current loop, |
| and then walk successor blocks. */ |
| /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_find_par'. */ |
| |
| static parallel_g * |
| omp_sese_find_par (bb_stmt_map_t *map, parallel_g *par, basic_block block) |
| { |
| if (block->flags & BB_VISITED) |
| return par; |
| block->flags |= BB_VISITED; |
| |
| if (gimple **stmtp = map->get (block)) |
| { |
| gimple *stmt = *stmtp; |
| |
| if (gimple_code (stmt) == GIMPLE_COND |
| || gimple_code (stmt) == GIMPLE_SWITCH |
| || gimple_code (stmt) == GIMPLE_RETURN |
| || (gimple_code (stmt) == GIMPLE_CALL |
| && !gimple_call_internal_p (stmt)) |
| || is_gimple_assign (stmt)) |
| { |
| /* A single block that is forced to be at the maximum partition |
| level. Make a singleton par for it. */ |
| par = new parallel_g (par, GOMP_DIM_MASK (GOMP_DIM_GANG) |
| | GOMP_DIM_MASK (GOMP_DIM_WORKER) |
| | GOMP_DIM_MASK (GOMP_DIM_VECTOR)); |
| par->forked_block = block; |
| par->forked_stmt = stmt; |
| par->blocks.safe_push (block); |
| par = par->parent; |
| goto walk_successors; |
| } |
| else if (gimple_nop_p (stmt)) |
| { |
| basic_block pred = single_pred (block); |
| gcc_assert (pred); |
| gimple_stmt_iterator gsi = gsi_last_bb (pred); |
| gimple *final_stmt = gsi_stmt (gsi); |
| |
| if (gimple_call_internal_p (final_stmt, IFN_UNIQUE)) |
| { |
| gcall *call = as_a <gcall *> (final_stmt); |
| enum ifn_unique_kind k = ((enum ifn_unique_kind) |
| TREE_INT_CST_LOW (gimple_call_arg (call, 0))); |
| |
| if (k == IFN_UNIQUE_OACC_FORK) |
| { |
| HOST_WIDE_INT dim |
| = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); |
| unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0; |
| |
| par = new parallel_g (par, mask); |
| par->forked_block = block; |
| par->forked_stmt = final_stmt; |
| par->fork_stmt = stmt; |
| } |
| else |
| gcc_unreachable (); |
| } |
| else |
| gcc_unreachable (); |
| } |
| else if (gimple_call_internal_p (stmt, IFN_UNIQUE)) |
| { |
| gcall *call = as_a <gcall *> (stmt); |
| enum ifn_unique_kind k = ((enum ifn_unique_kind) |
| TREE_INT_CST_LOW (gimple_call_arg (call, 0))); |
| if (k == IFN_UNIQUE_OACC_JOIN) |
| { |
| HOST_WIDE_INT dim = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2)); |
| unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0; |
| |
| gcc_assert (par->mask == mask); |
| par->join_block = block; |
| par->join_stmt = stmt; |
| par = par->parent; |
| } |
| else |
| gcc_unreachable (); |
| } |
| else |
| gcc_unreachable (); |
| } |
| |
| if (par) |
| /* Add this block onto the current loop's list of blocks. */ |
| par->blocks.safe_push (block); |
| else |
| /* This must be the entry block. Create a NULL parallel. */ |
| par = new parallel_g (0, 0); |
| |
| walk_successors: |
| /* Walk successor blocks. */ |
| edge e; |
| edge_iterator ei; |
| |
| FOR_EACH_EDGE (e, ei, block->succs) |
| omp_sese_find_par (map, par, e->dest); |
| |
| return par; |
| } |
| |
| /* DFS walk the CFG looking for fork & join markers. Construct |
| loop structures as we go. MAP is a mapping of basic blocks |
| to head & tail markers, discovered when splitting blocks. This |
| speeds up the discovery. We rely on the BB visited flag having |
| been cleared when splitting blocks. */ |
| /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_discover_pars'. */ |
| |
| static parallel_g * |
| omp_sese_discover_pars (bb_stmt_map_t *map) |
| { |
| basic_block block; |
| |
| /* Mark exit blocks as visited. */ |
| block = EXIT_BLOCK_PTR_FOR_FN (cfun); |
| block->flags |= BB_VISITED; |
| |
| /* And entry block as not. */ |
| block = ENTRY_BLOCK_PTR_FOR_FN (cfun); |
| block->flags &= ~BB_VISITED; |
| |
| parallel_g *par = omp_sese_find_par (map, 0, block); |
| |
| if (dump_file) |
| { |
| fprintf (dump_file, "\nLoops\n"); |
| omp_sese_dump_pars (par, 0); |
| fprintf (dump_file, "\n"); |
| } |
| |
| return par; |
| } |
| |
| static void |
| populate_single_mode_bitmaps (parallel_g *par, bitmap worker_single, |
| bitmap vector_single, unsigned outer_mask, |
| int depth) |
| { |
| unsigned mask = outer_mask | par->mask; |
| |
| basic_block block; |
| |
| for (unsigned i = 0; par->blocks.iterate (i, &block); i++) |
| { |
| if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0) |
| bitmap_set_bit (worker_single, block->index); |
| |
| if ((mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) == 0) |
| bitmap_set_bit (vector_single, block->index); |
| } |
| |
| if (par->inner) |
| populate_single_mode_bitmaps (par->inner, worker_single, vector_single, |
| mask, depth + 1); |
| if (par->next) |
| populate_single_mode_bitmaps (par->next, worker_single, vector_single, |
| outer_mask, depth); |
| } |
| |
| /* A map from SSA names or var decls to record fields. */ |
| |
| typedef hash_map<tree, tree> field_map_t; |
| |
| /* For each propagation record type, this is a map from SSA names or var decls |
| to propagate, to the field in the record type that should be used for |
| transmission and reception. */ |
| |
| typedef hash_map<tree, field_map_t> record_field_map_t; |
| |
| static void |
| install_var_field (tree var, tree record_type, field_map_t *fields) |
| { |
| tree name; |
| char tmp[20]; |
| |
| if (TREE_CODE (var) == SSA_NAME) |
| { |
| name = SSA_NAME_IDENTIFIER (var); |
| if (!name) |
| { |
| sprintf (tmp, "_%u", (unsigned) SSA_NAME_VERSION (var)); |
| name = get_identifier (tmp); |
| } |
| } |
| else if (TREE_CODE (var) == VAR_DECL) |
| { |
| name = DECL_NAME (var); |
| if (!name) |
| { |
| sprintf (tmp, "D_%u", (unsigned) DECL_UID (var)); |
| name = get_identifier (tmp); |
| } |
| } |
| else |
| gcc_unreachable (); |
| |
| gcc_assert (!fields->get (var)); |
| |
| tree type = TREE_TYPE (var); |
| |
| if (POINTER_TYPE_P (type) |
| && TYPE_RESTRICT (type)) |
| type = build_qualified_type (type, TYPE_QUALS (type) & ~TYPE_QUAL_RESTRICT); |
| |
| tree field = build_decl (BUILTINS_LOCATION, FIELD_DECL, name, type); |
| |
| if (TREE_CODE (var) == VAR_DECL && type == TREE_TYPE (var)) |
| { |
| SET_DECL_ALIGN (field, DECL_ALIGN (var)); |
| DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var); |
| TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var); |
| } |
| else |
| SET_DECL_ALIGN (field, TYPE_ALIGN (type)); |
| |
| fields->put (var, field); |
| |
| insert_field_into_struct (record_type, field); |
| } |
| |
| /* Sets of SSA_NAMES or VAR_DECLs to propagate. */ |
| typedef hash_set<tree> propagation_set; |
| |
| static void |
| find_ssa_names_to_propagate (parallel_g *par, unsigned outer_mask, |
| bitmap worker_single, bitmap vector_single, |
| vec<propagation_set *> *prop_set) |
| { |
| unsigned mask = outer_mask | par->mask; |
| |
| if (par->inner) |
| find_ssa_names_to_propagate (par->inner, mask, worker_single, |
| vector_single, prop_set); |
| if (par->next) |
| find_ssa_names_to_propagate (par->next, outer_mask, worker_single, |
| vector_single, prop_set); |
| |
| if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) |
| { |
| basic_block block; |
| int ix; |
| |
| for (ix = 0; par->blocks.iterate (ix, &block); ix++) |
| { |
| for (gphi_iterator psi = gsi_start_phis (block); |
| !gsi_end_p (psi); gsi_next (&psi)) |
| { |
| gphi *phi = psi.phi (); |
| use_operand_p use; |
| ssa_op_iter iter; |
| |
| FOR_EACH_PHI_ARG (use, phi, iter, SSA_OP_USE) |
| { |
| tree var = USE_FROM_PTR (use); |
| |
| if (TREE_CODE (var) != SSA_NAME) |
| continue; |
| |
| gimple *def_stmt = SSA_NAME_DEF_STMT (var); |
| |
| if (gimple_nop_p (def_stmt)) |
| continue; |
| |
| basic_block def_bb = gimple_bb (def_stmt); |
| |
| if (bitmap_bit_p (worker_single, def_bb->index)) |
| { |
| if (!(*prop_set)[def_bb->index]) |
| (*prop_set)[def_bb->index] = new propagation_set; |
| |
| propagation_set *ws_prop = (*prop_set)[def_bb->index]; |
| |
| ws_prop->add (var); |
| } |
| } |
| } |
| |
| for (gimple_stmt_iterator gsi = gsi_start_bb (block); |
| !gsi_end_p (gsi); gsi_next (&gsi)) |
| { |
| use_operand_p use; |
| ssa_op_iter iter; |
| gimple *stmt = gsi_stmt (gsi); |
| |
| FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE) |
| { |
| tree var = USE_FROM_PTR (use); |
| |
| gimple *def_stmt = SSA_NAME_DEF_STMT (var); |
| |
| if (gimple_nop_p (def_stmt)) |
| continue; |
| |
| basic_block def_bb = gimple_bb (def_stmt); |
| |
| if (bitmap_bit_p (worker_single, def_bb->index)) |
| { |
| if (!(*prop_set)[def_bb->index]) |
| (*prop_set)[def_bb->index] = new propagation_set; |
| |
| propagation_set *ws_prop = (*prop_set)[def_bb->index]; |
| |
| ws_prop->add (var); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| /* Callback for walk_gimple_stmt to find RHS VAR_DECLs (uses) in a |
| statement. */ |
| |
| static tree |
| find_partitioned_var_uses_1 (tree *node, int *, void *data) |
| { |
| walk_stmt_info *wi = (walk_stmt_info *) data; |
| hash_set<tree> *partitioned_var_uses = (hash_set<tree> *) wi->info; |
| |
| if (!wi->is_lhs && VAR_P (*node)) |
| partitioned_var_uses->add (*node); |
| |
| return NULL_TREE; |
| } |
| |
| static void |
| find_partitioned_var_uses (parallel_g *par, unsigned outer_mask, |
| hash_set<tree> *partitioned_var_uses) |
| { |
| unsigned mask = outer_mask | par->mask; |
| |
| if (par->inner) |
| find_partitioned_var_uses (par->inner, mask, partitioned_var_uses); |
| if (par->next) |
| find_partitioned_var_uses (par->next, outer_mask, partitioned_var_uses); |
| |
| if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) |
| { |
| basic_block block; |
| int ix; |
| |
| for (ix = 0; par->blocks.iterate (ix, &block); ix++) |
| for (gimple_stmt_iterator gsi = gsi_start_bb (block); |
| !gsi_end_p (gsi); gsi_next (&gsi)) |
| { |
| walk_stmt_info wi; |
| memset (&wi, 0, sizeof (wi)); |
| wi.info = (void *) partitioned_var_uses; |
| walk_gimple_stmt (&gsi, NULL, find_partitioned_var_uses_1, &wi); |
| } |
| } |
| } |
| |
| /* Gang-private variables (typically placed in a GPU's shared memory) do not |
| need to be processed by the worker-propagation mechanism. Populate the |
| GANG_PRIVATE_VARS set with any such variables found in the current |
| function. */ |
| |
| static void |
| find_gang_private_vars (hash_set<tree> *gang_private_vars) |
| { |
| basic_block block; |
| |
| FOR_EACH_BB_FN (block, cfun) |
| { |
| for (gimple_stmt_iterator gsi = gsi_start_bb (block); |
| !gsi_end_p (gsi); |
| gsi_next (&gsi)) |
| { |
| gimple *stmt = gsi_stmt (gsi); |
| |
| if (gimple_call_internal_p (stmt, IFN_UNIQUE)) |
| { |
| enum ifn_unique_kind k = ((enum ifn_unique_kind) |
| TREE_INT_CST_LOW (gimple_call_arg (stmt, 0))); |
| if (k == IFN_UNIQUE_OACC_PRIVATE) |
| { |
| HOST_WIDE_INT level |
| = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2)); |
| if (level != GOMP_DIM_GANG) |
| continue; |
| for (unsigned i = 3; i < gimple_call_num_args (stmt); i++) |
| { |
| tree arg = gimple_call_arg (stmt, i); |
| gcc_assert (TREE_CODE (arg) == ADDR_EXPR); |
| tree decl = TREE_OPERAND (arg, 0); |
| gang_private_vars->add (decl); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| static void |
| find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask, |
| hash_set<tree> *partitioned_var_uses, |
| hash_set<tree> *gang_private_vars, |
| bitmap writes_gang_private, |
| vec<propagation_set *> *prop_set) |
| { |
| unsigned mask = outer_mask | par->mask; |
| |
| if (par->inner) |
| find_local_vars_to_propagate (par->inner, mask, partitioned_var_uses, |
| gang_private_vars, writes_gang_private, |
| prop_set); |
| if (par->next) |
| find_local_vars_to_propagate (par->next, outer_mask, partitioned_var_uses, |
| gang_private_vars, writes_gang_private, |
| prop_set); |
| |
| if (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))) |
| { |
| basic_block block; |
| int ix; |
| |
| for (ix = 0; par->blocks.iterate (ix, &block); ix++) |
| { |
| for (gimple_stmt_iterator gsi = gsi_start_bb (block); |
| !gsi_end_p (gsi); gsi_next (&gsi)) |
| { |
| gimple *stmt = gsi_stmt (gsi); |
| tree var; |
| unsigned i; |
| |
| FOR_EACH_LOCAL_DECL (cfun, i, var) |
| { |
| if (!VAR_P (var) |
| || is_global_var (var) |
| || AGGREGATE_TYPE_P (TREE_TYPE (var)) |
| || !partitioned_var_uses->contains (var)) |
| continue; |
| |
| if (stmt_may_clobber_ref_p (stmt, var)) |
| { |
| if (dump_file) |
| { |
| fprintf (dump_file, "bb %u: local variable may be " |
| "clobbered in %s mode: ", block->index, |
| mask_name (mask)); |
| print_generic_expr (dump_file, var, TDF_SLIM); |
| fprintf (dump_file, "\n"); |
| } |
| |
| if (gang_private_vars->contains (var)) |
| { |
| /* If we write a gang-private variable, we want a |
| barrier at the end of the block. */ |
| bitmap_set_bit (writes_gang_private, block->index); |
| continue; |
| } |
| |
| if (!(*prop_set)[block->index]) |
| (*prop_set)[block->index] = new propagation_set; |
| |
| propagation_set *ws_prop |
| = (*prop_set)[block->index]; |
| |
| ws_prop->add (var); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| /* Transform basic blocks FROM, TO (which may be the same block) into: |
| if (GOACC_single_start ()) |
| BLOCK; |
| GOACC_barrier (); |
| \ | / |
| +----+ |
| | | (new) predicate block |
| +----+-- |
| \ | / \ | / |t \ |
| +----+ +----+ +----+ | |
| | | | | ===> | | | f (old) from block |
| +----+ +----+ +----+ | |
| | t/ \f | / |
| +----+/ |
| (split (split before | | skip block |
| at end) condition) +----+ |
| t/ \f |
| */ |
| |
| static void |
| worker_single_simple (basic_block from, basic_block to, |
| hash_set<tree> *def_escapes_block) |
| { |
| gimple *call, *cond; |
| tree lhs, decl; |
| basic_block skip_block; |
| |
| gimple_stmt_iterator gsi = gsi_last_bb (to); |
| if (EDGE_COUNT (to->succs) > 1) |
| { |
| gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_COND); |
| gsi_prev (&gsi); |
| } |
| edge e = split_block (to, gsi_stmt (gsi)); |
| skip_block = e->dest; |
| |
| gimple_stmt_iterator start = gsi_after_labels (from); |
| |
| decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_START); |
| lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl))); |
| call = gimple_build_call (decl, 0); |
| gimple_call_set_lhs (call, lhs); |
| gsi_insert_before (&start, call, GSI_NEW_STMT); |
| update_stmt (call); |
| |
| cond = gimple_build_cond (EQ_EXPR, lhs, |
| fold_convert_loc (UNKNOWN_LOCATION, |
| TREE_TYPE (lhs), |
| boolean_true_node), |
| NULL_TREE, NULL_TREE); |
| gsi_insert_after (&start, cond, GSI_NEW_STMT); |
| update_stmt (cond); |
| |
| edge et = split_block (from, cond); |
| et->flags &= ~EDGE_FALLTHRU; |
| et->flags |= EDGE_TRUE_VALUE; |
| /* Make the active worker the more probable path so we prefer fallthrough |
| (letting the idle workers jump around more). */ |
| et->probability = profile_probability::likely (); |
| |
| edge ef = make_edge (from, skip_block, EDGE_FALSE_VALUE); |
| ef->probability = et->probability.invert (); |
| |
| basic_block neutered = split_edge (ef); |
| gimple_stmt_iterator neut_gsi = gsi_last_bb (neutered); |
| |
| for (gsi = gsi_start_bb (et->dest); !gsi_end_p (gsi); gsi_next (&gsi)) |
| { |
| gimple *stmt = gsi_stmt (gsi); |
| ssa_op_iter iter; |
| tree var; |
| |
| FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_DEF) |
| { |
| if (def_escapes_block->contains (var)) |
| { |
| gphi *join_phi = create_phi_node (NULL_TREE, skip_block); |
| create_new_def_for (var, join_phi, |
| gimple_phi_result_ptr (join_phi)); |
| add_phi_arg (join_phi, var, e, UNKNOWN_LOCATION); |
| |
| tree neutered_def = copy_ssa_name (var, NULL); |
| /* We really want "don't care" or some value representing |
| undefined here, but optimizers will probably get rid of the |
| zero-assignments anyway. */ |
| gassign *zero = gimple_build_assign (neutered_def, |
| build_zero_cst (TREE_TYPE (neutered_def))); |
| |
| gsi_insert_after (&neut_gsi, zero, GSI_CONTINUE_LINKING); |
| update_stmt (zero); |
| |
| add_phi_arg (join_phi, neutered_def, single_succ_edge (neutered), |
| UNKNOWN_LOCATION); |
| update_stmt (join_phi); |
| } |
| } |
| } |
| } |
| |
| static tree |
| build_receiver_ref (tree var, tree receiver_decl, field_map_t *fields) |
| { |
| tree x = build_simple_mem_ref (receiver_decl); |
| tree field = *fields->get (var); |
| TREE_THIS_NOTRAP (x) = 1; |
| x = omp_build_component_ref (x, field); |
| return x; |
| } |
| |
| static tree |
| build_sender_ref (tree var, tree sender_decl, field_map_t *fields) |
| { |
| if (POINTER_TYPE_P (TREE_TYPE (sender_decl))) |
| sender_decl = build_simple_mem_ref (sender_decl); |
| tree field = *fields->get (var); |
| return omp_build_component_ref (sender_decl, field); |
| } |
| |
| static int |
| sort_by_ssa_version_or_uid (const void *p1, const void *p2) |
| { |
| const tree t1 = *(const tree *)p1; |
| const tree t2 = *(const tree *)p2; |
| |
| if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) == SSA_NAME) |
| return SSA_NAME_VERSION (t1) - SSA_NAME_VERSION (t2); |
| else if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) != SSA_NAME) |
| return -1; |
| else if (TREE_CODE (t1) != SSA_NAME && TREE_CODE (t2) == SSA_NAME) |
| return 1; |
| else |
| return DECL_UID (t1) - DECL_UID (t2); |
| } |
| |
| static int |
| sort_by_size_then_ssa_version_or_uid (const void *p1, const void *p2) |
| { |
| const tree t1 = *(const tree *)p1; |
| const tree t2 = *(const tree *)p2; |
| unsigned HOST_WIDE_INT s1 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t1))); |
| unsigned HOST_WIDE_INT s2 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t2))); |
| if (s1 != s2) |
| return s2 - s1; |
| else |
| return sort_by_ssa_version_or_uid (p1, p2); |
| } |
| |
| static void |
| worker_single_copy (basic_block from, basic_block to, |
| hash_set<tree> *def_escapes_block, |
| hash_set<tree> *worker_partitioned_uses, |
| tree record_type, record_field_map_t *record_field_map, |
| unsigned HOST_WIDE_INT placement, |
| bool isolate_broadcasts, bool has_gang_private_write) |
| { |
| /* If we only have virtual defs, we'll have no record type, but we still want |
| to emit single_copy_start and (particularly) single_copy_end to act as |
| a vdef source on the neutered edge representing memory writes on the |
| non-neutered edge. */ |
| if (!record_type) |
| record_type = char_type_node; |
| |
| tree sender_decl |
| = targetm.goacc.create_worker_broadcast_record (record_type, true, |
| ".oacc_worker_o", |
| placement); |
| tree receiver_decl |
| = targetm.goacc.create_worker_broadcast_record (record_type, false, |
| ".oacc_worker_i", |
| placement); |
| |
| gimple_stmt_iterator gsi = gsi_last_bb (to); |
| if (EDGE_COUNT (to->succs) > 1) |
| gsi_prev (&gsi); |
| edge e = split_block (to, gsi_stmt (gsi)); |
| basic_block barrier_block = e->dest; |
| |
| gimple_stmt_iterator start = gsi_after_labels (from); |
| |
| tree decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_START); |
| |
| tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl))); |
| |
| gimple *call |
| = gimple_build_call (decl, 1, |
| POINTER_TYPE_P (TREE_TYPE (sender_decl)) |
| ? sender_decl : build_fold_addr_expr (sender_decl)); |
| gimple_call_set_lhs (call, lhs); |
| gsi_insert_before (&start, call, GSI_NEW_STMT); |
| update_stmt (call); |
| |
| /* The shared-memory range for this block overflowed. Add a barrier before |
| the GOACC_single_copy_start call. */ |
| if (isolate_broadcasts) |
| { |
| decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER); |
| gimple *acc_bar = gimple_build_call (decl, 0); |
| gsi_insert_before (&start, acc_bar, GSI_SAME_STMT); |
| } |
| |
| tree conv_tmp = make_ssa_name (TREE_TYPE (receiver_decl)); |
| |
| gimple *conv = gimple_build_assign (conv_tmp, |
| fold_convert (TREE_TYPE (receiver_decl), |
| lhs)); |
| update_stmt (conv); |
| gsi_insert_after (&start, conv, GSI_NEW_STMT); |
| gimple *asgn = gimple_build_assign (receiver_decl, conv_tmp); |
| gsi_insert_after (&start, asgn, GSI_NEW_STMT); |
| update_stmt (asgn); |
| |
| tree zero_ptr = build_int_cst (TREE_TYPE (receiver_decl), 0); |
| |
| tree recv_tmp = make_ssa_name (TREE_TYPE (receiver_decl)); |
| asgn = gimple_build_assign (recv_tmp, receiver_decl); |
| gsi_insert_after (&start, asgn, GSI_NEW_STMT); |
| update_stmt (asgn); |
| |
| gimple *cond = gimple_build_cond (EQ_EXPR, recv_tmp, zero_ptr, NULL_TREE, |
| NULL_TREE); |
| update_stmt (cond); |
| |
| gsi_insert_after (&start, cond, GSI_NEW_STMT); |
| |
| edge et = split_block (from, cond); |
| et->flags &= ~EDGE_FALLTHRU; |
| et->flags |= EDGE_TRUE_VALUE; |
| /* Make the active worker the more probable path so we prefer fallthrough |
| (letting the idle workers jump around more). */ |
| et->probability = profile_probability::likely (); |
| |
| basic_block body = et->dest; |
| |
| edge ef = make_edge (from, barrier_block, EDGE_FALSE_VALUE); |
| ef->probability = et->probability.invert (); |
| |
| gimple_stmt_iterator bar_gsi = gsi_start_bb (barrier_block); |
| cond = gimple_build_cond (NE_EXPR, recv_tmp, zero_ptr, NULL_TREE, NULL_TREE); |
| |
| if (record_type != char_type_node || has_gang_private_write) |
| { |
| decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER); |
| gimple *acc_bar = gimple_build_call (decl, 0); |
| |
| gsi_insert_before (&bar_gsi, acc_bar, GSI_NEW_STMT); |
| gsi_insert_after (&bar_gsi, cond, GSI_NEW_STMT); |
| } |
| else |
| gsi_insert_before (&bar_gsi, cond, GSI_NEW_STMT); |
| |
| edge et2 = split_block (barrier_block, cond); |
| et2->flags &= ~EDGE_FALLTHRU; |
| et2->flags |= EDGE_TRUE_VALUE; |
| et2->probability = profile_probability::unlikely (); |
| |
| basic_block exit_block = et2->dest; |
| |
| basic_block copyout_block = split_edge (et2); |
| edge ef2 = make_edge (barrier_block, exit_block, EDGE_FALSE_VALUE); |
| ef2->probability = et2->probability.invert (); |
| |
| gimple_stmt_iterator copyout_gsi = gsi_start_bb (copyout_block); |
| |
| edge copyout_to_exit = single_succ_edge (copyout_block); |
| |
| gimple_seq sender_seq = NULL; |
| |
| /* Make sure we iterate over definitions in a stable order. */ |
| auto_vec<tree> escape_vec (def_escapes_block->elements ()); |
| for (hash_set<tree>::iterator it = def_escapes_block->begin (); |
| it != def_escapes_block->end (); ++it) |
| escape_vec.quick_push (*it); |
| escape_vec.qsort (sort_by_ssa_version_or_uid); |
| |
| for (unsigned i = 0; i < escape_vec.length (); i++) |
| { |
| tree var = escape_vec[i]; |
| |
| if (TREE_CODE (var) == SSA_NAME && SSA_NAME_IS_VIRTUAL_OPERAND (var)) |
| continue; |
| |
| tree barrier_def = 0; |
| |
| if (TREE_CODE (var) == SSA_NAME) |
| { |
| gimple *def_stmt = SSA_NAME_DEF_STMT (var); |
| |
| if (gimple_nop_p (def_stmt)) |
| continue; |
| |
| /* The barrier phi takes one result from the actual work of the |
| block we're neutering, and the other result is constant zero of |
| the same type. */ |
| |
| gphi *barrier_phi = create_phi_node (NULL_TREE, barrier_block); |
| barrier_def = create_new_def_for (var, barrier_phi, |
| gimple_phi_result_ptr (barrier_phi)); |
| |
| add_phi_arg (barrier_phi, var, e, UNKNOWN_LOCATION); |
| add_phi_arg (barrier_phi, build_zero_cst (TREE_TYPE (var)), ef, |
| UNKNOWN_LOCATION); |
| |
| update_stmt (barrier_phi); |
| } |
| else |
| gcc_assert (TREE_CODE (var) == VAR_DECL); |
| |
| /* If we had no record type, we will have no fields map. */ |
| field_map_t *fields = record_field_map->get (record_type); |
| |
| if (worker_partitioned_uses->contains (var) |
| && fields |
| && fields->get (var)) |
| { |
| tree neutered_def = make_ssa_name (TREE_TYPE (var)); |
| |
| /* Receive definition from shared memory block. */ |
| |
| tree receiver_ref = build_receiver_ref (var, receiver_decl, fields); |
| gassign *recv = gimple_build_assign (neutered_def, |
| receiver_ref); |
| gsi_insert_after (©out_gsi, recv, GSI_CONTINUE_LINKING); |
| update_stmt (recv); |
| |
| if (TREE_CODE (var) == VAR_DECL) |
| { |
| /* If it's a VAR_DECL, we only copied to an SSA temporary. Copy |
| to the final location now. */ |
| gassign *asgn = gimple_build_assign (var, neutered_def); |
| gsi_insert_after (©out_gsi, asgn, GSI_CONTINUE_LINKING); |
| update_stmt (asgn); |
| } |
| else |
| { |
| /* If it's an SSA name, create a new phi at the join node to |
| represent either the output from the active worker (the |
| barrier) or the inactive workers (the copyout block). */ |
| gphi *join_phi = create_phi_node (NULL_TREE, exit_block); |
| create_new_def_for (barrier_def, join_phi, |
| gimple_phi_result_ptr (join_phi)); |
| add_phi_arg (join_phi, barrier_def, ef2, UNKNOWN_LOCATION); |
| add_phi_arg (join_phi, neutered_def, copyout_to_exit, |
| UNKNOWN_LOCATION); |
| update_stmt (join_phi); |
| } |
| |
| /* Send definition to shared memory block. */ |
| |
| tree sender_ref = build_sender_ref (var, sender_decl, fields); |
| |
| if (TREE_CODE (var) == SSA_NAME) |
| { |
| gassign *send = gimple_build_assign (sender_ref, var); |
| gimple_seq_add_stmt (&sender_seq, send); |
| update_stmt (send); |
| } |
| else if (TREE_CODE (var) == VAR_DECL) |
| { |
| tree tmp = make_ssa_name (TREE_TYPE (var)); |
| gassign *send = gimple_build_assign (tmp, var); |
| gimple_seq_add_stmt (&sender_seq, send); |
| update_stmt (send); |
| send = gimple_build_assign (sender_ref, tmp); |
| gimple_seq_add_stmt (&sender_seq, send); |
| update_stmt (send); |
| } |
| else |
| gcc_unreachable (); |
| } |
| } |
| |
| /* The shared-memory range for this block overflowed. Add a barrier at the |
| end. */ |
| if (isolate_broadcasts) |
| { |
| gsi = gsi_start_bb (exit_block); |
| decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER); |
| gimple *acc_bar = gimple_build_call (decl, 0); |
| gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT); |
| } |
| |
| /* It's possible for the ET->DEST block (the work done by the active thread) |
| to finish with a control-flow insn, e.g. a UNIQUE function call. Split |
| the block and add SENDER_SEQ in the latter part to avoid having control |
| flow in the middle of a BB. */ |
| |
| decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_END); |
| call = gimple_build_call (decl, 1, |
| POINTER_TYPE_P (TREE_TYPE (sender_decl)) |
| ? sender_decl |
| : build_fold_addr_expr (sender_decl)); |
| gimple_seq_add_stmt (&sender_seq, call); |
| |
| gsi = gsi_last_bb (body); |
| gimple *last = gsi_stmt (gsi); |
| basic_block sender_block = split_block (body, last)->dest; |
| gsi = gsi_last_bb (sender_block); |
| gsi_insert_seq_after (&gsi, sender_seq, GSI_CONTINUE_LINKING); |
| } |
| |
| typedef hash_map<basic_block, std::pair<unsigned HOST_WIDE_INT, bool> > |
| blk_offset_map_t; |
| |
| static void |
| neuter_worker_single (parallel_g *par, unsigned outer_mask, |
| bitmap worker_single, bitmap vector_single, |
| vec<propagation_set *> *prop_set, |
| hash_set<tree> *partitioned_var_uses, |
| record_field_map_t *record_field_map, |
| blk_offset_map_t *blk_offset_map, |
| bitmap writes_gang_private) |
| { |
| unsigned mask = outer_mask | par->mask; |
| |
| if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0) |
| { |
| basic_block block; |
| |
| for (unsigned i = 0; par->blocks.iterate (i, &block); i++) |
| { |
| bool has_defs = false; |
| hash_set<tree> def_escapes_block; |
| hash_set<tree> worker_partitioned_uses; |
| unsigned j; |
| tree var; |
| |
| FOR_EACH_SSA_NAME (j, var, cfun) |
| { |
| if (SSA_NAME_IS_VIRTUAL_OPERAND (var)) |
| { |
| has_defs = true; |
| continue; |
| } |
| |
| gimple *def_stmt = SSA_NAME_DEF_STMT (var); |
| |
| if (gimple_nop_p (def_stmt)) |
| continue; |
| |
| if (gimple_bb (def_stmt)->index != block->index) |
| continue; |
| |
| gimple *use_stmt; |
| imm_use_iterator use_iter; |
| bool uses_outside_block = false; |
| bool worker_partitioned_use = false; |
| |
| FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, var) |
| { |
| int blocknum = gimple_bb (use_stmt)->index; |
| |
| /* Don't propagate SSA names that are only used in the |
| current block, unless the usage is in a phi node: that |
| means the name left the block, then came back in at the |
| top. */ |
| if (blocknum != block->index |
| || gimple_code (use_stmt) == GIMPLE_PHI) |
| uses_outside_block = true; |
| if (!bitmap_bit_p (worker_single, blocknum)) |
| worker_partitioned_use = true; |
| } |
| |
| if (uses_outside_block) |
| def_escapes_block.add (var); |
| |
| if (worker_partitioned_use) |
| { |
| worker_partitioned_uses.add (var); |
| has_defs = true; |
| } |
| } |
| |
| propagation_set *ws_prop = (*prop_set)[block->index]; |
| |
| if (ws_prop) |
| { |
| for (propagation_set::iterator it = ws_prop->begin (); |
| it != ws_prop->end (); |
| ++it) |
| { |
| tree var = *it; |
| if (TREE_CODE (var) == VAR_DECL) |
| { |
| def_escapes_block.add (var); |
| if (partitioned_var_uses->contains (var)) |
| { |
| worker_partitioned_uses.add (var); |
| has_defs = true; |
| } |
| } |
| } |
| |
| delete ws_prop; |
| (*prop_set)[block->index] = 0; |
| } |
| |
| bool only_marker_fns = true; |
| bool join_block = false; |
| |
| for (gimple_stmt_iterator gsi = gsi_start_bb (block); |
| !gsi_end_p (gsi); |
| gsi_next (&gsi)) |
| { |
| gimple *stmt = gsi_stmt (gsi); |
| if (gimple_code (stmt) == GIMPLE_CALL |
| && gimple_call_internal_p (stmt, IFN_UNIQUE)) |
| { |
| enum ifn_unique_kind k = ((enum ifn_unique_kind) |
| TREE_INT_CST_LOW (gimple_call_arg (stmt, 0))); |
| if (k != IFN_UNIQUE_OACC_PRIVATE |
| && k != IFN_UNIQUE_OACC_JOIN |
| && k != IFN_UNIQUE_OACC_FORK |
| && k != IFN_UNIQUE_OACC_HEAD_MARK |
| && k != IFN_UNIQUE_OACC_TAIL_MARK) |
| only_marker_fns = false; |
| else if (k == IFN_UNIQUE_OACC_JOIN) |
| /* The JOIN marker is special in that it *cannot* be |
| predicated for worker zero, because it may be lowered |
| to a barrier instruction and all workers must typically |
| execute that barrier. We shouldn't be doing any |
| broadcasts from the join block anyway. */ |
| join_block = true; |
| } |
| else if (gimple_code (stmt) == GIMPLE_CALL |
| && gimple_call_internal_p (stmt, IFN_GOACC_LOOP)) |
| /* Empty. */; |
| else if (gimple_nop_p (stmt)) |
| /* Empty. */; |
| else |
| only_marker_fns = false; |
| } |
| |
| /* We can skip predicating this block for worker zero if the only |
| thing it contains is marker functions that will be removed in the |
| oaccdevlow pass anyway. |
| Don't do this if the block has (any) phi nodes, because those |
| might define SSA names that need broadcasting. |
| TODO: We might be able to skip transforming blocks that only |
| contain some other trivial statements too. */ |
| if (only_marker_fns && !phi_nodes (block)) |
| continue; |
| |
| gcc_assert (!join_block); |
| |
| if (has_defs) |
| { |
| tree record_type = (tree) block->aux; |
| std::pair<unsigned HOST_WIDE_INT, bool> *off_rngalloc |
| = blk_offset_map->get (block); |
| gcc_assert (!record_type || off_rngalloc); |
| unsigned HOST_WIDE_INT offset |
| = off_rngalloc ? off_rngalloc->first : 0; |
| bool range_allocated |
| = off_rngalloc ? off_rngalloc->second : true; |
| bool has_gang_private_write |
| = bitmap_bit_p (writes_gang_private, block->index); |
| worker_single_copy (block, block, &def_escapes_block, |
| &worker_partitioned_uses, record_type, |
| record_field_map, |
| offset, !range_allocated, |
| has_gang_private_write); |
| } |
| else |
| worker_single_simple (block, block, &def_escapes_block); |
| } |
| } |
| |
| if ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0) |
| { |
| basic_block block; |
| |
| for (unsigned i = 0; par->blocks.iterate (i, &block); i++) |
| for (gimple_stmt_iterator gsi = gsi_start_bb (block); |
| !gsi_end_p (gsi); |
| gsi_next (&gsi)) |
| { |
| gimple *stmt = gsi_stmt (gsi); |
| |
| if (gimple_code (stmt) == GIMPLE_CALL |
| && !gimple_call_internal_p (stmt) |
| && !omp_sese_active_worker_call (as_a <gcall *> (stmt))) |
| { |
| /* If we have an OpenACC routine call in worker-single mode, |
| place barriers before and afterwards to prevent |
| clobbering re-used shared memory regions (as are used |
| for AMDGCN at present, for example). */ |
| tree decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER); |
| gsi_insert_before (&gsi, gimple_build_call (decl, 0), |
| GSI_SAME_STMT); |
| gsi_insert_after (&gsi, gimple_build_call (decl, 0), |
| GSI_NEW_STMT); |
| } |
| } |
| } |
| |
| if (par->inner) |
| neuter_worker_single (par->inner, mask, worker_single, vector_single, |
| prop_set, partitioned_var_uses, record_field_map, |
| blk_offset_map, writes_gang_private); |
| if (par->next) |
| neuter_worker_single (par->next, outer_mask, worker_single, vector_single, |
| prop_set, partitioned_var_uses, record_field_map, |
| blk_offset_map, writes_gang_private); |
| } |
| |
| static void |
| dfs_broadcast_reachable_1 (basic_block bb, sbitmap reachable) |
| { |
| if (bb->flags & BB_VISITED) |
| return; |
| |
| bb->flags |= BB_VISITED; |
| |
| if (bb->succs) |
| { |
| edge e; |
| edge_iterator ei; |
| FOR_EACH_EDGE (e, ei, bb->succs) |
| { |
| basic_block dest = e->dest; |
| if (dest->aux) |
| bitmap_set_bit (reachable, dest->index); |
| else |
| dfs_broadcast_reachable_1 (dest, reachable); |
| } |
| } |
| } |
| |
| typedef std::pair<int, tree> idx_decl_pair_t; |
| |
| typedef auto_vec<splay_tree> used_range_vec_t; |
| |
| static int |
| sort_size_descending (const void *a, const void *b) |
| { |
| const idx_decl_pair_t *pa = (const idx_decl_pair_t *) a; |
| const idx_decl_pair_t *pb = (const idx_decl_pair_t *) b; |
| unsigned HOST_WIDE_INT asize = tree_to_uhwi (TYPE_SIZE_UNIT (pa->second)); |
| unsigned HOST_WIDE_INT bsize = tree_to_uhwi (TYPE_SIZE_UNIT (pb->second)); |
| return bsize - asize; |
| } |
| |
| class addr_range |
| { |
| public: |
| addr_range (unsigned HOST_WIDE_INT addr_lo, unsigned HOST_WIDE_INT addr_hi) |
| : lo (addr_lo), hi (addr_hi) |
| { } |
| addr_range (const addr_range &ar) : lo (ar.lo), hi (ar.hi) |
| { } |
| addr_range () : lo (0), hi (0) |
| { } |
| |
| bool invalid () { return lo == 0 && hi == 0; } |
| |
| unsigned HOST_WIDE_INT lo; |
| unsigned HOST_WIDE_INT hi; |
| }; |
| |
| static int |
| splay_tree_compare_addr_range (splay_tree_key a, splay_tree_key b) |
| { |
| addr_range *ar = (addr_range *) a; |
| addr_range *br = (addr_range *) b; |
| if (ar->lo == br->lo && ar->hi == br->hi) |
| return 0; |
| if (ar->hi <= br->lo) |
| return -1; |
| else if (ar->lo >= br->hi) |
| return 1; |
| return 0; |
| } |
| |
| static void |
| splay_tree_free_key (splay_tree_key k) |
| { |
| addr_range *ar = (addr_range *) k; |
| delete ar; |
| } |
| |
| static addr_range |
| first_fit_range (splay_tree s, unsigned HOST_WIDE_INT size, |
| unsigned HOST_WIDE_INT align, addr_range *bounds) |
| { |
| splay_tree_node min = splay_tree_min (s); |
| if (min) |
| { |
| splay_tree_node next; |
| while ((next = splay_tree_successor (s, min->key))) |
| { |
| unsigned HOST_WIDE_INT lo = ((addr_range *) min->key)->hi; |
| unsigned HOST_WIDE_INT hi = ((addr_range *) next->key)->lo; |
| unsigned HOST_WIDE_INT base = (lo + align - 1) & ~(align - 1); |
| if (base + size <= hi) |
| return addr_range (base, base + size); |
| min = next; |
| } |
| |
| unsigned HOST_WIDE_INT base = ((addr_range *)min->key)->hi; |
| base = (base + align - 1) & ~(align - 1); |
| if (base + size <= bounds->hi) |
| return addr_range (base, base + size); |
| else |
| return addr_range (); |
| } |
| else |
| { |
| unsigned HOST_WIDE_INT lo = bounds->lo; |
| lo = (lo + align - 1) & ~(align - 1); |
| if (lo + size <= bounds->hi) |
| return addr_range (lo, lo + size); |
| else |
| return addr_range (); |
| } |
| } |
| |
| static int |
| merge_ranges_1 (splay_tree_node n, void *ptr) |
| { |
| splay_tree accum = (splay_tree) ptr; |
| addr_range ar = *(addr_range *) n->key; |
| |
| splay_tree_node old = splay_tree_lookup (accum, n->key); |
| |
| /* We might have an overlap. Create a new range covering the |
| overlapping parts. */ |
| if (old) |
| { |
| addr_range *old_ar = (addr_range *) old->key; |
| ar.lo = MIN (old_ar->lo, ar.lo); |
| ar.hi = MAX (old_ar->hi, ar.hi); |
| splay_tree_remove (accum, old->key); |
| } |
| |
| addr_range *new_ar = new addr_range (ar); |
| |
| splay_tree_insert (accum, (splay_tree_key) new_ar, n->value); |
| |
| return 0; |
| } |
| |
| static void |
| merge_ranges (splay_tree accum, splay_tree sp) |
| { |
| splay_tree_foreach (sp, merge_ranges_1, (void *) accum); |
| } |
| |
| static void |
| oacc_do_neutering (unsigned HOST_WIDE_INT bounds_lo, |
| unsigned HOST_WIDE_INT bounds_hi) |
| { |
| bb_stmt_map_t bb_stmt_map; |
| auto_bitmap worker_single, vector_single; |
| |
| omp_sese_split_blocks (&bb_stmt_map); |
| |
| if (dump_file) |
| { |
| fprintf (dump_file, "\n\nAfter splitting:\n\n"); |
| dump_function_to_file (current_function_decl, dump_file, dump_flags); |
| } |
| |
| unsigned mask = 0; |
| |
| /* If this is a routine, calculate MASK as if the outer levels are already |
| partitioned. */ |
| { |
| tree attr = oacc_get_fn_attrib (current_function_decl); |
| tree dims = TREE_VALUE (attr); |
| unsigned ix; |
| for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims)) |
| { |
| tree allowed = TREE_PURPOSE (dims); |
| if (allowed && integer_zerop (allowed)) |
| mask |= GOMP_DIM_MASK (ix); |
| } |
| } |
| |
| parallel_g *par = omp_sese_discover_pars (&bb_stmt_map); |
| populate_single_mode_bitmaps (par, worker_single, vector_single, mask, 0); |
| |
| basic_block bb; |
| FOR_ALL_BB_FN (bb, cfun) |
| bb->aux = NULL; |
| |
| vec<propagation_set *> prop_set (vNULL); |
| prop_set.safe_grow_cleared (last_basic_block_for_fn (cfun), true); |
| |
| find_ssa_names_to_propagate (par, mask, worker_single, vector_single, |
| &prop_set); |
| |
| hash_set<tree> partitioned_var_uses; |
| hash_set<tree> gang_private_vars; |
| auto_bitmap writes_gang_private; |
| |
| find_gang_private_vars (&gang_private_vars); |
| find_partitioned_var_uses (par, mask, &partitioned_var_uses); |
| find_local_vars_to_propagate (par, mask, &partitioned_var_uses, |
| &gang_private_vars, writes_gang_private, |
| &prop_set); |
| |
| record_field_map_t record_field_map; |
| |
| FOR_ALL_BB_FN (bb, cfun) |
| { |
| propagation_set *ws_prop = prop_set[bb->index]; |
| if (ws_prop) |
| { |
| tree record_type = lang_hooks.types.make_type (RECORD_TYPE); |
| tree name = create_tmp_var_name (".oacc_ws_data_s"); |
| name = build_decl (UNKNOWN_LOCATION, TYPE_DECL, name, record_type); |
| DECL_ARTIFICIAL (name) = 1; |
| DECL_NAMELESS (name) = 1; |
| TYPE_NAME (record_type) = name; |
| TYPE_ARTIFICIAL (record_type) = 1; |
| |
| auto_vec<tree> field_vec (ws_prop->elements ()); |
| for (hash_set<tree>::iterator it = ws_prop->begin (); |
| it != ws_prop->end (); ++it) |
| field_vec.quick_push (*it); |
| |
| field_vec.qsort (sort_by_size_then_ssa_version_or_uid); |
| |
| bool existed; |
| field_map_t *fields |
| = &record_field_map.get_or_insert (record_type, &existed); |
| gcc_checking_assert (!existed); |
| |
| /* Insert var fields in reverse order, so the last inserted element |
| is the first in the structure. */ |
| for (int i = field_vec.length () - 1; i >= 0; i--) |
| install_var_field (field_vec[i], record_type, fields); |
| |
| layout_type (record_type); |
| |
| bb->aux = (tree) record_type; |
| } |
| } |
| |
| sbitmap *reachable |
| = sbitmap_vector_alloc (last_basic_block_for_fn (cfun), |
| last_basic_block_for_fn (cfun)); |
| |
| bitmap_vector_clear (reachable, last_basic_block_for_fn (cfun)); |
| |
| auto_vec<std::pair<int, tree> > priority; |
| |
| FOR_ALL_BB_FN (bb, cfun) |
| { |
| if (bb->aux) |
| { |
| tree record_type = (tree) bb->aux; |
| |
| basic_block bb2; |
| FOR_ALL_BB_FN (bb2, cfun) |
| bb2->flags &= ~BB_VISITED; |
| |
| priority.safe_push (std::make_pair (bb->index, record_type)); |
| dfs_broadcast_reachable_1 (bb, reachable[bb->index]); |
| } |
| } |
| |
| sbitmap *inverted |
| = sbitmap_vector_alloc (last_basic_block_for_fn (cfun), |
| last_basic_block_for_fn (cfun)); |
| |
| bitmap_vector_clear (inverted, last_basic_block_for_fn (cfun)); |
| |
| for (int i = 0; i < last_basic_block_for_fn (cfun); i++) |
| { |
| sbitmap_iterator bi; |
| unsigned int j; |
| EXECUTE_IF_SET_IN_BITMAP (reachable[i], 0, j, bi) |
| bitmap_set_bit (inverted[j], i); |
| } |
| |
| for (int i = 0; i < last_basic_block_for_fn (cfun); i++) |
| bitmap_ior (reachable[i], reachable[i], inverted[i]); |
| |
| sbitmap_vector_free (inverted); |
| |
| used_range_vec_t used_ranges; |
| |
| used_ranges.safe_grow_cleared (last_basic_block_for_fn (cfun)); |
| |
| blk_offset_map_t blk_offset_map; |
| |
| addr_range worker_shm_bounds (bounds_lo, bounds_hi); |
| |
| priority.qsort (sort_size_descending); |
| for (unsigned int i = 0; i < priority.length (); i++) |
| { |
| idx_decl_pair_t p = priority[i]; |
| int blkno = p.first; |
| tree record_type = p.second; |
| HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (record_type)); |
| HOST_WIDE_INT align = TYPE_ALIGN_UNIT (record_type); |
| |
| splay_tree conflicts = splay_tree_new (splay_tree_compare_addr_range, |
| splay_tree_free_key, NULL); |
| |
| if (!used_ranges[blkno]) |
| used_ranges[blkno] = splay_tree_new (splay_tree_compare_addr_range, |
| splay_tree_free_key, NULL); |
| else |
| merge_ranges (conflicts, used_ranges[blkno]); |
| |
| sbitmap_iterator bi; |
| unsigned int j; |
| EXECUTE_IF_SET_IN_BITMAP (reachable[blkno], 0, j, bi) |
| if (used_ranges[j]) |
| merge_ranges (conflicts, used_ranges[j]); |
| |
| addr_range ar |
| = first_fit_range (conflicts, size, align, &worker_shm_bounds); |
| |
| splay_tree_delete (conflicts); |
| |
| if (ar.invalid ()) |
| { |
| unsigned HOST_WIDE_INT base |
| = (bounds_lo + align - 1) & ~(align - 1); |
| if (base + size > bounds_hi) |
| error_at (UNKNOWN_LOCATION, "shared-memory region overflow"); |
| std::pair<unsigned HOST_WIDE_INT, bool> base_inrng |
| = std::make_pair (base, false); |
| blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng); |
| } |
| else |
| { |
| splay_tree_node old = splay_tree_lookup (used_ranges[blkno], |
| (splay_tree_key) &ar); |
| if (old) |
| { |
| fprintf (stderr, "trying to map [%d..%d] but [%d..%d] is " |
| "already mapped in block %d\n", (int) ar.lo, |
| (int) ar.hi, (int) ((addr_range *) old->key)->lo, |
| (int) ((addr_range *) old->key)->hi, blkno); |
| abort (); |
| } |
| |
| addr_range *arp = new addr_range (ar); |
| splay_tree_insert (used_ranges[blkno], (splay_tree_key) arp, |
| (splay_tree_value) blkno); |
| std::pair<unsigned HOST_WIDE_INT, bool> base_inrng |
| = std::make_pair (ar.lo, true); |
| blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng); |
| } |
| } |
| |
| sbitmap_vector_free (reachable); |
| |
| neuter_worker_single (par, mask, worker_single, vector_single, &prop_set, |
| &partitioned_var_uses, &record_field_map, |
| &blk_offset_map, writes_gang_private); |
| |
| record_field_map.empty (); |
| |
| /* These are supposed to have been 'delete'd by 'neuter_worker_single'. */ |
| for (auto it : prop_set) |
| gcc_checking_assert (!it); |
| prop_set.release (); |
| |
| delete par; |
| |
| /* This doesn't seem to make a difference. */ |
| loops_state_clear (LOOP_CLOSED_SSA); |
| |
| /* Neutering worker-single neutered blocks will invalidate dominance info. |
| It may be possible to incrementally update just the affected blocks, but |
| obliterate everything for now. */ |
| free_dominance_info (CDI_DOMINATORS); |
| free_dominance_info (CDI_POST_DOMINATORS); |
| |
| if (dump_file) |
| { |
| fprintf (dump_file, "\n\nAfter neutering:\n\n"); |
| dump_function_to_file (current_function_decl, dump_file, dump_flags); |
| } |
| } |
| |
| static int |
| execute_omp_oacc_neuter_broadcast () |
| { |
| unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX]; |
| unsigned HOST_WIDE_INT private_size[GOMP_DIM_MAX]; |
| |
| for (unsigned i = 0; i < GOMP_DIM_MAX; i++) |
| { |
| reduction_size[i] = 0; |
| private_size[i] = 0; |
| } |
| |
| /* Calculate shared memory size required for reduction variables and |
| gang-private memory for this offloaded function. */ |
| basic_block bb; |
| FOR_ALL_BB_FN (bb, cfun) |
| { |
| for (gimple_stmt_iterator gsi = gsi_start_bb (bb); |
| !gsi_end_p (gsi); |
| gsi_next (&gsi)) |
| { |
| gimple *stmt = gsi_stmt (gsi); |
| if (!is_gimple_call (stmt)) |
| continue; |
| gcall *call = as_a <gcall *> (stmt); |
| if (!gimple_call_internal_p (call)) |
| continue; |
| enum internal_fn ifn_code = gimple_call_internal_fn (call); |
| switch (ifn_code) |
| { |
| default: break; |
| case IFN_GOACC_REDUCTION: |
| if (integer_minus_onep (gimple_call_arg (call, 3))) |
| continue; |
| else |
| { |
| unsigned code = TREE_INT_CST_LOW (gimple_call_arg (call, 0)); |
| /* Only count reduction variables once: the choice to pick |
| the setup call is fairly arbitrary. */ |
| if (code == IFN_GOACC_REDUCTION_SETUP) |
| { |
| int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); |
| tree var = gimple_call_arg (call, 2); |
| tree offset = gimple_call_arg (call, 5); |
| tree var_type = TREE_TYPE (var); |
| unsigned HOST_WIDE_INT limit |
| = (tree_to_uhwi (offset) |
| + tree_to_uhwi (TYPE_SIZE_UNIT (var_type))); |
| reduction_size[level] |
| = MAX (reduction_size[level], limit); |
| } |
| } |
| break; |
| case IFN_UNIQUE: |
| { |
| enum ifn_unique_kind kind |
| = ((enum ifn_unique_kind) |
| TREE_INT_CST_LOW (gimple_call_arg (call, 0))); |
| |
| if (kind == IFN_UNIQUE_OACC_PRIVATE) |
| { |
| HOST_WIDE_INT level |
| = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); |
| if (level == -1) |
| break; |
| for (unsigned i = 3; |
| i < gimple_call_num_args (call); |
| i++) |
| { |
| tree arg = gimple_call_arg (call, i); |
| gcc_assert (TREE_CODE (arg) == ADDR_EXPR); |
| tree decl = TREE_OPERAND (arg, 0); |
| unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (decl); |
| private_size[level] = ((private_size[level] + align - 1) |
| & ~(align - 1)); |
| unsigned HOST_WIDE_INT decl_size |
| = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (decl))); |
| private_size[level] += decl_size; |
| } |
| } |
| } |
| break; |
| } |
| } |
| } |
| |
| int dims[GOMP_DIM_MAX]; |
| for (unsigned i = 0; i < GOMP_DIM_MAX; i++) |
| dims[i] = oacc_get_fn_dim_size (current_function_decl, i); |
| |
| /* Find bounds of shared-memory buffer space we can use. */ |
| unsigned HOST_WIDE_INT bounds_lo = 0, bounds_hi = 0; |
| if (targetm.goacc.shared_mem_layout) |
| targetm.goacc.shared_mem_layout (&bounds_lo, &bounds_hi, dims, |
| private_size, reduction_size); |
| |
| /* Perform worker partitioning unless we know 'num_workers(1)'. */ |
| if (dims[GOMP_DIM_WORKER] != 1) |
| oacc_do_neutering (bounds_lo, bounds_hi); |
| |
| return 0; |
| } |
| |
| namespace { |
| |
| const pass_data pass_data_omp_oacc_neuter_broadcast = |
| { |
| GIMPLE_PASS, /* type */ |
| "omp_oacc_neuter_broadcast", /* name */ |
| OPTGROUP_OMP, /* optinfo_flags */ |
| TV_NONE, /* tv_id */ |
| PROP_cfg, /* properties_required */ |
| 0, /* properties_provided */ |
| 0, /* properties_destroyed */ |
| 0, /* todo_flags_start */ |
| TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */ |
| }; |
| |
| class pass_omp_oacc_neuter_broadcast : public gimple_opt_pass |
| { |
| public: |
| pass_omp_oacc_neuter_broadcast (gcc::context *ctxt) |
| : gimple_opt_pass (pass_data_omp_oacc_neuter_broadcast, ctxt) |
| {} |
| |
| /* opt_pass methods: */ |
| bool gate (function *fun) final override |
| { |
| if (!flag_openacc) |
| return false; |
| |
| if (!targetm.goacc.create_worker_broadcast_record) |
| return false; |
| |
| /* Only relevant for OpenACC offloaded functions. */ |
| tree attr = oacc_get_fn_attrib (fun->decl); |
| if (!attr) |
| return false; |
| |
| return true; |
| } |
| |
| unsigned int execute (function *) final override |
| { |
| return execute_omp_oacc_neuter_broadcast (); |
| } |
| |
| }; // class pass_omp_oacc_neuter_broadcast |
| |
| } // anon namespace |
| |
| gimple_opt_pass * |
| make_pass_omp_oacc_neuter_broadcast (gcc::context *ctxt) |
| { |
| return new pass_omp_oacc_neuter_broadcast (ctxt); |
| } |