| /* Decompose OpenACC 'kernels' constructs into parts, a sequence of compute |
| constructs |
| |
| Copyright (C) 2020-2022 Free Software Foundation, Inc. |
| |
| This file is part of GCC. |
| |
| GCC is free software; you can redistribute it and/or modify it under |
| the terms of the GNU General Public License as published by the Free |
| Software Foundation; either version 3, or (at your option) any later |
| version. |
| |
| GCC is distributed in the hope that it will be useful, but WITHOUT ANY |
| WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with GCC; see the file COPYING3. If not see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #include "config.h" |
| #include "system.h" |
| #include "coretypes.h" |
| #include "backend.h" |
| #include "target.h" |
| #include "tree.h" |
| #include "langhooks.h" |
| #include "gimple.h" |
| #include "tree-pass.h" |
| #include "cgraph.h" |
| #include "fold-const.h" |
| #include "gimplify.h" |
| #include "gimple-iterator.h" |
| #include "gimple-walk.h" |
| #include "gomp-constants.h" |
| #include "omp-general.h" |
| #include "diagnostic-core.h" |
| |
| |
| /* This preprocessing pass is run immediately before lower_omp. It decomposes |
| OpenACC 'kernels' constructs into parts, a sequence of compute constructs. |
| |
| The translation is as follows: |
| - The entire 'kernels' region is turned into a 'data' region with clauses |
| taken from the 'kernels' region. New 'create' clauses are added for all |
| variables declared at the top level in the kernels region. |
| - Any loop nests annotated with an OpenACC 'loop' directive are wrapped in |
| a new compute construct. |
| - 'loop' directives without an explicit 'independent' or 'seq' clause |
| get an 'auto' clause added; other clauses are preserved on the loop |
| or moved to the new surrounding compute construct, as applicable. |
| - Any sequences of other code (non-loops, non-OpenACC 'loop's) are wrapped |
| in new "gang-single" compute construct: 'worker'/'vector' parallelism is |
| preserved, but 'num_gangs (1)' is enforced. |
| - Both points above only apply at the topmost level in the region, that |
| is, the transformation does not introduce new compute constructs inside |
| nested statement bodies. In particular, this means that a |
| gang-parallelizable loop inside an 'if' statement is made "gang-single". |
| - In order to make the host wait only once for the whole region instead |
| of once per device kernel launch, the new compute constructs are |
| annotated 'async'. Unless the original 'kernels' construct already was |
| marked 'async', the entire region ends with a 'wait' directive. If the |
| original 'kernels' construct was marked 'async', the synthesized 'async' |
| clauses use the original 'kernels' construct's 'async' argument |
| (possibly implicit). |
| */ |
| |
| |
| /*TODO Things are conceptually wrong here: 'loop' clauses may be hidden behind |
| 'device_type', so we have to defer a lot of processing until we're in the |
| offloading compilation. "Fortunately", GCC doesn't support the OpenACC |
| 'device_type' clause yet, so we get away that. */ |
| |
| |
| /* Helper function for decompose_kernels_region_body. If STMT contains a |
| "top-level" OMP_FOR statement, returns a pointer to that statement; |
| returns NULL otherwise. |
| |
| A "top-level" OMP_FOR statement is one that is possibly accompanied by |
| small snippets of setup code. Specifically, this function accepts an |
| OMP_FOR possibly wrapped in a singleton bind and a singleton try |
| statement to allow for a local loop variable, but not an OMP_FOR |
| statement nested in any other constructs. Alternatively, it accepts a |
| non-singleton bind containing only assignments and then an OMP_FOR |
| statement at the very end. The former style can be generated by the C |
| frontend, the latter by the Fortran frontend. */ |
| |
| static gimple * |
| top_level_omp_for_in_stmt (gimple *stmt) |
| { |
| if (gimple_code (stmt) == GIMPLE_OMP_FOR) |
| return stmt; |
| |
| if (gimple_code (stmt) == GIMPLE_BIND) |
| { |
| gimple_seq body = gimple_bind_body (as_a <gbind *> (stmt)); |
| if (gimple_seq_singleton_p (body)) |
| { |
| /* Accept an OMP_FOR statement, or a try statement containing only |
| a single OMP_FOR. */ |
| gimple *maybe_for_or_try = gimple_seq_first_stmt (body); |
| if (gimple_code (maybe_for_or_try) == GIMPLE_OMP_FOR) |
| return maybe_for_or_try; |
| else if (gimple_code (maybe_for_or_try) == GIMPLE_TRY) |
| { |
| gimple_seq try_body = gimple_try_eval (maybe_for_or_try); |
| if (!gimple_seq_singleton_p (try_body)) |
| return NULL; |
| gimple *maybe_omp_for_stmt = gimple_seq_first_stmt (try_body); |
| if (gimple_code (maybe_omp_for_stmt) == GIMPLE_OMP_FOR) |
| return maybe_omp_for_stmt; |
| } |
| } |
| else |
| { |
| gimple_stmt_iterator gsi; |
| /* Accept only a block of optional assignments followed by an |
| OMP_FOR at the end. No other kinds of statements allowed. */ |
| for (gsi = gsi_start (body); !gsi_end_p (gsi); gsi_next (&gsi)) |
| { |
| gimple *body_stmt = gsi_stmt (gsi); |
| if (gimple_code (body_stmt) == GIMPLE_ASSIGN) |
| continue; |
| else if (gimple_code (body_stmt) == GIMPLE_OMP_FOR |
| && gsi_one_before_end_p (gsi)) |
| return body_stmt; |
| else |
| return NULL; |
| } |
| } |
| } |
| |
| return NULL; |
| } |
| |
| /* Helper for adjust_region_code: evaluate the statement at GSI_P. */ |
| |
| static tree |
| adjust_region_code_walk_stmt_fn (gimple_stmt_iterator *gsi_p, |
| bool *handled_ops_p, |
| struct walk_stmt_info *wi) |
| { |
| int *region_code = (int *) wi->info; |
| |
| gimple *stmt = gsi_stmt (*gsi_p); |
| switch (gimple_code (stmt)) |
| { |
| case GIMPLE_OMP_FOR: |
| { |
| tree clauses = gimple_omp_for_clauses (stmt); |
| if (omp_find_clause (clauses, OMP_CLAUSE_INDEPENDENT)) |
| { |
| /* Explicit 'independent' clause. */ |
| /* Keep going; recurse into loop body. */ |
| break; |
| } |
| else if (omp_find_clause (clauses, OMP_CLAUSE_SEQ)) |
| { |
| /* Explicit 'seq' clause. */ |
| /* We'll "parallelize" if at some level a loop construct has been |
| marked up by the user as unparallelizable ('seq' clause; we'll |
| respect that in the later processing). Given that the user has |
| explicitly marked it up, this loop construct cannot be |
| performance-critical, and in this case it's also fine to |
| "parallelize" instead of "gang-single", because any outer or |
| inner loops may still exploit the available parallelism. */ |
| /* Keep going; recurse into loop body. */ |
| break; |
| } |
| else |
| { |
| /* Explicit or implicit 'auto' clause. */ |
| /* The user would like this loop analyzed ('auto' clause) and |
| typically parallelized, but we don't have available yet the |
| compiler logic to analyze this, so can't parallelize it here, so |
| we'd very likely be running into a performance problem if we |
| were to execute this unparallelized, thus forward the whole loop |
| nest to 'parloops'. */ |
| *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS; |
| /* Terminate: final decision for this region. */ |
| *handled_ops_p = true; |
| return integer_zero_node; |
| } |
| gcc_unreachable (); |
| } |
| |
| case GIMPLE_COND: |
| case GIMPLE_GOTO: |
| case GIMPLE_SWITCH: |
| case GIMPLE_ASM: |
| case GIMPLE_ASSUME: |
| case GIMPLE_TRANSACTION: |
| case GIMPLE_RETURN: |
| /* Statement that might constitute some looping/control flow pattern. */ |
| /* The user would like this code analyzed (implicit inside a 'kernels' |
| region) and typically parallelized, but we don't have available yet |
| the compiler logic to analyze this, so can't parallelize it here, so |
| we'd very likely be running into a performance problem if we were to |
| execute this unparallelized, thus forward the whole thing to |
| 'parloops'. */ |
| *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS; |
| /* Terminate: final decision for this region. */ |
| *handled_ops_p = true; |
| return integer_zero_node; |
| |
| default: |
| /* Keep going. */ |
| break; |
| } |
| |
| return NULL; |
| } |
| |
| /* Adjust the REGION_CODE for the region in GS. */ |
| |
| static void |
| adjust_region_code (gimple_seq gs, int *region_code) |
| { |
| struct walk_stmt_info wi; |
| memset (&wi, 0, sizeof (wi)); |
| wi.info = region_code; |
| walk_gimple_seq (gs, adjust_region_code_walk_stmt_fn, NULL, &wi); |
| } |
| |
| /* Helper function for make_loops_gang_single for walking the tree. If the |
| statement indicated by GSI_P is an OpenACC for loop with a gang clause, |
| issue a warning and remove the clause. */ |
| |
| static tree |
| visit_loops_in_gang_single_region (gimple_stmt_iterator *gsi_p, |
| bool *handled_ops_p, |
| struct walk_stmt_info *) |
| { |
| *handled_ops_p = false; |
| |
| gimple *stmt = gsi_stmt (*gsi_p); |
| switch (gimple_code (stmt)) |
| { |
| case GIMPLE_OMP_FOR: |
| /*TODO Given the current 'adjust_region_code' algorithm, this is |
| actually... */ |
| #if 0 |
| gcc_unreachable (); |
| #else |
| /* ..., but due to bugs (PR100400), we may actually come here. |
| Reliably catch this, regardless of checking level. */ |
| internal_error ("PR100400"); |
| #endif |
| |
| { |
| tree clauses = gimple_omp_for_clauses (stmt); |
| tree prev_clause = NULL; |
| for (tree clause = clauses; clause; clause = OMP_CLAUSE_CHAIN (clause)) |
| { |
| if (OMP_CLAUSE_CODE (clause) == OMP_CLAUSE_GANG) |
| { |
| /* It makes no sense to have a 'gang' clause in a "gang-single" |
| region, so warn and remove it. */ |
| warning_at (gimple_location (stmt), 0, |
| "conditionally executed loop in %<kernels%> region" |
| " will be executed by a single gang;" |
| " ignoring %<gang%> clause"); |
| if (prev_clause != NULL) |
| OMP_CLAUSE_CHAIN (prev_clause) = OMP_CLAUSE_CHAIN (clause); |
| else |
| clauses = OMP_CLAUSE_CHAIN (clause); |
| |
| break; |
| } |
| prev_clause = clause; |
| } |
| gimple_omp_for_set_clauses (stmt, clauses); |
| } |
| /* No need to recurse into nested statements; no loop nested inside |
| this loop can be gang-partitioned. */ |
| sorry ("%<gang%> loop in %<gang-single%> region"); |
| *handled_ops_p = true; |
| break; |
| |
| default: |
| break; |
| } |
| |
| return NULL; |
| } |
| |
| /* Visit all nested OpenACC loops in the sequence indicated by GS. This |
| statement is expected to be inside a gang-single region. Issue a warning |
| for any loops inside it that have gang clauses and remove the clauses. */ |
| |
| static void |
| make_loops_gang_single (gimple_seq gs) |
| { |
| struct walk_stmt_info wi; |
| memset (&wi, 0, sizeof (wi)); |
| walk_gimple_seq (gs, visit_loops_in_gang_single_region, NULL, &wi); |
| } |
| |
| /* Construct a "gang-single" compute construct at LOC containing the STMTS. |
| Annotate with CLAUSES, which must not contain a 'num_gangs' clause, and an |
| additional 'num_gangs (1)' clause to force "gang-single" execution. */ |
| |
| static gimple * |
| make_region_seq (location_t loc, gimple_seq stmts, |
| tree num_gangs_clause, |
| tree num_workers_clause, |
| tree vector_length_clause, |
| tree clauses) |
| { |
| /* This correctly unshares the entire clause chain rooted here. */ |
| clauses = unshare_expr (clauses); |
| |
| dump_user_location_t loc_stmts_first = gimple_seq_first (stmts); |
| |
| /* Figure out the region code for this region. */ |
| /* Optimistic default: assume "setup code", no looping; thus not |
| performance-critical. */ |
| int region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE; |
| adjust_region_code (stmts, ®ion_code); |
| |
| if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE) |
| { |
| if (dump_enabled_p ()) |
| /*TODO MSG_MISSED_OPTIMIZATION? */ |
| dump_printf_loc (MSG_NOTE, loc_stmts_first, |
| "beginning %<gang-single%> part" |
| " in OpenACC %<kernels%> region\n"); |
| |
| /* Synthesize a 'num_gangs (1)' clause. */ |
| tree gang_single_clause = build_omp_clause (loc, OMP_CLAUSE_NUM_GANGS); |
| OMP_CLAUSE_OPERAND (gang_single_clause, 0) = integer_one_node; |
| OMP_CLAUSE_CHAIN (gang_single_clause) = clauses; |
| clauses = gang_single_clause; |
| |
| /* Remove and issue warnings about gang clauses on any OpenACC |
| loops nested inside this sequentially executed statement. */ |
| make_loops_gang_single (stmts); |
| } |
| else if (region_code == GF_OMP_TARGET_KIND_OACC_KERNELS) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, loc_stmts_first, |
| "beginning %<parloops%> part" |
| " in OpenACC %<kernels%> region\n"); |
| |
| /* As we're transforming a 'GF_OMP_TARGET_KIND_OACC_KERNELS' into another |
| 'GF_OMP_TARGET_KIND_OACC_KERNELS', this isn't doing any of the clauses |
| mangling that 'make_region_loop_nest' is doing. */ |
| /* Re-assemble the clauses stripped off earlier. */ |
| if (num_gangs_clause != NULL) |
| { |
| tree c = unshare_expr (num_gangs_clause); |
| OMP_CLAUSE_CHAIN (c) = clauses; |
| clauses = c; |
| } |
| if (num_workers_clause != NULL) |
| { |
| tree c = unshare_expr (num_workers_clause); |
| OMP_CLAUSE_CHAIN (c) = clauses; |
| clauses = c; |
| } |
| if (vector_length_clause != NULL) |
| { |
| tree c = unshare_expr (vector_length_clause); |
| OMP_CLAUSE_CHAIN (c) = clauses; |
| clauses = c; |
| } |
| } |
| else |
| gcc_unreachable (); |
| |
| /* Build the gang-single region. */ |
| gimple *single_region = gimple_build_omp_target (NULL, region_code, clauses); |
| gimple_set_location (single_region, loc); |
| gbind *single_body = gimple_build_bind (NULL, stmts, make_node (BLOCK)); |
| gimple_omp_set_body (single_region, single_body); |
| |
| return single_region; |
| } |
| |
| /* Helper function for make_region_loop_nest. Adds a 'num_gangs' |
| ('num_workers', 'vector_length') clause to the given CLAUSES, either the one |
| from the parent compute construct (PARENT_CLAUSE) or a new one based on the |
| loop's own LOOP_CLAUSE ('gang (num: N)' or similar for 'worker' or 'vector' |
| clauses) with the given CLAUSE_CODE. Does nothing if neither PARENT_CLAUSE |
| nor LOOP_CLAUSE exist. Returns the new clauses. */ |
| |
| static tree |
| add_parent_or_loop_num_clause (tree parent_clause, tree loop_clause, |
| omp_clause_code clause_code, tree clauses) |
| { |
| if (parent_clause != NULL) |
| { |
| tree num_clause = unshare_expr (parent_clause); |
| OMP_CLAUSE_CHAIN (num_clause) = clauses; |
| clauses = num_clause; |
| } |
| else if (loop_clause != NULL) |
| { |
| /* The kernels region does not have a 'num_gangs' clause, but the loop |
| itself had a 'gang (num: N)' clause. Honor it by adding a |
| 'num_gangs (N)' clause on the compute construct. */ |
| tree num = OMP_CLAUSE_OPERAND (loop_clause, 0); |
| tree new_num_clause |
| = build_omp_clause (OMP_CLAUSE_LOCATION (loop_clause), clause_code); |
| OMP_CLAUSE_OPERAND (new_num_clause, 0) = num; |
| OMP_CLAUSE_CHAIN (new_num_clause) = clauses; |
| clauses = new_num_clause; |
| } |
| return clauses; |
| } |
| |
| /* Helper for make_region_loop_nest, looking for 'worker (num: N)' or 'vector |
| (length: N)' clauses in nested loops. Removes the argument, transferring it |
| to the enclosing compute construct (via WI->INFO). If arguments within the |
| same loop nest conflict, emits a warning. |
| |
| This function also decides whether to add an 'auto' clause on each of these |
| nested loops. */ |
| |
| struct adjust_nested_loop_clauses_wi_info |
| { |
| tree *loop_gang_clause_ptr; |
| tree *loop_worker_clause_ptr; |
| tree *loop_vector_clause_ptr; |
| }; |
| |
| static tree |
| adjust_nested_loop_clauses (gimple_stmt_iterator *gsi_p, bool *, |
| struct walk_stmt_info *wi) |
| { |
| struct adjust_nested_loop_clauses_wi_info *wi_info |
| = (struct adjust_nested_loop_clauses_wi_info *) wi->info; |
| gimple *stmt = gsi_stmt (*gsi_p); |
| |
| if (gimple_code (stmt) == GIMPLE_OMP_FOR) |
| { |
| bool add_auto_clause = true; |
| tree loop_clauses = gimple_omp_for_clauses (stmt); |
| tree loop_clause = loop_clauses; |
| for (; loop_clause; loop_clause = OMP_CLAUSE_CHAIN (loop_clause)) |
| { |
| tree *outer_clause_ptr = NULL; |
| switch (OMP_CLAUSE_CODE (loop_clause)) |
| { |
| case OMP_CLAUSE_GANG: |
| outer_clause_ptr = wi_info->loop_gang_clause_ptr; |
| break; |
| case OMP_CLAUSE_WORKER: |
| outer_clause_ptr = wi_info->loop_worker_clause_ptr; |
| break; |
| case OMP_CLAUSE_VECTOR: |
| outer_clause_ptr = wi_info->loop_vector_clause_ptr; |
| break; |
| case OMP_CLAUSE_SEQ: |
| case OMP_CLAUSE_INDEPENDENT: |
| case OMP_CLAUSE_AUTO: |
| add_auto_clause = false; |
| default: |
| break; |
| } |
| if (outer_clause_ptr != NULL) |
| { |
| if (OMP_CLAUSE_OPERAND (loop_clause, 0) != NULL |
| && *outer_clause_ptr == NULL) |
| { |
| /* Transfer the clause to the enclosing compute construct and |
| remove the numerical argument from the 'loop'. */ |
| *outer_clause_ptr = unshare_expr (loop_clause); |
| OMP_CLAUSE_OPERAND (loop_clause, 0) = NULL; |
| } |
| else if (OMP_CLAUSE_OPERAND (loop_clause, 0) != NULL && |
| OMP_CLAUSE_OPERAND (*outer_clause_ptr, 0) != NULL) |
| { |
| /* See if both of these are the same constant. If they |
| aren't, emit a warning. */ |
| tree old_op = OMP_CLAUSE_OPERAND (*outer_clause_ptr, 0); |
| tree new_op = OMP_CLAUSE_OPERAND (loop_clause, 0); |
| if (!(cst_and_fits_in_hwi (old_op) && |
| cst_and_fits_in_hwi (new_op) && |
| int_cst_value (old_op) == int_cst_value (new_op))) |
| { |
| const char *clause_name |
| = omp_clause_code_name[OMP_CLAUSE_CODE (loop_clause)]; |
| error_at (gimple_location (stmt), |
| "cannot honor conflicting %qs clause", |
| clause_name); |
| inform (OMP_CLAUSE_LOCATION (*outer_clause_ptr), |
| "location of the previous clause" |
| " in the same loop nest"); |
| } |
| OMP_CLAUSE_OPERAND (loop_clause, 0) = NULL; |
| } |
| } |
| } |
| if (add_auto_clause) |
| { |
| tree auto_clause |
| = build_omp_clause (gimple_location (stmt), OMP_CLAUSE_AUTO); |
| OMP_CLAUSE_CHAIN (auto_clause) = loop_clauses; |
| gimple_omp_for_set_clauses (stmt, auto_clause); |
| } |
| } |
| |
| return NULL; |
| } |
| |
| /* Helper for make_region_loop_nest. Transform OpenACC 'kernels'/'loop' |
| construct clauses into OpenACC 'parallel'/'loop' construct ones. */ |
| |
| static tree |
| transform_kernels_loop_clauses (gimple *omp_for, |
| tree num_gangs_clause, |
| tree num_workers_clause, |
| tree vector_length_clause, |
| tree clauses) |
| { |
| /* If this loop in a kernels region does not have an explicit 'seq', |
| 'independent', or 'auto' clause, we must give it an explicit 'auto' |
| clause. |
| We also check for 'gang (num: N)' clauses. These must not appear in |
| kernels regions that have their own 'num_gangs' clause. Otherwise, they |
| must be converted and put on the region; similarly for 'worker' and |
| 'vector' clauses. */ |
| bool add_auto_clause = true; |
| tree loop_gang_clause = NULL, loop_worker_clause = NULL, |
| loop_vector_clause = NULL; |
| tree loop_clauses = gimple_omp_for_clauses (omp_for); |
| for (tree loop_clause = loop_clauses; |
| loop_clause; |
| loop_clause = OMP_CLAUSE_CHAIN (loop_clause)) |
| { |
| bool found_num_clause = false; |
| tree *clause_ptr, clause_to_check; |
| switch (OMP_CLAUSE_CODE (loop_clause)) |
| { |
| case OMP_CLAUSE_GANG: |
| found_num_clause = true; |
| clause_ptr = &loop_gang_clause; |
| clause_to_check = num_gangs_clause; |
| break; |
| case OMP_CLAUSE_WORKER: |
| found_num_clause = true; |
| clause_ptr = &loop_worker_clause; |
| clause_to_check = num_workers_clause; |
| break; |
| case OMP_CLAUSE_VECTOR: |
| found_num_clause = true; |
| clause_ptr = &loop_vector_clause; |
| clause_to_check = vector_length_clause; |
| break; |
| case OMP_CLAUSE_INDEPENDENT: |
| case OMP_CLAUSE_SEQ: |
| case OMP_CLAUSE_AUTO: |
| add_auto_clause = false; |
| default: |
| break; |
| } |
| if (found_num_clause && OMP_CLAUSE_OPERAND (loop_clause, 0) != NULL) |
| { |
| if (clause_to_check) |
| { |
| const char *clause_name |
| = omp_clause_code_name[OMP_CLAUSE_CODE (loop_clause)]; |
| const char *parent_clause_name |
| = omp_clause_code_name[OMP_CLAUSE_CODE (clause_to_check)]; |
| error_at (OMP_CLAUSE_LOCATION (loop_clause), |
| "argument not permitted on %qs clause" |
| " in OpenACC %<kernels%> region with a %qs clause", |
| clause_name, parent_clause_name); |
| inform (OMP_CLAUSE_LOCATION (clause_to_check), |
| "location of OpenACC %<kernels%>"); |
| } |
| /* Copy the 'gang (N)'/'worker (N)'/'vector (N)' clause to the |
| enclosing compute construct. */ |
| *clause_ptr = unshare_expr (loop_clause); |
| OMP_CLAUSE_CHAIN (*clause_ptr) = NULL; |
| /* Leave a 'gang'/'worker'/'vector' clause on the 'loop', but without |
| argument. */ |
| OMP_CLAUSE_OPERAND (loop_clause, 0) = NULL; |
| } |
| } |
| if (add_auto_clause) |
| { |
| tree auto_clause = build_omp_clause (gimple_location (omp_for), |
| OMP_CLAUSE_AUTO); |
| OMP_CLAUSE_CHAIN (auto_clause) = loop_clauses; |
| loop_clauses = auto_clause; |
| } |
| gimple_omp_for_set_clauses (omp_for, loop_clauses); |
| /* We must also recurse into the loop; it might contain nested loops having |
| their own 'worker (num: W)' or 'vector (length: V)' clauses. Turn these |
| into 'worker'/'vector' clauses on the compute construct. */ |
| struct walk_stmt_info wi; |
| memset (&wi, 0, sizeof (wi)); |
| struct adjust_nested_loop_clauses_wi_info wi_info; |
| wi_info.loop_gang_clause_ptr = &loop_gang_clause; |
| wi_info.loop_worker_clause_ptr = &loop_worker_clause; |
| wi_info.loop_vector_clause_ptr = &loop_vector_clause; |
| wi.info = &wi_info; |
| gimple *body = gimple_omp_body (omp_for); |
| walk_gimple_seq (body, adjust_nested_loop_clauses, NULL, &wi); |
| /* Check if there were conflicting numbers of workers or vector length. */ |
| if (loop_gang_clause != NULL && |
| OMP_CLAUSE_OPERAND (loop_gang_clause, 0) == NULL) |
| loop_gang_clause = NULL; |
| if (loop_worker_clause != NULL && |
| OMP_CLAUSE_OPERAND (loop_worker_clause, 0) == NULL) |
| loop_worker_clause = NULL; |
| if (loop_vector_clause != NULL && |
| OMP_CLAUSE_OPERAND (loop_vector_clause, 0) == NULL) |
| vector_length_clause = NULL; |
| |
| /* If the kernels region had 'num_gangs', 'num_worker', 'vector_length' |
| clauses, add these to this new compute construct. */ |
| clauses |
| = add_parent_or_loop_num_clause (num_gangs_clause, loop_gang_clause, |
| OMP_CLAUSE_NUM_GANGS, clauses); |
| clauses |
| = add_parent_or_loop_num_clause (num_workers_clause, loop_worker_clause, |
| OMP_CLAUSE_NUM_WORKERS, clauses); |
| clauses |
| = add_parent_or_loop_num_clause (vector_length_clause, loop_vector_clause, |
| OMP_CLAUSE_VECTOR_LENGTH, clauses); |
| |
| return clauses; |
| } |
| |
| /* Construct a possibly gang-parallel compute construct containing the STMT, |
| which must be identical to, or a bind containing, the loop OMP_FOR. |
| |
| The NUM_GANGS_CLAUSE, NUM_WORKERS_CLAUSE, and VECTOR_LENGTH_CLAUSE are |
| optional clauses from the original kernels region and must not be contained |
| in the other CLAUSES. The newly created compute construct is annotated with |
| the optional NUM_GANGS_CLAUSE as well as the other CLAUSES. If there is no |
| NUM_GANGS_CLAUSE but the loop has a 'gang (num: N)' clause, that is |
| converted to a 'num_gangs (N)' clause on the new compute construct, and |
| similarly for 'worker' and 'vector' clauses. |
| |
| The outermost loop gets an 'auto' clause unless there already is an |
| 'seq'/'independent'/'auto' clause. Nested loops inside OMP_FOR are treated |
| similarly by the adjust_nested_loop_clauses function. */ |
| |
| static gimple * |
| make_region_loop_nest (gimple *omp_for, gimple_seq stmts, |
| tree num_gangs_clause, |
| tree num_workers_clause, |
| tree vector_length_clause, |
| tree clauses) |
| { |
| /* This correctly unshares the entire clause chain rooted here. */ |
| clauses = unshare_expr (clauses); |
| |
| /* Figure out the region code for this region. */ |
| /* Optimistic default: assume that the loop nest is parallelizable |
| (essentially, no GIMPLE_OMP_FOR with (explicit or implicit) 'auto' clause, |
| and no un-annotated loops). */ |
| int region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED; |
| adjust_region_code (stmts, ®ion_code); |
| |
| if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED) |
| { |
| if (dump_enabled_p ()) |
| /* This is not MSG_OPTIMIZED_LOCATIONS, as we're just doing what the |
| user asked us to. */ |
| dump_printf_loc (MSG_NOTE, omp_for, |
| "parallelized loop nest" |
| " in OpenACC %<kernels%> region\n"); |
| |
| clauses = transform_kernels_loop_clauses (omp_for, |
| num_gangs_clause, |
| num_workers_clause, |
| vector_length_clause, |
| clauses); |
| } |
| else if (region_code == GF_OMP_TARGET_KIND_OACC_KERNELS) |
| { |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_NOTE, omp_for, |
| "forwarded loop nest" |
| " in OpenACC %<kernels%> region" |
| " to %<parloops%> for analysis\n"); |
| |
| /* We're transforming one 'GF_OMP_TARGET_KIND_OACC_KERNELS' into another |
| 'GF_OMP_TARGET_KIND_OACC_KERNELS', so don't have to |
| 'transform_kernels_loop_clauses'. */ |
| /* Re-assemble the clauses stripped off earlier. */ |
| clauses |
| = add_parent_or_loop_num_clause (num_gangs_clause, NULL, |
| OMP_CLAUSE_NUM_GANGS, clauses); |
| clauses |
| = add_parent_or_loop_num_clause (num_workers_clause, NULL, |
| OMP_CLAUSE_NUM_WORKERS, clauses); |
| clauses |
| = add_parent_or_loop_num_clause (vector_length_clause, NULL, |
| OMP_CLAUSE_VECTOR_LENGTH, clauses); |
| } |
| else |
| gcc_unreachable (); |
| |
| gimple *parallel_body_bind |
| = gimple_build_bind (NULL, stmts, make_node (BLOCK)); |
| gimple *parallel_region |
| = gimple_build_omp_target (parallel_body_bind, region_code, clauses); |
| gimple_set_location (parallel_region, gimple_location (omp_for)); |
| |
| return parallel_region; |
| } |
| |
| /* Eliminate any binds directly inside BIND by adding their statements to |
| BIND (i.e., modifying it in place), excluding binds that hold only an |
| OMP_FOR loop and associated setup/cleanup code. Recurse into binds but |
| not other statements. Return a chain of the local variables of eliminated |
| binds, i.e., the local variables found in nested binds. If |
| INCLUDE_TOPLEVEL_VARS is true, this also includes the variables belonging |
| to BIND itself. */ |
| |
| static tree |
| flatten_binds (gbind *bind, bool include_toplevel_vars = false) |
| { |
| tree vars = NULL, last_var = NULL; |
| |
| if (include_toplevel_vars) |
| { |
| vars = gimple_bind_vars (bind); |
| last_var = vars; |
| } |
| |
| gimple_seq new_body = NULL; |
| gimple_seq body_sequence = gimple_bind_body (bind); |
| gimple_stmt_iterator gsi, gsi_n; |
| for (gsi = gsi_start (body_sequence); !gsi_end_p (gsi); gsi = gsi_n) |
| { |
| /* Advance the iterator here because otherwise it would be invalidated |
| by moving statements below. */ |
| gsi_n = gsi; |
| gsi_next (&gsi_n); |
| |
| gimple *stmt = gsi_stmt (gsi); |
| /* Flatten bind statements, except the ones that contain only an |
| OpenACC for loop. */ |
| if (gimple_code (stmt) == GIMPLE_BIND |
| && !top_level_omp_for_in_stmt (stmt)) |
| { |
| gbind *inner_bind = as_a <gbind *> (stmt); |
| /* Flatten recursively, and collect all variables. */ |
| tree inner_vars = flatten_binds (inner_bind, true); |
| gimple_seq inner_sequence = gimple_bind_body (inner_bind); |
| if (flag_checking) |
| { |
| for (gimple_stmt_iterator inner_gsi = gsi_start (inner_sequence); |
| !gsi_end_p (inner_gsi); |
| gsi_next (&inner_gsi)) |
| { |
| gimple *inner_stmt = gsi_stmt (inner_gsi); |
| gcc_assert (gimple_code (inner_stmt) != GIMPLE_BIND |
| || top_level_omp_for_in_stmt (inner_stmt)); |
| } |
| } |
| gimple_seq_add_seq (&new_body, inner_sequence); |
| /* Find the last variable; we will append others to it. */ |
| while (last_var != NULL && TREE_CHAIN (last_var) != NULL) |
| last_var = TREE_CHAIN (last_var); |
| if (last_var != NULL) |
| { |
| TREE_CHAIN (last_var) = inner_vars; |
| last_var = inner_vars; |
| } |
| else |
| { |
| vars = inner_vars; |
| last_var = vars; |
| } |
| } |
| else |
| gimple_seq_add_stmt (&new_body, stmt); |
| } |
| |
| /* Put the possibly transformed body back into the bind. */ |
| gimple_bind_set_body (bind, new_body); |
| return vars; |
| } |
| |
| /* Helper function for places where we construct data regions. Wraps the BODY |
| inside a try-finally construct at LOC that calls __builtin_GOACC_data_end |
| in its cleanup block. Returns this try statement. */ |
| |
| static gimple * |
| make_data_region_try_statement (location_t loc, gimple *body) |
| { |
| tree data_end_fn = builtin_decl_explicit (BUILT_IN_GOACC_DATA_END); |
| gimple *call = gimple_build_call (data_end_fn, 0); |
| gimple_seq cleanup = NULL; |
| gimple_seq_add_stmt (&cleanup, call); |
| gimple *try_stmt = gimple_build_try (body, cleanup, GIMPLE_TRY_FINALLY); |
| gimple_set_location (body, loc); |
| return try_stmt; |
| } |
| |
| /* If INNER_BIND_VARS holds variables, build an OpenACC data region with |
| location LOC containing BODY and having 'create (var)' clauses for each |
| variable (as a side effect, such variables also get TREE_ADDRESSABLE set). |
| If INNER_CLEANUP is present, add a try-finally statement with |
| this cleanup code in the finally block. Return the new data region, or |
| the original BODY if no data region was needed. */ |
| |
| static gimple * |
| maybe_build_inner_data_region (location_t loc, gimple *body, |
| tree inner_bind_vars, gimple *inner_cleanup) |
| { |
| /* Is this an instantiation of a template? (In this case, we don't care what |
| the generic decl is - just whether the function decl has one.) */ |
| bool generic_inst_p |
| = (lang_hooks.decls.get_generic_function_decl (current_function_decl) |
| != NULL); |
| |
| /* Build data 'create (var)' clauses for these local variables. |
| Below we will add these to a data region enclosing the entire body |
| of the decomposed kernels region. */ |
| tree prev_mapped_var = NULL, next = NULL, artificial_vars = NULL, |
| inner_data_clauses = NULL; |
| for (tree v = inner_bind_vars; v; v = next) |
| { |
| next = TREE_CHAIN (v); |
| if (DECL_ARTIFICIAL (v) |
| || TREE_CODE (v) == CONST_DECL |
| || generic_inst_p) |
| { |
| /* If this is an artificial temporary, it need not be mapped. We |
| move its declaration into the bind inside the data region. |
| Also avoid mapping variables if we are inside a template |
| instantiation; the code does not contain all the copies to |
| temporaries that would make this legal. */ |
| TREE_CHAIN (v) = artificial_vars; |
| artificial_vars = v; |
| if (prev_mapped_var != NULL) |
| TREE_CHAIN (prev_mapped_var) = next; |
| else |
| inner_bind_vars = next; |
| } |
| else |
| { |
| /* Otherwise, build the map clause. */ |
| tree new_clause = build_omp_clause (loc, OMP_CLAUSE_MAP); |
| OMP_CLAUSE_SET_MAP_KIND (new_clause, GOMP_MAP_ALLOC); |
| OMP_CLAUSE_DECL (new_clause) = v; |
| OMP_CLAUSE_SIZE (new_clause) = DECL_SIZE_UNIT (v); |
| OMP_CLAUSE_CHAIN (new_clause) = inner_data_clauses; |
| inner_data_clauses = new_clause; |
| |
| prev_mapped_var = v; |
| |
| /* See <https://gcc.gnu.org/PR100280>. */ |
| if (!TREE_ADDRESSABLE (v)) |
| { |
| /* Request that OMP lowering make 'v' addressable. */ |
| OMP_CLAUSE_MAP_DECL_MAKE_ADDRESSABLE (new_clause) = 1; |
| |
| if (dump_enabled_p ()) |
| { |
| const dump_user_location_t d_u_loc |
| = dump_user_location_t::from_location_t (loc); |
| /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */ |
| #if __GNUC__ >= 10 |
| # pragma GCC diagnostic push |
| # pragma GCC diagnostic ignored "-Wformat" |
| #endif |
| dump_printf_loc (MSG_NOTE, d_u_loc, |
| "OpenACC %<kernels%> decomposition:" |
| " variable %<%T%> declared in block" |
| " requested to be made addressable\n", |
| v); |
| #if __GNUC__ >= 10 |
| # pragma GCC diagnostic pop |
| #endif |
| } |
| } |
| } |
| } |
| |
| if (artificial_vars) |
| body = gimple_build_bind (artificial_vars, body, make_node (BLOCK)); |
| |
| /* If we determined above that there are variables that need to be created |
| on the device, construct a data region for them and wrap the body |
| inside that. */ |
| if (inner_data_clauses != NULL) |
| { |
| gcc_assert (inner_bind_vars != NULL); |
| gimple *inner_data_region |
| = gimple_build_omp_target (NULL, GF_OMP_TARGET_KIND_OACC_DATA_KERNELS, |
| inner_data_clauses); |
| gimple_set_location (inner_data_region, loc); |
| /* Make sure __builtin_GOACC_data_end is called at the end. */ |
| gimple *try_stmt = make_data_region_try_statement (loc, body); |
| gimple_omp_set_body (inner_data_region, try_stmt); |
| gimple *bind_body; |
| if (inner_cleanup != NULL) |
| /* Clobber all the inner variables that need to be clobbered. */ |
| bind_body = gimple_build_try (inner_data_region, inner_cleanup, |
| GIMPLE_TRY_FINALLY); |
| else |
| bind_body = inner_data_region; |
| body = gimple_build_bind (inner_bind_vars, bind_body, make_node (BLOCK)); |
| } |
| |
| return body; |
| } |
| |
| static void |
| add_wait (location_t loc, gimple_seq *region_body) |
| { |
| /* A "#pragma acc wait" is just a call GOACC_wait (acc_async_sync, 0). */ |
| tree wait_fn = builtin_decl_explicit (BUILT_IN_GOACC_WAIT); |
| tree sync_arg = build_int_cst (integer_type_node, GOMP_ASYNC_SYNC); |
| gimple *wait_call = gimple_build_call (wait_fn, 2, |
| sync_arg, integer_zero_node); |
| gimple_set_location (wait_call, loc); |
| gimple_seq_add_stmt (region_body, wait_call); |
| } |
| |
| /* Helper function of decompose_kernels_region_body. The statements in |
| REGION_BODY are expected to be decomposed parts; add an 'async' clause to |
| each. Also add a 'wait' directive at the end of the sequence. */ |
| |
| static void |
| add_async_clauses_and_wait (location_t loc, gimple_seq *region_body) |
| { |
| tree default_async_queue |
| = build_int_cst (integer_type_node, GOMP_ASYNC_NOVAL); |
| for (gimple_stmt_iterator gsi = gsi_start (*region_body); |
| !gsi_end_p (gsi); |
| gsi_next (&gsi)) |
| { |
| gimple *stmt = gsi_stmt (gsi); |
| tree target_clauses = gimple_omp_target_clauses (stmt); |
| tree new_async_clause = build_omp_clause (loc, OMP_CLAUSE_ASYNC); |
| OMP_CLAUSE_OPERAND (new_async_clause, 0) = default_async_queue; |
| OMP_CLAUSE_CHAIN (new_async_clause) = target_clauses; |
| target_clauses = new_async_clause; |
| gimple_omp_target_set_clauses (as_a <gomp_target *> (stmt), |
| target_clauses); |
| } |
| add_wait (loc, region_body); |
| } |
| |
| /* Auxiliary analysis of the body of a kernels region, to determine for each |
| OpenACC loop whether it is control-dependent (i.e., not necessarily |
| executed every time the kernels region is entered) or not. |
| We say that a loop is control-dependent if there is some cond, switch, or |
| goto statement that jumps over it, forwards or backwards. For example, |
| if the loop is controlled by an if statement, then a jump to the true |
| block, the false block, or from one of those blocks to the control flow |
| join point will necessarily jump over the loop. |
| This analysis implements an ad-hoc union-find data structure classifying |
| statements into "control-flow regions" as follows: Most statements are in |
| the same region as their predecessor, except that each OpenACC loop is in |
| a region of its own, and each OpenACC loop's successor starts a new |
| region. We then unite the regions of any statements linked by jumps, |
| placing any cond, switch, or goto statement in the same region as its |
| target label(s). |
| In the end, control dependence of OpenACC loops can be determined by |
| comparing their immediate predecessor and successor statements' regions. |
| A jump crosses the loop if and only if the predecessor and successor are |
| in the same region. (If there is no predecessor or successor, the loop |
| is executed unconditionally.) |
| The methods in this class identify statements by their index in the |
| kernels region's body. */ |
| |
| class control_flow_regions |
| { |
| public: |
| /* Initialize an instance and pre-compute the control-flow region |
| information for the statement sequence SEQ. */ |
| control_flow_regions (gimple_seq seq); |
| |
| /* Return true if the statement with the given index IDX in the analyzed |
| statement sequence is an unconditionally executed OpenACC loop. */ |
| bool is_unconditional_oacc_for_loop (size_t idx); |
| |
| private: |
| /* Find the region representative for the statement identified by index |
| STMT_IDX. */ |
| size_t find_rep (size_t stmt_idx); |
| |
| /* Union the regions containing the statements represented by |
| representatives A and B. */ |
| void union_reps (size_t a, size_t b); |
| |
| /* Helper for the constructor. Performs the actual computation of the |
| control-flow regions in the statement sequence SEQ. */ |
| void compute_regions (gimple_seq seq); |
| |
| /* The mapping from statement indices to region representatives. */ |
| vec <size_t> representatives; |
| |
| /* A cache mapping statement indices to a flag indicating whether the |
| statement is a top level OpenACC for loop. */ |
| vec <bool> omp_for_loops; |
| }; |
| |
| control_flow_regions::control_flow_regions (gimple_seq seq) |
| { |
| representatives.create (1); |
| omp_for_loops.create (1); |
| compute_regions (seq); |
| } |
| |
| bool |
| control_flow_regions::is_unconditional_oacc_for_loop (size_t idx) |
| { |
| if (idx == 0 || idx == representatives.length () - 1) |
| /* The first or last statement in the kernels region. This means that |
| there is no room before or after it for a jump or a label. Thus |
| there cannot be a jump across it, so it is unconditional. */ |
| return true; |
| /* Otherwise, the loop is unconditional if the statements before and after |
| it are in different control flow regions. Scan forward and backward, |
| skipping over neighboring OpenACC for loops, to find these preceding |
| statements. */ |
| size_t prev_index = idx - 1; |
| while (prev_index > 0 && omp_for_loops [prev_index] == true) |
| prev_index--; |
| /* If all preceding statements are also OpenACC loops, all of these are |
| unconditional. */ |
| if (prev_index == 0) |
| return true; |
| size_t succ_index = idx + 1; |
| while (succ_index < omp_for_loops.length () |
| && omp_for_loops [succ_index] == true) |
| succ_index++; |
| /* If all following statements are also OpenACC loops, all of these are |
| unconditional. */ |
| if (succ_index == omp_for_loops.length ()) |
| return true; |
| return (find_rep (prev_index) != find_rep (succ_index)); |
| } |
| |
| size_t |
| control_flow_regions::find_rep (size_t stmt_idx) |
| { |
| size_t rep = stmt_idx, aux = stmt_idx; |
| /* Find the root representative of this statement. */ |
| while (representatives[rep] != rep) |
| rep = representatives[rep]; |
| /* Compress the path from the original statement to the representative. */ |
| while (representatives[aux] != rep) |
| { |
| size_t tmp = representatives[aux]; |
| representatives[aux] = rep; |
| aux = tmp; |
| } |
| return rep; |
| } |
| |
| void |
| control_flow_regions::union_reps (size_t a, size_t b) |
| { |
| a = find_rep (a); |
| b = find_rep (b); |
| representatives[b] = a; |
| } |
| |
| void |
| control_flow_regions::compute_regions (gimple_seq seq) |
| { |
| hash_map <gimple *, size_t> control_flow_reps; |
| hash_map <tree, size_t> label_reps; |
| size_t current_region = 0, idx = 0; |
| |
| /* In a first pass, assign an initial region to each statement. Except in |
| the case of OpenACC loops, each statement simply gets the same region |
| representative as its predecessor. */ |
| for (gimple_stmt_iterator gsi = gsi_start (seq); |
| !gsi_end_p (gsi); |
| gsi_next (&gsi)) |
| { |
| gimple *stmt = gsi_stmt (gsi); |
| gimple *omp_for = top_level_omp_for_in_stmt (stmt); |
| omp_for_loops.safe_push (omp_for != NULL); |
| if (omp_for != NULL) |
| { |
| /* Assign a new region to this loop and to its successor. */ |
| current_region = idx; |
| representatives.safe_push (current_region); |
| current_region++; |
| } |
| else |
| { |
| representatives.safe_push (current_region); |
| /* Remember any jumps and labels for the second pass below. */ |
| if (gimple_code (stmt) == GIMPLE_COND |
| || gimple_code (stmt) == GIMPLE_SWITCH |
| || gimple_code (stmt) == GIMPLE_GOTO) |
| control_flow_reps.put (stmt, current_region); |
| else if (gimple_code (stmt) == GIMPLE_LABEL) |
| label_reps.put (gimple_label_label (as_a <glabel *> (stmt)), |
| current_region); |
| } |
| idx++; |
| } |
| gcc_assert (representatives.length () == omp_for_loops.length ()); |
| |
| /* Revisit all the control flow statements and union the region of each |
| cond, switch, or goto statement with the target labels' regions. */ |
| for (hash_map <gimple *, size_t>::iterator it = control_flow_reps.begin (); |
| it != control_flow_reps.end (); |
| ++it) |
| { |
| gimple *stmt = (*it).first; |
| size_t stmt_rep = (*it).second; |
| switch (gimple_code (stmt)) |
| { |
| tree label; |
| unsigned int n; |
| |
| case GIMPLE_COND: |
| label = gimple_cond_true_label (as_a <gcond *> (stmt)); |
| union_reps (stmt_rep, *label_reps.get (label)); |
| label = gimple_cond_false_label (as_a <gcond *> (stmt)); |
| union_reps (stmt_rep, *label_reps.get (label)); |
| break; |
| |
| case GIMPLE_SWITCH: |
| n = gimple_switch_num_labels (as_a <gswitch *> (stmt)); |
| for (unsigned int i = 0; i < n; i++) |
| { |
| tree switch_case |
| = gimple_switch_label (as_a <gswitch *> (stmt), i); |
| label = CASE_LABEL (switch_case); |
| union_reps (stmt_rep, *label_reps.get (label)); |
| } |
| break; |
| |
| case GIMPLE_GOTO: |
| label = gimple_goto_dest (stmt); |
| union_reps (stmt_rep, *label_reps.get (label)); |
| break; |
| |
| default: |
| gcc_unreachable (); |
| } |
| } |
| } |
| |
| /* Decompose the body of the KERNELS_REGION, which was originally annotated |
| with the KERNELS_CLAUSES, into a series of compute constructs. */ |
| |
| static gimple * |
| decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses) |
| { |
| location_t loc = gimple_location (kernels_region); |
| |
| /* The kernels clauses will be propagated to the child clauses unmodified, |
| except that the 'num_gangs', 'num_workers', and 'vector_length' clauses |
| will only be added to loop regions. The other regions are "gang-single" |
| and get an explicit 'num_gangs (1)' clause. So separate out the |
| 'num_gangs', 'num_workers', and 'vector_length' clauses here. |
| Also check for the presence of an 'async' clause but do not remove it from |
| the 'kernels' clauses. */ |
| tree num_gangs_clause = NULL, num_workers_clause = NULL, |
| vector_length_clause = NULL; |
| tree async_clause = NULL; |
| tree prev_clause = NULL, next_clause = NULL; |
| tree parallel_clauses = kernels_clauses; |
| for (tree c = parallel_clauses; c; c = next_clause) |
| { |
| /* Preserve this here, as we might NULL it later. */ |
| next_clause = OMP_CLAUSE_CHAIN (c); |
| |
| if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_NUM_GANGS |
| || OMP_CLAUSE_CODE (c) == OMP_CLAUSE_NUM_WORKERS |
| || OMP_CLAUSE_CODE (c) == OMP_CLAUSE_VECTOR_LENGTH) |
| { |
| /* Cut this clause out of the chain. */ |
| if (prev_clause != NULL) |
| OMP_CLAUSE_CHAIN (prev_clause) = OMP_CLAUSE_CHAIN (c); |
| else |
| kernels_clauses = OMP_CLAUSE_CHAIN (c); |
| OMP_CLAUSE_CHAIN (c) = NULL; |
| switch (OMP_CLAUSE_CODE (c)) |
| { |
| case OMP_CLAUSE_NUM_GANGS: |
| num_gangs_clause = c; |
| break; |
| case OMP_CLAUSE_NUM_WORKERS: |
| num_workers_clause = c; |
| break; |
| case OMP_CLAUSE_VECTOR_LENGTH: |
| vector_length_clause = c; |
| break; |
| default: |
| gcc_unreachable (); |
| } |
| } |
| else |
| prev_clause = c; |
| if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_ASYNC) |
| async_clause = c; |
| } |
| |
| gimple *kernels_body = gimple_omp_body (kernels_region); |
| gbind *kernels_bind = as_a <gbind *> (kernels_body); |
| |
| /* The body of the region may contain other nested binds declaring inner |
| local variables. Collapse all these binds into one to ensure that we |
| have a single sequence of statements to iterate over; also, collect all |
| inner variables. */ |
| tree inner_bind_vars = flatten_binds (kernels_bind); |
| gimple_seq body_sequence = gimple_bind_body (kernels_bind); |
| |
| /* All these inner variables will get allocated on the device (below, by |
| calling maybe_build_inner_data_region). Here we create 'present' |
| clauses for them and add these clauses to the list of clauses to be |
| attached to each inner compute construct. */ |
| tree present_clauses = kernels_clauses; |
| for (tree var = inner_bind_vars; var; var = TREE_CHAIN (var)) |
| { |
| if (!DECL_ARTIFICIAL (var) && TREE_CODE (var) != CONST_DECL) |
| { |
| tree present_clause = build_omp_clause (loc, OMP_CLAUSE_MAP); |
| OMP_CLAUSE_SET_MAP_KIND (present_clause, GOMP_MAP_FORCE_PRESENT); |
| OMP_CLAUSE_DECL (present_clause) = var; |
| OMP_CLAUSE_SIZE (present_clause) = DECL_SIZE_UNIT (var); |
| OMP_CLAUSE_CHAIN (present_clause) = present_clauses; |
| present_clauses = present_clause; |
| } |
| } |
| kernels_clauses = present_clauses; |
| |
| /* In addition to nested binds, the "real" body of the region may be |
| nested inside a try-finally block. Find its cleanup block, which |
| contains code to clobber the local variables that must be clobbered. */ |
| gimple *inner_cleanup = NULL; |
| if (body_sequence != NULL && gimple_code (body_sequence) == GIMPLE_TRY) |
| { |
| if (gimple_seq_singleton_p (body_sequence)) |
| { |
| /* The try statement is the only thing inside the bind. */ |
| inner_cleanup = gimple_try_cleanup (body_sequence); |
| body_sequence = gimple_try_eval (body_sequence); |
| } |
| else |
| { |
| /* The bind's body starts with a try statement, but it is followed |
| by other things. */ |
| gimple_stmt_iterator gsi = gsi_start (body_sequence); |
| gimple *try_stmt = gsi_stmt (gsi); |
| inner_cleanup = gimple_try_cleanup (try_stmt); |
| gimple *try_body = gimple_try_eval (try_stmt); |
| |
| gsi_remove (&gsi, false); |
| /* Now gsi indicates the sequence of statements after the try |
| statement in the bind. Append the statement in the try body and |
| the trailing statements from gsi. */ |
| gsi_insert_seq_before (&gsi, try_body, GSI_CONTINUE_LINKING); |
| body_sequence = gsi_stmt (gsi); |
| } |
| } |
| |
| /* This sequence will collect all the top-level statements in the body of |
| the data region we are about to construct. */ |
| gimple_seq region_body = NULL; |
| /* This sequence will collect consecutive statements to be put into a |
| gang-single region. */ |
| gimple_seq gang_single_seq = NULL; |
| /* Flag recording whether the gang_single_seq only contains copies to |
| local variables. These may be loop setup code that should not be |
| separated from the loop. */ |
| bool only_simple_assignments = true; |
| |
| /* Precompute the control flow region information to determine whether an |
| OpenACC loop is executed conditionally or unconditionally. */ |
| control_flow_regions cf_regions (body_sequence); |
| |
| /* Iterate over the statements in the kernels region's body. */ |
| size_t idx = 0; |
| gimple_stmt_iterator gsi, gsi_n; |
| for (gsi = gsi_start (body_sequence); !gsi_end_p (gsi); gsi = gsi_n, idx++) |
| { |
| /* Advance the iterator here because otherwise it would be invalidated |
| by moving statements below. */ |
| gsi_n = gsi; |
| gsi_next (&gsi_n); |
| |
| gimple *stmt = gsi_stmt (gsi); |
| if (gimple_code (stmt) == GIMPLE_DEBUG) |
| { |
| if (flag_compare_debug_opt || flag_compare_debug) |
| /* Let the usual '-fcompare-debug' analysis bail out, as |
| necessary. */ |
| ; |
| else |
| sorry_at (loc, "%qs not yet supported", |
| gimple_code_name[gimple_code (stmt)]); |
| } |
| gimple *omp_for = top_level_omp_for_in_stmt (stmt); |
| bool is_unconditional_oacc_for_loop = false; |
| if (omp_for != NULL) |
| is_unconditional_oacc_for_loop |
| = cf_regions.is_unconditional_oacc_for_loop (idx); |
| if (omp_for != NULL |
| && is_unconditional_oacc_for_loop) |
| { |
| /* This is an OMP for statement, put it into a separate region. |
| But first, construct a gang-single region containing any |
| complex sequential statements we may have seen. */ |
| if (gang_single_seq != NULL && !only_simple_assignments) |
| { |
| gimple *single_region |
| = make_region_seq (loc, gang_single_seq, |
| num_gangs_clause, |
| num_workers_clause, |
| vector_length_clause, |
| kernels_clauses); |
| gimple_seq_add_stmt (®ion_body, single_region); |
| } |
| else if (gang_single_seq != NULL && only_simple_assignments) |
| { |
| /* There is a sequence of sequential statements preceding this |
| loop, but they are all simple assignments. This is |
| probably setup code for the loop; in particular, Fortran DO |
| loops are preceded by code to copy the loop limit variable |
| to a temporary. Group this code together with the loop |
| itself. */ |
| gimple_seq_add_stmt (&gang_single_seq, stmt); |
| stmt = gimple_build_bind (NULL, gang_single_seq, |
| make_node (BLOCK)); |
| } |
| gang_single_seq = NULL; |
| only_simple_assignments = true; |
| |
| gimple_seq parallel_seq = NULL; |
| gimple_seq_add_stmt (¶llel_seq, stmt); |
| gimple *parallel_region |
| = make_region_loop_nest (omp_for, parallel_seq, |
| num_gangs_clause, |
| num_workers_clause, |
| vector_length_clause, |
| kernels_clauses); |
| gimple_seq_add_stmt (®ion_body, parallel_region); |
| } |
| else |
| { |
| if (omp_for != NULL) |
| { |
| gcc_checking_assert (!is_unconditional_oacc_for_loop); |
| if (dump_enabled_p ()) |
| dump_printf_loc (MSG_MISSED_OPTIMIZATION, omp_for, |
| "unparallelized loop nest" |
| " in OpenACC %<kernels%> region:" |
| " it's executed conditionally\n"); |
| } |
| |
| /* This is not an unconditional OMP for statement, so it will be |
| put into a gang-single region. */ |
| gimple_seq_add_stmt (&gang_single_seq, stmt); |
| /* Is this a simple assignment? We call it simple if it is an |
| assignment to an artificial local variable. This captures |
| Fortran loop setup code computing loop bounds and offsets. */ |
| bool is_simple_assignment |
| = (gimple_code (stmt) == GIMPLE_ASSIGN |
| && TREE_CODE (gimple_assign_lhs (stmt)) == VAR_DECL |
| && DECL_ARTIFICIAL (gimple_assign_lhs (stmt))); |
| if (!is_simple_assignment) |
| only_simple_assignments = false; |
| } |
| } |
| |
| /* If we did not emit a new region, and are not going to emit one now |
| (that is, the original region was empty), prepare to emit a dummy so as |
| to preserve the original construct, which other processing (at least |
| test cases) depend on. */ |
| if (region_body == NULL && gang_single_seq == NULL) |
| { |
| gimple *stmt = gimple_build_nop (); |
| gimple_set_location (stmt, loc); |
| gimple_seq_add_stmt (&gang_single_seq, stmt); |
| } |
| |
| /* Gather up any remaining gang-single statements. */ |
| if (gang_single_seq != NULL) |
| { |
| gimple *single_region |
| = make_region_seq (loc, gang_single_seq, |
| num_gangs_clause, |
| num_workers_clause, |
| vector_length_clause, |
| kernels_clauses); |
| gimple_seq_add_stmt (®ion_body, single_region); |
| } |
| |
| /* We want to launch these kernels asynchronously. If the original |
| kernels region had an async clause, this is done automatically because |
| that async clause was copied to the individual regions we created. |
| Otherwise, add an async clause to each newly created region, as well as |
| a wait directive at the end. */ |
| if (async_clause == NULL) |
| add_async_clauses_and_wait (loc, ®ion_body); |
| else |
| /* !!! If we have asynchronous parallel blocks inside a (synchronous) data |
| region, then target memory will get unmapped at the point the data |
| region ends, even if the inner asynchronous parallels have not yet |
| completed. For kernels marked "async", we might want to use "enter data |
| async(...)" and "exit data async(...)" instead, or asynchronous data |
| regions (see also <https://gcc.gnu.org/PR97390> |
| "[OpenACC] 'async' clause on 'data' construct", |
| which is to share the same implementation). |
| For now, insert a (synchronous) wait at the end of the block. */ |
| add_wait (loc, ®ion_body); |
| |
| tree kernels_locals = gimple_bind_vars (as_a <gbind *> (kernels_body)); |
| gimple *body = gimple_build_bind (kernels_locals, region_body, |
| make_node (BLOCK)); |
| |
| /* If we found variables declared in nested scopes, build a data region to |
| map them to the device. */ |
| body = maybe_build_inner_data_region (loc, body, inner_bind_vars, |
| inner_cleanup); |
| |
| return body; |
| } |
| |
| /* Decompose one OpenACC 'kernels' construct into an OpenACC 'data' construct |
| containing the original OpenACC 'kernels' construct's region cut up into a |
| sequence of compute constructs. */ |
| |
| static gimple * |
| omp_oacc_kernels_decompose_1 (gimple *kernels_stmt) |
| { |
| gcc_checking_assert (gimple_omp_target_kind (kernels_stmt) |
| == GF_OMP_TARGET_KIND_OACC_KERNELS); |
| location_t loc = gimple_location (kernels_stmt); |
| |
| /* Collect the data clauses of the OpenACC 'kernels' directive and create a |
| new OpenACC 'data' construct with those clauses. */ |
| tree kernels_clauses = gimple_omp_target_clauses (kernels_stmt); |
| tree data_clauses = NULL; |
| for (tree c = kernels_clauses; c; c = OMP_CLAUSE_CHAIN (c)) |
| { |
| /* Certain clauses are copied to the enclosing OpenACC 'data'. Other |
| clauses remain on the OpenACC 'kernels'. */ |
| if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP) |
| { |
| tree decl = OMP_CLAUSE_DECL (c); |
| HOST_WIDE_INT map_kind = OMP_CLAUSE_MAP_KIND (c); |
| switch (map_kind) |
| { |
| default: |
| if (map_kind == GOMP_MAP_ALLOC |
| && integer_zerop (OMP_CLAUSE_SIZE (c))) |
| /* ??? This is an alloc clause for mapping a pointer whose |
| target is already mapped. We leave these on the inner |
| compute constructs because moving them to the outer data |
| region causes runtime errors. */ |
| break; |
| |
| /* For non-artificial variables, and for non-declaration |
| expressions like A[0:n], copy the clause to the data |
| region. */ |
| if ((DECL_P (decl) && !DECL_ARTIFICIAL (decl)) |
| || !DECL_P (decl)) |
| { |
| tree new_clause = build_omp_clause (OMP_CLAUSE_LOCATION (c), |
| OMP_CLAUSE_MAP); |
| OMP_CLAUSE_SET_MAP_KIND (new_clause, map_kind); |
| /* This must be unshared here to avoid "incorrect sharing |
| of tree nodes" errors from verify_gimple. */ |
| OMP_CLAUSE_DECL (new_clause) = unshare_expr (decl); |
| OMP_CLAUSE_SIZE (new_clause) = OMP_CLAUSE_SIZE (c); |
| OMP_CLAUSE_CHAIN (new_clause) = data_clauses; |
| data_clauses = new_clause; |
| |
| /* Now that this data is mapped, turn the data clause on the |
| inner OpenACC 'kernels' into a 'present' clause. */ |
| OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_FORCE_PRESENT); |
| |
| /* See <https://gcc.gnu.org/PR100280>, |
| <https://gcc.gnu.org/PR104086>. */ |
| if (DECL_P (decl) |
| && !TREE_ADDRESSABLE (decl)) |
| { |
| /* Request that OMP lowering make 'decl' addressable. */ |
| OMP_CLAUSE_MAP_DECL_MAKE_ADDRESSABLE (new_clause) = 1; |
| |
| if (dump_enabled_p ()) |
| { |
| location_t loc = OMP_CLAUSE_LOCATION (new_clause); |
| const dump_user_location_t d_u_loc |
| = dump_user_location_t::from_location_t (loc); |
| /* PR100695 "Format decoder, quoting in 'dump_printf' |
| etc." */ |
| #if __GNUC__ >= 10 |
| # pragma GCC diagnostic push |
| # pragma GCC diagnostic ignored "-Wformat" |
| #endif |
| dump_printf_loc |
| (MSG_NOTE, d_u_loc, |
| "OpenACC %<kernels%> decomposition:" |
| " variable %<%T%> in %qs clause" |
| " requested to be made addressable\n", |
| decl, |
| user_omp_clause_code_name (new_clause, true)); |
| #if __GNUC__ >= 10 |
| # pragma GCC diagnostic pop |
| #endif |
| } |
| } |
| } |
| break; |
| |
| case GOMP_MAP_POINTER: |
| case GOMP_MAP_TO_PSET: |
| case GOMP_MAP_FIRSTPRIVATE_POINTER: |
| case GOMP_MAP_FIRSTPRIVATE_REFERENCE: |
| /* ??? Copying these map kinds leads to internal compiler |
| errors in later passes. */ |
| break; |
| } |
| } |
| else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_IF) |
| { |
| /* If there is an 'if' clause, it must be duplicated to the |
| enclosing data region. Temporarily remove the if clause's |
| chain to avoid copying it. */ |
| tree saved_chain = OMP_CLAUSE_CHAIN (c); |
| OMP_CLAUSE_CHAIN (c) = NULL; |
| tree new_if_clause = unshare_expr (c); |
| OMP_CLAUSE_CHAIN (c) = saved_chain; |
| OMP_CLAUSE_CHAIN (new_if_clause) = data_clauses; |
| data_clauses = new_if_clause; |
| } |
| } |
| /* Restore the original order of the clauses. */ |
| data_clauses = nreverse (data_clauses); |
| |
| gimple *data_region |
| = gimple_build_omp_target (NULL, GF_OMP_TARGET_KIND_OACC_DATA_KERNELS, |
| data_clauses); |
| gimple_set_location (data_region, loc); |
| |
| /* Transform the body of the kernels region into a sequence of compute |
| constructs. */ |
| gimple *body = decompose_kernels_region_body (kernels_stmt, |
| kernels_clauses); |
| |
| /* Put the transformed pieces together. The entire body of the region is |
| wrapped in a try-finally statement that calls __builtin_GOACC_data_end |
| for cleanup. */ |
| gimple *try_stmt = make_data_region_try_statement (loc, body); |
| gimple_omp_set_body (data_region, try_stmt); |
| |
| return data_region; |
| } |
| |
| |
| /* Decompose OpenACC 'kernels' constructs in the current function. */ |
| |
| static tree |
| omp_oacc_kernels_decompose_callback_stmt (gimple_stmt_iterator *gsi_p, |
| bool *handled_ops_p, |
| struct walk_stmt_info *) |
| { |
| gimple *stmt = gsi_stmt (*gsi_p); |
| |
| if ((gimple_code (stmt) == GIMPLE_OMP_TARGET) |
| && gimple_omp_target_kind (stmt) == GF_OMP_TARGET_KIND_OACC_KERNELS) |
| { |
| gimple *stmt_new = omp_oacc_kernels_decompose_1 (stmt); |
| gsi_replace (gsi_p, stmt_new, false); |
| *handled_ops_p = true; |
| } |
| else |
| *handled_ops_p = false; |
| |
| return NULL; |
| } |
| |
| static unsigned int |
| omp_oacc_kernels_decompose (void) |
| { |
| gimple_seq body = gimple_body (current_function_decl); |
| |
| struct walk_stmt_info wi; |
| memset (&wi, 0, sizeof (wi)); |
| walk_gimple_seq_mod (&body, omp_oacc_kernels_decompose_callback_stmt, NULL, |
| &wi); |
| |
| gimple_set_body (current_function_decl, body); |
| |
| return 0; |
| } |
| |
| |
| namespace { |
| |
| const pass_data pass_data_omp_oacc_kernels_decompose = |
| { |
| GIMPLE_PASS, /* type */ |
| "omp_oacc_kernels_decompose", /* name */ |
| OPTGROUP_OMP, /* optinfo_flags */ |
| TV_NONE, /* tv_id */ |
| PROP_gimple_any, /* properties_required */ |
| 0, /* properties_provided */ |
| 0, /* properties_destroyed */ |
| 0, /* todo_flags_start */ |
| 0, /* todo_flags_finish */ |
| }; |
| |
| class pass_omp_oacc_kernels_decompose : public gimple_opt_pass |
| { |
| public: |
| pass_omp_oacc_kernels_decompose (gcc::context *ctxt) |
| : gimple_opt_pass (pass_data_omp_oacc_kernels_decompose, ctxt) |
| {} |
| |
| /* opt_pass methods: */ |
| bool gate (function *) final override |
| { |
| return (flag_openacc |
| && param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE); |
| } |
| unsigned int execute (function *) final override |
| { |
| return omp_oacc_kernels_decompose (); |
| } |
| |
| }; // class pass_omp_oacc_kernels_decompose |
| |
| } // anon namespace |
| |
| gimple_opt_pass * |
| make_pass_omp_oacc_kernels_decompose (gcc::context *ctxt) |
| { |
| return new pass_omp_oacc_kernels_decompose (ctxt); |
| } |