/* Loop unroll-and-jam. | |

Copyright (C) 2017-2020 Free Software Foundation, Inc. | |

This file is part of GCC. | |

GCC is free software; you can redistribute it and/or modify it | |

under the terms of the GNU General Public License as published by the | |

Free Software Foundation; either version 3, or (at your option) any | |

later version. | |

GCC is distributed in the hope that it will be useful, but WITHOUT | |

ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |

FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |

for more details. | |

You should have received a copy of the GNU General Public License | |

along with GCC; see the file COPYING3. If not see | |

<http://www.gnu.org/licenses/>. */ | |

#include "config.h" | |

#include "system.h" | |

#include "coretypes.h" | |

#include "tree-pass.h" | |

#include "backend.h" | |

#include "tree.h" | |

#include "gimple.h" | |

#include "ssa.h" | |

#include "fold-const.h" | |

#include "tree-cfg.h" | |

#include "tree-ssa.h" | |

#include "tree-ssa-loop-niter.h" | |

#include "tree-ssa-loop.h" | |

#include "tree-ssa-loop-manip.h" | |

#include "cfgloop.h" | |

#include "tree-scalar-evolution.h" | |

#include "gimple-iterator.h" | |

#include "cfghooks.h" | |

#include "tree-data-ref.h" | |

#include "tree-ssa-loop-ivopts.h" | |

#include "tree-vectorizer.h" | |

/* Unroll and Jam transformation | |

This is a combination of two transformations, where the second | |

is not always valid. It's applicable if a loop nest has redundancies | |

over the iterations of an outer loop while not having that with | |

an inner loop. | |

Given this nest: | |

for (i) { | |

for (j) { | |

B(i,j) | |

} | |

} | |

first unroll: | |

for (i by 2) { | |

for (j) { | |

B(i,j) | |

} | |

for (j) { | |

B(i+1,j) | |

} | |

} | |

then fuse the two adjacent inner loops resulting from that: | |

for (i by 2) { | |

for (j) { | |

B(i,j) | |

B(i+1,j) | |

} | |

} | |

As the order of evaluations of the body B changes this is valid | |

only in certain situations: all distance vectors need to be forward. | |

Additionally if there are multiple induction variables than just | |

a counting control IV (j above) we can also deal with some situations. | |

The validity is checked by unroll_jam_possible_p, and the data-dep | |

testing below. | |

A trivial example where the fusion is wrong would be when | |

B(i,j) == x[j-1] = x[j]; | |

for (i by 2) { | |

for (j) { | |

x[j-1] = x[j]; | |

} | |

for (j) { | |

x[j-1] = x[j]; | |

} | |

} effect: move content to front by two elements | |

--> | |

for (i by 2) { | |

for (j) { | |

x[j-1] = x[j]; | |

x[j-1] = x[j]; | |

} | |

} effect: move content to front by one element | |

*/ | |

/* Modify the loop tree for the fact that all code once belonging | |

to the OLD loop or the outer loop of OLD now is inside LOOP. */ | |

static void | |

merge_loop_tree (class loop *loop, class loop *old) | |

{ | |

basic_block *bbs; | |

int i, n; | |

class loop *subloop; | |

edge e; | |

edge_iterator ei; | |

/* Find its nodes. */ | |

bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); | |

n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun)); | |

for (i = 0; i < n; i++) | |

{ | |

/* If the block was direct child of OLD loop it's now part | |

of LOOP. If it was outside OLD, then it moved into LOOP | |

as well. This avoids changing the loop father for BBs | |

in inner loops of OLD. */ | |

if (bbs[i]->loop_father == old | |

|| loop_depth (bbs[i]->loop_father) < loop_depth (old)) | |

{ | |

remove_bb_from_loops (bbs[i]); | |

add_bb_to_loop (bbs[i], loop); | |

continue; | |

} | |

/* If we find a direct subloop of OLD, move it to LOOP. */ | |

subloop = bbs[i]->loop_father; | |

if (loop_outer (subloop) == old && subloop->header == bbs[i]) | |

{ | |

flow_loop_tree_node_remove (subloop); | |

flow_loop_tree_node_add (loop, subloop); | |

} | |

} | |

/* Update the information about loop exit edges. */ | |

for (i = 0; i < n; i++) | |

{ | |

FOR_EACH_EDGE (e, ei, bbs[i]->succs) | |

{ | |

rescan_loop_exit (e, false, false); | |

} | |

} | |

loop->num_nodes = n; | |

free (bbs); | |

} | |

/* BB is part of the outer loop of an unroll-and-jam situation. | |

Check if any statements therein would prevent the transformation. */ | |

static bool | |

bb_prevents_fusion_p (basic_block bb) | |

{ | |

gimple_stmt_iterator gsi; | |

/* BB is duplicated by outer unrolling and then all N-1 first copies | |

move into the body of the fused inner loop. If BB exits the outer loop | |

the last copy still does so, and the first N-1 copies are cancelled | |

by loop unrolling, so also after fusion it's the exit block. | |

But there might be other reasons that prevent fusion: | |

* stores or unknown side-effects prevent fusion | |

* loads don't | |

* computations into SSA names: these aren't problematic. Their | |

result will be unused on the exit edges of the first N-1 copies | |

(those aren't taken after unrolling). If they are used on the | |

other edge (the one leading to the outer latch block) they are | |

loop-carried (on the outer loop) and the Nth copy of BB will | |

compute them again (i.e. the first N-1 copies will be dead). */ | |

for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) | |

{ | |

gimple *g = gsi_stmt (gsi); | |

if (gimple_vdef (g) || gimple_has_side_effects (g)) | |

return true; | |

} | |

return false; | |

} | |

/* Given an inner loop LOOP (of some OUTER loop) determine if | |

we can safely fuse copies of it (generated by outer unrolling). | |

If so return true, otherwise return false. */ | |

static bool | |

unroll_jam_possible_p (class loop *outer, class loop *loop) | |

{ | |

basic_block *bbs; | |

int i, n; | |

class tree_niter_desc niter; | |

/* When fusing the loops we skip the latch block | |

of the first one, so it mustn't have any effects to | |

preserve. */ | |

if (!empty_block_p (loop->latch)) | |

return false; | |

if (!single_exit (loop)) | |

return false; | |

/* We need a perfect nest. Quick check for adjacent inner loops. */ | |

if (outer->inner != loop || loop->next) | |

return false; | |

/* Prevent head-controlled inner loops, that we usually have. | |

The guard block would need to be accepted | |

(invariant condition either entering or skipping the loop), | |

without also accepting arbitrary control flow. When unswitching | |

ran before us (as with -O3) this won't be a problem because its | |

outer loop unswitching will have moved out the invariant condition. | |

If we do that we need to extend fuse_loops() to cope with this | |

by threading through the (still invariant) copied condition | |

between the two loop copies. */ | |

if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header)) | |

return false; | |

/* The number of iterations of the inner loop must be loop invariant | |

with respect to the outer loop. */ | |

if (!number_of_iterations_exit (loop, single_exit (loop), &niter, | |

false, true) | |

|| niter.cmp == ERROR_MARK | |

|| !integer_zerop (niter.may_be_zero) | |

|| !expr_invariant_in_loop_p (outer, niter.niter)) | |

return false; | |

/* If the inner loop produces any values that are used inside the | |

outer loop (except the virtual op) then it can flow | |

back (perhaps indirectly) into the inner loop. This prevents | |

fusion: without fusion the value at the last iteration is used, | |

with fusion the value after the initial iteration is used. | |

If all uses are outside the outer loop this doesn't prevent fusion; | |

the value of the last iteration is still used (and the values from | |

all intermediate iterations are dead). */ | |

gphi_iterator psi; | |

for (psi = gsi_start_phis (single_exit (loop)->dest); | |

!gsi_end_p (psi); gsi_next (&psi)) | |

{ | |

imm_use_iterator imm_iter; | |

use_operand_p use_p; | |

tree op = gimple_phi_result (psi.phi ()); | |

if (virtual_operand_p (op)) | |

continue; | |

FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op) | |

{ | |

gimple *use_stmt = USE_STMT (use_p); | |

if (!is_gimple_debug (use_stmt) | |

&& flow_bb_inside_loop_p (outer, gimple_bb (use_stmt))) | |

return false; | |

} | |

} | |

/* And check blocks belonging to just outer loop. */ | |

bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); | |

n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun)); | |

for (i = 0; i < n; i++) | |

if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i])) | |

break; | |

free (bbs); | |

if (i != n) | |

return false; | |

/* For now we can safely fuse copies of LOOP only if all | |

loop carried variables are inductions (or the virtual op). | |

We could handle reductions as well (the initial value in the second | |

body would be the after-iter value of the first body) if it's over | |

an associative and commutative operation. We wouldn't | |

be able to handle unknown cycles. */ | |

for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi)) | |

{ | |

affine_iv iv; | |

tree op = gimple_phi_result (psi.phi ()); | |

if (virtual_operand_p (op)) | |

continue; | |

if (!simple_iv (loop, loop, op, &iv, true)) | |

return false; | |

/* The inductions must be regular, loop invariant step and initial | |

value. */ | |

if (!expr_invariant_in_loop_p (outer, iv.step) | |

|| !expr_invariant_in_loop_p (outer, iv.base)) | |

return false; | |

/* XXX With more effort we could also be able to deal with inductions | |

where the initial value is loop variant but a simple IV in the | |

outer loop. The initial value for the second body would be | |

the original initial value plus iv.base.step. The next value | |

for the fused loop would be the original next value of the first | |

copy, _not_ the next value of the second body. */ | |

} | |

return true; | |

} | |

/* Fuse LOOP with all further neighbors. The loops are expected to | |

be in appropriate form. */ | |

static void | |

fuse_loops (class loop *loop) | |

{ | |

class loop *next = loop->next; | |

while (next) | |

{ | |

edge e; | |

remove_branch (single_pred_edge (loop->latch)); | |

/* Make delete_basic_block not fiddle with the loop structure. */ | |

basic_block oldlatch = loop->latch; | |

loop->latch = NULL; | |

delete_basic_block (oldlatch); | |

e = redirect_edge_and_branch (loop_latch_edge (next), | |

loop->header); | |

loop->latch = e->src; | |

flush_pending_stmts (e); | |

gcc_assert (EDGE_COUNT (next->header->preds) == 1); | |

/* The PHI nodes of the second body (single-argument now) | |

need adjustments to use the right values: either directly | |

the value of the corresponding PHI in the first copy or | |

the one leaving the first body which unrolling did for us. | |

See also unroll_jam_possible_p() for further possibilities. */ | |

gphi_iterator psi_first, psi_second; | |

e = single_pred_edge (next->header); | |

for (psi_first = gsi_start_phis (loop->header), | |

psi_second = gsi_start_phis (next->header); | |

!gsi_end_p (psi_first); | |

gsi_next (&psi_first), gsi_next (&psi_second)) | |

{ | |

gphi *phi_first = psi_first.phi (); | |

gphi *phi_second = psi_second.phi (); | |

tree firstop = gimple_phi_result (phi_first); | |

/* The virtual operand is correct already as it's | |

always live at exit, hence has a LCSSA node and outer | |

loop unrolling updated SSA form. */ | |

if (virtual_operand_p (firstop)) | |

continue; | |

/* Due to unroll_jam_possible_p() we know that this is | |

an induction. The second body goes over the same | |

iteration space. */ | |

add_phi_arg (phi_second, firstop, e, | |

gimple_location (phi_first)); | |

} | |

gcc_assert (gsi_end_p (psi_second)); | |

merge_loop_tree (loop, next); | |

gcc_assert (!next->num_nodes); | |

class loop *ln = next->next; | |

delete_loop (next); | |

next = ln; | |

} | |

rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop); | |

} | |

/* Return true if any of the access functions for dataref A | |

isn't invariant with respect to loop LOOP_NEST. */ | |

static bool | |

any_access_function_variant_p (const struct data_reference *a, | |

const class loop *loop_nest) | |

{ | |

unsigned int i; | |

vec<tree> fns = DR_ACCESS_FNS (a); | |

tree t; | |

FOR_EACH_VEC_ELT (fns, i, t) | |

if (!evolution_function_is_invariant_p (t, loop_nest->num)) | |

return true; | |

return false; | |

} | |

/* Returns true if the distance in DDR can be determined and adjusts | |

the unroll factor in *UNROLL to make unrolling valid for that distance. | |

Otherwise return false. DDR is with respect to the outer loop of INNER. | |

If this data dep can lead to a removed memory reference, increment | |

*REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor | |

for this to happen. */ | |

static bool | |

adjust_unroll_factor (class loop *inner, struct data_dependence_relation *ddr, | |

unsigned *unroll, unsigned *profit_unroll, | |

unsigned *removed) | |

{ | |

bool ret = false; | |

if (DDR_ARE_DEPENDENT (ddr) != chrec_known) | |

{ | |

if (DDR_NUM_DIST_VECTS (ddr) == 0) | |

return false; | |

unsigned i; | |

lambda_vector dist_v; | |

FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) | |

{ | |

/* A distance (a,b) is at worst transformed into (a/N,b) by the | |

unrolling (factor N), so the transformation is valid if | |

a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll | |

factor needs to be limited so that the first condition holds. | |

That may limit the factor down to zero in the worst case. */ | |

lambda_int dist = dist_v[0]; | |

if (dist < 0) | |

gcc_unreachable (); | |

else if (dist >= (lambda_int)*unroll) | |

; | |

else if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) | |

{ | |

/* We have (a,0) with a < N, so this will be transformed into | |

(0,0) after unrolling by N. This might potentially be a | |

problem, if it's not a read-read dependency. */ | |

if (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr))) | |

; | |

else | |

{ | |

/* So, at least one is a write, and we might reduce the | |

distance vector to (0,0). This is still no problem | |

if both data-refs are affine with respect to the inner | |

loops. But if one of them is invariant with respect | |

to an inner loop our reordering implicit in loop fusion | |

corrupts the program, as our data dependences don't | |

capture this. E.g. for: | |

for (0 <= i < n) | |

for (0 <= j < m) | |

a[i][0] = a[i+1][0] + 2; // (1) | |

b[i][j] = b[i+1][j] + 2; // (2) | |

the distance vector for both statements is (-1,0), | |

but exchanging the order for (2) is okay, while | |

for (1) it is not. To see this, write out the original | |

accesses (assume m is 2): | |

a i j original | |

0 0 0 r a[1][0] b[1][0] | |

1 0 0 w a[0][0] b[0][0] | |

2 0 1 r a[1][0] b[1][1] | |

3 0 1 w a[0][0] b[0][1] | |

4 1 0 r a[2][0] b[2][0] | |

5 1 0 w a[1][0] b[1][0] | |

after unroll-by-2 and fusion the accesses are done in | |

this order (from column a): 0,1, 4,5, 2,3, i.e. this: | |

a i j transformed | |

0 0 0 r a[1][0] b[1][0] | |

1 0 0 w a[0][0] b[0][0] | |

4 1 0 r a[2][0] b[2][0] | |

5 1 0 w a[1][0] b[1][0] | |

2 0 1 r a[1][0] b[1][1] | |

3 0 1 w a[0][0] b[0][1] | |

Note how access 2 accesses the same element as access 5 | |

for array 'a' but not for array 'b'. */ | |

if (any_access_function_variant_p (DDR_A (ddr), inner) | |

&& any_access_function_variant_p (DDR_B (ddr), inner)) | |

; | |

else | |

/* And if any dataref of this pair is invariant with | |

respect to the inner loop, we have no chance than | |

to reduce the unroll factor. */ | |

*unroll = dist; | |

} | |

} | |

else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) | |

; | |

else | |

*unroll = dist; | |

/* With a distance (a,0) it's always profitable to unroll-and-jam | |

(by a+1), because one memory reference will go away. With | |

(a,b) and b != 0 that's less clear. We will increase the | |

number of streams without lowering the number of mem refs. | |

So for now only handle the first situation. */ | |

if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) | |

{ | |

*profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1); | |

(*removed)++; | |

} | |

ret = true; | |

} | |

} | |

return ret; | |

} | |

/* Main entry point for the unroll-and-jam transformation | |

described above. */ | |

static unsigned int | |

tree_loop_unroll_and_jam (void) | |

{ | |

class loop *loop; | |

bool changed = false; | |

gcc_assert (scev_initialized_p ()); | |

/* Go through all innermost loops. */ | |

FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) | |

{ | |

class loop *outer = loop_outer (loop); | |

if (loop_depth (loop) < 2 | |

|| optimize_loop_nest_for_size_p (outer)) | |

continue; | |

if (!unroll_jam_possible_p (outer, loop)) | |

continue; | |

vec<data_reference_p> datarefs; | |

vec<ddr_p> dependences; | |

unsigned unroll_factor, profit_unroll, removed; | |

class tree_niter_desc desc; | |

bool unroll = false; | |

auto_vec<loop_p, 3> loop_nest; | |

dependences.create (10); | |

datarefs.create (10); | |

if (!compute_data_dependences_for_loop (outer, true, &loop_nest, | |

&datarefs, &dependences)) | |

{ | |

if (dump_file && (dump_flags & TDF_DETAILS)) | |

fprintf (dump_file, "Cannot analyze data dependencies\n"); | |

free_data_refs (datarefs); | |

free_dependence_relations (dependences); | |

continue; | |

} | |

if (!datarefs.length ()) | |

continue; | |

if (dump_file && (dump_flags & TDF_DETAILS)) | |

dump_data_dependence_relations (dump_file, dependences); | |

unroll_factor = (unsigned)-1; | |

profit_unroll = 1; | |

removed = 0; | |

/* Check all dependencies. */ | |

unsigned i; | |

struct data_dependence_relation *ddr; | |

FOR_EACH_VEC_ELT (dependences, i, ddr) | |

{ | |

struct data_reference *dra, *drb; | |

/* If the refs are independend there's nothing to do. */ | |

if (DDR_ARE_DEPENDENT (ddr) == chrec_known) | |

continue; | |

dra = DDR_A (ddr); | |

drb = DDR_B (ddr); | |

/* Nothing interesting for the self dependencies. */ | |

if (dra == drb) | |

continue; | |

/* Now check the distance vector, for determining a sensible | |

outer unroll factor, and for validity of merging the inner | |

loop copies. */ | |

if (!adjust_unroll_factor (loop, ddr, &unroll_factor, &profit_unroll, | |

&removed)) | |

{ | |

/* Couldn't get the distance vector. For two reads that's | |

harmless (we assume we should unroll). For at least | |

one write this means we can't check the dependence direction | |

and hence can't determine safety. */ | |

if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb)) | |

{ | |

unroll_factor = 0; | |

break; | |

} | |

} | |

} | |

/* We regard a user-specified minimum percentage of zero as a request | |

to ignore all profitability concerns and apply the transformation | |

always. */ | |

if (!param_unroll_jam_min_percent) | |

profit_unroll = MAX(2, profit_unroll); | |

else if (removed * 100 / datarefs.length () | |

< (unsigned)param_unroll_jam_min_percent) | |

profit_unroll = 1; | |

if (unroll_factor > profit_unroll) | |

unroll_factor = profit_unroll; | |

if (unroll_factor > (unsigned)param_unroll_jam_max_unroll) | |

unroll_factor = param_unroll_jam_max_unroll; | |

unroll = (unroll_factor > 1 | |

&& can_unroll_loop_p (outer, unroll_factor, &desc)); | |

if (unroll) | |

{ | |

if (dump_enabled_p ()) | |

dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS, | |

find_loop_location (outer), | |

"applying unroll and jam with factor %d\n", | |

unroll_factor); | |

initialize_original_copy_tables (); | |

tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer), | |

&desc); | |

free_original_copy_tables (); | |

fuse_loops (outer->inner); | |

changed = true; | |

} | |

loop_nest.release (); | |

free_dependence_relations (dependences); | |

free_data_refs (datarefs); | |

} | |

if (changed) | |

{ | |

scev_reset (); | |

free_dominance_info (CDI_DOMINATORS); | |

return TODO_cleanup_cfg; | |

} | |

return 0; | |

} | |

/* Pass boilerplate */ | |

namespace { | |

const pass_data pass_data_loop_jam = | |

{ | |

GIMPLE_PASS, /* type */ | |

"unrolljam", /* name */ | |

OPTGROUP_LOOP, /* optinfo_flags */ | |

TV_LOOP_JAM, /* tv_id */ | |

PROP_cfg, /* properties_required */ | |

0, /* properties_provided */ | |

0, /* properties_destroyed */ | |

0, /* todo_flags_start */ | |

0, /* todo_flags_finish */ | |

}; | |

class pass_loop_jam : public gimple_opt_pass | |

{ | |

public: | |

pass_loop_jam (gcc::context *ctxt) | |

: gimple_opt_pass (pass_data_loop_jam, ctxt) | |

{} | |

/* opt_pass methods: */ | |

virtual bool gate (function *) { return flag_unroll_jam != 0; } | |

virtual unsigned int execute (function *); | |

}; | |

unsigned int | |

pass_loop_jam::execute (function *fun) | |

{ | |

if (number_of_loops (fun) <= 1) | |

return 0; | |

return tree_loop_unroll_and_jam (); | |

} | |

} | |

gimple_opt_pass * | |

make_pass_loop_jam (gcc::context *ctxt) | |

{ | |

return new pass_loop_jam (ctxt); | |

} |