blob: 09069a20de28f992c288f9dc87b7e935937e4baa [file] [log] [blame]
// Pass to fuse CC operations with other instructions.
// Copyright (C) 2021 Free Software Foundation, Inc.
//
// This file is part of GCC.
//
// GCC is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 3, or (at your option) any later
// version.
//
// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License
// along with GCC; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// This pass looks for sequences of the form:
//
// A: (set (reg R1) X1)
// B: ...instructions that might change the value of X1...
// C: (set (reg CC) X2) // X2 uses R1
//
// and tries to change them to:
//
// C': [(set (reg CC) X2')
// (set (reg R1) X1)]
// B: ...instructions that might change the value of X1...
//
// where X2' is the result of replacing R1 with X1 in X2.
//
// This sequence occurs in SVE code in two important cases:
//
// (a) Sometimes, to deal correctly with overflow, we need to increment
// an IV after a WHILELO rather than before it. In this case:
// - A is a WHILELO,
// - B includes an IV increment and
// - C is a separate PTEST.
//
// (b) ACLE code of the form:
//
// svbool_t ok = svrdffr ();
// if (svptest_last (pg, ok))
// ...
//
// must, for performance reasons, be code-generated as:
//
// RDFFRS Pok.B, Pg/Z
// ...branch on flags result...
//
// without a separate PTEST of Pok. In this case:
// - A is an aarch64_rdffr
// - B includes an aarch64_update_ffrt
// - C is a separate PTEST
//
// Combine can handle this optimization if B doesn't exist and if A and
// C are in the same BB. This pass instead handles cases where B does
// exist and cases where A and C are in different BBs of the same EBB.
#define IN_TARGET_CODE 1
#define INCLUDE_ALGORITHM
#define INCLUDE_FUNCTIONAL
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "backend.h"
#include "rtl.h"
#include "df.h"
#include "rtl-ssa.h"
#include "tree-pass.h"
using namespace rtl_ssa;
namespace {
const pass_data pass_data_cc_fusion =
{
RTL_PASS, // type
"cc_fusion", // name
OPTGROUP_NONE, // optinfo_flags
TV_NONE, // tv_id
0, // properties_required
0, // properties_provided
0, // properties_destroyed
0, // todo_flags_start
TODO_df_finish, // todo_flags_finish
};
// Class that represents one run of the pass.
class cc_fusion
{
public:
cc_fusion () : m_parallel () {}
void execute ();
private:
rtx optimizable_set (const insn_info *);
bool parallelize_insns (def_info *, rtx, def_info *, rtx);
void optimize_cc_setter (def_info *, rtx);
// A spare PARALLEL rtx, or null if none.
rtx m_parallel;
};
// See whether INSN is a single_set that we can optimize. Return the
// set if so, otherwise return null.
rtx
cc_fusion::optimizable_set (const insn_info *insn)
{
if (!insn->can_be_optimized ()
|| insn->is_asm ()
|| insn->has_volatile_refs ()
|| insn->has_pre_post_modify ())
return NULL_RTX;
return single_set (insn->rtl ());
}
// CC_SET is a single_set that sets (only) CC_DEF; OTHER_SET is likewise
// a single_set that sets (only) OTHER_DEF. CC_SET is known to set the
// CC register and the instruction that contains CC_SET is known to use
// OTHER_DEF. Try to do CC_SET and OTHER_SET in parallel.
bool
cc_fusion::parallelize_insns (def_info *cc_def, rtx cc_set,
def_info *other_def, rtx other_set)
{
auto attempt = crtl->ssa->new_change_attempt ();
insn_info *cc_insn = cc_def->insn ();
insn_info *other_insn = other_def->insn ();
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "trying to parallelize insn %d and insn %d\n",
other_insn->uid (), cc_insn->uid ());
// Try to substitute OTHER_SET into CC_INSN.
insn_change_watermark rtl_watermark;
rtx_insn *cc_rtl = cc_insn->rtl ();
insn_propagation prop (cc_rtl, SET_DEST (other_set),
SET_SRC (other_set));
if (!prop.apply_to_pattern (&PATTERN (cc_rtl))
|| prop.num_replacements == 0)
{
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "-- failed to substitute all uses of r%d\n",
other_def->regno ());
return false;
}
// Restrict the uses to those outside notes.
use_array cc_uses = remove_note_accesses (attempt, cc_insn->uses ());
use_array other_set_uses = remove_note_accesses (attempt,
other_insn->uses ());
// Remove the use of the substituted value.
access_array_builder uses_builder (attempt);
uses_builder.reserve (cc_uses.size ());
for (use_info *use : cc_uses)
if (use->def () != other_def)
uses_builder.quick_push (use);
cc_uses = use_array (uses_builder.finish ());
// Get the list of uses for the new instruction.
insn_change cc_change (cc_insn);
cc_change.new_uses = merge_access_arrays (attempt, other_set_uses, cc_uses);
if (!cc_change.new_uses.is_valid ())
{
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "-- cannot merge uses\n");
return false;
}
// The instruction initially defines just two registers. recog can add
// extra clobbers if necessary.
auto_vec<access_info *, 2> new_defs;
new_defs.quick_push (cc_def);
new_defs.quick_push (other_def);
sort_accesses (new_defs);
cc_change.new_defs = def_array (access_array (new_defs));
// Make sure there is somewhere that the new instruction could live.
auto other_change = insn_change::delete_insn (other_insn);
insn_change *changes[] = { &other_change, &cc_change };
cc_change.move_range = cc_insn->ebb ()->insn_range ();
if (!restrict_movement_ignoring (cc_change, insn_is_changing (changes)))
{
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "-- cannot satisfy all definitions and uses\n");
return false;
}
// Tentatively install the new pattern. By convention, the CC set
// must be first.
if (m_parallel)
{
XVECEXP (m_parallel, 0, 0) = cc_set;
XVECEXP (m_parallel, 0, 1) = other_set;
}
else
{
rtvec vec = gen_rtvec (2, cc_set, other_set);
m_parallel = gen_rtx_PARALLEL (VOIDmode, vec);
}
validate_change (cc_rtl, &PATTERN (cc_rtl), m_parallel, 1);
// These routines report failures themselves.
if (!recog_ignoring (attempt, cc_change, insn_is_changing (changes))
|| !changes_are_worthwhile (changes)
|| !crtl->ssa->verify_insn_changes (changes))
return false;
remove_reg_equal_equiv_notes (cc_rtl);
confirm_change_group ();
crtl->ssa->change_insns (changes);
m_parallel = NULL_RTX;
return true;
}
// Try to optimize the instruction that contains CC_DEF, where CC_DEF describes
// a definition of the CC register by CC_SET.
void
cc_fusion::optimize_cc_setter (def_info *cc_def, rtx cc_set)
{
// Search the registers used by the CC setter for an easily-substitutable
// def-use chain.
for (use_info *other_use : cc_def->insn ()->uses ())
if (def_info *other_def = other_use->def ())
if (other_use->regno () != CC_REGNUM
&& other_def->ebb () == cc_def->ebb ())
if (rtx other_set = optimizable_set (other_def->insn ()))
{
rtx dest = SET_DEST (other_set);
if (REG_P (dest)
&& REGNO (dest) == other_def->regno ()
&& REG_NREGS (dest) == 1
&& parallelize_insns (cc_def, cc_set, other_def, other_set))
return;
}
}
// Run the pass on the current function.
void
cc_fusion::execute ()
{
// Initialization.
calculate_dominance_info (CDI_DOMINATORS);
df_analyze ();
crtl->ssa = new rtl_ssa::function_info (cfun);
// Walk through all instructions that set CC. Look for a PTEST instruction
// that we can optimize.
//
// ??? The PTEST test isn't needed for correctness, but it ensures that the
// pass no effect on non-SVE code.
for (def_info *def : crtl->ssa->reg_defs (CC_REGNUM))
if (rtx cc_set = optimizable_set (def->insn ()))
if (REG_P (SET_DEST (cc_set))
&& REGNO (SET_DEST (cc_set)) == CC_REGNUM
&& GET_CODE (SET_SRC (cc_set)) == UNSPEC
&& XINT (SET_SRC (cc_set), 1) == UNSPEC_PTEST)
optimize_cc_setter (def, cc_set);
// Finalization.
crtl->ssa->perform_pending_updates ();
free_dominance_info (CDI_DOMINATORS);
}
class pass_cc_fusion : public rtl_opt_pass
{
public:
pass_cc_fusion (gcc::context *ctxt)
: rtl_opt_pass (pass_data_cc_fusion, ctxt)
{}
// opt_pass methods:
virtual bool gate (function *) { return TARGET_SVE && optimize >= 2; }
virtual unsigned int execute (function *);
};
unsigned int
pass_cc_fusion::execute (function *)
{
cc_fusion ().execute ();
return 0;
}
} // end namespace
// Create a new CC fusion pass instance.
rtl_opt_pass *
make_pass_cc_fusion (gcc::context *ctxt)
{
return new pass_cc_fusion (ctxt);
}