gcc/config/aarch64/aarch64-cc-fusion.cc - gcc - Git at Google

 // Pass to fuse CC operations with other instructions.
 // Copyright (C) 2021 Free Software Foundation, Inc.
 //
 // This file is part of GCC.
 //
 // GCC is free software; you can redistribute it and/or modify it under
 // the terms of the GNU General Public License as published by the Free
 // Software Foundation; either version 3, or (at your option) any later
 // version.
 //
 // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
 // WARRANTY; without even the implied warranty of MERCHANTABILITY or
 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 // for more details.
 //
 // You should have received a copy of the GNU General Public License
 // along with GCC; see the file COPYING3.  If not see
 // <http://www.gnu.org/licenses/>.

 // This pass looks for sequences of the form:
 //
 //    A: (set (reg R1) X1)
 //    B: ...instructions that might change the value of X1...
 //    C: (set (reg CC) X2) // X2 uses R1
 //
 // and tries to change them to:
 //
 //    C': [(set (reg CC) X2')
 //         (set (reg R1) X1)]
 //    B: ...instructions that might change the value of X1...
 //
 // where X2' is the result of replacing R1 with X1 in X2.
 //
 // This sequence occurs in SVE code in two important cases:
 //
 // (a) Sometimes, to deal correctly with overflow, we need to increment
 //     an IV after a WHILELO rather than before it.  In this case:
 //     - A is a WHILELO,
 //     - B includes an IV increment and
 //     - C is a separate PTEST.
 //
 // (b) ACLE code of the form:
 //
 //       svbool_t ok = svrdffr ();
 //       if (svptest_last (pg, ok))
 //         ...
 //
 //     must, for performance reasons, be code-generated as:
 //
 //       RDFFRS Pok.B, Pg/Z
 //       ...branch on flags result...
 //
 //     without a separate PTEST of Pok.  In this case:
 //     - A is an aarch64_rdffr
 //     - B includes an aarch64_update_ffrt
 //     - C is a separate PTEST
 //
 // Combine can handle this optimization if B doesn't exist and if A and
 // C are in the same BB.  This pass instead handles cases where B does
 // exist and cases where A and C are in different BBs of the same EBB.

 #define IN_TARGET_CODE 1

 #define INCLUDE_ALGORITHM
 #define INCLUDE_FUNCTIONAL
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
 #include "backend.h"
 #include "rtl.h"
 #include "df.h"
 #include "rtl-ssa.h"
 #include "tree-pass.h"

 using namespace rtl_ssa;

 namespace {
 const pass_data pass_data_cc_fusion =
 {
   RTL_PASS, // type
   "cc_fusion", // name
   OPTGROUP_NONE, // optinfo_flags
   TV_NONE, // tv_id
   0, // properties_required
   0, // properties_provided
   0, // properties_destroyed
   0, // todo_flags_start
   TODO_df_finish, // todo_flags_finish
 };

 // Class that represents one run of the pass.
 class cc_fusion
 {
 public:
   cc_fusion ()  : m_parallel () {}
   void execute ();

 private:
   rtx optimizable_set (const insn_info *);
   bool parallelize_insns (def_info *, rtx, def_info *, rtx);
   void optimize_cc_setter (def_info *, rtx);

   // A spare PARALLEL rtx, or null if none.
   rtx m_parallel;
 };

 // See whether INSN is a single_set that we can optimize.  Return the
 // set if so, otherwise return null.
 rtx
 cc_fusion::optimizable_set (const insn_info *insn)
 {
   if (!insn->can_be_optimized ()
       || insn->is_asm ()
       || insn->has_volatile_refs ()
       || insn->has_pre_post_modify ())
     return NULL_RTX;

   return single_set (insn->rtl ());
 }

 // CC_SET is a single_set that sets (only) CC_DEF; OTHER_SET is likewise
 // a single_set that sets (only) OTHER_DEF.  CC_SET is known to set the
 // CC register and the instruction that contains CC_SET is known to use
 // OTHER_DEF.  Try to do CC_SET and OTHER_SET in parallel.
 bool
 cc_fusion::parallelize_insns (def_info *cc_def, rtx cc_set,
 			      def_info *other_def, rtx other_set)
 {
   auto attempt = crtl->ssa->new_change_attempt ();

   insn_info *cc_insn = cc_def->insn ();
   insn_info *other_insn = other_def->insn ();
   if (dump_file && (dump_flags & TDF_DETAILS))
     fprintf (dump_file, "trying to parallelize insn %d and insn %d\n",
 	     other_insn->uid (), cc_insn->uid ());

   // Try to substitute OTHER_SET into CC_INSN.
   insn_change_watermark rtl_watermark;
   rtx_insn *cc_rtl = cc_insn->rtl ();
   insn_propagation prop (cc_rtl, SET_DEST (other_set),
 			 SET_SRC (other_set));
   if (!prop.apply_to_pattern (&PATTERN (cc_rtl))
       || prop.num_replacements == 0)
     {
       if (dump_file && (dump_flags & TDF_DETAILS))
 	fprintf (dump_file, "-- failed to substitute all uses of r%d\n",
 		 other_def->regno ());
       return false;
     }

   // Restrict the uses to those outside notes.
   use_array cc_uses = remove_note_accesses (attempt, cc_insn->uses ());
   use_array other_set_uses = remove_note_accesses (attempt,
 						   other_insn->uses ());

   // Remove the use of the substituted value.
   access_array_builder uses_builder (attempt);
   uses_builder.reserve (cc_uses.size ());
   for (use_info *use : cc_uses)
     if (use->def () != other_def)
       uses_builder.quick_push (use);
   cc_uses = use_array (uses_builder.finish ());

   // Get the list of uses for the new instruction.
   insn_change cc_change (cc_insn);
   cc_change.new_uses = merge_access_arrays (attempt, other_set_uses, cc_uses);
   if (!cc_change.new_uses.is_valid ())
     {
       if (dump_file && (dump_flags & TDF_DETAILS))
 	fprintf (dump_file, "-- cannot merge uses\n");
       return false;
     }

   // The instruction initially defines just two registers.  recog can add
   // extra clobbers if necessary.
   auto_vec<access_info *, 2> new_defs;
   new_defs.quick_push (cc_def);
   new_defs.quick_push (other_def);
   sort_accesses (new_defs);
   cc_change.new_defs = def_array (access_array (new_defs));

   // Make sure there is somewhere that the new instruction could live.
   auto other_change = insn_change::delete_insn (other_insn);
   insn_change *changes[] = { &other_change, &cc_change };
   cc_change.move_range = cc_insn->ebb ()->insn_range ();
   if (!restrict_movement_ignoring (cc_change, insn_is_changing (changes)))
     {
       if (dump_file && (dump_flags & TDF_DETAILS))
 	fprintf (dump_file, "-- cannot satisfy all definitions and uses\n");
       return false;
     }

   // Tentatively install the new pattern.  By convention, the CC set
   // must be first.
   if (m_parallel)
     {
       XVECEXP (m_parallel, 0, 0) = cc_set;
       XVECEXP (m_parallel, 0, 1) = other_set;
     }
   else
     {
       rtvec vec = gen_rtvec (2, cc_set, other_set);
       m_parallel = gen_rtx_PARALLEL (VOIDmode, vec);
     }
   validate_change (cc_rtl, &PATTERN (cc_rtl), m_parallel, 1);

   // These routines report failures themselves.
   if (!recog_ignoring (attempt, cc_change, insn_is_changing (changes))
       || !changes_are_worthwhile (changes)
       || !crtl->ssa->verify_insn_changes (changes))
     return false;

   remove_reg_equal_equiv_notes (cc_rtl);
   confirm_change_group ();
   crtl->ssa->change_insns (changes);
   m_parallel = NULL_RTX;
   return true;
 }

 // Try to optimize the instruction that contains CC_DEF, where CC_DEF describes
 // a definition of the CC register by CC_SET.
 void
 cc_fusion::optimize_cc_setter (def_info *cc_def, rtx cc_set)
 {
   // Search the registers used by the CC setter for an easily-substitutable
   // def-use chain.
   for (use_info *other_use : cc_def->insn ()->uses ())
     if (def_info *other_def = other_use->def ())
       if (other_use->regno () != CC_REGNUM
 	  && other_def->ebb () == cc_def->ebb ())
 	if (rtx other_set = optimizable_set (other_def->insn ()))
 	  {
 	    rtx dest = SET_DEST (other_set);
 	    if (REG_P (dest)
 		&& REGNO (dest) == other_def->regno ()
 		&& REG_NREGS (dest) == 1
 		&& parallelize_insns (cc_def, cc_set, other_def, other_set))
 	      return;
 	  }
 }

 // Run the pass on the current function.
 void
 cc_fusion::execute ()
 {
   // Initialization.
   calculate_dominance_info (CDI_DOMINATORS);
   df_analyze ();
   crtl->ssa = new rtl_ssa::function_info (cfun);

   // Walk through all instructions that set CC.  Look for a PTEST instruction
   // that we can optimize.
   //
   // ??? The PTEST test isn't needed for correctness, but it ensures that the
   // pass no effect on non-SVE code.
   for (def_info *def : crtl->ssa->reg_defs (CC_REGNUM))
     if (rtx cc_set = optimizable_set (def->insn ()))
       if (REG_P (SET_DEST (cc_set))
 	  && REGNO (SET_DEST (cc_set)) == CC_REGNUM
 	  && GET_CODE (SET_SRC (cc_set)) == UNSPEC
 	  && XINT (SET_SRC (cc_set), 1) == UNSPEC_PTEST)
 	optimize_cc_setter (def, cc_set);

   // Finalization.
   crtl->ssa->perform_pending_updates ();
   free_dominance_info (CDI_DOMINATORS);
 }

 class pass_cc_fusion : public rtl_opt_pass
 {
 public:
   pass_cc_fusion (gcc::context *ctxt)
     : rtl_opt_pass (pass_data_cc_fusion, ctxt)
   {}

   // opt_pass methods:
   virtual bool gate (function *) { return TARGET_SVE && optimize >= 2; }
   virtual unsigned int execute (function *);
 };

 unsigned int
 pass_cc_fusion::execute (function *)
 {
   cc_fusion ().execute ();
   return 0;
 }

 } // end namespace

 // Create a new CC fusion pass instance.

 rtl_opt_pass *
 make_pass_cc_fusion (gcc::context *ctxt)
 {
   return new pass_cc_fusion (ctxt);
 }
	// Pass to fuse CC operations with other instructions.
	// Copyright (C) 2021 Free Software Foundation, Inc.
	//
	// This file is part of GCC.
	//
	// GCC is free software; you can redistribute it and/or modify it under
	// the terms of the GNU General Public License as published by the Free
	// Software Foundation; either version 3, or (at your option) any later
	// version.
	//
	// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
	// WARRANTY; without even the implied warranty of MERCHANTABILITY or
	// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	// for more details.
	//
	// You should have received a copy of the GNU General Public License
	// along with GCC; see the file COPYING3. If not see
	// <http://www.gnu.org/licenses/>.

	// This pass looks for sequences of the form:
	//
	// A: (set (reg R1) X1)
	// B: ...instructions that might change the value of X1...
	// C: (set (reg CC) X2) // X2 uses R1
	//
	// and tries to change them to:
	//
	// C': [(set (reg CC) X2')
	// (set (reg R1) X1)]
	// B: ...instructions that might change the value of X1...
	//
	// where X2' is the result of replacing R1 with X1 in X2.
	//
	// This sequence occurs in SVE code in two important cases:
	//
	// (a) Sometimes, to deal correctly with overflow, we need to increment
	// an IV after a WHILELO rather than before it. In this case:
	// - A is a WHILELO,
	// - B includes an IV increment and
	// - C is a separate PTEST.
	//
	// (b) ACLE code of the form:
	//
	// svbool_t ok = svrdffr ();
	// if (svptest_last (pg, ok))
	// ...
	//
	// must, for performance reasons, be code-generated as:
	//
	// RDFFRS Pok.B, Pg/Z
	// ...branch on flags result...
	//
	// without a separate PTEST of Pok. In this case:
	// - A is an aarch64_rdffr
	// - B includes an aarch64_update_ffrt
	// - C is a separate PTEST
	//
	// Combine can handle this optimization if B doesn't exist and if A and
	// C are in the same BB. This pass instead handles cases where B does
	// exist and cases where A and C are in different BBs of the same EBB.

	#define IN_TARGET_CODE 1

	#define INCLUDE_ALGORITHM
	#define INCLUDE_FUNCTIONAL
	#include "config.h"
	#include "system.h"
	#include "coretypes.h"
	#include "backend.h"
	#include "rtl.h"
	#include "df.h"
	#include "rtl-ssa.h"
	#include "tree-pass.h"

	using namespace rtl_ssa;

	namespace {
	const pass_data pass_data_cc_fusion =
	{
	RTL_PASS, // type
	"cc_fusion", // name
	OPTGROUP_NONE, // optinfo_flags
	TV_NONE, // tv_id
	0, // properties_required
	0, // properties_provided
	0, // properties_destroyed
	0, // todo_flags_start
	TODO_df_finish, // todo_flags_finish
	};

	// Class that represents one run of the pass.
	class cc_fusion
	{
	public:
	cc_fusion () : m_parallel () {}
	void execute ();

	private:
	rtx optimizable_set (const insn_info *);
	bool parallelize_insns (def_info , rtx, def_info , rtx);
	void optimize_cc_setter (def_info *, rtx);

	// A spare PARALLEL rtx, or null if none.
	rtx m_parallel;
	};

	// See whether INSN is a single_set that we can optimize. Return the
	// set if so, otherwise return null.
	rtx
	cc_fusion::optimizable_set (const insn_info *insn)
	{
	if (!insn->can_be_optimized ()
	\|\| insn->is_asm ()
	\|\| insn->has_volatile_refs ()
	\|\| insn->has_pre_post_modify ())
	return NULL_RTX;

	return single_set (insn->rtl ());
	}

	// CC_SET is a single_set that sets (only) CC_DEF; OTHER_SET is likewise
	// a single_set that sets (only) OTHER_DEF. CC_SET is known to set the
	// CC register and the instruction that contains CC_SET is known to use
	// OTHER_DEF. Try to do CC_SET and OTHER_SET in parallel.
	bool
	cc_fusion::parallelize_insns (def_info *cc_def, rtx cc_set,
	def_info *other_def, rtx other_set)
	{
	auto attempt = crtl->ssa->new_change_attempt ();

	insn_info *cc_insn = cc_def->insn ();
	insn_info *other_insn = other_def->insn ();
	if (dump_file && (dump_flags & TDF_DETAILS))
	fprintf (dump_file, "trying to parallelize insn %d and insn %d\n",
	other_insn->uid (), cc_insn->uid ());

	// Try to substitute OTHER_SET into CC_INSN.
	insn_change_watermark rtl_watermark;
	rtx_insn *cc_rtl = cc_insn->rtl ();
	insn_propagation prop (cc_rtl, SET_DEST (other_set),
	SET_SRC (other_set));
	if (!prop.apply_to_pattern (&PATTERN (cc_rtl))
	\|\| prop.num_replacements == 0)
	{
	if (dump_file && (dump_flags & TDF_DETAILS))
	fprintf (dump_file, "-- failed to substitute all uses of r%d\n",
	other_def->regno ());
	return false;
	}

	// Restrict the uses to those outside notes.
	use_array cc_uses = remove_note_accesses (attempt, cc_insn->uses ());
	use_array other_set_uses = remove_note_accesses (attempt,
	other_insn->uses ());

	// Remove the use of the substituted value.
	access_array_builder uses_builder (attempt);
	uses_builder.reserve (cc_uses.size ());
	for (use_info *use : cc_uses)
	if (use->def () != other_def)
	uses_builder.quick_push (use);
	cc_uses = use_array (uses_builder.finish ());

	// Get the list of uses for the new instruction.
	insn_change cc_change (cc_insn);
	cc_change.new_uses = merge_access_arrays (attempt, other_set_uses, cc_uses);
	if (!cc_change.new_uses.is_valid ())
	{
	if (dump_file && (dump_flags & TDF_DETAILS))
	fprintf (dump_file, "-- cannot merge uses\n");
	return false;
	}

	// The instruction initially defines just two registers. recog can add
	// extra clobbers if necessary.
	auto_vec<access_info *, 2> new_defs;
	new_defs.quick_push (cc_def);
	new_defs.quick_push (other_def);
	sort_accesses (new_defs);
	cc_change.new_defs = def_array (access_array (new_defs));

	// Make sure there is somewhere that the new instruction could live.
	auto other_change = insn_change::delete_insn (other_insn);
	insn_change *changes[] = { &other_change, &cc_change };
	cc_change.move_range = cc_insn->ebb ()->insn_range ();
	if (!restrict_movement_ignoring (cc_change, insn_is_changing (changes)))
	{
	if (dump_file && (dump_flags & TDF_DETAILS))
	fprintf (dump_file, "-- cannot satisfy all definitions and uses\n");
	return false;
	}

	// Tentatively install the new pattern. By convention, the CC set
	// must be first.
	if (m_parallel)
	{
	XVECEXP (m_parallel, 0, 0) = cc_set;
	XVECEXP (m_parallel, 0, 1) = other_set;
	}
	else
	{
	rtvec vec = gen_rtvec (2, cc_set, other_set);
	m_parallel = gen_rtx_PARALLEL (VOIDmode, vec);
	}
	validate_change (cc_rtl, &PATTERN (cc_rtl), m_parallel, 1);

	// These routines report failures themselves.
	if (!recog_ignoring (attempt, cc_change, insn_is_changing (changes))
	\|\| !changes_are_worthwhile (changes)
	\|\| !crtl->ssa->verify_insn_changes (changes))
	return false;

	remove_reg_equal_equiv_notes (cc_rtl);
	confirm_change_group ();
	crtl->ssa->change_insns (changes);
	m_parallel = NULL_RTX;
	return true;
	}

	// Try to optimize the instruction that contains CC_DEF, where CC_DEF describes
	// a definition of the CC register by CC_SET.
	void
	cc_fusion::optimize_cc_setter (def_info *cc_def, rtx cc_set)
	{
	// Search the registers used by the CC setter for an easily-substitutable
	// def-use chain.
	for (use_info *other_use : cc_def->insn ()->uses ())
	if (def_info *other_def = other_use->def ())
	if (other_use->regno () != CC_REGNUM
	&& other_def->ebb () == cc_def->ebb ())
	if (rtx other_set = optimizable_set (other_def->insn ()))
	{
	rtx dest = SET_DEST (other_set);
	if (REG_P (dest)
	&& REGNO (dest) == other_def->regno ()
	&& REG_NREGS (dest) == 1
	&& parallelize_insns (cc_def, cc_set, other_def, other_set))
	return;
	}
	}

	// Run the pass on the current function.
	void
	cc_fusion::execute ()
	{
	// Initialization.
	calculate_dominance_info (CDI_DOMINATORS);
	df_analyze ();
	crtl->ssa = new rtl_ssa::function_info (cfun);

	// Walk through all instructions that set CC. Look for a PTEST instruction
	// that we can optimize.
	//
	// ??? The PTEST test isn't needed for correctness, but it ensures that the
	// pass no effect on non-SVE code.
	for (def_info *def : crtl->ssa->reg_defs (CC_REGNUM))
	if (rtx cc_set = optimizable_set (def->insn ()))
	if (REG_P (SET_DEST (cc_set))
	&& REGNO (SET_DEST (cc_set)) == CC_REGNUM
	&& GET_CODE (SET_SRC (cc_set)) == UNSPEC
	&& XINT (SET_SRC (cc_set), 1) == UNSPEC_PTEST)
	optimize_cc_setter (def, cc_set);

	// Finalization.
	crtl->ssa->perform_pending_updates ();
	free_dominance_info (CDI_DOMINATORS);
	}

	class pass_cc_fusion : public rtl_opt_pass
	{
	public:
	pass_cc_fusion (gcc::context *ctxt)
	: rtl_opt_pass (pass_data_cc_fusion, ctxt)
	{}

	// opt_pass methods:
	virtual bool gate (function *) { return TARGET_SVE && optimize >= 2; }
	virtual unsigned int execute (function *);
	};

	unsigned int
	pass_cc_fusion::execute (function *)
	{
	cc_fusion ().execute ();
	return 0;
	}

	} // end namespace

	// Create a new CC fusion pass instance.

	rtl_opt_pass *
	make_pass_cc_fusion (gcc::context *ctxt)
	{
	return new pass_cc_fusion (ctxt);
	}