Remove pass_cpb which is related to enable avx512 embedded broadcast from constant pool.
By optimizing vector movement to broadcast in ix86_expand_vector_move
during pass_expand, pass_reload/LRA can automatically generate an avx512
embedded broadcast, pass_cpb is not needed.
Considering that in the absence of avx512f, broadcast from memory is
still slightly faster than loading the entire memory, so always enable
broadcast.
benchmark:
https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/vaddps/broadcast
The performance diff
strategy : cycles
memory : 1046611188
memory : 1255420817
memory : 1044720793
memory : 1253414145
average : 1097868397
broadcast : 1044430688
broadcast : 1044477630
broadcast : 1253554603
broadcast : 1044561934
average : 1096756213
But however broadcast has larger size.
the size diff
size broadcast.o
text data bss dec hex filename
137 0 0 137 89 broadcast.o
size memory.o
text data bss dec hex filename
115 0 0 115 73 memory.o
gcc/ChangeLog:
* config/i386/i386-expand.c
(ix86_broadcast_from_integer_constant): Rename to ..
(ix86_broadcast_from_constant): .. this, and extend it to
handle float mode.
(ix86_expand_vector_move): Extend to float mode.
* config/i386/i386-features.c
(replace_constant_pool_with_broadcast): Remove.
(remove_partial_avx_dependency_gate): Ditto.
(constant_pool_broadcast): Ditto.
(class pass_constant_pool_broadcast): Ditto.
(make_pass_constant_pool_broadcast): Ditto.
(remove_partial_avx_dependency): Adjust gate.
* config/i386/i386-passes.def: Remove pass_constant_pool_broadcast.
* config/i386/i386-protos.h
(make_pass_constant_pool_broadcast): Remove.
gcc/testsuite/ChangeLog:
* gcc.target/i386/fuse-caller-save-xmm.c: Adjust testcase.
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 69ea79e..896bd68 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -453,8 +453,10 @@
emit_insn (gen_rtx_SET (op0, op1));
}
+/* OP is a memref of CONST_VECTOR, return scalar constant mem
+ if CONST_VECTOR is a vec_duplicate, else return NULL. */
static rtx
-ix86_broadcast_from_integer_constant (machine_mode mode, rtx op)
+ix86_broadcast_from_constant (machine_mode mode, rtx op)
{
int nunits = GET_MODE_NUNITS (mode);
if (nunits < 2)
@@ -462,7 +464,8 @@
/* Don't use integer vector broadcast if we can't move from GPR to SSE
register directly. */
- if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
+ if (!TARGET_INTER_UNIT_MOVES_TO_VEC
+ && INTEGRAL_MODE_P (mode))
return nullptr;
/* Convert CONST_VECTOR to a non-standard SSE constant integer
@@ -470,12 +473,17 @@
if (!(TARGET_AVX2
|| (TARGET_AVX
&& (GET_MODE_INNER (mode) == SImode
- || GET_MODE_INNER (mode) == DImode)))
+ || GET_MODE_INNER (mode) == DImode))
+ || FLOAT_MODE_P (mode))
|| standard_sse_constant_p (op, mode))
return nullptr;
- /* Don't broadcast from a 64-bit integer constant in 32-bit mode. */
- if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT)
+ /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
+ We can still put 64-bit integer constant in memory when
+ avx512 embed broadcast is available. */
+ if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
+ && (!TARGET_AVX512F
+ || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
return nullptr;
if (GET_MODE_INNER (mode) == TImode)
@@ -561,17 +569,29 @@
if (can_create_pseudo_p ()
&& GET_MODE_SIZE (mode) >= 16
- && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ && VECTOR_MODE_P (mode)
&& (MEM_P (op1)
&& SYMBOL_REF_P (XEXP (op1, 0))
&& CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
{
- rtx first = ix86_broadcast_from_integer_constant (mode, op1);
+ rtx first = ix86_broadcast_from_constant (mode, op1);
if (first != nullptr)
{
/* Broadcast to XMM/YMM/ZMM register from an integer
- constant. */
+ constant or scalar mem. */
+ /* Hard registers are used for 2 purposes:
+ 1. Prevent stack realignment when the original code
+ doesn't use vector registers, which is the same for
+ memcpy and memset.
+ 2. Prevent combine to convert constant broadcast to
+ load from constant pool. */
op1 = ix86_gen_scratch_sse_rtx (mode);
+ if (FLOAT_MODE_P (mode)
+ || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
+ {
+ first = force_const_mem (GET_MODE_INNER (mode), first);
+ op1 = gen_reg_rtx (mode);
+ }
bool ok = ix86_expand_vector_init_duplicate (false, mode,
op1, first);
gcc_assert (ok);
diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c
index cbd430a..d9c6652 100644
--- a/gcc/config/i386/i386-features.c
+++ b/gcc/config/i386/i386-features.c
@@ -2136,81 +2136,6 @@
return new pass_insert_endbr_and_patchable_area (ctxt);
}
-/* Replace all one-value const vector that are referenced by SYMBOL_REFs in x
- with embedded broadcast. i.e.transform
-
- vpaddq .LC0(%rip), %zmm0, %zmm0
- ret
- .LC0:
- .quad 3
- .quad 3
- .quad 3
- .quad 3
- .quad 3
- .quad 3
- .quad 3
- .quad 3
-
- to
-
- vpaddq .LC0(%rip){1to8}, %zmm0, %zmm0
- ret
- .LC0:
- .quad 3 */
-static void
-replace_constant_pool_with_broadcast (rtx_insn *insn)
-{
- subrtx_ptr_iterator::array_type array;
- FOR_EACH_SUBRTX_PTR (iter, array, &PATTERN (insn), ALL)
- {
- rtx *loc = *iter;
- rtx x = *loc;
- rtx broadcast_mem, vec_dup, constant, first;
- machine_mode mode;
-
- /* Constant pool. */
- if (!MEM_P (x)
- || !SYMBOL_REF_P (XEXP (x, 0))
- || !CONSTANT_POOL_ADDRESS_P (XEXP (x, 0)))
- continue;
-
- /* Const vector. */
- mode = GET_MODE (x);
- if (!VECTOR_MODE_P (mode))
- return;
- constant = get_pool_constant (XEXP (x, 0));
- if (GET_CODE (constant) != CONST_VECTOR)
- return;
-
- /* There could be some rtx like
- (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
- but with "*.LC1" refer to V2DI constant vector. */
- if (GET_MODE (constant) != mode)
- {
- constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
- if (constant == NULL_RTX || GET_CODE (constant) != CONST_VECTOR)
- return;
- }
- first = XVECEXP (constant, 0, 0);
-
- for (int i = 1; i < GET_MODE_NUNITS (mode); ++i)
- {
- rtx tmp = XVECEXP (constant, 0, i);
- /* Vector duplicate value. */
- if (!rtx_equal_p (tmp, first))
- return;
- }
-
- /* Replace with embedded broadcast. */
- broadcast_mem = force_const_mem (GET_MODE_INNER (mode), first);
- vec_dup = gen_rtx_VEC_DUPLICATE (mode, broadcast_mem);
- validate_change (insn, loc, vec_dup, 0);
-
- /* At most 1 memory_operand in an insn. */
- return;
- }
-}
-
/* At entry of the nearest common dominator for basic blocks with
conversions, generate a single
vxorps %xmmN, %xmmN, %xmmN
@@ -2249,10 +2174,6 @@
if (!NONDEBUG_INSN_P (insn))
continue;
- /* Handle AVX512 embedded broadcast here to save compile time. */
- if (TARGET_AVX512F)
- replace_constant_pool_with_broadcast (insn);
-
set = single_set (insn);
if (!set)
continue;
@@ -2384,16 +2305,6 @@
return 0;
}
-static bool
-remove_partial_avx_dependency_gate ()
-{
- return (TARGET_AVX
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY
- && TARGET_SSE_MATH
- && optimize
- && optimize_function_for_speed_p (cfun));
-}
-
namespace {
const pass_data pass_data_remove_partial_avx_dependency =
@@ -2419,7 +2330,11 @@
/* opt_pass methods: */
virtual bool gate (function *)
{
- return remove_partial_avx_dependency_gate ();
+ return (TARGET_AVX
+ && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+ && TARGET_SSE_MATH
+ && optimize
+ && optimize_function_for_speed_p (cfun));
}
virtual unsigned int execute (function *)
@@ -2436,68 +2351,6 @@
return new pass_remove_partial_avx_dependency (ctxt);
}
-/* For const vector having one duplicated value, there's no need to put
- whole vector in the constant pool when target supports embedded broadcast. */
-static unsigned int
-constant_pool_broadcast (void)
-{
- timevar_push (TV_MACH_DEP);
- rtx_insn *insn;
-
- for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
- {
- if (INSN_P (insn))
- replace_constant_pool_with_broadcast (insn);
- }
- timevar_pop (TV_MACH_DEP);
- return 0;
-}
-
-namespace {
-
-const pass_data pass_data_constant_pool_broadcast =
-{
- RTL_PASS, /* type */
- "cpb", /* name */
- OPTGROUP_NONE, /* optinfo_flags */
- TV_MACH_DEP, /* tv_id */
- 0, /* properties_required */
- 0, /* properties_provided */
- 0, /* properties_destroyed */
- 0, /* todo_flags_start */
- TODO_df_finish, /* todo_flags_finish */
-};
-
-class pass_constant_pool_broadcast : public rtl_opt_pass
-{
-public:
- pass_constant_pool_broadcast (gcc::context *ctxt)
- : rtl_opt_pass (pass_data_constant_pool_broadcast, ctxt)
- {}
-
- /* opt_pass methods: */
- virtual bool gate (function *)
- {
- /* Return false if rpad pass gate is true.
- replace_constant_pool_with_broadcast is called
- from both this pass and rpad pass. */
- return (TARGET_AVX512F && !remove_partial_avx_dependency_gate ());
- }
-
- virtual unsigned int execute (function *)
- {
- return constant_pool_broadcast ();
- }
-}; // class pass_cpb
-
-} // anon namespace
-
-rtl_opt_pass *
-make_pass_constant_pool_broadcast (gcc::context *ctxt)
-{
- return new pass_constant_pool_broadcast (ctxt);
-}
-
/* This compares the priority of target features in function DECL1
and DECL2. It returns positive value if DECL1 is higher priority,
negative value if DECL2 is higher priority and 0 if they are the
diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index 44df00e..29baf8a 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -33,4 +33,3 @@
INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area);
INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency);
- INSERT_PASS_AFTER (pass_combine, 1, pass_constant_pool_broadcast);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 51376fc..07ac02a 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -395,4 +395,3 @@
(gcc::context *);
extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
(gcc::context *);
-extern rtl_opt_pass *make_pass_constant_pool_broadcast (gcc::context *);
diff --git a/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c b/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c
index 4deff93..b0d3dc3 100644
--- a/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c
+++ b/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c
@@ -6,7 +6,7 @@
static v2df __attribute__((noinline))
bar (v2df a)
{
- return a + (v2df){ 3.0, 3.0 };
+ return a + (v2df){ 3.0, 4.0 };
}
v2df __attribute__((noinline))