i386: Add variable vec_set for 64bit vectors [PR97194]

To generate sane code a SSE4.1 variable PBLENDV instruction is needed.

2021-06-17  Uroš Bizjak  <ubizjak@gmail.com>

gcc/
	PR target/97194
	* config/i386/i386-expand.c (expand_vector_set_var):
	Handle V2FS mode remapping.  Pass TARGET_MMX_WITH_SSE to
	ix86_expand_vector_init_duplicate.
	(ix86_expand_vector_init_duplicate): Emit insv_1 for
	QImode for !TARGET_PARTIAL_REG_STALL.
	* config/i386/predicates.md (vec_setm_mmx_operand): New predicate.
	* config/i386/mmx.md (vec_setv2sf): Use vec_setm_mmx_operand
	as operand 2 predicate.  Call ix86_expand_vector_set_var
	for non-constant index operand.
	(vec_setv2si): Ditto.
	(vec_setv4hi): Ditto.
	(vec_setv8qi): ditto.

gcc/testsuite/

	PR target/97194
	* gcc.target/i386/sse4_1-vec-set-1.c: New test.
	* gcc.target/i386/sse4_1-vec-set-2.c: ditto.
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index eb6f9b0..8f4e4e4 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -13811,10 +13811,17 @@
 	wsmode = GET_MODE_INNER (wvmode);
 
 	val = convert_modes (wsmode, smode, val, true);
-	x = expand_simple_binop (wsmode, ASHIFT, val,
-				 GEN_INT (GET_MODE_BITSIZE (smode)),
-				 NULL_RTX, 1, OPTAB_LIB_WIDEN);
-	val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
+
+	if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
+	  emit_insn (gen_insv_1 (wsmode, val, val));
+	else
+	  {
+	    x = expand_simple_binop (wsmode, ASHIFT, val,
+				     GEN_INT (GET_MODE_BITSIZE (smode)),
+				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
+	    val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
+				       OPTAB_LIB_WIDEN);
+	  }
 
 	x = gen_reg_rtx (wvmode);
 	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
@@ -14788,6 +14795,9 @@
 	case E_V8DFmode:
 	  cmp_mode = V8DImode;
 	  break;
+	case E_V2SFmode:
+	  cmp_mode = V2SImode;
+	  break;
 	case E_V4SFmode:
 	  cmp_mode = V4SImode;
 	  break;
@@ -14809,9 +14819,11 @@
   idxv = gen_reg_rtx (cmp_mode);
   idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
 
-  ok = ix86_expand_vector_init_duplicate (false, mode, valv, val);
+  ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
+					  mode, valv, val);
   gcc_assert (ok);
-  ok = ix86_expand_vector_init_duplicate (false, cmp_mode, idxv, idx_tmp);
+  ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
+					  cmp_mode, idxv, idx_tmp);
   gcc_assert (ok);
   vec[0] = target;
   vec[1] = valv;
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 59a16f4..a107ac5 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -1279,11 +1279,14 @@
 (define_expand "vec_setv2sf"
   [(match_operand:V2SF 0 "register_operand")
    (match_operand:SF 1 "register_operand")
-   (match_operand 2 "const_int_operand")]
+   (match_operand 2 "vec_setm_mmx_operand")]
   "TARGET_MMX || TARGET_MMX_WITH_SSE"
 {
-  ix86_expand_vector_set (TARGET_MMX_WITH_SSE, operands[0], operands[1],
-			  INTVAL (operands[2]));
+  if (CONST_INT_P (operands[2]))
+    ix86_expand_vector_set (TARGET_MMX_WITH_SSE, operands[0], operands[1],
+			    INTVAL (operands[2]));
+  else
+    ix86_expand_vector_set_var (operands[0], operands[1], operands[2]);
   DONE;
 })
 
@@ -2989,11 +2992,14 @@
 (define_expand "vec_setv2si"
   [(match_operand:V2SI 0 "register_operand")
    (match_operand:SI 1 "register_operand")
-   (match_operand 2 "const_int_operand")]
+   (match_operand 2 "vec_setm_mmx_operand")]
   "TARGET_MMX || TARGET_MMX_WITH_SSE"
 {
-  ix86_expand_vector_set (TARGET_MMX_WITH_SSE, operands[0], operands[1],
-			  INTVAL (operands[2]));
+  if (CONST_INT_P (operands[2]))
+    ix86_expand_vector_set (TARGET_MMX_WITH_SSE, operands[0], operands[1],
+			    INTVAL (operands[2]));
+  else
+    ix86_expand_vector_set_var (operands[0], operands[1], operands[2]);
   DONE;
 })
 
@@ -3145,11 +3151,14 @@
 (define_expand "vec_setv4hi"
   [(match_operand:V4HI 0 "register_operand")
    (match_operand:HI 1 "register_operand")
-   (match_operand 2 "const_int_operand")]
+   (match_operand 2 "vec_setm_mmx_operand")]
   "TARGET_MMX || TARGET_MMX_WITH_SSE"
 {
-  ix86_expand_vector_set (TARGET_MMX_WITH_SSE, operands[0], operands[1],
-			  INTVAL (operands[2]));
+  if (CONST_INT_P (operands[2]))
+    ix86_expand_vector_set (TARGET_MMX_WITH_SSE, operands[0], operands[1],
+			    INTVAL (operands[2]));
+  else
+    ix86_expand_vector_set_var (operands[0], operands[1], operands[2]);
   DONE;
 })
 
@@ -3177,11 +3186,14 @@
 (define_expand "vec_setv8qi"
   [(match_operand:V8QI 0 "register_operand")
    (match_operand:QI 1 "register_operand")
-   (match_operand 2 "const_int_operand")]
+   (match_operand 2 "vec_setm_mmx_operand")]
   "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
 {
-  ix86_expand_vector_set (TARGET_MMX_WITH_SSE, operands[0], operands[1],
-			  INTVAL (operands[2]));
+  if (CONST_INT_P (operands[2]))
+    ix86_expand_vector_set (TARGET_MMX_WITH_SSE, operands[0], operands[1],
+			    INTVAL (operands[2]));
+  else
+    ix86_expand_vector_set_var (operands[0], operands[1], operands[2]);
   DONE;
 })
 
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 3dd134e..e7a8968 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1026,6 +1026,12 @@
 	    (match_test "TARGET_AVX2"))
        (match_code "const_int")))
 
+(define_predicate "vec_setm_mmx_operand"
+  (ior (and (match_operand 0 "register_operand")
+	    (match_test "TARGET_SSE4_1")
+	    (match_test "TARGET_MMX_WITH_SSE"))
+       (match_code "const_int")))
+
 ;; True for registers, or 1 or -1.  Used to optimize double-word shifts.
 (define_predicate "reg_or_pm1_operand"
   (ior (match_operand 0 "register_operand")
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-vec-set-1.c b/gcc/testsuite/gcc.target/i386/sse4_1-vec-set-1.c
new file mode 100644
index 0000000..7c7fd34
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-vec-set-1.c
@@ -0,0 +1,26 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-msse4.1 -O2" } */
+/* { dg-final { scan-assembler-times {(?n)v?pcmpeq[bwd]} 4 } } */
+/* { dg-final { scan-assembler-times {(?n)v?p?blendv} 4 } } */
+
+typedef char v8qi __attribute__ ((vector_size (8)));
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef int v2si __attribute__ ((vector_size (8)));
+typedef float v2sf __attribute__ ((vector_size (8)));
+
+#define FOO(VTYPE, TYPE)			\
+  VTYPE						\
+  __attribute__ ((noipa))			\
+  foo_##VTYPE (VTYPE a, TYPE b, unsigned int c)	\
+  {						\
+    a[c] = b;					\
+    return a;					\
+  }						\
+
+FOO (v8qi, char);
+
+FOO (v4hi, short);
+
+FOO (v2si, int);
+
+FOO (v2sf, float);
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-vec-set-2.c b/gcc/testsuite/gcc.target/i386/sse4_1-vec-set-2.c
new file mode 100644
index 0000000..24f8041
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-vec-set-2.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+
+#ifndef CHECK
+#define CHECK "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK
+
+#include "sse4_1-vec-set-1.c"
+
+#define CALC_TEST(vtype, type, N, idx)				\
+do								\
+  {								\
+    int i,val = idx * idx - idx * 3 + 16;			\
+    type res[N],exp[N];						\
+    vtype resv;							\
+    for (i = 0; i < N; i++)					\
+      {								\
+	res[i] = i * i - i * 3 + 15;				\
+	exp[i] = res[i];					\
+      }								\
+    exp[idx] = val;						\
+    resv = foo_##vtype (*(vtype *)&res[0], val, idx);		\
+    for (i = 0; i < N; i++)					\
+      {								\
+	if (resv[i] != exp[i])					\
+	  abort ();						\
+      }								\
+  }								\
+while (0)
+
+static void
+TEST (void)
+{
+  CALC_TEST (v8qi, char, 8, 5);
+  CALC_TEST (v4hi, short, 4, 2);
+  CALC_TEST (v2si, int, 2, 1);
+}