i386: Add V8QI and other 64bit vector permutations [PR89021]

In addition to V8QI permutations, several other missing permutations are
added for 64bit vector modes for TARGET_SSSE3 and TARGET_SSE4_1 targets.

2021-06-10  Uroš Bizjak  <ubizjak@gmail.com>

gcc/
	PR target/89021
	* config/i386/i386-expand.c (ix86_split_mmx_punpck):
	Handle V2SF mode.  Emit SHUFPS to fixup unpack-high for V2SF mode.
	(expand_vec_perm_blend): Handle 64bit modes for TARGET_SSE4_1.
	(expand_vec_perm_pshufb): Handle 64bit modes for TARGET_SSSE3.
	(expand_vec_perm_pblendv): Handle 64bit modes for TARGET_SSE4_1.
	(expand_vec_perm_interleave2): Handle 64bit modes.
	(expand_vec_perm_even_odd_pack): Handle V8QI mode.
	(expand_vec_perm_even_odd_1): Ditto.
	(ix86_vectorize_vec_perm_const): Ditto.
	* config/i386/i386.md (UNSPEC_PSHUFB): Move from ...
	* config/i386/sse.md: ... here.
	* config/i386/mmx.md (*vec_interleave_lowv2sf):
	New insn_and_split pattern.
	(*vec_interleave_highv2sf): Ditto.
	(mmx_pshufbv8qi3): New insn pattern.
	(*mmx_pblendw): Ditto.
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index c3ce21b..9ee5257 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -798,6 +798,15 @@
 					  GEN_INT (1), GEN_INT (5)));
       break;
 
+    case E_V2SFmode:
+      sse_mode = V4SFmode;
+      double_sse_mode = V8SFmode;
+      mask = gen_rtx_PARALLEL (VOIDmode,
+			       gen_rtvec (4,
+					  GEN_INT (0), GEN_INT (4),
+					  GEN_INT (1), GEN_INT (5)));
+      break;
+
     default:
       gcc_unreachable ();
     }
@@ -812,14 +821,26 @@
   rtx insn = gen_rtx_SET (dest, op2);
   emit_insn (insn);
 
+  /* Move bits 64:127 to bits 0:63.  */
   if (high_p)
     {
-      /* Move bits 64:127 to bits 0:63.  */
-      mask = gen_rtx_PARALLEL (VOIDmode,
-			       gen_rtvec (4, GEN_INT (2), GEN_INT (3),
-					  GEN_INT (0), GEN_INT (0)));
-      dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
-      op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
+      if (sse_mode == V4SFmode)
+	{
+	  mask = gen_rtx_PARALLEL (VOIDmode,
+				   gen_rtvec (4, GEN_INT (2), GEN_INT (3),
+					      GEN_INT (4), GEN_INT (5)));
+	  op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
+	  op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
+	}
+      else
+	{
+	  mask = gen_rtx_PARALLEL (VOIDmode,
+				   gen_rtvec (4, GEN_INT (2), GEN_INT (3),
+					      GEN_INT (0), GEN_INT (1)));
+	  dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
+	  op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
+	}
+
       insn = gen_rtx_SET (dest, op1);
       emit_insn (insn);
     }
@@ -17062,7 +17083,8 @@
     ;
   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
     ;
-  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+  else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
+			     || GET_MODE_SIZE (vmode) == 8))
     ;
   else
     return false;
@@ -17095,6 +17117,7 @@
     case E_V8SFmode:
     case E_V2DFmode:
     case E_V4SFmode:
+    case E_V4HImode:
     case E_V8HImode:
     case E_V8SImode:
     case E_V32HImode:
@@ -17111,6 +17134,12 @@
       vmode = V8HImode;
       goto do_subreg;
 
+    case E_V2SImode:
+      for (i = 0; i < 2; ++i)
+	mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
+      vmode = V4HImode;
+      goto do_subreg;
+
     case E_V4SImode:
       for (i = 0; i < 4; ++i)
 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
@@ -17132,7 +17161,9 @@
 	    vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
 	    vperm = force_reg (vmode, vperm);
 
-	    if (GET_MODE_SIZE (vmode) == 16)
+	    if (GET_MODE_SIZE (vmode) == 8)
+	      emit_insn (gen_mmx_pblendvb64 (target, op0, op1, vperm));
+	    else if (GET_MODE_SIZE (vmode) == 16)
 	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
 	    else
 	      emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
@@ -17152,6 +17183,16 @@
       op1 = gen_lowpart (vmode, op1);
       break;
 
+    case E_V8QImode:
+      for (i = 0; i < 8; i += 2)
+	if (d->perm[i] + 1 != d->perm[i + 1])
+	  goto use_pblendvb;
+
+      for (i = 0; i < 4; ++i)
+	mask |= (d->perm[i * 2] >= 8) << i;
+      vmode = V4HImode;
+      goto do_subreg;
+
     case E_V32QImode:
       /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
       for (i = 0; i < 32; i += 2)
@@ -17384,7 +17425,13 @@
     }
   else
     {
-      if (GET_MODE_SIZE (d->vmode) == 16)
+      if (GET_MODE_SIZE (d->vmode) == 8)
+	{
+	  if (!TARGET_SSSE3)
+	    return false;
+	  vmode = V8QImode;
+	}
+      else if (GET_MODE_SIZE (d->vmode) == 16)
 	{
 	  if (!TARGET_SSSE3)
 	    return false;
@@ -17506,12 +17553,12 @@
       eltsz = GET_MODE_UNIT_SIZE (d->vmode);
       if (!d->one_operand_p)
 	mask = 2 * nelt - 1;
-      else if (vmode == V16QImode)
-	mask = nelt - 1;
       else if (vmode == V64QImode)
 	mask = nelt / 4 - 1;
-      else
+      else if (vmode == V32QImode)
 	mask = nelt / 2 - 1;
+      else
+	mask = nelt - 1;
 
       for (i = 0; i < nelt; ++i)
 	{
@@ -17521,9 +17568,18 @@
 	}
     }
 
-  vperm = gen_rtx_CONST_VECTOR (vmode,
-				gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
-  vperm = force_reg (vmode, vperm);
+  machine_mode vpmode = vmode;
+
+  if (vmode == V8QImode)
+    {
+      for (i = nelt; i < 16; ++i)
+	rperm[i] = constm1_rtx;
+      vpmode = V16QImode;
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (vpmode,
+				gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
+  vperm = force_reg (vpmode, vperm);
 
   target = d->target;
   if (d->vmode != vmode)
@@ -17531,7 +17587,9 @@
   op0 = gen_lowpart (vmode, d->op0);
   if (d->one_operand_p)
     {
-      if (vmode == V16QImode)
+      if (vmode == V8QImode)
+	emit_insn (gen_mmx_pshufbv8qi3 (target, op0, vperm));
+      else if (vmode == V16QImode)
 	emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
       else if (vmode == V32QImode)
 	emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
@@ -18041,7 +18099,8 @@
     ;
   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
     ;
-  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+  else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 8
+			     || GET_MODE_SIZE (vmode) == 16))
     ;
   else
     return false;
@@ -18120,7 +18179,8 @@
   rtx_insn *seq;
   bool ok, same_halves = false;
 
-  if (GET_MODE_SIZE (d->vmode) == 16)
+  if (GET_MODE_SIZE (d->vmode) == 8
+      || GET_MODE_SIZE (d->vmode) == 16)
     {
       if (d->one_operand_p)
 	return false;
@@ -18155,7 +18215,44 @@
   memset (remap, 0xff, sizeof (remap));
   dremap = *d;
 
-  if (GET_MODE_SIZE (d->vmode) == 16)
+  if (GET_MODE_SIZE (d->vmode) == 8)
+    {
+      unsigned HOST_WIDE_INT h1, h2, h3, h4;
+
+      /* Split the two input vectors into 4 halves.  */
+      h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
+      h2 = h1 << nelt2;
+      h3 = h2 << nelt2;
+      h4 = h3 << nelt2;
+
+      /* If the elements from the low halves use interleave low,
+	 and similarly for interleave high.  */
+      if ((contents & (h1 | h3)) == contents)
+	{
+	  /* punpckl* */
+	  for (i = 0; i < nelt2; ++i)
+	    {
+	      remap[i] = i * 2;
+	      remap[i + nelt] = i * 2 + 1;
+	      dremap.perm[i * 2] = i;
+	      dremap.perm[i * 2 + 1] = i + nelt;
+	    }
+	}
+      else if ((contents & (h2 | h4)) == contents)
+	{
+	  /* punpckh* */
+	  for (i = 0; i < nelt2; ++i)
+	    {
+	      remap[i + nelt2] = i * 2;
+	      remap[i + nelt + nelt2] = i * 2 + 1;
+	      dremap.perm[i * 2] = i + nelt2;
+	      dremap.perm[i * 2 + 1] = i + nelt + nelt2;
+	    }
+	}
+      else
+	return false;
+    }
+  else if (GET_MODE_SIZE (d->vmode) == 16)
     {
       unsigned HOST_WIDE_INT h1, h2, h3, h4;
 
@@ -19328,9 +19425,9 @@
 }
 
 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
-   and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
-   with two "and" and "pack" or two "shift" and "pack" insns.  We should
-   have already failed all two instruction sequences.  */
+   and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
+   operands with two "and" and "pack" or two "shift" and "pack" insns.
+   We should have already failed all two instruction sequences.  */
 
 static bool
 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
@@ -19359,6 +19456,15 @@
       gen_pack = gen_sse4_1_packusdw;
       gen_shift = gen_lshrv4si3;
       break;
+    case E_V8QImode:
+      /* No check as all instructions are SSE2.  */
+      c = 0xff;
+      s = 8;
+      half_mode = V4HImode;
+      gen_and = gen_andv4hi3;
+      gen_pack = gen_mmx_packuswb;
+      gen_shift = gen_lshrv4hi3;
+      break;
     case E_V16QImode:
       /* No check as all instructions are SSE2.  */
       c = 0xff;
@@ -19391,8 +19497,8 @@
       end_perm = true;
       break;
     default:
-      /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
-	 general shuffles.  */
+      /* Only V8QI, V8HI, V16QI, V16HI and V32QI modes
+	 are more profitable than general shuffles.  */
       return false;
     }
 
@@ -19621,6 +19727,7 @@
 	}
       break;
 
+    case E_V8QImode:
     case E_V16QImode:
       return expand_vec_perm_even_odd_pack (d);
 
@@ -19786,6 +19893,41 @@
       /* These are always implementable using standard shuffle patterns.  */
       gcc_unreachable ();
 
+    case E_V8QImode:
+      /* These can be implemented via interleave.  We save one insn by
+	 stopping once we have promoted to V2SImode and then use pshufd.  */
+      if (d->testing_p)
+	return true;
+      do
+	{
+	  rtx dest;
+	  rtx (*gen) (rtx, rtx, rtx)
+	    = vmode == V8QImode ? gen_mmx_punpcklbw
+				: gen_mmx_punpcklwd;
+
+	  if (elt >= nelt2)
+	    {
+	      gen = vmode == V8QImode ? gen_mmx_punpckhbw
+				      : gen_mmx_punpckhwd;
+	      elt -= nelt2;
+	    }
+	  nelt2 /= 2;
+
+	  dest = gen_reg_rtx (vmode);
+	  emit_insn (gen (dest, op0, op0));
+	  vmode = get_mode_wider_vector (vmode);
+	  op0 = gen_lowpart (vmode, dest);
+	}
+      while (vmode != V2SImode);
+
+      memset (perm2, elt, 2);
+      dest = gen_reg_rtx (V2SImode);
+      ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
+      gcc_assert (ok);
+      if (!d->testing_p)
+	emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+      return true;
+
     case E_V8HImode:
     case E_V16QImode:
       /* These can be implemented via interleave.  We save one insn by
@@ -20289,6 +20431,7 @@
     case E_V2SFmode:
     case E_V2SImode:
     case E_V4HImode:
+    case E_V8QImode:
       if (!TARGET_MMX_WITH_SSE)
 	return false;
       break;
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 5ff49ec..7743c61e 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -119,6 +119,7 @@
   UNSPEC_MASKMOV
   UNSPEC_MOVMSK
   UNSPEC_BLENDV
+  UNSPEC_PSHUFB
   UNSPEC_RCP
   UNSPEC_RSQRT
   UNSPEC_PSADBW
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 0a17a54..f9e7d27 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -1198,6 +1198,40 @@
    (set_attr "prefix" "maybe_vex,orig")
    (set_attr "mode" "V4SF")])
 
+(define_insn_and_split "*vec_interleave_lowv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=x,v")
+	(vec_select:V2SF
+	  (vec_concat:V4SF
+	    (match_operand:V2SF 1 "register_operand" "0,v")
+	    (match_operand:V2SF 2 "register_operand" "x,v"))
+	  (parallel [(const_int 0) (const_int 2)])))]
+  "TARGET_MMX_WITH_SSE"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  "ix86_split_mmx_punpck (operands, false); DONE;"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix" "orig,maybe_evex")
+   (set_attr "mode" "V4SF")])
+
+(define_insn_and_split "*vec_interleave_highv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=x,v")
+	(vec_select:V2SF
+	  (vec_concat:V4SF
+	    (match_operand:V2SF 1 "register_operand" "0,v")
+	    (match_operand:V2SF 2 "register_operand" "x,v"))
+	  (parallel [(const_int 1) (const_int 3)])))]
+  "TARGET_MMX_WITH_SSE"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  "ix86_split_mmx_punpck (operands, true); DONE;"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "V4SF")])
+
 (define_insn "*vec_dupv2sf"
   [(set (match_operand:V2SF 0 "register_operand" "=y,Yv,x")
 	(vec_duplicate:V2SF
@@ -2415,7 +2449,7 @@
    pack<s_trunsuffix>swb\t{%2, %0|%0, %2}
    #
    #"
-  "TARGET_SSE2 && reload_completed
+  "&& reload_completed
    && SSE_REGNO_P (REGNO (operands[0]))"
   [(const_int 0)]
   "ix86_split_mmx_pack (operands, <any_s_truncate:CODE>); DONE;"
@@ -2435,7 +2469,7 @@
    packssdw\t{%2, %0|%0, %2}
    #
    #"
-  "TARGET_SSE2 && reload_completed
+  "&& reload_completed
    && SSE_REGNO_P (REGNO (operands[0]))"
   [(const_int 0)]
   "ix86_split_mmx_pack (operands, SS_TRUNCATE); DONE;"
@@ -2458,7 +2492,7 @@
    punpckhbw\t{%2, %0|%0, %2}
    #
    #"
-  "TARGET_SSE2 && reload_completed
+  "&& reload_completed
    && SSE_REGNO_P (REGNO (operands[0]))"
   [(const_int 0)]
   "ix86_split_mmx_punpck (operands, true); DONE;"
@@ -2481,7 +2515,7 @@
    punpcklbw\t{%2, %0|%0, %k2}
    #
    #"
-  "TARGET_SSE2 && reload_completed
+  "&& reload_completed
    && SSE_REGNO_P (REGNO (operands[0]))"
   [(const_int 0)]
   "ix86_split_mmx_punpck (operands, false); DONE;"
@@ -2502,7 +2536,7 @@
    punpckhwd\t{%2, %0|%0, %2}
    #
    #"
-  "TARGET_SSE2 && reload_completed
+  "&& reload_completed
    && SSE_REGNO_P (REGNO (operands[0]))"
   [(const_int 0)]
   "ix86_split_mmx_punpck (operands, true); DONE;"
@@ -2523,7 +2557,7 @@
    punpcklwd\t{%2, %0|%0, %k2}
    #
    #"
-  "TARGET_SSE2 && reload_completed
+  "&& reload_completed
    && SSE_REGNO_P (REGNO (operands[0]))"
   [(const_int 0)]
   "ix86_split_mmx_punpck (operands, false); DONE;"
@@ -2544,7 +2578,7 @@
    punpckhdq\t{%2, %0|%0, %2}
    #
    #"
-  "TARGET_SSE2 && reload_completed
+  "&& reload_completed
    && SSE_REGNO_P (REGNO (operands[0]))"
   [(const_int 0)]
   "ix86_split_mmx_punpck (operands, true); DONE;"
@@ -2565,7 +2599,7 @@
    punpckldq\t{%2, %0|%0, %k2}
    #
    #"
-  "TARGET_SSE2 && reload_completed
+  "&& reload_completed
    && SSE_REGNO_P (REGNO (operands[0]))"
   [(const_int 0)]
   "ix86_split_mmx_punpck (operands, false); DONE;"
@@ -2756,6 +2790,24 @@
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
+(define_insn "mmx_pshufbv8qi3"
+  [(set (match_operand:V8QI 0 "register_operand" "=x,Yw")
+	(unspec:V8QI
+	  [(match_operand:V8QI 1 "register_operand" "0,Yw")
+	   (match_operand:V16QI 2 "vector_operand" "xBm,Ywm")]
+	  UNSPEC_PSHUFB))]
+  "TARGET_SSSE3 && TARGET_MMX_WITH_SSE"
+  "@
+   pshufb\t{%2, %0|%0, %2}
+   vpshufb\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog1")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,maybe_evex")
+   (set_attr "btver2_decode" "vector")
+   (set_attr "mode" "TI")])
+
 (define_expand "mmx_pshufw"
   [(match_operand:V4HI 0 "register_operand")
    (match_operand:V4HI 1 "register_mmxmem_operand")
@@ -2828,6 +2880,24 @@
    (set_attr "length_immediate" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*mmx_pblendw"
+  [(set (match_operand:V4HI 0 "register_operand" "=Yr,*x,x")
+	(vec_merge:V4HI
+	  (match_operand:V4HI 2 "register_operand" "Yr,*x,x")
+	  (match_operand:V4HI 1 "register_operand" "0,0,x")
+	  (match_operand:SI 3 "const_0_to_63_operand" "n,n,n")))]
+  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+  "@
+   pblendw\t{%3, %2, %0|%0, %2, %3}
+   pblendw\t{%3, %2, %0|%0, %2, %3}
+   vpblendw\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "mode" "TI")])
+
 ;; Optimize V2SImode load from memory, swapping the elements and
 ;; storing back into the memory into DImode rotate of the memory by 32.
 (define_split
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 2a34756..8403a07 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -28,7 +28,6 @@
   UNSPEC_LDDQU
 
   ;; SSSE3
-  UNSPEC_PSHUFB
   UNSPEC_PSIGN
   UNSPEC_PALIGNR