gcc/testsuite/gcc.target/i386/pr92645-4.c - gcc - Git at Google

 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx2 -fdump-tree-optimized -Wno-psabi -mno-avx512f" } */

 typedef unsigned int u32v4 __attribute__((vector_size(16)));
 typedef unsigned short u16v16 __attribute__((vector_size(32)));
 typedef unsigned char u8v16 __attribute__((vector_size(16)));

 union vec128 {
   u8v16 u8;
   u32v4 u32;
 };

 #define memcpy __builtin_memcpy

 static u16v16 zxt(u8v16 x)
 {
   return (u16v16) {
     x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
     x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
   };
 }

 static u8v16 narrow(u16v16 x)
 {
   return (u8v16) {
     x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
     x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
   };
 }

 void f(char *dst, char *src, unsigned long n, unsigned c)
 {
   unsigned ia = 255 - (c >> 24);
   ia += ia >> 7;

   union vec128 c4 = {0}, ia16 = {0};
   c4.u32 += c;
   ia16.u8 += (unsigned char)ia;

   u16v16 c16 = (zxt(c4.u8) << 8) + 128;

   for (; n; src += 16, dst += 16, n -= 4) {
     union vec128 s;
     memcpy(&s, src, sizeof s);
     s.u8 = narrow((zxt(s.u8)*zxt(ia16.u8) + c16) >> 8);
     memcpy(dst, &s, sizeof s);
   }
 }

 /* { dg-final { scan-tree-dump-times "\\(vector\\(16\\) short unsigned int\\)" 3 "optimized" } } */
 /* We're missing an opportunity to, after later optimizations, combine
    a uniform CTOR with a vector promotion to a CTOR on a promoted
    element.  */
 /* { dg-final { scan-tree-dump-times "\\(vector\\(16\\) short unsigned int\\)" 2 "optimized" { xfail *-*-* } } } */
 /* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } } */
 /* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } } */
	/* { dg-do compile } */
	/* { dg-options "-O2 -mavx2 -fdump-tree-optimized -Wno-psabi -mno-avx512f" } */

	typedef unsigned int u32v4 __attribute__((vector_size(16)));
	typedef unsigned short u16v16 __attribute__((vector_size(32)));
	typedef unsigned char u8v16 __attribute__((vector_size(16)));

	union vec128 {
	u8v16 u8;
	u32v4 u32;
	};

	#define memcpy __builtin_memcpy

	static u16v16 zxt(u8v16 x)
	{
	return (u16v16) {
	x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
	x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
	};
	}

	static u8v16 narrow(u16v16 x)
	{
	return (u8v16) {
	x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
	x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
	};
	}

	void f(char dst, char src, unsigned long n, unsigned c)
	{
	unsigned ia = 255 - (c >> 24);
	ia += ia >> 7;

	union vec128 c4 = {0}, ia16 = {0};
	c4.u32 += c;
	ia16.u8 += (unsigned char)ia;

	u16v16 c16 = (zxt(c4.u8) << 8) + 128;

	for (; n; src += 16, dst += 16, n -= 4) {
	union vec128 s;
	memcpy(&s, src, sizeof s);
	s.u8 = narrow((zxt(s.u8)*zxt(ia16.u8) + c16) >> 8);
	memcpy(dst, &s, sizeof s);
	}
	}

	/* { dg-final { scan-tree-dump-times "\\(vector\\(16\\) short unsigned int\\)" 3 "optimized" } } */
	/* We're missing an opportunity to, after later optimizations, combine
	a uniform CTOR with a vector promotion to a CTOR on a promoted
	element. */
	/* { dg-final { scan-tree-dump-times "\\(vector\\(16\\) short unsigned int\\)" 2 "optimized" { xfail --* } } } */
	/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } } */
	/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } } */