gcc/testsuite/g++.dg/opt/pr99728.C - gcc - Git at Google

 // PR/99728
 // { dg-do compile }
 // { dg-options "-O2 -fdump-tree-lim2-details -w -Wno-psabi" }

 typedef double __m256d __attribute__((vector_size(sizeof (double) * 4)));
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_set1_pd (double __A)
 {
   return __extension__ (__m256d){ __A, __A, __A, __A };
 }

 // simple OO wrapper around __m256d
 struct Tvsimple
   {
   __m256d v;
   Tvsimple &operator+=(const Tvsimple &other) {v+=other.v; return *this;}
   Tvsimple operator*(double val) const { Tvsimple res; res.v = v*_mm256_set1_pd(val); return res;}
   Tvsimple operator*(Tvsimple val) const { Tvsimple res; res.v = v*val.v; return res; }
   Tvsimple operator+(Tvsimple val) const { Tvsimple res; res.v = v+val.v; return res; }
   Tvsimple operator+(double val) const { Tvsimple res; res.v = v+_mm256_set1_pd(val); return res;}
   };

 template<typename vtype> struct s0data_s
   { vtype sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i; };

 template<typename vtype> void foo(s0data_s<vtype> & __restrict__ d,
   const double * __restrict__ coef, const double * __restrict__ alm,
   unsigned long l, unsigned long il, unsigned long lmax)
   {
 // critical loop
   while (l<=lmax)
     {
     d.p1r += d.lam2*alm[2*l];
     d.p1i += d.lam2*alm[2*l+1];
     d.p2r += d.lam2*alm[2*l+2];
     d.p2i += d.lam2*alm[2*l+3];
     Tvsimple tmp = d.lam2*(d.csq*coef[2*il] + coef[2*il+1]) + d.lam1;
     d.lam1 = d.lam2;
     d.lam2 = tmp;
     ++il; l+=2;
     }
   }

 // this version has dead stores at the end of the loop
 template void foo<>(s0data_s<Tvsimple> & __restrict__ d,
   const double * __restrict__ coef, const double * __restrict__ alm,
   unsigned long l, unsigned long il, unsigned long lmax);

 // The aggregate copy in the IL should not prevent all store-motion
 // { dg-final { scan-tree-dump-times "Executing store motion" 4 "lim2" } }
	// PR/99728
	// { dg-do compile }
	// { dg-options "-O2 -fdump-tree-lim2-details -w -Wno-psabi" }

	typedef double __m256d __attribute__((vector_size(sizeof (double) * 4)));
	extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
	_mm256_set1_pd (double __A)
	{
	return __extension__ (__m256d){ __A, __A, __A, __A };
	}

	// simple OO wrapper around __m256d
	struct Tvsimple
	{
	__m256d v;
	Tvsimple &operator+=(const Tvsimple &other) {v+=other.v; return *this;}
	Tvsimple operator(double val) const { Tvsimple res; res.v = v_mm256_set1_pd(val); return res;}
	Tvsimple operator(Tvsimple val) const { Tvsimple res; res.v = vval.v; return res; }
	Tvsimple operator+(Tvsimple val) const { Tvsimple res; res.v = v+val.v; return res; }
	Tvsimple operator+(double val) const { Tvsimple res; res.v = v+_mm256_set1_pd(val); return res;}
	};

	template<typename vtype> struct s0data_s
	{ vtype sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i; };

	template<typename vtype> void foo(s0data_s<vtype> & __restrict__ d,
	const double * __restrict__ coef, const double * __restrict__ alm,
	unsigned long l, unsigned long il, unsigned long lmax)
	{
	// critical loop
	while (l<=lmax)
	{
	d.p1r += d.lam2alm[2l];
	d.p1i += d.lam2alm[2l+1];
	d.p2r += d.lam2alm[2l+2];
	d.p2i += d.lam2alm[2l+3];
	Tvsimple tmp = d.lam2(d.csqcoef[2il] + coef[2il+1]) + d.lam1;
	d.lam1 = d.lam2;
	d.lam2 = tmp;
	++il; l+=2;
	}
	}

	// this version has dead stores at the end of the loop
	template void foo<>(s0data_s<Tvsimple> & __restrict__ d,
	const double * __restrict__ coef, const double * __restrict__ alm,
	unsigned long l, unsigned long il, unsigned long lmax);

	// The aggregate copy in the IL should not prevent all store-motion
	// { dg-final { scan-tree-dump-times "Executing store motion" 4 "lim2" } }