blob: d4393231b4c0f06a15ebbad12e353cd37681a8a2 [file] [log] [blame]
// PR/99728
// { dg-do compile }
// { dg-options "-O2 -fdump-tree-lim2-details -w -Wno-psabi" }
typedef double __m256d __attribute__((vector_size(sizeof (double) * 4)));
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_set1_pd (double __A)
{
return __extension__ (__m256d){ __A, __A, __A, __A };
}
// simple OO wrapper around __m256d
struct Tvsimple
{
__m256d v;
Tvsimple &operator+=(const Tvsimple &other) {v+=other.v; return *this;}
Tvsimple operator*(double val) const { Tvsimple res; res.v = v*_mm256_set1_pd(val); return res;}
Tvsimple operator*(Tvsimple val) const { Tvsimple res; res.v = v*val.v; return res; }
Tvsimple operator+(Tvsimple val) const { Tvsimple res; res.v = v+val.v; return res; }
Tvsimple operator+(double val) const { Tvsimple res; res.v = v+_mm256_set1_pd(val); return res;}
};
template<typename vtype> struct s0data_s
{ vtype sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i; };
template<typename vtype> void foo(s0data_s<vtype> & __restrict__ d,
const double * __restrict__ coef, const double * __restrict__ alm,
unsigned long l, unsigned long il, unsigned long lmax)
{
// critical loop
while (l<=lmax)
{
d.p1r += d.lam2*alm[2*l];
d.p1i += d.lam2*alm[2*l+1];
d.p2r += d.lam2*alm[2*l+2];
d.p2i += d.lam2*alm[2*l+3];
Tvsimple tmp = d.lam2*(d.csq*coef[2*il] + coef[2*il+1]) + d.lam1;
d.lam1 = d.lam2;
d.lam2 = tmp;
++il; l+=2;
}
}
// this version has dead stores at the end of the loop
template void foo<>(s0data_s<Tvsimple> & __restrict__ d,
const double * __restrict__ coef, const double * __restrict__ alm,
unsigned long l, unsigned long il, unsigned long lmax);
// The aggregate copy in the IL should not prevent all store-motion
// { dg-final { scan-tree-dump-times "Executing store motion" 4 "lim2" } }