blob: 94d6764fc56f5f3c161563257debb96e9916554d [file] [log] [blame]
/* { dg-do compile } */
/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE32" } */
typedef unsigned long long u64;
typedef unsigned int u32;
typedef unsigned short u16;
typedef unsigned char u8;
#ifdef DTYPE32
typedef u32 DTYPE;
#define byteswap byteswapu32
#endif
#ifdef DTYPE16
typedef u16 DTYPE;
#define byteswap byteswapu16
#endif
#ifdef DTYPE8
typedef u16 DTYPE;
#define byteswap byteswapu8
#endif
#ifdef DTYPE64
typedef u16 DTYPE;
#define byteswap byteswapu64
#endif
#define R(x,n) ( (x >> n) | (x << (32 - n)))
#define S0(x) (R(x, 2) ^ R(x,13) ^ R(x,22))
#define S1(x) (R(x, 6) ^ R(x,11) ^ R(x,25))
#define TT(a,b,c,d,e,f,g,h,x,K) \
{ \
tmp1 = h + S1(e) + (g ^ (e & (f ^ g))) + K + x; \
tmp2 = S0(a) + ((a & b) | (c & (a | b))); \
h = tmp1 + tmp2; \
d += tmp1; \
}
static inline u32 byteswapu32(u32 x)
{
x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;
return x;
}
static inline u16 byteswapu16(u16 x)
{
x = (x & 0x00FF) << 8 | (x & 0xFF00) >> 8;
return x;
}
static inline u8 byteswapu8(u8 x)
{
return x;
}
static inline u64 byteswapu64(u64 x)
{
x = ((u64)(byteswapu32 (x & 0x00000000FFFFFFFF))) << 32 | byteswapu32((x & 0xFFFFFFFF00000000) >> 32);
return x;
}
void foo (DTYPE in[16], DTYPE out[8], const DTYPE C[16])
{
DTYPE tmp1 = 0, tmp2 = 0, a, b, c, d, e, f, g, h;
DTYPE w0, w1, w2, w3, w4, w5, w6, w7,
w8, w9, w10, w11, w12, w13, w14, w15;
w0 = byteswap(in[0]);
w1 = byteswap(in[1]);
w2 = byteswap(in[2]);
w3 = byteswap(in[3]);
w4 = byteswap(in[4]);
w5 = byteswap(in[5]);
w6 = byteswap(in[6]);
w7 = byteswap(in[7]);
w8 = byteswap(in[8]);
w9 = byteswap(in[9]);
w10 = byteswap(in[10]);
w11 = byteswap(in[11]);
w12 = byteswap(in[12]);
w13 = byteswap(in[13]);
w14 = byteswap(in[14]);
w15 = byteswap(in[15]);
a = out[0];
b = out[1];
c = out[2];
d = out[3];
e = out[4];
f = out[5];
g = out[6];
h = out[7];
TT(a, b, c, d, e, f, g, h, w0, C[0]);
TT(h, a, b, c, d, e, f, g, w1, C[1]);
TT(g, h, a, b, c, d, e, f, w2, C[2]);
TT(f, g, h, a, b, c, d, e, w3, C[3]);
TT(e, f, g, h, a, b, c, d, w4, C[4]);
TT(d, e, f, g, h, a, b, c, w5, C[5]);
TT(c, d, e, f, g, h, a, b, w6, C[6]);
TT(b, c, d, e, f, g, h, a, w7, C[7]);
TT(a, b, c, d, e, f, g, h, w8, C[8]);
TT(h, a, b, c, d, e, f, g, w9, C[9]);
TT(g, h, a, b, c, d, e, f, w10, C[10]);
TT(f, g, h, a, b, c, d, e, w11, C[11]);
TT(e, f, g, h, a, b, c, d, w12, C[12]);
TT(d, e, f, g, h, a, b, c, w13, C[13]);
TT(c, d, e, f, g, h, a, b, w14, C[14]);
TT(b, c, d, e, f, g, h, a, w15, C[15]);
out[0] += a;
out[1] += b;
out[2] += c;
out[3] += d;
out[4] += e;
out[5] += f;
out[6] += g;
out[7] += h;
}
/* { dg-final { scan-assembler "kmovd" } } */
/* { dg-final { scan-assembler-not "knot" } } */
/* { dg-final { scan-assembler-not "kxor" } } */
/* { dg-final { scan-assembler-not "kor" } } */
/* { dg-final { scan-assembler-not "kandn" } } */