blob: d831e9c114251df41364e75bfb58a8d135958700 [file] [log] [blame]
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize --param aarch64-sve-compare-costs=0" } */
#include <stdint.h>
#define TEST(TYPE, NAME, OP) \
void __attribute__ ((noinline, noclone)) \
test_##TYPE##_##NAME (TYPE *__restrict x, \
TYPE *__restrict y, \
TYPE *__restrict z, \
TYPE *__restrict pred, int n) \
{ \
for (int i = 0; i < n; ++i) \
x[i] = pred[i] != 1 ? y[i] OP z[i] : y[i]; \
}
#define TEST_INT_TYPE(TYPE) \
TEST (TYPE, div, /)
#define TEST_FP_TYPE(TYPE) \
TEST (TYPE, add, +) \
TEST (TYPE, sub, -) \
TEST (TYPE, mul, *) \
TEST (TYPE, div, /)
#define TEST_ALL \
TEST_INT_TYPE (int8_t) \
TEST_INT_TYPE (uint8_t) \
TEST_INT_TYPE (int16_t) \
TEST_INT_TYPE (uint16_t) \
TEST_INT_TYPE (int32_t) \
TEST_INT_TYPE (uint32_t) \
TEST_INT_TYPE (int64_t) \
TEST_INT_TYPE (uint64_t) \
TEST_FP_TYPE (float) \
TEST_FP_TYPE (double)
TEST_ALL
/* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.b} } } */ \
/* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.h} } } */ \
/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.s, p[0-7]/m,} 7 } } */
/* At present we don't vectorize the uint8_t or uint16_t loops because the
division is done directly in the narrow type, rather than being widened
to int first. */
/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* We fail to optimize away the SEL for the int8_t and int16_t loops,
because the 32-bit result is converted before selection. */
/* { dg-final { scan-assembler-times {\tsel\t} 2 } } */