gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1.c - gcc - Git at Google

 /* { dg-do compile } */
 /* { dg-options "-O2 -ftree-vectorize --param aarch64-sve-compare-costs=0" } */

 #include <stdint.h>

 #define TEST(TYPE, NAME, OP)				\
   void __attribute__ ((noinline, noclone))		\
   test_##TYPE##_##NAME (TYPE *__restrict x,		\
 			TYPE *__restrict y,		\
 			TYPE *__restrict z,		\
 			TYPE *__restrict pred, int n)	\
   {							\
     for (int i = 0; i < n; ++i)				\
       x[i] = pred[i] != 1 ? y[i] OP z[i] : y[i];	\
   }

 #define TEST_INT_TYPE(TYPE) \
   TEST (TYPE, div, /)

 #define TEST_FP_TYPE(TYPE) \
   TEST (TYPE, add, +) \
   TEST (TYPE, sub, -) \
   TEST (TYPE, mul, *) \
   TEST (TYPE, div, /)

 #define TEST_ALL \
   TEST_INT_TYPE (int8_t) \
   TEST_INT_TYPE (uint8_t) \
   TEST_INT_TYPE (int16_t) \
   TEST_INT_TYPE (uint16_t) \
   TEST_INT_TYPE (int32_t) \
   TEST_INT_TYPE (uint32_t) \
   TEST_INT_TYPE (int64_t) \
   TEST_INT_TYPE (uint64_t) \
   TEST_FP_TYPE (float) \
   TEST_FP_TYPE (double)

 TEST_ALL

 /* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.b} } } */		\
 /* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.h} } } */		\
 /* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.s, p[0-7]/m,} 7 } } */
 /* At present we don't vectorize the uint8_t or uint16_t loops because the
    division is done directly in the narrow type, rather than being widened
    to int first.  */
 /* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
 /* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
 /* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */

 /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
 /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */

 /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
 /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m,} 1 } } */

 /* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
 /* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m,} 1 } } */

 /* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
 /* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */

 /* We fail to optimize away the SEL for the int8_t and int16_t loops,
    because the 32-bit result is converted before selection.  */
 /* { dg-final { scan-assembler-times {\tsel\t} 2 } } */
	/* { dg-do compile } */
	/* { dg-options "-O2 -ftree-vectorize --param aarch64-sve-compare-costs=0" } */

	#include <stdint.h>

	#define TEST(TYPE, NAME, OP) \
	void __attribute__ ((noinline, noclone)) \
	test_##TYPE##_##NAME (TYPE *__restrict x, \
	TYPE *__restrict y, \
	TYPE *__restrict z, \
	TYPE *__restrict pred, int n) \
	{ \
	for (int i = 0; i < n; ++i) \
	x[i] = pred[i] != 1 ? y[i] OP z[i] : y[i]; \
	}

	#define TEST_INT_TYPE(TYPE) \
	TEST (TYPE, div, /)

	#define TEST_FP_TYPE(TYPE) \
	TEST (TYPE, add, +) \
	TEST (TYPE, sub, -) \
	TEST (TYPE, mul, *) \
	TEST (TYPE, div, /)

	#define TEST_ALL \
	TEST_INT_TYPE (int8_t) \
	TEST_INT_TYPE (uint8_t) \
	TEST_INT_TYPE (int16_t) \
	TEST_INT_TYPE (uint16_t) \
	TEST_INT_TYPE (int32_t) \
	TEST_INT_TYPE (uint32_t) \
	TEST_INT_TYPE (int64_t) \
	TEST_INT_TYPE (uint64_t) \
	TEST_FP_TYPE (float) \
	TEST_FP_TYPE (double)

	TEST_ALL

	/* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.b} } } */ \
	/* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.h} } } */ \
	/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.s, p[0-7]/m,} 7 } } */
	/* At present we don't vectorize the uint8_t or uint16_t loops because the
	division is done directly in the narrow type, rather than being widened
	to int first. */
	/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
	/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
	/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */

	/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
	/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */

	/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
	/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m,} 1 } } */

	/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
	/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m,} 1 } } */

	/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
	/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */

	/* We fail to optimize away the SEL for the int8_t and int16_t loops,
	because the 32-bit result is converted before selection. */
	/* { dg-final { scan-assembler-times {\tsel\t} 2 } } */