gcc/testsuite/gcc.target/arm/simd/mve-vld2.c - gcc - Git at Google

 /* { dg-do assemble } */
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-add-options arm_v8_1m_mve_fp } */
 /* { dg-additional-options "-O3" } */

 #include <stdint.h>

 #define M00 100
 #define M10 216
 #define M01 1322
 #define M11 13

 #define N 128


 /* Integer tests.  */
 #define FUNC(SIGN, TYPE, BITS)						\
   void foo_##SIGN##BITS##x (TYPE##BITS##_t *__restrict__ pInput,	\
 			    TYPE##BITS##_t *__restrict__ pOutput)	\
   {									\
     unsigned int i;							\
     TYPE##BITS##_t  a, b;						\
     									\
     for (i = 0; i < N / BITS; i++)					\
       {									\
 	a = *pInput++;							\
 	b = *pInput++;							\
 									\
 	*pOutput++ = M00 * a + M01 * b;					\
 	*pOutput++ = M10 * a + M11 * b;					\
       }									\
   }

 FUNC(s, int, 8)
 FUNC(u, uint, 8)
 FUNC(s, int, 16)
 FUNC(u, uint, 16)
 FUNC(s, int, 32)
 FUNC(u, uint, 32)

 /* float test, keep the macro because it's similar to the above, but does not
    need the ##BITS##_t.  */
 #define FUNC_FLOAT(SIGN, TYPE, BITS)					\
   void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,			\
 			    TYPE *__restrict__ pOutput)			\
   {									\
     unsigned int i;							\
     TYPE a, b;								\
     									\
     for (i = 0; i < N / BITS; i++)					\
       {									\
 	a = *pInput++;							\
 	b = *pInput++;							\
 									\
 	*pOutput++ = M00 * a + M01 * b;					\
 	*pOutput++ = M10 * a + M11 * b;					\
       }									\
   }

 FUNC_FLOAT(f, float, 32)

 /* __fp16 test, needs explicit casts to avoid conversions to floating-point and
    failure to vectorize.  */
 __fp16 M00_fp16 = 100.0f16;
 __fp16 M10_fp16 = 216.0f16;
 __fp16 M01_fp16 = 1322.0f16;
 __fp16 M11_fp16 = 13.0f16;

 #define FUNC_FLOAT_FP16(SIGN, TYPE, BITS)				\
   void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,			\
 			    TYPE *__restrict__ pOutput)			\
   {									\
     unsigned int i;							\
     TYPE a, b;								\
     									\
     for (i = 0; i < N / BITS; i++)					\
       {									\
 	a = *pInput++;							\
 	b = *pInput++;							\
 									\
 	*pOutput++ = (__fp16)(M00_fp16 * a) + (__fp16)(M01_fp16 * b);	\
 	*pOutput++ = (__fp16)(M10_fp16 * a) + (__fp16)(M11_fp16 * b);	\
       }									\
   }

 FUNC_FLOAT_FP16(f, __fp16, 16)

 /* vld2X.8 is used for signed and unsigned chars: 2 pairs.  */
 /* vld2X.16 is used for signed and unsigned shorts and __fp16: 3 pairs.  */
 /* vld2X.32 is used for signed and unsigned ints and float: 3 pairs.  */
 /* { dg-final { scan-assembler-times {vld2[01].8\t.q[0-9]+, q[0-9]+., } 4 } } */
 /* { dg-final { scan-assembler-times {vld2[01].16\t.q[0-9]+, q[0-9]+., } 6 } } */
 /* { dg-final { scan-assembler-times {vld2[01].32\t.q[0-9]+, q[0-9]+., } 6 } } */
 /* { dg-final { scan-assembler-times {vst2[01].8\t.q[0-9]+, q[0-9]+., } 4 } } */
 /* { dg-final { scan-assembler-times {vst2[01].16\t.q[0-9]+, q[0-9]+., } 6 } } */
 /* { dg-final { scan-assembler-times {vst2[01].32\t.q[0-9]+, q[0-9]+., } 6 } } */
	/* { dg-do assemble } */
	/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
	/* { dg-add-options arm_v8_1m_mve_fp } */
	/* { dg-additional-options "-O3" } */

	#include <stdint.h>

	#define M00 100
	#define M10 216
	#define M01 1322
	#define M11 13

	#define N 128


	/* Integer tests. */
	#define FUNC(SIGN, TYPE, BITS) \
	void foo_##SIGN##BITS##x (TYPE##BITS##_t *__restrict__ pInput, \
	TYPE##BITS##_t *__restrict__ pOutput) \
	{ \
	unsigned int i; \
	TYPE##BITS##_t a, b; \
	\
	for (i = 0; i < N / BITS; i++) \
	{ \
	a = *pInput++; \
	b = *pInput++; \
	\
	pOutput++ = M00 a + M01 * b; \
	pOutput++ = M10 a + M11 * b; \
	} \
	}

	FUNC(s, int, 8)
	FUNC(u, uint, 8)
	FUNC(s, int, 16)
	FUNC(u, uint, 16)
	FUNC(s, int, 32)
	FUNC(u, uint, 32)

	/* float test, keep the macro because it's similar to the above, but does not
	need the ##BITS##_t. */
	#define FUNC_FLOAT(SIGN, TYPE, BITS) \
	void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput, \
	TYPE *__restrict__ pOutput) \
	{ \
	unsigned int i; \
	TYPE a, b; \
	\
	for (i = 0; i < N / BITS; i++) \
	{ \
	a = *pInput++; \
	b = *pInput++; \
	\
	pOutput++ = M00 a + M01 * b; \
	pOutput++ = M10 a + M11 * b; \
	} \
	}

	FUNC_FLOAT(f, float, 32)

	/* __fp16 test, needs explicit casts to avoid conversions to floating-point and
	failure to vectorize. */
	__fp16 M00_fp16 = 100.0f16;
	__fp16 M10_fp16 = 216.0f16;
	__fp16 M01_fp16 = 1322.0f16;
	__fp16 M11_fp16 = 13.0f16;

	#define FUNC_FLOAT_FP16(SIGN, TYPE, BITS) \
	void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput, \
	TYPE *__restrict__ pOutput) \
	{ \
	unsigned int i; \
	TYPE a, b; \
	\
	for (i = 0; i < N / BITS; i++) \
	{ \
	a = *pInput++; \
	b = *pInput++; \
	\
	pOutput++ = (__fp16)(M00_fp16 a) + (__fp16)(M01_fp16 * b); \
	pOutput++ = (__fp16)(M10_fp16 a) + (__fp16)(M11_fp16 * b); \
	} \
	}

	FUNC_FLOAT_FP16(f, __fp16, 16)

	/* vld2X.8 is used for signed and unsigned chars: 2 pairs. */
	/* vld2X.16 is used for signed and unsigned shorts and __fp16: 3 pairs. */
	/* vld2X.32 is used for signed and unsigned ints and float: 3 pairs. */
	/* { dg-final { scan-assembler-times {vld2[01].8\t.q[0-9]+, q[0-9]+., } 4 } } */
	/* { dg-final { scan-assembler-times {vld2[01].16\t.q[0-9]+, q[0-9]+., } 6 } } */
	/* { dg-final { scan-assembler-times {vld2[01].32\t.q[0-9]+, q[0-9]+., } 6 } } */
	/* { dg-final { scan-assembler-times {vst2[01].8\t.q[0-9]+, q[0-9]+., } 4 } } */
	/* { dg-final { scan-assembler-times {vst2[01].16\t.q[0-9]+, q[0-9]+., } 6 } } */
	/* { dg-final { scan-assembler-times {vst2[01].32\t.q[0-9]+, q[0-9]+., } 6 } } */