gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c - gcc - Git at Google

 /* { dg-do compile  { target { lp64 } } } */
 /* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
 /* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */

 #include <arm_neon.h>

 /*
 **test1:
 **	adrp	x[0-9]+, .LC[0-9]+
 **	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
 **	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
 **	str	q[0-9]+, \[x[0-9]+\]
 **	fmov	x[0-9]+, d[0-9]+
 **	orr	x[0-9]+, x[0-9]+, x[0-9]+
 **	ret
 */

 uint64_t
 test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
 {
   uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
   uint64_t res = a | arr[0];
   uint64x2_t val = vld1q_u64 (arr);
   *rt = vaddq_u64 (val, b);
   return res;
 }

 /*
 **test2:
 **	adrp	x[0-9]+, .LC[0-1]+
 **	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
 **	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
 **	str	q[0-9]+, \[x[0-9]+\]
 **	fmov	x[0-9]+, d[0-9]+
 **	orr	x[0-9]+, x[0-9]+, x[0-9]+
 **	ret
 */

 uint64_t
 test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
 {
   uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
   uint64_t arr = vgetq_lane_u64 (val, 0);
   uint64_t res = a | arr;
   *rt = vaddq_u64 (val, b);
   return res;
 }

 /*
 **test3:
 **	adrp	x[0-9]+, .LC[0-9]+
 **	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
 **	add	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
 **	str	q[0-9]+, \[x1\]
 **	fmov	w[0-9]+, s[0-9]+
 **	orr	w[0-9]+, w[0-9]+, w[0-9]+
 **	ret
 */

 uint32_t
 test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
 {
   uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
   uint32_t res = a | arr[0];
   uint32x4_t val = vld1q_u32 (arr);
   *rt = vaddq_u32 (val, b);
   return res;
 }

 /*
 **test4:
 **	ushr	v[0-9]+.16b, v[0-9]+.16b, 7
 **	mov	x[0-9]+, 16512
 **	movk	x[0-9]+, 0x1020, lsl 16
 **	movk	x[0-9]+, 0x408, lsl 32
 **	movk	x[0-9]+, 0x102, lsl 48
 **	fmov	d[0-9]+, x[0-9]+
 **	pmull	v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
 **	dup	v[0-9]+.2d, v[0-9]+.d\[0\]
 **	pmull2	v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d
 **	trn2	v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
 **	umov	w[0-9]+, v[0-9]+.h\[3\]
 **	ret
 */

 uint64_t
 test4 (uint8x16_t input)
 {
     uint8x16_t bool_input = vshrq_n_u8(input, 7);
     poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
     poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
                                vgetq_lane_p64(mask, 0));
     poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
     uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
     return vget_lane_u16((uint16x4_t)res, 3);
 }
	/* { dg-do compile { target { lp64 } } } */
	/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
	/* { dg-final { check-function-bodies "*" "" "" { target { le } } } } /

	#include <arm_neon.h>

	/*
	**test1:
	** adrp x[0-9]+, .LC[0-9]+
	** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
	** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
	** str q[0-9]+, \[x[0-9]+\]
	** fmov x[0-9]+, d[0-9]+
	** orr x[0-9]+, x[0-9]+, x[0-9]+
	** ret
	*/

	uint64_t
	test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
	{
	uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
	uint64_t res = a \| arr[0];
	uint64x2_t val = vld1q_u64 (arr);
	*rt = vaddq_u64 (val, b);
	return res;
	}

	/*
	**test2:
	** adrp x[0-9]+, .LC[0-1]+
	** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
	** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
	** str q[0-9]+, \[x[0-9]+\]
	** fmov x[0-9]+, d[0-9]+
	** orr x[0-9]+, x[0-9]+, x[0-9]+
	** ret
	*/

	uint64_t
	test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
	{
	uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
	uint64_t arr = vgetq_lane_u64 (val, 0);
	uint64_t res = a \| arr;
	*rt = vaddq_u64 (val, b);
	return res;
	}

	/*
	**test3:
	** adrp x[0-9]+, .LC[0-9]+
	** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
	** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
	** str q[0-9]+, \[x1\]
	** fmov w[0-9]+, s[0-9]+
	** orr w[0-9]+, w[0-9]+, w[0-9]+
	** ret
	*/

	uint32_t
	test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
	{
	uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
	uint32_t res = a \| arr[0];
	uint32x4_t val = vld1q_u32 (arr);
	*rt = vaddq_u32 (val, b);
	return res;
	}

	/*
	**test4:
	** ushr v[0-9]+.16b, v[0-9]+.16b, 7
	** mov x[0-9]+, 16512
	** movk x[0-9]+, 0x1020, lsl 16
	** movk x[0-9]+, 0x408, lsl 32
	** movk x[0-9]+, 0x102, lsl 48
	** fmov d[0-9]+, x[0-9]+
	** pmull v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
	** dup v[0-9]+.2d, v[0-9]+.d\[0\]
	** pmull2 v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d
	** trn2 v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
	** umov w[0-9]+, v[0-9]+.h\[3\]
	** ret
	*/

	uint64_t
	test4 (uint8x16_t input)
	{
	uint8x16_t bool_input = vshrq_n_u8(input, 7);
	poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
	poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
	vgetq_lane_p64(mask, 0));
	poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
	uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
	return vget_lane_u16((uint16x4_t)res, 3);
	}