gcc/testsuite/gcc.target/powerpc/pr37191.c - gcc - Git at Google

 /* { dg-do compile } */
 /* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
 /* { dg-options "-O3 -mpower8-vector" } */
 /* { dg-require-effective-target p8vector_hw } */

 #define NO_WARN_X86_INTRINSICS 1

 #include <mmintrin.h>
 #include <stddef.h>
 #include <stdint.h>

 #if 0
 extern const uint64_t ff_bone;
 #endif

 static inline void transpose4x4(uint8_t *dst, uint8_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride) {
   __m64 row0 = _mm_cvtsi32_si64(*(unsigned*)(src + (0 * src_stride)));
   __m64 row1 = _mm_cvtsi32_si64(*(unsigned*)(src + (1 * src_stride)));
   __m64 row2 = _mm_cvtsi32_si64(*(unsigned*)(src + (2 * src_stride)));
   __m64 row3 = _mm_cvtsi32_si64(*(unsigned*)(src + (3 * src_stride)));
   __m64 tmp0 = _mm_unpacklo_pi8(row0, row1);
   __m64 tmp1 = _mm_unpacklo_pi8(row2, row3);
   __m64 row01 = _mm_unpacklo_pi16(tmp0, tmp1);
   __m64 row23 = _mm_unpackhi_pi16(tmp0, tmp1);
   *((unsigned*)(dst + (0 * dst_stride))) = _mm_cvtsi64_si32(row01);
   *((unsigned*)(dst + (1 * dst_stride))) = _mm_cvtsi64_si32(_mm_unpackhi_pi32(row01, row01));
   *((unsigned*)(dst + (2 * dst_stride))) = _mm_cvtsi64_si32(row23);
   *((unsigned*)(dst + (3 * dst_stride))) = _mm_cvtsi64_si32(_mm_unpackhi_pi32(row23, row23));
 }

 #if 0
 static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
 {
     asm volatile(
         ""
         :: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
            "m"(alpha1), "m"(beta1), "m"(ff_bone)
     );
 }
 #endif

 void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
 {
   uint8_t trans[8*4] __attribute__ ((aligned (8)));
   transpose4x4(trans, pix-2, 8, stride);
   transpose4x4(trans+4, pix-2+4*stride, 8, stride);
 //    h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
   transpose4x4(pix-2, trans, stride, 8);
   transpose4x4(pix-2+4*stride, trans+4, stride, 8);
 }
	/* { dg-do compile } */
	/* { dg-skip-if "" { powerpc--darwin* } { "" } { "" } } /
	/* { dg-options "-O3 -mpower8-vector" } */
	/* { dg-require-effective-target p8vector_hw } */

	#define NO_WARN_X86_INTRINSICS 1

	#include <mmintrin.h>
	#include <stddef.h>
	#include <stdint.h>

	#if 0
	extern const uint64_t ff_bone;
	#endif

	static inline void transpose4x4(uint8_t dst, uint8_t src, ptrdiff_t dst_stride, ptrdiff_t src_stride) {
	__m64 row0 = _mm_cvtsi32_si64((unsigned)(src + (0 * src_stride)));
	__m64 row1 = _mm_cvtsi32_si64((unsigned)(src + (1 * src_stride)));
	__m64 row2 = _mm_cvtsi32_si64((unsigned)(src + (2 * src_stride)));
	__m64 row3 = _mm_cvtsi32_si64((unsigned)(src + (3 * src_stride)));
	__m64 tmp0 = _mm_unpacklo_pi8(row0, row1);
	__m64 tmp1 = _mm_unpacklo_pi8(row2, row3);
	__m64 row01 = _mm_unpacklo_pi16(tmp0, tmp1);
	__m64 row23 = _mm_unpackhi_pi16(tmp0, tmp1);
	((unsigned)(dst + (0 * dst_stride))) = _mm_cvtsi64_si32(row01);
	((unsigned)(dst + (1 * dst_stride))) = _mm_cvtsi64_si32(_mm_unpackhi_pi32(row01, row01));
	((unsigned)(dst + (2 * dst_stride))) = _mm_cvtsi64_si32(row23);
	((unsigned)(dst + (3 * dst_stride))) = _mm_cvtsi64_si32(_mm_unpackhi_pi32(row23, row23));
	}

	#if 0
	static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
	{
	asm volatile(
	""
	:: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
	"m"(alpha1), "m"(beta1), "m"(ff_bone)
	);
	}
	#endif

	void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
	{
	uint8_t trans[8*4] __attribute__ ((aligned (8)));
	transpose4x4(trans, pix-2, 8, stride);
	transpose4x4(trans+4, pix-2+4*stride, 8, stride);
	// h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
	transpose4x4(pix-2, trans, stride, 8);
	transpose4x4(pix-2+4*stride, trans+4, stride, 8);
	}