libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c - gcc - Git at Google

 /* This code uses nvptx inline assembly guarded with acc_on_device, which is
    not optimized away at -O0, and then confuses the target assembler.
    { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */

 #include <stdio.h>
 #include <openacc.h>

 #define NUM_WORKERS 16
 #define NUM_VECTORS 32
 #define WIDTH 64
 #define HEIGHT 32

 #define WORK_ID(I,N)						\
   (acc_on_device (acc_device_nvidia)				\
    ? ({unsigned __r;						\
        __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (__r));	\
        __r; }) : (I % N))
 #define VEC_ID(I,N)						\
   (acc_on_device (acc_device_nvidia)				\
    ? ({unsigned __r;						\
        __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (__r));	\
        __r; }) : (I % N))

 #pragma acc routine worker
 void __attribute__ ((noinline))
   WorkVec (int *ptr, int w, int h, int nw, int nv)
 {
 #pragma acc loop worker
   for (int i = 0; i < h; i++)
 #pragma acc loop vector
     for (int j = 0; j < w; j++)
       ptr[i*w + j] = (WORK_ID (i, nw) << 8) | VEC_ID(j, nv);
 }

 int DoWorkVec (int nw)
 {
   int ary[HEIGHT][WIDTH];
   int err = 0;

   for (int ix = 0; ix != HEIGHT; ix++)
     for (int jx = 0; jx != WIDTH; jx++)
       ary[ix][jx] = 0xdeadbeef;

   printf ("spawning %d ...", nw); fflush (stdout);

 #pragma acc parallel num_workers(nw) vector_length (NUM_VECTORS) copy (ary)
   {
     WorkVec ((int *)ary, WIDTH, HEIGHT, nw, NUM_VECTORS);
   }

   for (int ix = 0; ix != HEIGHT; ix++)
     for (int jx = 0; jx != WIDTH; jx++)
       {
 	int exp = ((ix % nw) << 8) | (jx % NUM_VECTORS);

 	if (ary[ix][jx] != exp)
 	  {
 	    printf ("\nary[%d][%d] = %#x expected %#x", ix, jx,
 		    ary[ix][jx], exp);
 	    err = 1;
 	  }
       }
   printf (err ? " failed\n" : " ok\n");

   return err;
 }

 int main ()
 {
   int err = 0;

   for (int W = 1; W <= NUM_WORKERS; W <<= 1)
     err |= DoWorkVec (W);

   return err;
 }
	/* This code uses nvptx inline assembly guarded with acc_on_device, which is
	not optimized away at -O0, and then confuses the target assembler.
	{ dg-skip-if "" { --* } { "-O0" } { "" } } */

	#include <stdio.h>
	#include <openacc.h>

	#define NUM_WORKERS 16
	#define NUM_VECTORS 32
	#define WIDTH 64
	#define HEIGHT 32

	#define WORK_ID(I,N) \
	(acc_on_device (acc_device_nvidia) \
	? ({unsigned __r; \
	__asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (__r)); \
	__r; }) : (I % N))
	#define VEC_ID(I,N) \
	(acc_on_device (acc_device_nvidia) \
	? ({unsigned __r; \
	__asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (__r)); \
	__r; }) : (I % N))

	#pragma acc routine worker
	void __attribute__ ((noinline))
	WorkVec (int *ptr, int w, int h, int nw, int nv)
	{
	#pragma acc loop worker
	for (int i = 0; i < h; i++)
	#pragma acc loop vector
	for (int j = 0; j < w; j++)
	ptr[i*w + j] = (WORK_ID (i, nw) << 8) \| VEC_ID(j, nv);
	}

	int DoWorkVec (int nw)
	{
	int ary[HEIGHT][WIDTH];
	int err = 0;

	for (int ix = 0; ix != HEIGHT; ix++)
	for (int jx = 0; jx != WIDTH; jx++)
	ary[ix][jx] = 0xdeadbeef;

	printf ("spawning %d ...", nw); fflush (stdout);

	#pragma acc parallel num_workers(nw) vector_length (NUM_VECTORS) copy (ary)
	{
	WorkVec ((int *)ary, WIDTH, HEIGHT, nw, NUM_VECTORS);
	}

	for (int ix = 0; ix != HEIGHT; ix++)
	for (int jx = 0; jx != WIDTH; jx++)
	{
	int exp = ((ix % nw) << 8) \| (jx % NUM_VECTORS);

	if (ary[ix][jx] != exp)
	{
	printf ("\nary[%d][%d] = %#x expected %#x", ix, jx,
	ary[ix][jx], exp);
	err = 1;
	}
	}
	printf (err ? " failed\n" : " ok\n");

	return err;
	}

	int main ()
	{
	int err = 0;

	for (int W = 1; W <= NUM_WORKERS; W <<= 1)
	err \|= DoWorkVec (W);

	return err;
	}