libgomp/testsuite/libgomp.oacc-c-c++-common/static-variable-1.c - gcc - Git at Google

 /* "Function scope" (top-level block scope) 'static' variables

    ... inside OpenACC compute construct regions as well as OpenACC 'routine'.

    This is to document/verify aspects of GCC's observed behavior, not
    necessarily as it's (intended to be?) restricted by the OpenACC
    specification.  See also PR84991, PR84992, PR90779 etc., and
    <https://github.com/OpenACC/openacc-spec/issues/372> "C/C++ 'static'
    variables" (only visible to members of the GitHub OpenACC organization).
 */

 /* { dg-additional-options "-fopt-info-note-omp" }
    { dg-additional-options "--param=openacc-privatization=noisy" }
    { dg-additional-options "-foffload=-fopt-info-note-omp" }
    { dg-additional-options "-foffload=--param=openacc-privatization=noisy" }
    for testing/documenting aspects of that functionality.  */

 /* { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting
    aspects of that functionality.  */


 #undef NDEBUG
 #include <assert.h>
 #include <string.h>
 #include <openacc.h>
 #include <gomp-constants.h>


 #define IF_DEBUG if (0)


 /* Without explicit 'num_gangs'.  */

 static void t0_c(void)
 {
   IF_DEBUG
     __builtin_printf ("%s\n", __FUNCTION__);

   const int i_limit = 11;
   const int var_init = 16;

   for (int i = 0; i < i_limit; ++i)
     {
       int result = 0;
       int num_gangs_actual = -1;
 #pragma acc parallel \
   reduction(max:num_gangs_actual) \
   reduction(max:result)
       /* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-3 } */
       {
 	num_gangs_actual = 1 + __builtin_goacc_parlevel_id(GOMP_DIM_GANG);

 	static int var = var_init;

 #pragma acc atomic capture
 	result = ++var;

 	/* Irrespective of the order in which the gang-redundant threads
 	   execute, 'var' has now been incremented 'num_gangs_actual' times, and
 	   the final value captured as 'result'.  */
       }
       /* Without an explicit 'num_gangs' clause GCC assigns 'num_gangs(1)'
 	 because it doesn't see any use of gang-level parallelism inside the
 	 region.  */
       assert(num_gangs_actual == 1);
       assert(result == var_init + num_gangs_actual * (1 + i));
     }
 }


 /* Call a gang-level routine.  */

 static const int t0_r_var_init = 61;

 #pragma acc routine gang
 /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+4 } */
 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .+3 } */
 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .+2 } */
 __attribute__((noinline))
 static int t0_r_r(void)
 {
   static int var = t0_r_var_init;

   int tmp;
 #pragma acc atomic capture
   tmp = ++var;

   return tmp;
 }

 static void t0_r(void)
 {
   IF_DEBUG
     __builtin_printf ("%s\n", __FUNCTION__);

   const int i_limit = 11;

   for (int i = 0; i < i_limit; ++i)
     {
       int result = 0;
       int num_gangs_actual = -1;
 #pragma acc parallel \
   reduction(max:num_gangs_actual) \
   reduction(max:result)
       {
 	num_gangs_actual = 1 + __builtin_goacc_parlevel_id(GOMP_DIM_GANG);

 	result = t0_r_r();

 	/* Irrespective of the order in which the gang-redundant threads
 	   execute, 'var' has now been incremented 'num_gangs_actual' times, and
 	   the final value captured as 'result'.  */
       }
       /* The number of gangs selected by the implemention ought to but must not
 	 be bigger than one.  */
       IF_DEBUG
 	__builtin_printf ("%d: num_gangs_actual: %d\n", i, num_gangs_actual);
       assert(num_gangs_actual >= 1);
       assert(result == t0_r_var_init + num_gangs_actual * (1 + i));
     }
 }


 /* Explicit 'num_gangs'.  */

 static void t1_c(void)
 {
   IF_DEBUG
     __builtin_printf ("%s\n", __FUNCTION__);

   const int i_limit = 22;
   const int num_gangs_request = 444;
   const int var_init = 5;

   for (int i = 0; i < i_limit; ++i)
     {
       int result = 0;
       int num_gangs_actual = -1;
       /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+1 } */
 #pragma acc parallel \
   num_gangs(num_gangs_request) \
   reduction(max:num_gangs_actual) \
   reduction(max:result)
       /* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-4 } */
       {
 	num_gangs_actual = 1 + __builtin_goacc_parlevel_id(GOMP_DIM_GANG);

 	static int var = var_init;

 #pragma acc atomic capture
 	result = ++var;

 	/* Irrespective of the order in which the gang-redundant threads
 	   execute, 'var' has now been incremented 'num_gangs_actual' times, and
 	   the final value captured as 'result'.  */
       }
       if (acc_get_device_type() == acc_device_host)
 	assert(num_gangs_actual == 1);
       else
 	assert(num_gangs_actual == num_gangs_request);
       assert(result == var_init + num_gangs_actual * (1 + i));
     }
 }


 /* Check the same routine called from two compute constructs.  */

 static const int t1_r2_var_init = 166;

 #pragma acc routine gang
 /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+4 } */
 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .+3 } */
 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .+2 } */
 __attribute__((noinline))
 static int t1_r2_r(void)
 {
   static int var = t1_r2_var_init;

   int tmp;
 #pragma acc atomic capture
   tmp = ++var;

   return tmp;
 }

 static void t1_r2(void)
 {
   IF_DEBUG
     __builtin_printf ("%s\n", __FUNCTION__);

   const int i_limit = 71;
   /* The checking assumes the same 'num_gangs' for all compute constructs.  */
   const int num_gangs_request = 333;
   int num_gangs_actual = -1;
   if (acc_get_device_type() == acc_device_host)
     num_gangs_actual = 1;
   else
     {
       /* We're assuming that the implementation is able to accomodate the
 	 'num_gangs' requested (which really ought to be true for
 	 'num_gangs').  */
       num_gangs_actual = num_gangs_request;
     }

   for (int i = 0; i < i_limit; ++i)
     {
       int result_1 = 0;
 #pragma acc parallel \
   num_gangs(num_gangs_request) \
   reduction(max:result_1)
       {
 	result_1 = t1_r2_r();

 	/* Irrespective of the order in which the gang-redundant threads
 	   execute, 'var' has now been incremented 'num_gangs_actual' times, and
 	   the final value captured as 'result_1'.  */
       }
       IF_DEBUG
 	__builtin_printf ("%d: result_1: %d\n", i, result_1);
       assert(result_1 == t1_r2_var_init + num_gangs_actual * (1 + (i * 3 + 0)));

       int result_2 = 0;
 #pragma acc parallel \
   num_gangs(num_gangs_request) \
   reduction(max:result_2)
       {
 	result_2 = t1_r2_r() + t1_r2_r();

 	/* Irrespective of the order in which the gang-redundant threads
 	   execute, 'var' has now been incremented '2 * num_gangs_actual' times.
 	   However, the order of the two 't1_r2_r' function calls is not
 	   synchronized (between different gang-redundant threads).  We thus
 	   cannot verify the actual 'result_2' values in this case.  */
       }
       IF_DEBUG
 	__builtin_printf ("%d: result_2: %d\n", i, result_2);
       if (num_gangs_actual == 1)
 	/* Per the rationale above, only in this case we can check the actual
 	   result.  */
 	assert(result_2 == (t1_r2_var_init + num_gangs_actual * (1 + (i * 3 + 1))
 			    + t1_r2_var_init + num_gangs_actual * (1 + (i * 3 + 2))));
       /* But we can generally check low and high limits.  */
       {
 	/* Must be bigger than '2 * result_1'.  */
 	int c = 2 * result_1;
 	IF_DEBUG
 	  __builtin_printf ("  > %d\n", c);
 	assert(result_2 > c);
       }
       {
 	/* ..., but limited by the base value for next 'i'.  */
 	int c = 2 * (t1_r2_var_init + num_gangs_actual * (0 + ((i + 1) * 3 + 0)));
 	IF_DEBUG
 	  __builtin_printf ("  < %d\n", c);
 	assert(result_2 < c);
       }
     }
 }


 /* Asynchronous execution.  */

 static const int t2_var_init_2 = -55;

 #pragma acc routine gang
 /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+4 } */
 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .+3 } */
 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .+2 } */
 __attribute__((noinline))
 static int t2_r(void)
 {
   static int var = t2_var_init_2;

   int tmp;
 #pragma acc atomic capture
   tmp = ++var;

   return tmp;
 }

 static void t2(void)
 {
   IF_DEBUG
     __builtin_printf ("%s\n", __FUNCTION__);

   const int i_limit = 12;
   const int num_gangs_request_1 = 14;
   const int var_init_1 = 5;
   int results_1[i_limit][num_gangs_request_1];
   memset (results_1, 0, sizeof results_1);
   const int num_gangs_request_2 = 5;
   int results_2[i_limit][num_gangs_request_2];
   memset (results_2, 0, sizeof results_2);
   const int num_gangs_request_3 = 34;
   const int var_init_3 = 1250;
   int results_3[i_limit][num_gangs_request_3];
   memset (results_3, 0, sizeof results_3);

 #pragma acc data \
   copy(results_1, results_2, results_3)
   /* { dg-note {variable 'num_gangs_request_1\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target { c && { ! __OPTIMIZE__ } } } .-2 } */
   /* { dg-note {variable 'num_gangs_request_2\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target { c && { ! __OPTIMIZE__ } } } .-3 } */
   /* { dg-note {variable 'num_gangs_request_3\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target { c && { ! __OPTIMIZE__ } } } .-4 } */
   /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-5 } */
   {
     for (int i = 0; i < i_limit; ++i)
       {
 	/* The following 'async' clauses effect asynchronous execution, but
 	   using the same async-argument for each compute construct implies that
 	   the respective compute constructs' execution is synchronized with
 	   itself, meaning that all 'i = 0' execution has finished (on the
 	   device) before 'i = 1' is started (on the device), etc.  */

 	/* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+1 } */
 #pragma acc parallel \
   present(results_1) \
   num_gangs(num_gangs_request_1) \
   async(1)
 	/* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-4 } */
 	/* { dg-note {variable 'tmp' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-5 } */
 	{
 	  static int var = var_init_1;

 	  int tmp;
 #pragma acc atomic capture
 	  tmp = ++var;

 	  results_1[i][__builtin_goacc_parlevel_id(GOMP_DIM_GANG)] += tmp;
 	}

 #pragma acc parallel \
   present(results_2) \
   num_gangs(num_gangs_request_2) \
   async(2)
 	{
 	  results_2[i][__builtin_goacc_parlevel_id(GOMP_DIM_GANG)] += t2_r();
 	}

 	/* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+1 } */
 #pragma acc parallel \
   present(results_3) \
   num_gangs(num_gangs_request_3) \
   async(3)
 	/* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-4 } */
 	/* { dg-note {variable 'tmp' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-5 } */
 	{
 	  static int var = var_init_3;

 	  int tmp;
 #pragma acc atomic capture
 	  tmp = ++var;

 	  results_3[i][__builtin_goacc_parlevel_id(GOMP_DIM_GANG)] += tmp;
 	}
       }
 #pragma acc wait
   }
   int num_gangs_actual_1;
   int num_gangs_actual_2;
   int num_gangs_actual_3;
   if (acc_get_device_type() == acc_device_host)
     {
       num_gangs_actual_1 = 1;
       num_gangs_actual_2 = 1;
       num_gangs_actual_3 = 1;
     }
   else
     {
       /* We're assuming that the implementation is able to accomodate the
 	 'num_gangs' requested (which really ought to be true for
 	 'num_gangs').  */
       num_gangs_actual_1 = num_gangs_request_1;
       num_gangs_actual_2 = num_gangs_request_2;
       num_gangs_actual_3 = num_gangs_request_3;
     }

   /* For 'i = 0', 'results_*[i][0..num_gangs_actual_*]' are expected to each
      contain one value of '(1 + var_init_*)..(var_init_* + num_gangs_actual_*)',
      and so on for increasing 'i'.  Their order however is unspecified due to
      the gang-redundant execution.  (Thus checking that their sums match.)  */

   int result_1 = 0;
   int result_2 = 0;
   int result_3 = 0;
   for (int i = 0; i < i_limit; ++i)
     {
       int result_1_ = 0;
       for (int g = 0; g < num_gangs_actual_1; ++g)
 	{
 	  IF_DEBUG
 	    __builtin_printf ("results_1[%d][%d]: %d\n", i, g, results_1[i][g]);
 	  result_1_ += results_1[i][g];
 	}
       IF_DEBUG
 	__builtin_printf ("%d result_1_: %d\n", i, result_1_);
       assert (result_1_ == (((var_init_1 + num_gangs_actual_1 * (1 + i)) * (1 + var_init_1 + num_gangs_actual_1 * (1 + i)) / 2)
 			    - ((var_init_1 + num_gangs_actual_1 * (0 + i)) * (1 + var_init_1 + num_gangs_actual_1 * (0 + i)) / 2)));
       result_1 += result_1_;

       int result_2_ = 0;
       for (int g = 0; g < num_gangs_actual_2; ++g)
 	{
 	  IF_DEBUG
 	    __builtin_printf ("results_2[%d][%d]: %d\n", i, g, results_2[i][g]);
 	  result_2_ += results_2[i][g];
 	}
       IF_DEBUG
 	__builtin_printf ("%d result_2_: %d\n", i, result_2_);
       assert (result_2_ == (((t2_var_init_2 + num_gangs_actual_2 * (1 + i)) * (1 + t2_var_init_2 + num_gangs_actual_2 * (1 + i)) / 2)
 			    - ((t2_var_init_2 + num_gangs_actual_2 * (0 + i)) * (1 + t2_var_init_2 + num_gangs_actual_2 * (0 + i)) / 2)));
       result_2 += result_2_;

       int result_3_ = 0;
       for (int g = 0; g < num_gangs_actual_3; ++g)
 	{
 	  IF_DEBUG
 	    __builtin_printf ("results_3[%d][%d]: %d\n", i, g, results_3[i][g]);
 	  result_3_ += results_3[i][g];
 	}
       IF_DEBUG
 	__builtin_printf ("%d result_3_: %d\n", i, result_3_);
       assert (result_3_ == (((var_init_3 + num_gangs_actual_3 * (1 + i)) * (1 + var_init_3 + num_gangs_actual_3 * (1 + i)) / 2)
 			    - ((var_init_3 + num_gangs_actual_3 * (0 + i)) * (1 + var_init_3 + num_gangs_actual_3 * (0 + i)) / 2)));
       result_3 += result_3_;
     }
   IF_DEBUG
     __builtin_printf ("result_1: %d\n", result_1);
   assert (result_1 == (((var_init_1 + num_gangs_actual_1 * i_limit) * (1 + var_init_1 + num_gangs_actual_1 * i_limit) / 2)
 		       - (var_init_1 * (var_init_1 + 1) / 2)));
   IF_DEBUG
     __builtin_printf ("result_2: %d\n", result_2);
   assert (result_2 == (((t2_var_init_2 + num_gangs_actual_2 * i_limit) * (1 + t2_var_init_2 + num_gangs_actual_2 * i_limit) / 2)
 		       - (t2_var_init_2 * (t2_var_init_2 + 1) / 2)));
   IF_DEBUG
     __builtin_printf ("result_3: %d\n", result_3);
   assert (result_3 == (((var_init_3 + num_gangs_actual_3 * i_limit) * (1 + var_init_3 + num_gangs_actual_3 * i_limit) / 2)
 		       - (var_init_3 * (var_init_3 + 1) / 2)));
 }


 #pragma acc routine seq
 __attribute__((noinline))
 static int pr84991_1_r_s(int n)
 {
   static const int test[] = {1,2,3,4};
   return test[n];
 }

 static void pr84991_1(void)
 {
   int n[1];
   n[0] = 3;
 #pragma acc parallel copy(n)
   {
     n[0] = pr84991_1_r_s(n[0]);
   }
   assert(n[0] == 4);
 }


 static void pr84992_1(void)
 {
   int n[1];
   n[0] = 3;
 #pragma acc parallel copy(n)
   /* { dg-note {variable 'test' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-1 } */
   {
     static const int test[] = {1,2,3,4};
     n[0] = test[n[0]];
   }
   assert(n[0] == 4);
 }


 int main(void)
 {
   t0_c();

   t0_r();

   t1_c();

   t1_r2();

   t2();

   pr84991_1();

   pr84992_1();

   return 0;
 }
	/* "Function scope" (top-level block scope) 'static' variables

	... inside OpenACC compute construct regions as well as OpenACC 'routine'.

	This is to document/verify aspects of GCC's observed behavior, not
	necessarily as it's (intended to be?) restricted by the OpenACC
	specification. See also PR84991, PR84992, PR90779 etc., and
	<https://github.com/OpenACC/openacc-spec/issues/372> "C/C++ 'static'
	variables" (only visible to members of the GitHub OpenACC organization).
	*/

	/* { dg-additional-options "-fopt-info-note-omp" }
	{ dg-additional-options "--param=openacc-privatization=noisy" }
	{ dg-additional-options "-foffload=-fopt-info-note-omp" }
	{ dg-additional-options "-foffload=--param=openacc-privatization=noisy" }
	for testing/documenting aspects of that functionality. */

	/* { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting
	aspects of that functionality. */


	#undef NDEBUG
	#include <assert.h>
	#include <string.h>
	#include <openacc.h>
	#include <gomp-constants.h>


	#define IF_DEBUG if (0)


	/* Without explicit 'num_gangs'. */

	static void t0_c(void)
	{
	IF_DEBUG
	__builtin_printf ("%s\n", __FUNCTION__);

	const int i_limit = 11;
	const int var_init = 16;

	for (int i = 0; i < i_limit; ++i)
	{
	int result = 0;
	int num_gangs_actual = -1;
	#pragma acc parallel \
	reduction(max:num_gangs_actual) \
	reduction(max:result)
	/* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target --* } .-3 } */
	{
	num_gangs_actual = 1 + __builtin_goacc_parlevel_id(GOMP_DIM_GANG);

	static int var = var_init;

	#pragma acc atomic capture
	result = ++var;

	/* Irrespective of the order in which the gang-redundant threads
	execute, 'var' has now been incremented 'num_gangs_actual' times, and
	the final value captured as 'result'. */
	}
	/* Without an explicit 'num_gangs' clause GCC assigns 'num_gangs(1)'
	because it doesn't see any use of gang-level parallelism inside the
	region. */
	assert(num_gangs_actual == 1);
	assert(result == var_init + num_gangs_actual * (1 + i));
	}
	}


	/* Call a gang-level routine. */

	static const int t0_r_var_init = 61;

	#pragma acc routine gang
	/* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail --* } .+4 } */
	/* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target --* } .+3 } */
	/* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target --* } .+2 } */
	__attribute__((noinline))
	static int t0_r_r(void)
	{
	static int var = t0_r_var_init;

	int tmp;
	#pragma acc atomic capture
	tmp = ++var;

	return tmp;
	}

	static void t0_r(void)
	{
	IF_DEBUG
	__builtin_printf ("%s\n", __FUNCTION__);

	const int i_limit = 11;

	for (int i = 0; i < i_limit; ++i)
	{
	int result = 0;
	int num_gangs_actual = -1;
	#pragma acc parallel \
	reduction(max:num_gangs_actual) \
	reduction(max:result)
	{
	num_gangs_actual = 1 + __builtin_goacc_parlevel_id(GOMP_DIM_GANG);

	result = t0_r_r();

	/* Irrespective of the order in which the gang-redundant threads
	execute, 'var' has now been incremented 'num_gangs_actual' times, and
	the final value captured as 'result'. */
	}
	/* The number of gangs selected by the implemention ought to but must not
	be bigger than one. */
	IF_DEBUG
	__builtin_printf ("%d: num_gangs_actual: %d\n", i, num_gangs_actual);
	assert(num_gangs_actual >= 1);
	assert(result == t0_r_var_init + num_gangs_actual * (1 + i));
	}
	}


	/* Explicit 'num_gangs'. */

	static void t1_c(void)
	{
	IF_DEBUG
	__builtin_printf ("%s\n", __FUNCTION__);

	const int i_limit = 22;
	const int num_gangs_request = 444;
	const int var_init = 5;

	for (int i = 0; i < i_limit; ++i)
	{
	int result = 0;
	int num_gangs_actual = -1;
	/* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail --* } .+1 } */
	#pragma acc parallel \
	num_gangs(num_gangs_request) \
	reduction(max:num_gangs_actual) \
	reduction(max:result)
	/* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target --* } .-4 } */
	{
	num_gangs_actual = 1 + __builtin_goacc_parlevel_id(GOMP_DIM_GANG);

	static int var = var_init;

	#pragma acc atomic capture
	result = ++var;

	/* Irrespective of the order in which the gang-redundant threads
	execute, 'var' has now been incremented 'num_gangs_actual' times, and
	the final value captured as 'result'. */
	}
	if (acc_get_device_type() == acc_device_host)
	assert(num_gangs_actual == 1);
	else
	assert(num_gangs_actual == num_gangs_request);
	assert(result == var_init + num_gangs_actual * (1 + i));
	}
	}


	/* Check the same routine called from two compute constructs. */

	static const int t1_r2_var_init = 166;

	#pragma acc routine gang
	/* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail --* } .+4 } */
	/* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target --* } .+3 } */
	/* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target --* } .+2 } */
	__attribute__((noinline))
	static int t1_r2_r(void)
	{
	static int var = t1_r2_var_init;

	int tmp;
	#pragma acc atomic capture
	tmp = ++var;

	return tmp;
	}

	static void t1_r2(void)
	{
	IF_DEBUG
	__builtin_printf ("%s\n", __FUNCTION__);

	const int i_limit = 71;
	/* The checking assumes the same 'num_gangs' for all compute constructs. */
	const int num_gangs_request = 333;
	int num_gangs_actual = -1;
	if (acc_get_device_type() == acc_device_host)
	num_gangs_actual = 1;
	else
	{
	/* We're assuming that the implementation is able to accomodate the
	'num_gangs' requested (which really ought to be true for
	'num_gangs'). */
	num_gangs_actual = num_gangs_request;
	}

	for (int i = 0; i < i_limit; ++i)
	{
	int result_1 = 0;
	#pragma acc parallel \
	num_gangs(num_gangs_request) \
	reduction(max:result_1)
	{
	result_1 = t1_r2_r();

	/* Irrespective of the order in which the gang-redundant threads
	execute, 'var' has now been incremented 'num_gangs_actual' times, and
	the final value captured as 'result_1'. */
	}
	IF_DEBUG
	__builtin_printf ("%d: result_1: %d\n", i, result_1);
	assert(result_1 == t1_r2_var_init + num_gangs_actual * (1 + (i * 3 + 0)));

	int result_2 = 0;
	#pragma acc parallel \
	num_gangs(num_gangs_request) \
	reduction(max:result_2)
	{
	result_2 = t1_r2_r() + t1_r2_r();

	/* Irrespective of the order in which the gang-redundant threads
	execute, 'var' has now been incremented '2 * num_gangs_actual' times.
	However, the order of the two 't1_r2_r' function calls is not
	synchronized (between different gang-redundant threads). We thus
	cannot verify the actual 'result_2' values in this case. */
	}
	IF_DEBUG
	__builtin_printf ("%d: result_2: %d\n", i, result_2);
	if (num_gangs_actual == 1)
	/* Per the rationale above, only in this case we can check the actual
	result. */
	assert(result_2 == (t1_r2_var_init + num_gangs_actual * (1 + (i * 3 + 1))
	+ t1_r2_var_init + num_gangs_actual * (1 + (i * 3 + 2))));
	/* But we can generally check low and high limits. */
	{
	/* Must be bigger than '2 * result_1'. */
	int c = 2 * result_1;
	IF_DEBUG
	__builtin_printf (" > %d\n", c);
	assert(result_2 > c);
	}
	{
	/* ..., but limited by the base value for next 'i'. */
	int c = 2 * (t1_r2_var_init + num_gangs_actual * (0 + ((i + 1) * 3 + 0)));
	IF_DEBUG
	__builtin_printf (" < %d\n", c);
	assert(result_2 < c);
	}
	}
	}


	/* Asynchronous execution. */

	static const int t2_var_init_2 = -55;

	#pragma acc routine gang
	/* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail --* } .+4 } */
	/* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target --* } .+3 } */
	/* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target --* } .+2 } */
	__attribute__((noinline))
	static int t2_r(void)
	{
	static int var = t2_var_init_2;

	int tmp;
	#pragma acc atomic capture
	tmp = ++var;

	return tmp;
	}

	static void t2(void)
	{
	IF_DEBUG
	__builtin_printf ("%s\n", __FUNCTION__);

	const int i_limit = 12;
	const int num_gangs_request_1 = 14;
	const int var_init_1 = 5;
	int results_1[i_limit][num_gangs_request_1];
	memset (results_1, 0, sizeof results_1);
	const int num_gangs_request_2 = 5;
	int results_2[i_limit][num_gangs_request_2];
	memset (results_2, 0, sizeof results_2);
	const int num_gangs_request_3 = 34;
	const int var_init_3 = 1250;
	int results_3[i_limit][num_gangs_request_3];
	memset (results_3, 0, sizeof results_3);

	#pragma acc data \
	copy(results_1, results_2, results_3)
	/* { dg-note {variable 'num_gangs_request_1\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target { c && { ! __OPTIMIZE__ } } } .-2 } */
	/* { dg-note {variable 'num_gangs_request_2\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target { c && { ! __OPTIMIZE__ } } } .-3 } */
	/* { dg-note {variable 'num_gangs_request_3\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target { c && { ! __OPTIMIZE__ } } } .-4 } */
	/* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target --* } .-5 } */
	{
	for (int i = 0; i < i_limit; ++i)
	{
	/* The following 'async' clauses effect asynchronous execution, but
	using the same async-argument for each compute construct implies that
	the respective compute constructs' execution is synchronized with
	itself, meaning that all 'i = 0' execution has finished (on the
	device) before 'i = 1' is started (on the device), etc. */

	/* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail --* } .+1 } */
	#pragma acc parallel \
	present(results_1) \
	num_gangs(num_gangs_request_1) \
	async(1)
	/* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target --* } .-4 } */
	/* { dg-note {variable 'tmp' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target --* } .-5 } */
	{
	static int var = var_init_1;

	int tmp;
	#pragma acc atomic capture
	tmp = ++var;

	results_1[i][__builtin_goacc_parlevel_id(GOMP_DIM_GANG)] += tmp;
	}

	#pragma acc parallel \
	present(results_2) \
	num_gangs(num_gangs_request_2) \
	async(2)
	{
	results_2[i][__builtin_goacc_parlevel_id(GOMP_DIM_GANG)] += t2_r();
	}

	/* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail --* } .+1 } */
	#pragma acc parallel \
	present(results_3) \
	num_gangs(num_gangs_request_3) \
	async(3)
	/* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target --* } .-4 } */
	/* { dg-note {variable 'tmp' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target --* } .-5 } */
	{
	static int var = var_init_3;

	int tmp;
	#pragma acc atomic capture
	tmp = ++var;

	results_3[i][__builtin_goacc_parlevel_id(GOMP_DIM_GANG)] += tmp;
	}
	}
	#pragma acc wait
	}
	int num_gangs_actual_1;
	int num_gangs_actual_2;
	int num_gangs_actual_3;
	if (acc_get_device_type() == acc_device_host)
	{
	num_gangs_actual_1 = 1;
	num_gangs_actual_2 = 1;
	num_gangs_actual_3 = 1;
	}
	else
	{
	/* We're assuming that the implementation is able to accomodate the
	'num_gangs' requested (which really ought to be true for
	'num_gangs'). */
	num_gangs_actual_1 = num_gangs_request_1;
	num_gangs_actual_2 = num_gangs_request_2;
	num_gangs_actual_3 = num_gangs_request_3;
	}

	/* For 'i = 0', 'results_[i][0..num_gangs_actual_]' are expected to each
	contain one value of '(1 + var_init_)..(var_init_ + num_gangs_actual_*)',
	and so on for increasing 'i'. Their order however is unspecified due to
	the gang-redundant execution. (Thus checking that their sums match.) */

	int result_1 = 0;
	int result_2 = 0;
	int result_3 = 0;
	for (int i = 0; i < i_limit; ++i)
	{
	int result_1_ = 0;
	for (int g = 0; g < num_gangs_actual_1; ++g)
	{
	IF_DEBUG
	__builtin_printf ("results_1[%d][%d]: %d\n", i, g, results_1[i][g]);
	result_1_ += results_1[i][g];
	}
	IF_DEBUG
	__builtin_printf ("%d result_1_: %d\n", i, result_1_);
	assert (result_1_ == (((var_init_1 + num_gangs_actual_1 * (1 + i)) * (1 + var_init_1 + num_gangs_actual_1 * (1 + i)) / 2)
	- ((var_init_1 + num_gangs_actual_1 * (0 + i)) * (1 + var_init_1 + num_gangs_actual_1 * (0 + i)) / 2)));
	result_1 += result_1_;

	int result_2_ = 0;
	for (int g = 0; g < num_gangs_actual_2; ++g)
	{
	IF_DEBUG
	__builtin_printf ("results_2[%d][%d]: %d\n", i, g, results_2[i][g]);
	result_2_ += results_2[i][g];
	}
	IF_DEBUG
	__builtin_printf ("%d result_2_: %d\n", i, result_2_);
	assert (result_2_ == (((t2_var_init_2 + num_gangs_actual_2 * (1 + i)) * (1 + t2_var_init_2 + num_gangs_actual_2 * (1 + i)) / 2)
	- ((t2_var_init_2 + num_gangs_actual_2 * (0 + i)) * (1 + t2_var_init_2 + num_gangs_actual_2 * (0 + i)) / 2)));
	result_2 += result_2_;

	int result_3_ = 0;
	for (int g = 0; g < num_gangs_actual_3; ++g)
	{
	IF_DEBUG
	__builtin_printf ("results_3[%d][%d]: %d\n", i, g, results_3[i][g]);
	result_3_ += results_3[i][g];
	}
	IF_DEBUG
	__builtin_printf ("%d result_3_: %d\n", i, result_3_);
	assert (result_3_ == (((var_init_3 + num_gangs_actual_3 * (1 + i)) * (1 + var_init_3 + num_gangs_actual_3 * (1 + i)) / 2)
	- ((var_init_3 + num_gangs_actual_3 * (0 + i)) * (1 + var_init_3 + num_gangs_actual_3 * (0 + i)) / 2)));
	result_3 += result_3_;
	}
	IF_DEBUG
	__builtin_printf ("result_1: %d\n", result_1);
	assert (result_1 == (((var_init_1 + num_gangs_actual_1 * i_limit) * (1 + var_init_1 + num_gangs_actual_1 * i_limit) / 2)
	- (var_init_1 * (var_init_1 + 1) / 2)));
	IF_DEBUG
	__builtin_printf ("result_2: %d\n", result_2);
	assert (result_2 == (((t2_var_init_2 + num_gangs_actual_2 * i_limit) * (1 + t2_var_init_2 + num_gangs_actual_2 * i_limit) / 2)
	- (t2_var_init_2 * (t2_var_init_2 + 1) / 2)));
	IF_DEBUG
	__builtin_printf ("result_3: %d\n", result_3);
	assert (result_3 == (((var_init_3 + num_gangs_actual_3 * i_limit) * (1 + var_init_3 + num_gangs_actual_3 * i_limit) / 2)
	- (var_init_3 * (var_init_3 + 1) / 2)));
	}


	#pragma acc routine seq
	__attribute__((noinline))
	static int pr84991_1_r_s(int n)
	{
	static const int test[] = {1,2,3,4};
	return test[n];
	}

	static void pr84991_1(void)
	{
	int n[1];
	n[0] = 3;
	#pragma acc parallel copy(n)
	{
	n[0] = pr84991_1_r_s(n[0]);
	}
	assert(n[0] == 4);
	}


	static void pr84992_1(void)
	{
	int n[1];
	n[0] = 3;
	#pragma acc parallel copy(n)
	/* { dg-note {variable 'test' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target --* } .-1 } */
	{
	static const int test[] = {1,2,3,4};
	n[0] = test[n[0]];
	}
	assert(n[0] == 4);
	}


	int main(void)
	{
	t0_c();

	t0_r();

	t1_c();

	t1_r2();

	t2();

	pr84991_1();

	pr84992_1();

	return 0;
	}