| /* "Function scope" (top-level block scope) 'static' variables |
| |
| ... inside OpenACC compute construct regions as well as OpenACC 'routine'. |
| |
| This is to document/verify aspects of GCC's observed behavior, not |
| necessarily as it's (intended to be?) restricted by the OpenACC |
| specification. See also PR84991, PR84992, PR90779 etc., and |
| <https://github.com/OpenACC/openacc-spec/issues/372> "C/C++ 'static' |
| variables" (only visible to members of the GitHub OpenACC organization). |
| */ |
| |
| /* { dg-additional-options "-fopt-info-note-omp" } |
| { dg-additional-options "--param=openacc-privatization=noisy" } |
| { dg-additional-options "-foffload=-fopt-info-note-omp" } |
| { dg-additional-options "-foffload=--param=openacc-privatization=noisy" } |
| for testing/documenting aspects of that functionality. */ |
| |
| /* { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting |
| aspects of that functionality. */ |
| |
| |
| #undef NDEBUG |
| #include <assert.h> |
| #include <string.h> |
| #include <openacc.h> |
| #include <gomp-constants.h> |
| |
| |
| #define IF_DEBUG if (0) |
| |
| |
| /* Without explicit 'num_gangs'. */ |
| |
| static void t0_c(void) |
| { |
| IF_DEBUG |
| __builtin_printf ("%s\n", __FUNCTION__); |
| |
| const int i_limit = 11; |
| const int var_init = 16; |
| |
| for (int i = 0; i < i_limit; ++i) |
| { |
| int result = 0; |
| int num_gangs_actual = -1; |
| #pragma acc parallel \ |
| reduction(max:num_gangs_actual) \ |
| reduction(max:result) |
| /* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-3 } */ |
| { |
| num_gangs_actual = 1 + __builtin_goacc_parlevel_id(GOMP_DIM_GANG); |
| |
| static int var = var_init; |
| |
| #pragma acc atomic capture |
| result = ++var; |
| |
| /* Irrespective of the order in which the gang-redundant threads |
| execute, 'var' has now been incremented 'num_gangs_actual' times, and |
| the final value captured as 'result'. */ |
| } |
| /* Without an explicit 'num_gangs' clause GCC assigns 'num_gangs(1)' |
| because it doesn't see any use of gang-level parallelism inside the |
| region. */ |
| assert(num_gangs_actual == 1); |
| assert(result == var_init + num_gangs_actual * (1 + i)); |
| } |
| } |
| |
| |
| /* Call a gang-level routine. */ |
| |
| static const int t0_r_var_init = 61; |
| |
| #pragma acc routine gang |
| /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+4 } */ |
| /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .+3 } */ |
| /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .+2 } */ |
| __attribute__((noinline)) |
| static int t0_r_r(void) |
| { |
| static int var = t0_r_var_init; |
| |
| int tmp; |
| #pragma acc atomic capture |
| tmp = ++var; |
| |
| return tmp; |
| } |
| |
| static void t0_r(void) |
| { |
| IF_DEBUG |
| __builtin_printf ("%s\n", __FUNCTION__); |
| |
| const int i_limit = 11; |
| |
| for (int i = 0; i < i_limit; ++i) |
| { |
| int result = 0; |
| int num_gangs_actual = -1; |
| #pragma acc parallel \ |
| reduction(max:num_gangs_actual) \ |
| reduction(max:result) |
| { |
| num_gangs_actual = 1 + __builtin_goacc_parlevel_id(GOMP_DIM_GANG); |
| |
| result = t0_r_r(); |
| |
| /* Irrespective of the order in which the gang-redundant threads |
| execute, 'var' has now been incremented 'num_gangs_actual' times, and |
| the final value captured as 'result'. */ |
| } |
| /* The number of gangs selected by the implemention ought to but must not |
| be bigger than one. */ |
| IF_DEBUG |
| __builtin_printf ("%d: num_gangs_actual: %d\n", i, num_gangs_actual); |
| assert(num_gangs_actual >= 1); |
| assert(result == t0_r_var_init + num_gangs_actual * (1 + i)); |
| } |
| } |
| |
| |
| /* Explicit 'num_gangs'. */ |
| |
| static void t1_c(void) |
| { |
| IF_DEBUG |
| __builtin_printf ("%s\n", __FUNCTION__); |
| |
| const int i_limit = 22; |
| const int num_gangs_request = 444; |
| const int var_init = 5; |
| |
| for (int i = 0; i < i_limit; ++i) |
| { |
| int result = 0; |
| int num_gangs_actual = -1; |
| /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+1 } */ |
| #pragma acc parallel \ |
| num_gangs(num_gangs_request) \ |
| reduction(max:num_gangs_actual) \ |
| reduction(max:result) |
| /* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-4 } */ |
| { |
| num_gangs_actual = 1 + __builtin_goacc_parlevel_id(GOMP_DIM_GANG); |
| |
| static int var = var_init; |
| |
| #pragma acc atomic capture |
| result = ++var; |
| |
| /* Irrespective of the order in which the gang-redundant threads |
| execute, 'var' has now been incremented 'num_gangs_actual' times, and |
| the final value captured as 'result'. */ |
| } |
| if (acc_get_device_type() == acc_device_host) |
| assert(num_gangs_actual == 1); |
| else |
| assert(num_gangs_actual == num_gangs_request); |
| assert(result == var_init + num_gangs_actual * (1 + i)); |
| } |
| } |
| |
| |
| /* Check the same routine called from two compute constructs. */ |
| |
| static const int t1_r2_var_init = 166; |
| |
| #pragma acc routine gang |
| /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+4 } */ |
| /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .+3 } */ |
| /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .+2 } */ |
| __attribute__((noinline)) |
| static int t1_r2_r(void) |
| { |
| static int var = t1_r2_var_init; |
| |
| int tmp; |
| #pragma acc atomic capture |
| tmp = ++var; |
| |
| return tmp; |
| } |
| |
| static void t1_r2(void) |
| { |
| IF_DEBUG |
| __builtin_printf ("%s\n", __FUNCTION__); |
| |
| const int i_limit = 71; |
| /* The checking assumes the same 'num_gangs' for all compute constructs. */ |
| const int num_gangs_request = 333; |
| int num_gangs_actual = -1; |
| if (acc_get_device_type() == acc_device_host) |
| num_gangs_actual = 1; |
| else |
| { |
| /* We're assuming that the implementation is able to accomodate the |
| 'num_gangs' requested (which really ought to be true for |
| 'num_gangs'). */ |
| num_gangs_actual = num_gangs_request; |
| } |
| |
| for (int i = 0; i < i_limit; ++i) |
| { |
| int result_1 = 0; |
| #pragma acc parallel \ |
| num_gangs(num_gangs_request) \ |
| reduction(max:result_1) |
| { |
| result_1 = t1_r2_r(); |
| |
| /* Irrespective of the order in which the gang-redundant threads |
| execute, 'var' has now been incremented 'num_gangs_actual' times, and |
| the final value captured as 'result_1'. */ |
| } |
| IF_DEBUG |
| __builtin_printf ("%d: result_1: %d\n", i, result_1); |
| assert(result_1 == t1_r2_var_init + num_gangs_actual * (1 + (i * 3 + 0))); |
| |
| int result_2 = 0; |
| #pragma acc parallel \ |
| num_gangs(num_gangs_request) \ |
| reduction(max:result_2) |
| { |
| result_2 = t1_r2_r() + t1_r2_r(); |
| |
| /* Irrespective of the order in which the gang-redundant threads |
| execute, 'var' has now been incremented '2 * num_gangs_actual' times. |
| However, the order of the two 't1_r2_r' function calls is not |
| synchronized (between different gang-redundant threads). We thus |
| cannot verify the actual 'result_2' values in this case. */ |
| } |
| IF_DEBUG |
| __builtin_printf ("%d: result_2: %d\n", i, result_2); |
| if (num_gangs_actual == 1) |
| /* Per the rationale above, only in this case we can check the actual |
| result. */ |
| assert(result_2 == (t1_r2_var_init + num_gangs_actual * (1 + (i * 3 + 1)) |
| + t1_r2_var_init + num_gangs_actual * (1 + (i * 3 + 2)))); |
| /* But we can generally check low and high limits. */ |
| { |
| /* Must be bigger than '2 * result_1'. */ |
| int c = 2 * result_1; |
| IF_DEBUG |
| __builtin_printf (" > %d\n", c); |
| assert(result_2 > c); |
| } |
| { |
| /* ..., but limited by the base value for next 'i'. */ |
| int c = 2 * (t1_r2_var_init + num_gangs_actual * (0 + ((i + 1) * 3 + 0))); |
| IF_DEBUG |
| __builtin_printf (" < %d\n", c); |
| assert(result_2 < c); |
| } |
| } |
| } |
| |
| |
| /* Asynchronous execution. */ |
| |
| static const int t2_var_init_2 = -55; |
| |
| #pragma acc routine gang |
| /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+4 } */ |
| /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .+3 } */ |
| /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .+2 } */ |
| __attribute__((noinline)) |
| static int t2_r(void) |
| { |
| static int var = t2_var_init_2; |
| |
| int tmp; |
| #pragma acc atomic capture |
| tmp = ++var; |
| |
| return tmp; |
| } |
| |
| static void t2(void) |
| { |
| IF_DEBUG |
| __builtin_printf ("%s\n", __FUNCTION__); |
| |
| const int i_limit = 12; |
| const int num_gangs_request_1 = 14; |
| const int var_init_1 = 5; |
| int results_1[i_limit][num_gangs_request_1]; |
| memset (results_1, 0, sizeof results_1); |
| const int num_gangs_request_2 = 5; |
| int results_2[i_limit][num_gangs_request_2]; |
| memset (results_2, 0, sizeof results_2); |
| const int num_gangs_request_3 = 34; |
| const int var_init_3 = 1250; |
| int results_3[i_limit][num_gangs_request_3]; |
| memset (results_3, 0, sizeof results_3); |
| |
| #pragma acc data \ |
| copy(results_1, results_2, results_3) |
| /* { dg-note {variable 'num_gangs_request_1\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target { c && { ! __OPTIMIZE__ } } } .-2 } */ |
| /* { dg-note {variable 'num_gangs_request_2\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target { c && { ! __OPTIMIZE__ } } } .-3 } */ |
| /* { dg-note {variable 'num_gangs_request_3\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target { c && { ! __OPTIMIZE__ } } } .-4 } */ |
| /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-5 } */ |
| { |
| for (int i = 0; i < i_limit; ++i) |
| { |
| /* The following 'async' clauses effect asynchronous execution, but |
| using the same async-argument for each compute construct implies that |
| the respective compute constructs' execution is synchronized with |
| itself, meaning that all 'i = 0' execution has finished (on the |
| device) before 'i = 1' is started (on the device), etc. */ |
| |
| /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+1 } */ |
| #pragma acc parallel \ |
| present(results_1) \ |
| num_gangs(num_gangs_request_1) \ |
| async(1) |
| /* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-4 } */ |
| /* { dg-note {variable 'tmp' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-5 } */ |
| { |
| static int var = var_init_1; |
| |
| int tmp; |
| #pragma acc atomic capture |
| tmp = ++var; |
| |
| results_1[i][__builtin_goacc_parlevel_id(GOMP_DIM_GANG)] += tmp; |
| } |
| |
| #pragma acc parallel \ |
| present(results_2) \ |
| num_gangs(num_gangs_request_2) \ |
| async(2) |
| { |
| results_2[i][__builtin_goacc_parlevel_id(GOMP_DIM_GANG)] += t2_r(); |
| } |
| |
| /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+1 } */ |
| #pragma acc parallel \ |
| present(results_3) \ |
| num_gangs(num_gangs_request_3) \ |
| async(3) |
| /* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-4 } */ |
| /* { dg-note {variable 'tmp' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-5 } */ |
| { |
| static int var = var_init_3; |
| |
| int tmp; |
| #pragma acc atomic capture |
| tmp = ++var; |
| |
| results_3[i][__builtin_goacc_parlevel_id(GOMP_DIM_GANG)] += tmp; |
| } |
| } |
| #pragma acc wait |
| } |
| int num_gangs_actual_1; |
| int num_gangs_actual_2; |
| int num_gangs_actual_3; |
| if (acc_get_device_type() == acc_device_host) |
| { |
| num_gangs_actual_1 = 1; |
| num_gangs_actual_2 = 1; |
| num_gangs_actual_3 = 1; |
| } |
| else |
| { |
| /* We're assuming that the implementation is able to accomodate the |
| 'num_gangs' requested (which really ought to be true for |
| 'num_gangs'). */ |
| num_gangs_actual_1 = num_gangs_request_1; |
| num_gangs_actual_2 = num_gangs_request_2; |
| num_gangs_actual_3 = num_gangs_request_3; |
| } |
| |
| /* For 'i = 0', 'results_*[i][0..num_gangs_actual_*]' are expected to each |
| contain one value of '(1 + var_init_*)..(var_init_* + num_gangs_actual_*)', |
| and so on for increasing 'i'. Their order however is unspecified due to |
| the gang-redundant execution. (Thus checking that their sums match.) */ |
| |
| int result_1 = 0; |
| int result_2 = 0; |
| int result_3 = 0; |
| for (int i = 0; i < i_limit; ++i) |
| { |
| int result_1_ = 0; |
| for (int g = 0; g < num_gangs_actual_1; ++g) |
| { |
| IF_DEBUG |
| __builtin_printf ("results_1[%d][%d]: %d\n", i, g, results_1[i][g]); |
| result_1_ += results_1[i][g]; |
| } |
| IF_DEBUG |
| __builtin_printf ("%d result_1_: %d\n", i, result_1_); |
| assert (result_1_ == (((var_init_1 + num_gangs_actual_1 * (1 + i)) * (1 + var_init_1 + num_gangs_actual_1 * (1 + i)) / 2) |
| - ((var_init_1 + num_gangs_actual_1 * (0 + i)) * (1 + var_init_1 + num_gangs_actual_1 * (0 + i)) / 2))); |
| result_1 += result_1_; |
| |
| int result_2_ = 0; |
| for (int g = 0; g < num_gangs_actual_2; ++g) |
| { |
| IF_DEBUG |
| __builtin_printf ("results_2[%d][%d]: %d\n", i, g, results_2[i][g]); |
| result_2_ += results_2[i][g]; |
| } |
| IF_DEBUG |
| __builtin_printf ("%d result_2_: %d\n", i, result_2_); |
| assert (result_2_ == (((t2_var_init_2 + num_gangs_actual_2 * (1 + i)) * (1 + t2_var_init_2 + num_gangs_actual_2 * (1 + i)) / 2) |
| - ((t2_var_init_2 + num_gangs_actual_2 * (0 + i)) * (1 + t2_var_init_2 + num_gangs_actual_2 * (0 + i)) / 2))); |
| result_2 += result_2_; |
| |
| int result_3_ = 0; |
| for (int g = 0; g < num_gangs_actual_3; ++g) |
| { |
| IF_DEBUG |
| __builtin_printf ("results_3[%d][%d]: %d\n", i, g, results_3[i][g]); |
| result_3_ += results_3[i][g]; |
| } |
| IF_DEBUG |
| __builtin_printf ("%d result_3_: %d\n", i, result_3_); |
| assert (result_3_ == (((var_init_3 + num_gangs_actual_3 * (1 + i)) * (1 + var_init_3 + num_gangs_actual_3 * (1 + i)) / 2) |
| - ((var_init_3 + num_gangs_actual_3 * (0 + i)) * (1 + var_init_3 + num_gangs_actual_3 * (0 + i)) / 2))); |
| result_3 += result_3_; |
| } |
| IF_DEBUG |
| __builtin_printf ("result_1: %d\n", result_1); |
| assert (result_1 == (((var_init_1 + num_gangs_actual_1 * i_limit) * (1 + var_init_1 + num_gangs_actual_1 * i_limit) / 2) |
| - (var_init_1 * (var_init_1 + 1) / 2))); |
| IF_DEBUG |
| __builtin_printf ("result_2: %d\n", result_2); |
| assert (result_2 == (((t2_var_init_2 + num_gangs_actual_2 * i_limit) * (1 + t2_var_init_2 + num_gangs_actual_2 * i_limit) / 2) |
| - (t2_var_init_2 * (t2_var_init_2 + 1) / 2))); |
| IF_DEBUG |
| __builtin_printf ("result_3: %d\n", result_3); |
| assert (result_3 == (((var_init_3 + num_gangs_actual_3 * i_limit) * (1 + var_init_3 + num_gangs_actual_3 * i_limit) / 2) |
| - (var_init_3 * (var_init_3 + 1) / 2))); |
| } |
| |
| |
| #pragma acc routine seq |
| __attribute__((noinline)) |
| static int pr84991_1_r_s(int n) |
| { |
| static const int test[] = {1,2,3,4}; |
| return test[n]; |
| } |
| |
| static void pr84991_1(void) |
| { |
| int n[1]; |
| n[0] = 3; |
| #pragma acc parallel copy(n) |
| { |
| n[0] = pr84991_1_r_s(n[0]); |
| } |
| assert(n[0] == 4); |
| } |
| |
| |
| static void pr84992_1(void) |
| { |
| int n[1]; |
| n[0] = 3; |
| #pragma acc parallel copy(n) |
| /* { dg-note {variable 'test' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-1 } */ |
| { |
| static const int test[] = {1,2,3,4}; |
| n[0] = test[n[0]]; |
| } |
| assert(n[0] == 4); |
| } |
| |
| |
| int main(void) |
| { |
| t0_c(); |
| |
| t0_r(); |
| |
| t1_c(); |
| |
| t1_r2(); |
| |
| t2(); |
| |
| pr84991_1(); |
| |
| pr84992_1(); |
| |
| return 0; |
| } |