| /* Test dispatch of events to callbacks. */ |
| |
| /* { dg-additional-options "--param=openacc-kernels=decompose" } */ |
| |
| /* { dg-additional-options "-fopt-info-omp-all" } |
| { dg-additional-options "-foffload=-fopt-info-omp-all" } */ |
| |
| /* { dg-additional-options "--param=openacc-privatization=noisy" } |
| { dg-additional-options "-foffload=--param=openacc-privatization=noisy" } |
| Prune a few: uninteresting: |
| { dg-prune-output {note: variable 'D\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} } */ |
| |
| /* It's only with Tcl 8.5 (released in 2007) that "the variable 'varName' |
| passed to 'incr' may be unset, and in that case, it will be set to [...]", |
| so to maintain compatibility with earlier Tcl releases, we manually |
| initialize counter variables: |
| { dg-line l_dummy[variable c_compute 0] } |
| { dg-message dummy {} { target iN-VAl-Id } l_dummy } to avoid |
| "WARNING: dg-line var l_dummy defined, but not used". */ |
| |
| #undef NDEBUG |
| #include <assert.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #include <acc_prof.h> |
| |
| |
| /* Use explicit 'copyin' clauses, to work around "'firstprivate' |
| optimizations", which will cause the value at the point of call to be used |
| (*before* any potential modifications done in callbacks), as opposed to its |
| address being taken, which then later gets dereferenced (*after* any |
| modifications done in callbacks). */ |
| #define COPYIN(...) copyin(__VA_ARGS__) |
| |
| |
| /* See the 'DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT' reference in |
| 'libgomp.texi'. */ |
| #define DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT 0 |
| |
| |
| #define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__) |
| |
| |
| volatile // TODO PR90488 |
| static int state = -1; |
| |
| #define STATE_OP(state, op) \ |
| do \ |
| { \ |
| typeof (state) state_o = (state); \ |
| (void) state_o; \ |
| (state)op; \ |
| DEBUG_printf("state: %d -> %d\n", state_o, (state)); \ |
| } \ |
| while (0) |
| |
| |
| static acc_device_t acc_device_type; |
| static int acc_device_num; |
| static int num_gangs, num_workers, vector_length; |
| |
| |
| static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) |
| { |
| DEBUG_printf ("%s\n", __FUNCTION__); |
| |
| assert (acc_device_type != acc_device_host); |
| |
| assert (state == 0); |
| STATE_OP (state, = 1); |
| |
| assert (prof_info->event_type == acc_ev_enqueue_launch_start); |
| assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES); |
| assert (prof_info->version == _ACC_PROF_INFO_VERSION); |
| assert (prof_info->device_type == acc_device_type); |
| assert (prof_info->device_number == acc_device_num); |
| assert (prof_info->thread_id == -1); |
| assert (prof_info->async == acc_async_noval); |
| assert (prof_info->async_queue == prof_info->async); |
| assert (prof_info->src_file == NULL); |
| assert (prof_info->func_name == NULL); |
| assert (prof_info->line_no == -1); |
| assert (prof_info->end_line_no == -1); |
| assert (prof_info->func_line_no == -1); |
| assert (prof_info->func_end_line_no == -1); |
| |
| assert (event_info->launch_event.event_type == prof_info->event_type); |
| assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES); |
| assert (event_info->launch_event.parent_construct == acc_construct_parallel); |
| assert (event_info->launch_event.implicit == 1); |
| assert (event_info->launch_event.tool_info == NULL); |
| assert (event_info->launch_event.kernel_name != NULL); |
| { |
| const char *s = strstr (event_info->launch_event.kernel_name, "main"); |
| assert (s != NULL); |
| s = strstr (s, "omp_fn"); |
| assert (s != NULL); |
| } |
| if (num_gangs < 1) |
| assert (event_info->launch_event.num_gangs >= 1); |
| else |
| { |
| #ifdef __OPTIMIZE__ |
| assert (event_info->launch_event.num_gangs == num_gangs); |
| #else |
| /* No parallelized OpenACC 'kernels' constructs. Unparallelized OpenACC |
| 'kernels' constructs must get launched as 1 x 1 x 1 GPU kernels. */ |
| assert (event_info->launch_event.num_gangs == 1); |
| #endif |
| } |
| if (num_workers < 1) |
| assert (event_info->launch_event.num_workers >= 1); |
| else |
| { |
| #ifdef __OPTIMIZE__ |
| assert (event_info->launch_event.num_workers == num_workers); |
| #else |
| /* See 'num_gangs' above. */ |
| assert (event_info->launch_event.num_workers == 1); |
| #endif |
| } |
| if (vector_length < 1) |
| assert (event_info->launch_event.vector_length >= 1); |
| else if (acc_device_type == acc_device_nvidia) /* ... is special. */ |
| assert (event_info->launch_event.vector_length == 32); |
| else if (acc_device_type == acc_device_radeon) /* ...and so is this. */ |
| assert (event_info->launch_event.vector_length == 64); |
| else |
| { |
| #ifdef __OPTIMIZE__ |
| assert (event_info->launch_event.vector_length == vector_length); |
| #else |
| /* See 'num_gangs' above. */ |
| assert (event_info->launch_event.vector_length == 1); |
| #endif |
| } |
| |
| if (acc_device_type == acc_device_host) |
| assert (api_info->device_api == acc_device_api_none); |
| else if (acc_device_type == acc_device_radeon) |
| assert (api_info->device_api == acc_device_api_other); |
| else |
| assert (api_info->device_api == acc_device_api_cuda); |
| assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES); |
| assert (api_info->device_type == prof_info->device_type); |
| assert (api_info->vendor == -1); |
| assert (api_info->device_handle == NULL); |
| assert (api_info->context_handle == NULL); |
| assert (api_info->async_handle == NULL); |
| } |
| |
| |
| static acc_prof_reg reg; |
| static acc_prof_reg unreg; |
| static acc_prof_lookup_func lookup; |
| void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_) |
| { |
| DEBUG_printf ("%s\n", __FUNCTION__); |
| |
| reg = reg_; |
| unreg = unreg_; |
| lookup = lookup_; |
| } |
| |
| |
| int main() |
| { |
| acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup); |
| |
| STATE_OP (state, = 0); |
| reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg); |
| assert (state == 0); |
| |
| acc_device_type = acc_get_device_type (); |
| acc_device_num = acc_get_device_num (acc_device_type); |
| assert (state == 0); |
| |
| /* Parallelism dimensions: compiler/runtime decides. */ |
| STATE_OP (state, = 0); |
| num_gangs = num_workers = vector_length = 0; |
| { |
| #define N 100 |
| int x[N]; |
| #pragma acc kernels /* { dg-line l_compute[incr c_compute] } */ |
| /* { dg-note {OpenACC 'kernels' decomposition: variable 'i' declared in block requested to be made addressable} {} { target *-*-* } l_compute$c_compute } |
| { dg-note {variable 'i' made addressable} {} { target *-*-* } l_compute$c_compute } */ |
| /* { dg-note {variable 'i' declared in block is candidate for adjusting OpenACC privatization level} {} { target *-*-* } l_compute$c_compute } */ |
| /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { ! __OPTIMIZE__ } } l_compute$c_compute } |
| { dg-optimized {assigned OpenACC gang loop parallelism} {} { target __OPTIMIZE__ } l_compute$c_compute } */ |
| { |
| /* { dg-note {beginning 'parloops' part in OpenACC 'kernels' region} {} { target *-*-* } .+1 } */ |
| for (int i = 0; i < N; ++i) |
| x[i] = i * i; |
| } |
| if (acc_device_type == acc_device_host) |
| assert (state == 0); /* No 'acc_ev_enqueue_launch_start'. */ |
| else |
| assert (state == 1); |
| for (int i = 0; i < N; ++i) |
| if (x[i] != i * i) |
| __builtin_abort (); |
| #undef N |
| } |
| |
| /* Parallelism dimensions: literal. */ |
| STATE_OP (state, = 0); |
| num_gangs = 30; |
| num_workers = 3; |
| vector_length = 5; |
| { |
| #define N 100 |
| int x[N]; |
| #pragma acc kernels /* { dg-line l_compute[incr c_compute] } */ \ |
| num_gangs (30) num_workers (3) vector_length (5) |
| /* { dg-note {OpenACC 'kernels' decomposition: variable 'i' declared in block requested to be made addressable} {} { target *-*-* } l_compute$c_compute } |
| { dg-note {variable 'i' made addressable} {} { target *-*-* } l_compute$c_compute } */ |
| /* { dg-note {variable 'i' declared in block is candidate for adjusting OpenACC privatization level} {} { target *-*-* } l_compute$c_compute } */ |
| /* { dg-warning {using 'vector_length \(32\)', ignoring 5} {} { target { __OPTIMIZE__ && openacc_nvidia_accel_selected } } l_compute$c_compute } */ |
| /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { ! __OPTIMIZE__ } } l_compute$c_compute } |
| { dg-optimized {assigned OpenACC gang loop parallelism} {} { target __OPTIMIZE__ } l_compute$c_compute } */ |
| { |
| /* { dg-note {beginning 'parloops' part in OpenACC 'kernels' region} {} { target *-*-* } .+1 } */ |
| for (int i = 0; i < N; ++i) |
| x[i] = i * i; |
| } |
| if (acc_device_type == acc_device_host) |
| assert (state == 0); /* No 'acc_ev_enqueue_launch_start'. */ |
| else |
| assert (state == 1); |
| for (int i = 0; i < N; ++i) |
| if (x[i] != i * i) |
| __builtin_abort (); |
| #undef N |
| } |
| |
| /* Parallelism dimensions: variable. */ |
| STATE_OP (state, = 0); |
| num_gangs = 22; |
| num_workers = 5; |
| vector_length = 7; |
| { |
| #define N 100 |
| int x[N]; |
| #pragma acc kernels /* { dg-line l_compute[incr c_compute] } */ \ |
| num_gangs (num_gangs) num_workers (num_workers) vector_length (vector_length) |
| /* { dg-note {OpenACC 'kernels' decomposition: variable 'i' declared in block requested to be made addressable} {} { target *-*-* } l_compute$c_compute } |
| { dg-note {variable 'i' made addressable} {} { target *-*-* } l_compute$c_compute } */ |
| /* { dg-note {variable 'i' declared in block is candidate for adjusting OpenACC privatization level} {} { target *-*-* } l_compute$c_compute } */ |
| /* { dg-warning {using 'vector_length \(32\)', ignoring runtime setting} {} { target { __OPTIMIZE__ && openacc_nvidia_accel_selected } } l_compute$c_compute } */ |
| /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { ! __OPTIMIZE__ } } l_compute$c_compute } |
| { dg-optimized {assigned OpenACC gang loop parallelism} {} { target __OPTIMIZE__ } l_compute$c_compute } */ |
| { |
| /* { dg-note {beginning 'parloops' part in OpenACC 'kernels' region} {} { target *-*-* } .+1 } */ |
| for (int i = 0; i < N; ++i) |
| x[i] = i * i; |
| } |
| if (acc_device_type == acc_device_host) |
| assert (state == 0); /* No 'acc_ev_enqueue_launch_start'. */ |
| else |
| assert (state == 1); |
| for (int i = 0; i < N; ++i) |
| if (x[i] != i * i) |
| __builtin_abort (); |
| #undef N |
| } |
| |
| return 0; |
| } |