| /* { dg-do run { target openacc_nvidia_accel_selected } } */ |
| /* { dg-additional-options "-lcuda" } */ |
| /* { dg-require-effective-target openacc_cuda } */ |
| |
| #include <openacc.h> |
| #include <stdlib.h> |
| #include "cuda.h" |
| |
| #include <stdio.h> |
| |
| #define n 128 |
| |
| int |
| main (void) |
| { |
| CUresult r; |
| CUstream stream1; |
| int N = n; |
| int a[n]; |
| int c[n]; |
| |
| acc_init (acc_device_nvidia); |
| |
| r = cuStreamCreate (&stream1, CU_STREAM_NON_BLOCKING); |
| if (r != CUDA_SUCCESS) |
| { |
| fprintf (stderr, "cuStreamCreate failed: %d\n", r); |
| abort (); |
| } |
| |
| acc_set_cuda_stream (1, stream1); |
| |
| for (int i = 0; i < n; i++) |
| { |
| a[i] = 3; |
| c[i] = 0; |
| } |
| |
| #pragma acc data copy (a, c) copyin (N) |
| { |
| #pragma acc parallel async (1) |
| ; |
| |
| #pragma acc parallel async (1) num_gangs (320) |
| #pragma acc loop gang |
| for (int ii = 0; ii < N; ii++) |
| c[ii] = (a[ii] + a[N - ii - 1]); |
| |
| #pragma acc parallel async (1) |
| #pragma acc loop seq |
| for (int ii = 0; ii < n; ii++) |
| a[ii] = 6; |
| |
| #pragma acc wait (1) |
| } |
| |
| for (int i = 0; i < n; i++) |
| if (c[i] != 6) |
| abort (); |
| |
| return 0; |
| } |