blob: 53f03d17bb2f96b98b7ed385cb1d146d8b390762 [file] [log] [blame]
#include <assert.h>
#include <openacc.h>
typedef struct {
int x, y;
} vec2;
typedef struct {
int x, y, z;
int attr[13];
} vec3_attr;
/* Test of gang-private variables declared in local scope with parallel
directive. */
void local_g_1()
{
int i, arr[32];
for (i = 0; i < 32; i++)
arr[i] = 3;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
{
int x;
#pragma acc loop gang(static:1)
for (i = 0; i < 32; i++)
x = i * 2;
#pragma acc loop gang(static:1)
for (i = 0; i < 32; i++)
{
if (acc_on_device (acc_device_host))
x = i * 2;
arr[i] += x;
}
}
for (i = 0; i < 32; i++)
assert (arr[i] == 3 + i * 2);
}
/* Test of worker-private variables declared in a local scope, broadcasting
to vector-partitioned mode. Back-to-back worker loops. */
void local_w_1()
{
int i, arr[32 * 32 * 32];
for (i = 0; i < 32 * 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
{
int j;
#pragma acc loop gang
for (i = 0; i < 32; i++)
{
#pragma acc loop worker
for (j = 0; j < 32; j++)
{
int k;
int x = i ^ j * 3;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += x * k;
}
#pragma acc loop worker
for (j = 0; j < 32; j++)
{
int k;
int x = i | j * 5;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += x * k;
}
}
}
for (i = 0; i < 32; i++)
for (int j = 0; j < 32; j++)
for (int k = 0; k < 32; k++)
{
int idx = i * 1024 + j * 32 + k;
assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
}
}
/* Test of worker-private variables declared in a local scope, broadcasting
to vector-partitioned mode. Successive vector loops. */
void local_w_2()
{
int i, arr[32 * 32 * 32];
for (i = 0; i < 32 * 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
{
int j;
#pragma acc loop gang
for (i = 0; i < 32; i++)
{
#pragma acc loop worker
for (j = 0; j < 32; j++)
{
int k;
int x = i ^ j * 3;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += x * k;
x = i | j * 5;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += x * k;
}
}
}
for (i = 0; i < 32; i++)
for (int j = 0; j < 32; j++)
for (int k = 0; k < 32; k++)
{
int idx = i * 1024 + j * 32 + k;
assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
}
}
/* Test of worker-private variables declared in a local scope, broadcasting
to vector-partitioned mode. Aggregate worker variable. */
void local_w_3()
{
int i, arr[32 * 32 * 32];
for (i = 0; i < 32 * 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
{
int j;
#pragma acc loop gang
for (i = 0; i < 32; i++)
{
#pragma acc loop worker
for (j = 0; j < 32; j++)
{
int k;
vec2 pt;
pt.x = i ^ j * 3;
pt.y = i | j * 5;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += pt.x * k;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += pt.y * k;
}
}
}
for (i = 0; i < 32; i++)
for (int j = 0; j < 32; j++)
for (int k = 0; k < 32; k++)
{
int idx = i * 1024 + j * 32 + k;
assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
}
}
/* Test of worker-private variables declared in a local scope, broadcasting
to vector-partitioned mode. Addressable worker variable. */
void local_w_4()
{
int i, arr[32 * 32 * 32];
for (i = 0; i < 32 * 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
{
int j;
#pragma acc loop gang
for (i = 0; i < 32; i++)
{
#pragma acc loop worker
for (j = 0; j < 32; j++)
{
int k;
vec2 pt, *ptp;
ptp = &pt;
pt.x = i ^ j * 3;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += ptp->x * k;
ptp->y = i | j * 5;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += pt.y * k;
}
}
}
for (i = 0; i < 32; i++)
for (int j = 0; j < 32; j++)
for (int k = 0; k < 32; k++)
{
int idx = i * 1024 + j * 32 + k;
assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
}
}
/* Test of worker-private variables declared in a local scope, broadcasting
to vector-partitioned mode. Array worker variable. */
void local_w_5()
{
int i, arr[32 * 32 * 32];
for (i = 0; i < 32 * 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
{
int j;
#pragma acc loop gang
for (i = 0; i < 32; i++)
{
#pragma acc loop worker
for (j = 0; j < 32; j++)
{
int k;
int pt[2];
pt[0] = i ^ j * 3;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += pt[0] * k;
pt[1] = i | j * 5;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += pt[1] * k;
}
}
}
for (i = 0; i < 32; i++)
for (int j = 0; j < 32; j++)
for (int k = 0; k < 32; k++)
{
int idx = i * 1024 + j * 32 + k;
assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
}
}
/* Test of gang-private variables declared on loop directive. */
void loop_g_1()
{
int x = 5, i, arr[32];
for (i = 0; i < 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
{
#pragma acc loop gang private(x)
for (i = 0; i < 32; i++)
{
x = i * 2;
arr[i] += x;
}
}
for (i = 0; i < 32; i++)
assert (arr[i] == i * 3);
}
/* Test of gang-private variables declared on loop directive, with broadcasting
to partitioned workers. */
void loop_g_2()
{
int x = 5, i, arr[32 * 32];
for (i = 0; i < 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
{
#pragma acc loop gang private(x)
for (i = 0; i < 32; i++)
{
x = i * 2;
#pragma acc loop worker
for (int j = 0; j < 32; j++)
arr[i * 32 + j] += x;
}
}
for (i = 0; i < 32 * 32; i++)
assert (arr[i] == i + (i / 32) * 2);
}
/* Test of gang-private variables declared on loop directive, with broadcasting
to partitioned vectors. */
void loop_g_3()
{
int x = 5, i, arr[32 * 32];
for (i = 0; i < 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
{
#pragma acc loop gang private(x)
for (i = 0; i < 32; i++)
{
x = i * 2;
#pragma acc loop vector
for (int j = 0; j < 32; j++)
arr[i * 32 + j] += x;
}
}
for (i = 0; i < 32 * 32; i++)
assert (arr[i] == i + (i / 32) * 2);
}
/* Test of gang-private addressable variable declared on loop directive, with
broadcasting to partitioned workers. */
void loop_g_4()
{
int x = 5, i, arr[32 * 32];
for (i = 0; i < 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
{
#pragma acc loop gang private(x)
for (i = 0; i < 32; i++)
{
int *p = &x;
x = i * 2;
#pragma acc loop worker
for (int j = 0; j < 32; j++)
arr[i * 32 + j] += x;
(*p)--;
}
}
for (i = 0; i < 32 * 32; i++)
assert (arr[i] == i + (i / 32) * 2);
}
/* Test of gang-private array variable declared on loop directive, with
broadcasting to partitioned workers. */
void loop_g_5()
{
int x[8], i, arr[32 * 32];
for (i = 0; i < 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
{
#pragma acc loop gang private(x)
for (i = 0; i < 32; i++)
{
for (int j = 0; j < 8; j++)
x[j] = j * 2;
#pragma acc loop worker
for (int j = 0; j < 32; j++)
arr[i * 32 + j] += x[j % 8];
}
}
for (i = 0; i < 32 * 32; i++)
assert (arr[i] == i + (i % 8) * 2);
}
/* Test of gang-private aggregate variable declared on loop directive, with
broadcasting to partitioned workers. */
void loop_g_6()
{
int i, arr[32 * 32];
vec3_attr pt;
for (i = 0; i < 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
{
#pragma acc loop gang private(pt)
for (i = 0; i < 32; i++)
{
pt.x = i;
pt.y = i * 2;
pt.z = i * 4;
pt.attr[5] = i * 6;
#pragma acc loop worker
for (int j = 0; j < 32; j++)
arr[i * 32 + j] += pt.x + pt.y + pt.z + pt.attr[5];
}
}
for (i = 0; i < 32 * 32; i++)
assert (arr[i] == i + (i / 32) * 13);
}
/* Test of vector-private variables declared on loop directive. */
void loop_v_1()
{
int x, i, arr[32 * 32 * 32];
for (i = 0; i < 32 * 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
{
int j;
#pragma acc loop gang
for (i = 0; i < 32; i++)
{
#pragma acc loop worker
for (j = 0; j < 32; j++)
{
int k;
#pragma acc loop vector private(x)
for (k = 0; k < 32; k++)
{
x = i ^ j * 3;
arr[i * 1024 + j * 32 + k] += x * k;
}
#pragma acc loop vector private(x)
for (k = 0; k < 32; k++)
{
x = i | j * 5;
arr[i * 1024 + j * 32 + k] += x * k;
}
}
}
}
for (i = 0; i < 32; i++)
for (int j = 0; j < 32; j++)
for (int k = 0; k < 32; k++)
{
int idx = i * 1024 + j * 32 + k;
assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
}
}
/* Test of vector-private variables declared on loop directive. Array type. */
void loop_v_2()
{
int pt[2], i, arr[32 * 32 * 32];
for (i = 0; i < 32 * 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
{
int j;
#pragma acc loop gang
for (i = 0; i < 32; i++)
{
#pragma acc loop worker
for (j = 0; j < 32; j++)
{
int k;
#pragma acc loop vector private(pt)
for (k = 0; k < 32; k++)
{
pt[0] = i ^ j * 3;
pt[1] = i | j * 5;
arr[i * 1024 + j * 32 + k] += pt[0] * k;
arr[i * 1024 + j * 32 + k] += pt[1] * k;
}
}
}
}
for (i = 0; i < 32; i++)
for (int j = 0; j < 32; j++)
for (int k = 0; k < 32; k++)
{
int idx = i * 1024 + j * 32 + k;
assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
}
}
/* Test of worker-private variables declared on a loop directive. */
void loop_w_1()
{
int x = 5, i, arr[32 * 32];
for (i = 0; i < 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
{
int j;
#pragma acc loop gang
for (i = 0; i < 32; i++)
{
#pragma acc loop worker private(x)
for (j = 0; j < 32; j++)
{
x = i ^ j * 3;
/* Try to ensure 'x' accesses doesn't get optimized into a
temporary. */
__asm__ __volatile__ ("");
arr[i * 32 + j] += x;
}
}
}
for (i = 0; i < 32 * 32; i++)
assert (arr[i] == i + ((i / 32) ^ (i % 32) * 3));
}
/* Test of worker-private variables declared on a loop directive, broadcasting
to vector-partitioned mode. */
void loop_w_2()
{
int x = 5, i, arr[32 * 32 * 32];
for (i = 0; i < 32 * 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
{
int j;
#pragma acc loop gang
for (i = 0; i < 32; i++)
{
#pragma acc loop worker private(x)
for (j = 0; j < 32; j++)
{
int k;
x = i ^ j * 3;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += x * k;
}
}
}
for (i = 0; i < 32; i++)
for (int j = 0; j < 32; j++)
for (int k = 0; k < 32; k++)
{
int idx = i * 1024 + j * 32 + k;
assert (arr[idx] == idx + (i ^ j * 3) * k);
}
}
/* Test of worker-private variables declared on a loop directive, broadcasting
to vector-partitioned mode. Back-to-back worker loops. */
void loop_w_3()
{
int x = 5, i, arr[32 * 32 * 32];
for (i = 0; i < 32 * 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
{
int j;
#pragma acc loop gang
for (i = 0; i < 32; i++)
{
#pragma acc loop worker private(x)
for (j = 0; j < 32; j++)
{
int k;
x = i ^ j * 3;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += x * k;
}
#pragma acc loop worker private(x)
for (j = 0; j < 32; j++)
{
int k;
x = i | j * 5;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += x * k;
}
}
}
for (i = 0; i < 32; i++)
for (int j = 0; j < 32; j++)
for (int k = 0; k < 32; k++)
{
int idx = i * 1024 + j * 32 + k;
assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
}
}
/* Test of worker-private variables declared on a loop directive, broadcasting
to vector-partitioned mode. Successive vector loops. */
void loop_w_4()
{
int x = 5, i, arr[32 * 32 * 32];
for (i = 0; i < 32 * 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
{
int j;
#pragma acc loop gang
for (i = 0; i < 32; i++)
{
#pragma acc loop worker private(x)
for (j = 0; j < 32; j++)
{
int k;
x = i ^ j * 3;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += x * k;
x = i | j * 5;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += x * k;
}
}
}
for (i = 0; i < 32; i++)
for (int j = 0; j < 32; j++)
for (int k = 0; k < 32; k++)
{
int idx = i * 1024 + j * 32 + k;
assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
}
}
/* Test of worker-private variables declared on a loop directive, broadcasting
to vector-partitioned mode. Addressable worker variable. */
void loop_w_5()
{
int x = 5, i, arr[32 * 32 * 32];
for (i = 0; i < 32 * 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
{
int j;
#pragma acc loop gang
for (i = 0; i < 32; i++)
{
#pragma acc loop worker private(x)
for (j = 0; j < 32; j++)
{
int k;
int *p = &x;
x = i ^ j * 3;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += x * k;
*p = i | j * 5;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += x * k;
}
}
}
for (i = 0; i < 32; i++)
for (int j = 0; j < 32; j++)
for (int k = 0; k < 32; k++)
{
int idx = i * 1024 + j * 32 + k;
assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
}
}
/* Test of worker-private variables declared on a loop directive, broadcasting
to vector-partitioned mode. Aggregate worker variable. */
void loop_w_6()
{
int i, arr[32 * 32 * 32];
vec2 pt;
for (i = 0; i < 32 * 32 * 32; i++)
arr[i] = i;
#pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
{
int j;
#pragma acc loop gang
for (i = 0; i < 32; i++)
{
#pragma acc loop worker private(pt)
for (j = 0; j < 32; j++)
{
int k;
pt.x = i ^ j * 3;
pt.y = i | j * 5;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += pt.x * k;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += pt.y * k;
}
}
}
for (i = 0; i < 32; i++)
for (int j = 0; j < 32; j++)
for (int k = 0; k < 32; k++)
{
int idx = i * 1024 + j * 32 + k;
assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
}
}
/* Test of worker-private variables declared on loop directive, broadcasting
to vector-partitioned mode. Array worker variable. */
void loop_w_7()
{
int i, arr[32 * 32 * 32];
int pt[2];
for (i = 0; i < 32 * 32 * 32; i++)
arr[i] = i;
/* "pt" is treated as "present_or_copy" on the parallel directive because it
is an array variable. */
#pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
{
int j;
#pragma acc loop gang
for (i = 0; i < 32; i++)
{
/* But here, it is made private per-worker. */
#pragma acc loop worker private(pt)
for (j = 0; j < 32; j++)
{
int k;
pt[0] = i ^ j * 3;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += pt[0] * k;
pt[1] = i | j * 5;
#pragma acc loop vector
for (k = 0; k < 32; k++)
arr[i * 1024 + j * 32 + k] += pt[1] * k;
}
}
}
for (i = 0; i < 32; i++)
for (int j = 0; j < 32; j++)
for (int k = 0; k < 32; k++)
{
int idx = i * 1024 + j * 32 + k;
assert (arr[idx] == idx + (i ^ j * 3) * k + (i | j * 5) * k);
}
}
/* Test of gang-private variables declared on the parallel directive. */
void parallel_g_1()
{
int x = 5, i, arr[32];
for (i = 0; i < 32; i++)
arr[i] = 3;
#pragma acc parallel private(x) copy(arr) num_gangs(32) num_workers(8) vector_length(32)
{
#pragma acc loop gang(static:1)
for (i = 0; i < 32; i++)
x = i * 2;
#pragma acc loop gang(static:1)
for (i = 0; i < 32; i++)
{
if (acc_on_device (acc_device_host))
x = i * 2;
arr[i] += x;
}
}
for (i = 0; i < 32; i++)
assert (arr[i] == 3 + i * 2);
}
/* Test of gang-private array variable declared on the parallel directive. */
void parallel_g_2()
{
int x[32], i, arr[32 * 32];
for (i = 0; i < 32 * 32; i++)
arr[i] = i;
#pragma acc parallel private(x) copy(arr) num_gangs(32) num_workers(2) vector_length(32)
{
#pragma acc loop gang
for (i = 0; i < 32; i++)
{
int j;
for (j = 0; j < 32; j++)
x[j] = j * 2;
#pragma acc loop worker
for (j = 0; j < 32; j++)
arr[i * 32 + j] += x[31 - j];
}
}
for (i = 0; i < 32 * 32; i++)
assert (arr[i] == i + (31 - (i % 32)) * 2);
}
int main ()
{
local_g_1();
local_w_1();
local_w_2();
local_w_3();
local_w_4();
local_w_5();
loop_g_1();
loop_g_2();
loop_g_3();
loop_g_4();
loop_g_5();
loop_g_6();
loop_v_1();
loop_v_2();
loop_w_1();
loop_w_2();
loop_w_3();
loop_w_4();
loop_w_5();
loop_w_6();
loop_w_7();
parallel_g_1();
parallel_g_2();
return 0;
}