blob: 4c66dc7bfeadd601e69313eb9c31db5b8928a89b [file] [log] [blame]
#include <assert.h>
#if ACC_DEVICE_TYPE_nvidia
/* To avoid 'libgomp: The Nvidia accelerator has insufficient resources'. */
#define NUM_WORKERS 24
#else
#define NUM_WORKERS 32
#endif
/* Test of reduction on both parallel and loop directives (workers and vectors
together in gang-partitioned mode, float type, multiple reductions). */
int
main (int argc, char *argv[])
{
int i, j;
float arr[32768];
float res = 0, mres = 0, hres = 0, hmres = 0;
for (i = 0; i < 32768; i++)
arr[i] = i % (32768 / 64);
#pragma acc parallel \
num_gangs(32) num_workers(NUM_WORKERS) vector_length(32) \
reduction(+:res) reduction(max:mres) copy(res, mres)
{
#pragma acc loop gang /* { dg-warning "nested loop in reduction needs reduction clause for 'm\?res'" "TODO" } */
for (j = 0; j < 32; j++)
{
#pragma acc loop worker vector reduction(+:res) reduction(max:mres)
for (i = 0; i < 1024; i++)
{
res += arr[j * 1024 + i];
if (arr[j * 1024 + i] > mres)
mres = arr[j * 1024 + i];
}
#pragma acc loop worker vector reduction(+:res) reduction(max:mres)
for (i = 0; i < 1024; i++)
{
res += arr[j * 1024 + (1023 - i)];
if (arr[j * 1024 + (1023 - i)] > mres)
mres = arr[j * 1024 + (1023 - i)];
}
}
}
for (j = 0; j < 32; j++)
for (i = 0; i < 1024; i++)
{
hres += arr[j * 1024 + i];
hres += arr[j * 1024 + (1023 - i)];
if (arr[j * 1024 + i] > hmres)
hmres = arr[j * 1024 + i];
if (arr[j * 1024 + (1023 - i)] > hmres)
hmres = arr[j * 1024 + (1023 - i)];
}
assert (hres <= 16777216);
assert (res == hres);
assert (hmres <= 16777216);
assert (mres == hmres);
return 0;
}