blob: 21ac91c1b58cfbc084dda6e39e5b18cedd555b19 [file] [log] [blame]
/* { dg-additional-options "-ldl" } */
/* { dg-require-effective-target offload_device_gcn }
The 'asm' insert is valid for GCN only:
{ dg-additional-options -foffload=amdgcn-amdhsa } */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <omp.h>
#include <assert.h>
#include <dlfcn.h>
#include "../../../include/hsa.h"
#include "../../config/gcn/libgomp-gcn.h"
#define STACKSIZE (100 * 1024)
#define HEAPSIZE (10 * 1024 * 1024)
#define ARENASIZE HEAPSIZE
/* This code fragment must be optimized or else the host-fallback kernel has
* invalid ASM inserts. The rest of the file can be compiled safely at -O0. */
#pragma omp declare target
uintptr_t __attribute__((optimize("O1")))
get_kernel_ptr ()
{
uintptr_t val;
if (!omp_is_initial_device ())
/* "main._omp_fn.0" is the name GCC gives the first OpenMP target
* region in the "main" function.
* The ".kd" suffix is added by the LLVM assembler when it creates the
* kernel meta-data, and this is what we need to launch a kernel. */
asm ("s_getpc_b64 %0\n\t"
"s_add_u32 %L0, %L0, main._omp_fn.0.kd@rel32@lo+4\n\t"
"s_addc_u32 %H0, %H0, main._omp_fn.0.kd@rel32@hi+4"
: "=Sg"(val));
return val;
}
#pragma omp end declare target
int
main(int argc, char** argv)
{
/* Load the HSA runtime DLL. */
void *hsalib = dlopen ("libhsa-runtime64.so.1", RTLD_LAZY);
assert (hsalib);
hsa_status_t (*hsa_signal_create) (hsa_signal_value_t initial_value,
uint32_t num_consumers,
const hsa_agent_t *consumers,
hsa_signal_t *signal)
= dlsym (hsalib, "hsa_signal_create");
assert (hsa_signal_create);
uint64_t (*hsa_queue_load_write_index_relaxed) (const hsa_queue_t *queue)
= dlsym (hsalib, "hsa_queue_load_write_index_relaxed");
assert (hsa_queue_load_write_index_relaxed);
void (*hsa_signal_store_relaxed) (hsa_signal_t signal,
hsa_signal_value_t value)
= dlsym (hsalib, "hsa_signal_store_relaxed");
assert (hsa_signal_store_relaxed);
hsa_signal_value_t (*hsa_signal_wait_relaxed) (hsa_signal_t signal,
hsa_signal_condition_t condition,
hsa_signal_value_t compare_value,
uint64_t timeout_hint,
hsa_wait_state_t wait_state_hint)
= dlsym (hsalib, "hsa_signal_wait_relaxed");
assert (hsa_signal_wait_relaxed);
void (*hsa_queue_store_write_index_relaxed) (const hsa_queue_t *queue,
uint64_t value)
= dlsym (hsalib, "hsa_queue_store_write_index_relaxed");
assert (hsa_queue_store_write_index_relaxed);
hsa_status_t (*hsa_signal_destroy) (hsa_signal_t signal)
= dlsym (hsalib, "hsa_signal_destroy");
assert (hsa_signal_destroy);
/* Set up the device data environment. */
int test_data_value = 0;
#pragma omp target enter data map(test_data_value)
/* Get the interop details. */
int device_num = omp_get_default_device();
hsa_agent_t *gpu_agent;
hsa_queue_t *hsa_queue = NULL;
omp_interop_t interop = omp_interop_none;
#pragma omp interop init(target, targetsync, prefer_type("hsa"): interop) device(device_num)
assert (interop != omp_interop_none);
omp_interop_rc_t retcode;
omp_interop_fr_t fr = omp_get_interop_int (interop, omp_ipr_fr_id, &retcode);
assert (retcode == omp_irc_success);
assert (fr == omp_ifr_hsa);
gpu_agent = omp_get_interop_ptr(interop, omp_ipr_device, &retcode);
assert (retcode == omp_irc_success);
hsa_queue = omp_get_interop_ptr(interop, omp_ipr_targetsync, &retcode);
assert (retcode == omp_irc_success);
assert (hsa_queue);
/* Call an offload kernel via OpenMP/libgomp.
*
* This kernel serves two purposes:
* 1) Lookup the device-side load-address of itself (thus avoiding the
* need to access the libgomp internals).
* 2) Count how many times it is called.
* We then call it once using OpenMP, and once manually, and check
* the counter reads "2". */
uint64_t kernel_object = 0;
#pragma omp target map(from:kernel_object) map(present,alloc:test_data_value)
{
kernel_object = get_kernel_ptr ();
++test_data_value;
}
assert (kernel_object != 0);
/* Configure the same kernel to run again, using HSA manually this time. */
hsa_status_t status;
hsa_signal_t signal;
status = hsa_signal_create(1, 0, NULL, &signal);
assert (status == HSA_STATUS_SUCCESS);
/* The kernel is built by GCC for OpenMP, so we need to pass the same
* data pointers that libgomp would pass in. */
struct {
uintptr_t test_data_value;
uintptr_t kernel_object;
} tgtaddrs;
#pragma omp target data use_device_addr(test_data_value)
{
tgtaddrs.test_data_value = (uintptr_t)&test_data_value;
tgtaddrs.kernel_object = (uintptr_t)omp_target_alloc (8, device_num);
}
/* We also need to duplicate the launch ABI used by plugin-gcn.c. */
struct kernargs_abi args; /* From libgomp-gcn.h. */
args.dummy1 = (int64_t)&tgtaddrs;
args.out_ptr = (int64_t)malloc (sizeof (struct output)); /* Host side. */
args.heap_ptr = (int64_t)omp_target_alloc (HEAPSIZE, device_num);
args.arena_ptr = (int64_t)omp_target_alloc (ARENASIZE, device_num);
args.stack_ptr = (int64_t)omp_target_alloc (STACKSIZE, device_num);
args.arena_size_per_team = ARENASIZE;
args.stack_size_per_thread = STACKSIZE;
/* Build the HSA dispatch packet, and insert it into the queue. */
uint64_t packet_id = hsa_queue_load_write_index_relaxed (hsa_queue);
const uint32_t queueMask = hsa_queue->size - 1;
hsa_kernel_dispatch_packet_t *dispatch_packet =
&(((hsa_kernel_dispatch_packet_t *)
(hsa_queue->base_address))[packet_id & queueMask]);
dispatch_packet->setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
dispatch_packet->workgroup_size_x = 1;
dispatch_packet->workgroup_size_y = 64;
dispatch_packet->workgroup_size_z = 1;
dispatch_packet->grid_size_x = 1;
dispatch_packet->grid_size_y = 64;
dispatch_packet->grid_size_z = 1;
dispatch_packet->completion_signal = signal;
dispatch_packet->kernel_object = kernel_object;
dispatch_packet->kernarg_address = &args;
dispatch_packet->private_segment_size = 0;
dispatch_packet->group_segment_size = 1536;
uint16_t header = 0;
header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
/* Finish writing the packet header with an atomic release. */
__atomic_store_n((uint16_t*)dispatch_packet, header, __ATOMIC_RELEASE);
hsa_queue_store_write_index_relaxed (hsa_queue, packet_id + 1);
;/* Run the kernel and wait for it to complete. */
hsa_signal_store_relaxed(hsa_queue->doorbell_signal, packet_id);
while (hsa_signal_wait_relaxed(signal, HSA_SIGNAL_CONDITION_LT, 1,
UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
;
/* Clean up HSA. */
hsa_signal_destroy(signal);
free ((void*)args.out_ptr);
omp_target_free ((void*)args.heap_ptr, device_num);
omp_target_free ((void*)args.arena_ptr, device_num);
omp_target_free ((void*)args.stack_ptr, device_num);
omp_target_free ((void*)tgtaddrs.kernel_object, device_num);
/* Clean up OpenMP. */
#pragma omp interop destroy(interop)
/* Bring the data back from the device. */
#pragma omp target exit data map(test_data_value)
/* Ensure the kernel was called twice. Once by OpenMP, once by HSA. */
assert (test_data_value == 2);
return 0;
}