blob: cab08c43550d7e7872f466c510f5b9aac91c7f20 [file] [log] [blame]
/*
Copyright (c) 2014-2016 Intel Corporation. All Rights Reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Forward declaration as the following 2 functions are declared as friend
// in offload_engine.h.
// CLANG does not like static to been after friend declaration.
static void __offload_init_library_once(void);
static void __offload_fini_library(void);
#include "offload_host.h"
#ifdef MYO_SUPPORT
#include "offload_myo_host.h"
#endif
#include <malloc.h>
#ifndef TARGET_WINNT
#include <alloca.h>
#include <elf.h>
#endif // TARGET_WINNT
#include <errno.h>
#include <fcntl.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <algorithm>
#include <bitset>
#include <iostream>
#if defined(HOST_WINNT)
#define PATH_SEPARATOR ";"
#else
#define PATH_SEPARATOR ":"
#endif
#define GET_OFFLOAD_NUMBER(timer_data) \
timer_data? timer_data->offload_number : 0
static void (*task_completion_callback)(void *);
extern "C" {
#ifdef TARGET_WINNT
// Windows does not support imports from libraries without actually
// including them as dependence. We don't want to include in the
// dependence since is it used only for Fortran when traceback is enabled.
// Chose to implement it with GetProcAddress.
#define FORTRAN_TRACE_BACK win_for__continue_traceback
int win_for__continue_traceback( _Offload_result coi_offload_result )
{
HINSTANCE hDLL;
int (* TraceBackRoutine)(_Offload_result value);
hDLL = LoadLibrary("libifcoremd.dll");
if (hDLL != 0) {
TraceBackRoutine = (int (*)(_Offload_result)) GetProcAddress(hDLL,
"for__continue_traceback");
if (TraceBackRoutine != 0) {
return TraceBackRoutine(coi_offload_result);
}
else {
OFFLOAD_TRACE(3,
"Cannot find for__continue_traceback routine in libifcorert.dll\n");
exit(1);
}
}
else {
OFFLOAD_TRACE(3, "Cannot load libifcorert.dll\n");
exit(1);
}
return 0;
}
#else // TARGET_WINNT
#define FORTRAN_TRACE_BACK for__continue_traceback
// for__continue_traceback is provided as a dummy to resolve link time symbols
// for C/C++ programs. For Fortran the actual fortran library function in
// libifcore.so is used.
#pragma weak for__continue_traceback
int for__continue_traceback( _Offload_result coi_offload_result )
{
OFFLOAD_TRACE(3,
"liboffload function for_continue_traceback should not be called.\n");
exit(1);
}
#endif //TARGET_WINNT
} // extern "C"
#ifdef TARGET_WINNT
// Small subset of ELF declarations for Windows which is needed to compile
// this file. ELF header is used to understand what binary type is contained
// in the target image - shared library or executable.
typedef uint16_t Elf64_Half;
typedef uint32_t Elf64_Word;
typedef uint64_t Elf64_Addr;
typedef uint64_t Elf64_Off;
#define EI_NIDENT 16
#define ET_EXEC 2
#define ET_DYN 3
typedef struct
{
unsigned char e_ident[EI_NIDENT];
Elf64_Half e_type;
Elf64_Half e_machine;
Elf64_Word e_version;
Elf64_Addr e_entry;
Elf64_Off e_phoff;
Elf64_Off e_shoff;
Elf64_Word e_flags;
Elf64_Half e_ehsize;
Elf64_Half e_phentsize;
Elf64_Half e_phnum;
Elf64_Half e_shentsize;
Elf64_Half e_shnum;
Elf64_Half e_shstrndx;
} Elf64_Ehdr;
#endif // TARGET_WINNT
// Host console and file logging
const char *prefix;
int console_enabled = 0;
int offload_number = 0;
static const char *htrace_envname = "H_TRACE";
static const char *offload_report_envname = "OFFLOAD_REPORT";
static const char *timer_envname = "H_TIME";
// DMA channel count used by COI and set via
// OFFLOAD_DMA_CHANNEL_COUNT environment variable
uint32_t mic_dma_channel_count;
// Trace information
static const char* vardesc_direction_as_string[] = {
"NOCOPY",
"IN",
"OUT",
"INOUT"
};
static const char* vardesc_type_as_string[] = {
"unknown",
"data",
"data_ptr",
"func_ptr",
"void_ptr",
"string_ptr",
"dv",
"dv_data",
"dv_data_slice",
"dv_ptr",
"dv_ptr_data",
"dv_ptr_data_slice",
"cean_var",
"cean_var_ptr",
"c_data_ptr_array",
"c_extended_type",
"c_func_ptr_array",
"c_void_ptr_array",
"c_string_ptr_array",
"c_data_ptr_ptr",
"c_func_ptr_ptr",
"c_void_ptr_ptr",
"c_string_ptr_ptr",
"c_cean_var_ptr_ptr",
};
Engine* mic_engines = 0;
uint32_t mic_engines_total = 0;
pthread_key_t mic_thread_key;
MicEnvVar mic_env_vars;
uint64_t cpu_frequency = 0;
// MIC_STACKSIZE
uint32_t mic_stack_size = 12 * 1024 * 1024;
// MIC_BUFFERSIZE
uint64_t mic_buffer_size = 0;
// Preallocated 4K page memory size for buffers on MIC
uint64_t mic_4k_buffer_size = 0;
// Preallocated 2M page memory size for buffers on MIC
uint64_t mic_2m_buffer_size = 0;
// LD_LIBRARY_PATH for KNC
char* knc_library_path = 0;
// LD_LIBRARY_PATH for KNL
char* knl_library_path = 0;
// MIC_PROXY_IO
bool mic_proxy_io = true;
// MIC_PROXY_FS_ROOT
char* mic_proxy_fs_root = 0;
// Threshold for creating buffers with large pages. Buffer is created
// with large pages hint if its size exceeds the threshold value.
// By default large pages are disabled right now (by setting default
// value for threshold to MAX) due to HSD 4114629.
uint64_t __offload_use_2mb_buffers = 0xffffffffffffffffULL;
static const char *mic_use_2mb_buffers_envname =
"MIC_USE_2MB_BUFFERS";
static uint64_t __offload_use_async_buffer_write = 2 * 1024 * 1024;
static const char *mic_use_async_buffer_write_envname =
"MIC_USE_ASYNC_BUFFER_WRITE";
static uint64_t __offload_use_async_buffer_read = 2 * 1024 * 1024;
static const char *mic_use_async_buffer_read_envname =
"MIC_USE_ASYNC_BUFFER_READ";
// device initialization type
OffloadInitType __offload_init_type = c_init_on_offload_all;
static const char *offload_init_envname = "OFFLOAD_INIT";
// active wait
static bool __offload_active_wait = true;
static const char *offload_active_wait_envname = "OFFLOAD_ACTIVE_WAIT";
// wait even for asynchronous offload
// true for now still the performance issue with COI is not fixed
static bool __offload_always_wait = true;
static const char *offload_always_wait_envname = "OFFLOAD_ALWAYS_WAIT";
// OMP_DEFAULT_DEVICE
int __omp_device_num = 0;
static const char *omp_device_num_envname = "OMP_DEFAULT_DEVICE";
//OFFLOAD_PARALLEL_COPY
static bool __offload_parallel_copy = false;
static const char *parallel_copy_envname = "OFFLOAD_PARALLEL_COPY";
//Use COI interface for noncontiguous transfer if it exists.
static bool __offload_use_coi_noncontiguous_transfer = false;
static const char *use_coi_noncontiguous_transfer_envname =
"MIC_USE_COI_MULTI_D";
// The list of pending target libraries
static bool __target_libs;
static TargetImageList __target_libs_list;
static mutex_t __target_libs_lock;
static mutex_t stack_alloc_lock;
static mutex_t lock_complete;
// Set of OffloadDescriptors of asynchronous offloads that are not destroyed
std::map<void *, bool> offload_descr_map;
// Target executable
TargetImage* __target_exe;
// is true if last loaded image is dll
bool __current_image_is_dll = false;
// is true if myo library is loaded when dll is loaded
bool __myo_init_in_so = false;
// Print readable offload flags
static void trace_offload_flags(
OffloadHostTimerData* timer_data,
OffloadFlags offload_flags
)
{
// Sized big enough for all flag names
char fbuffer[256];
bool first = true;
if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
sprintf(fbuffer, " OffloadFlags=(");
if (offload_flags.bits.fortran_traceback) {
sprintf(fbuffer+strlen(fbuffer), "fortran_traceback");
first = false;
}
if (offload_flags.bits.omp_async) {
sprintf(fbuffer+strlen(fbuffer), first ? "omp_async" : ",omp_async");
first = false;
}
OFFLOAD_DEBUG_TRACE_1(1,
GET_OFFLOAD_NUMBER(timer_data), c_offload_init_func,
"%s)\n", fbuffer);
}
}
// Print readable varDesc flags
static void trace_varDesc_flags(
OffloadHostTimerData* timer_data,
varDescFlags offload_flags
)
{
// Sized big enough for all flag names
char fbuffer[256];
bool first = true;
if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
sprintf(fbuffer, " varDescFlags=(");
if (offload_flags.is_static) {
sprintf(fbuffer+strlen(fbuffer), "is_static");
first = false;
}
if (offload_flags.is_static_dstn) {
sprintf(fbuffer+strlen(fbuffer),
first ? "is_static_dstn" : ",is_static_dstn");
first = false;
}
if (offload_flags.has_length) {
sprintf(fbuffer+strlen(fbuffer),
first ? "has_length" : ",has_length");
first = false;
}
if (offload_flags.is_stack_buf) {
sprintf(fbuffer+strlen(fbuffer),
first ? "is_stack_buf" : ",is_stack_buf");
first = false;
}
if (offload_flags.targetptr) {
sprintf(fbuffer+strlen(fbuffer),
first ? "targetptr" : ",targetptr");
first = false;
}
if (offload_flags.preallocated) {
sprintf(fbuffer+strlen(fbuffer),
first ? "preallocated" : ",preallocated");
first = false;
}
if (offload_flags.is_pointer) {
sprintf(fbuffer+strlen(fbuffer),
first ? "is_pointer" : ",is_pointer");
first = false;
}
if (offload_flags.sink_addr) {
sprintf(fbuffer+strlen(fbuffer),
first ? "sink_addr" : ",sink_addr");
first = false;
}
if (offload_flags.alloc_disp) {
sprintf(fbuffer+strlen(fbuffer),
first ? "alloc_disp" : ",alloc_disp");
first = false;
}
if (offload_flags.is_noncont_src) {
sprintf(fbuffer+strlen(fbuffer),
first ? "is_noncont_src" : ",is_noncont_src");
first = false;
}
if (offload_flags.is_noncont_dst) {
sprintf(fbuffer+strlen(fbuffer),
first ? "is_noncont_dst" : ",is_noncont_dst");
first = false;
}
if (offload_flags.always_copy) {
sprintf(fbuffer+strlen(fbuffer),
first ? "always_copy" : ",always_copy");
first = false;
}
if (offload_flags.always_delete) {
sprintf(fbuffer+strlen(fbuffer),
first ? "always_delete" : ",always_delete");
first = false;
}
if (offload_flags.is_non_cont_struct) {
sprintf(fbuffer+strlen(fbuffer),
first ? "is_non_cont_struct" : ",is_non_cont_struct");
first = false;
}
if (offload_flags.pin) {
sprintf(fbuffer+strlen(fbuffer),
first ? "pin" : ",pin");
first = false;
}
if (offload_flags.is_device_ptr) {
sprintf(fbuffer+strlen(fbuffer),
first ? "is_device_ptr" : ",is_device_ptr");
first = false;
}
if (offload_flags.use_device_ptr) {
sprintf(fbuffer+strlen(fbuffer),
first ? "use_device_ptr" : ",use_device_ptr");
}
OFFLOAD_DEBUG_TRACE_1(1,
GET_OFFLOAD_NUMBER(timer_data), c_offload_init_func,
"%s)\n", fbuffer);
}
}
static char * offload_get_src_base(void * ptr, uint8_t type)
{
char *base;
if (VAR_TYPE_IS_PTR(type)) {
base = *static_cast<char**>(ptr);
}
else if (VAR_TYPE_IS_SCALAR(type)) {
base = static_cast<char*>(ptr);
}
else if (VAR_TYPE_IS_DV_DATA_SLICE(type) || VAR_TYPE_IS_DV_DATA(type)) {
ArrDesc *dvp;
if (VAR_TYPE_IS_DV_DATA_SLICE(type)) {
const Arr_Desc *ap = static_cast<const Arr_Desc*>(ptr);
dvp = (type == c_dv_data_slice) ?
reinterpret_cast<ArrDesc*>(ap->base) :
*reinterpret_cast<ArrDesc**>(ap->base);
}
else {
dvp = (type == c_dv_data) ?
static_cast<ArrDesc*>(ptr) :
*static_cast<ArrDesc**>(ptr);
}
base = reinterpret_cast<char*>(dvp->Base);
}
else {
base = NULL;
}
return base;
}
void OffloadDescriptor::report_coi_error(error_types msg, COIRESULT res)
{
// special case for the 'process died' error
if (res == COI_PROCESS_DIED) {
m_device.fini_process(true);
}
else {
switch (msg) {
case c_buf_create:
if (res == COI_OUT_OF_MEMORY) {
msg = c_buf_create_out_of_mem;
}
/* fallthru */
case c_buf_create_from_mem:
case c_buf_get_address:
case c_pipeline_create:
case c_pipeline_run_func:
LIBOFFLOAD_ERROR(msg, m_device.get_logical_index(), res);
break;
case c_buf_read:
case c_buf_write:
case c_buf_copy:
case c_buf_map:
case c_buf_unmap:
case c_buf_destroy:
case c_buf_set_state:
LIBOFFLOAD_ERROR(msg, res);
break;
default:
break;
}
}
exit(1);
}
_Offload_result OffloadDescriptor::translate_coi_error(COIRESULT res) const
{
switch (res) {
case COI_SUCCESS:
return OFFLOAD_SUCCESS;
case COI_PROCESS_DIED:
return OFFLOAD_PROCESS_DIED;
case COI_OUT_OF_MEMORY:
return OFFLOAD_OUT_OF_MEMORY;
default:
return OFFLOAD_ERROR;
}
}
// is_targetptr == 0 && is_prealloc == 0 - allocation of pointer data;
// is_targetptr == 1 && is_prealloc == 0 - allocation of target memory:
// allocate memory at target; use its value as base in target table.
// is_targetptr == 1 && is_prealloc == 1 - use preallocated target memory:
// base - is address at target of preallocated memory; use its value as
// base in target table.
bool OffloadDescriptor::alloc_ptr_data(
PtrData* &ptr_data,
void *base,
int64_t disp,
int64_t size,
int64_t alloc_disp,
int align,
bool is_targptr,
bool is_prealloc,
bool pin
)
{
// total length of base
int64_t length = size;
bool is_new;
COIBUFFER targptr_buf;
COIRESULT res;
uint32_t buffer_flags = 0;
char * base_disp = reinterpret_cast<char *>(base) + disp;
// create buffer with large pages if data length exceeds
// large page threshold
if (length >= __offload_use_2mb_buffers) {
buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
}
// Allocate memory at target for targetptr without preallocated as we need
// its address as base argument in call to m_device.insert_ptr_data
if (is_targptr && !is_prealloc) {
length = alloc_disp ? length : size + disp;
res = COI::BufferCreate(
length,
COI_BUFFER_OPENCL,
buffer_flags,
0,
1,
&m_device.get_process(),
&targptr_buf);
if (res != COI_SUCCESS) {
if (m_status != 0) {
m_status->result = translate_coi_error(res);
}
else if (m_is_mandatory) {
report_coi_error(c_buf_create, res);
}
return false;
}
res = COI::BufferGetSinkAddress(
targptr_buf, reinterpret_cast<uint64_t *>(&base));
if (res != COI_SUCCESS) {
if (m_status != 0) {
m_status->result = translate_coi_error(res);
}
else if (m_is_mandatory) {
report_coi_error(c_buf_get_address, res);
}
return false;
}
}
OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n",
alloc_disp ? base : base_disp,
alloc_disp ? length : size + disp);
// add new entry
ptr_data = is_targptr ?
m_device.find_targetptr_data(base_disp) :
m_device.find_ptr_data(base_disp);
// if ptr_data is found just need to check it for overlapping
if (ptr_data) {
is_new = false;
base = base_disp;
}
else {
// If association is not found we must create it.
length = alloc_disp ? length : size + disp;
ptr_data = is_targptr ?
m_device.insert_targetptr_data(base, length, is_new) :
m_device.insert_ptr_data(base, length, is_new);
}
if (is_new) {
OFFLOAD_TRACE(3, "Added new association\n");
if (length > 0) {
OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers);
// align should be a power of 2
if (!pin && !is_targptr &&
align > 0 && (align & (align - 1)) == 0) {
// offset within mic_buffer. Can do offset optimization
// only when source address alignment satisfies requested
// alignment on the target (cq172736).
if ((reinterpret_cast<intptr_t>(base) & (align - 1)) == 0) {
ptr_data->mic_offset =
reinterpret_cast<intptr_t>(base) & 4095;
}
}
// buffer size and flags
uint64_t buffer_size = length + ptr_data->mic_offset;
// For targetptr there is no CPU buffer
if (pin || !is_targptr) {
// create CPU buffer
OFFLOAD_DEBUG_TRACE_1(3,
GET_OFFLOAD_NUMBER(get_timer_data()),
c_offload_create_buf_host,
"Creating buffer from source memory %p, "
"length %lld\n", base, length);
// result is not checked because we can continue without cpu
// buffer. In this case we will use COIBufferRead/Write
// instead of COIBufferCopy.
COI::BufferCreateFromMemory(length,
COI_BUFFER_OPENCL,
0,
base,
1,
&m_device.get_process(),
&ptr_data->cpu_buf);
}
// create MIC buffer
if (is_prealloc) {
OFFLOAD_DEBUG_TRACE_1(3,
GET_OFFLOAD_NUMBER(get_timer_data()),
c_offload_create_buf_mic,
"Creating buffer from sink memory: "
"addr %p, size %lld, offset %d, flags 0x%x\n",
base, buffer_size, ptr_data->mic_offset,
buffer_flags);
res = COI::BufferCreateFromMemory(ptr_data->cpu_addr.length(),
COI_BUFFER_NORMAL,
COI_SINK_MEMORY,
base,
1,
&m_device.get_process(),
&ptr_data->mic_buf);
if (res != COI_SUCCESS) {
if (m_status != 0) {
m_status->result = translate_coi_error(res);
}
else if (m_is_mandatory) {
report_coi_error(c_buf_create, res);
}
ptr_data->alloc_ptr_data_lock.unlock();
return false;
}
}
else if (is_targptr) {
ptr_data->mic_buf = targptr_buf;
}
else if (!pin) {
OFFLOAD_DEBUG_TRACE_1(3,
GET_OFFLOAD_NUMBER(get_timer_data()),
c_offload_create_buf_mic,
"Creating buffer for sink: size %lld, offset %d, "
"flags =0x%x\n", buffer_size,
ptr_data->mic_offset, buffer_flags);
res = COI::BufferCreate(buffer_size,
COI_BUFFER_NORMAL,
buffer_flags,
0,
1,
&m_device.get_process(),
&ptr_data->mic_buf);
if (res != COI_SUCCESS) {
if (m_status != 0) {
m_status->result = translate_coi_error(res);
}
else if (m_is_mandatory) {
report_coi_error(c_buf_create, res);
}
ptr_data->alloc_ptr_data_lock.unlock();
return false;
}
}
if (!pin) {
// make buffer valid on the device.
res = COI::BufferSetState(ptr_data->mic_buf,
m_device.get_process(),
COI_BUFFER_VALID,
COI_BUFFER_NO_MOVE,
0, 0, 0);
if (res != COI_SUCCESS) {
if (m_status != 0) {
m_status->result = translate_coi_error(res);
}
else if (m_is_mandatory) {
report_coi_error(c_buf_set_state, res);
}
ptr_data->alloc_ptr_data_lock.unlock();
return false;
}
res = COI::BufferSetState(ptr_data->mic_buf,
COI_PROCESS_SOURCE,
COI_BUFFER_INVALID,
COI_BUFFER_NO_MOVE,
0, 0, 0);
if (res != COI_SUCCESS) {
if (m_status != 0) {
m_status->result = translate_coi_error(res);
}
else if (m_is_mandatory) {
report_coi_error(c_buf_set_state, res);
}
ptr_data->alloc_ptr_data_lock.unlock();
return false;
}
}
}
ptr_data->alloc_disp = alloc_disp;
ptr_data->alloc_ptr_data_lock.unlock();
}
else {
mutex_locker_t locker(ptr_data->alloc_ptr_data_lock);
OFFLOAD_TRACE(3, "Found existing association: addr %p, length %lld, "
"is_static %d\n",
ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
ptr_data->is_static);
// This is not a new entry. Make sure that provided address range fits
// into existing one.
MemRange addr_range(base, length);
if (!ptr_data->cpu_addr.contains(addr_range)) {
LIBOFFLOAD_ERROR(c_bad_ptr_mem_alloc, base, length,
const_cast<void *>(ptr_data->cpu_addr.start()),
ptr_data->cpu_addr.length());
exit(1);
}
// if the entry is associated with static data it may not have buffers
// created because they are created on demand.
if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) {
return false;
}
}
return true;
}
bool OffloadDescriptor::find_ptr_data(
PtrData* &ptr_data,
void *in_base,
int64_t disp,
int64_t size,
bool is_targetptr,
bool report_error
)
{
// total length of base
int64_t length = size;
char *base = reinterpret_cast<char *>(in_base) + disp;
OFFLOAD_TRACE(3, "Looking for association for data: addr %p, "
"length %lld\n", base, length);
// find existing association in pointer table
ptr_data = is_targetptr ?
m_device.find_targetptr_data(base) :
m_device.find_ptr_data(base);
if (ptr_data == 0) {
if (report_error) {
LIBOFFLOAD_ERROR(c_no_ptr_data, base);
exit(1);
}
OFFLOAD_TRACE(3, "Association does not exist\n");
return true;
}
OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
ptr_data->is_static);
// make sure that provided address range fits into existing one
MemRange addr_range(base, length);
if (!ptr_data->cpu_addr.contains(addr_range)) {
if (report_error) {
LIBOFFLOAD_ERROR(c_bad_ptr_mem_range, base, length,
const_cast<void *>(ptr_data->cpu_addr.start()),
ptr_data->cpu_addr.length());
exit(1);
}
OFFLOAD_TRACE(3, "Existing association partially overlaps with "
"data address range\n");
ptr_data = 0;
return true;
}
// if the entry is associated with static data it may not have buffers
// created because they are created on demand.
if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) {
return false;
}
return true;
}
void OffloadDescriptor::find_device_ptr(
int64_t* &device_ptr,
void *host_ptr
)
{
PtrData* ptr_data;
char *base = reinterpret_cast<char *>(host_ptr);
OFFLOAD_TRACE(3, "Looking for association for data: addr %p\n", base);
// find existing association in pointer table
ptr_data = m_device.find_ptr_data(base);
// MIC address should have been assigned.
// For now assume does not exist and get the addr
// if ((ptr_data == 0) || ptr_data->mic_addr) {
if (ptr_data == 0) {
OFFLOAD_TRACE(3, "Association does not exist\n");
LIBOFFLOAD_ERROR(c_no_ptr_data, base);
exit(1);
}
if (!ptr_data->mic_addr) {
COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf,
&ptr_data->mic_addr);
if (res != COI_SUCCESS) {
if (m_status != 0)
m_status->result = translate_coi_error(res);
report_coi_error(c_buf_get_address, res);
}
}
device_ptr = (int64_t *) ptr_data->mic_addr;
OFFLOAD_TRACE(3, "Found association: host_ptr %p, device_ptr = %p\n",
ptr_data->cpu_addr.start(), device_ptr);
}
bool OffloadDescriptor::init_static_ptr_data(PtrData *ptr_data)
{
OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers);
if (ptr_data->cpu_buf == 0) {
OFFLOAD_TRACE(3, "Creating buffer from source memory %llx\n",
ptr_data->cpu_addr.start());
COIRESULT res = COI::BufferCreateFromMemory(
ptr_data->cpu_addr.length(),
COI_BUFFER_OPENCL,
0,
const_cast<void*>(ptr_data->cpu_addr.start()),
1, &m_device.get_process(),
&ptr_data->cpu_buf);
if (res != COI_SUCCESS) {
if (m_status != 0) {
m_status->result = translate_coi_error(res);
return false;
}
report_coi_error(c_buf_create_from_mem, res);
}
}
if (ptr_data->mic_buf == 0) {
OFFLOAD_TRACE(3, "Creating buffer from sink memory %llx\n",
ptr_data->mic_addr);
COIRESULT res = COI::BufferCreateFromMemory(
ptr_data->cpu_addr.length(),
COI_BUFFER_NORMAL,
COI_SINK_MEMORY,
reinterpret_cast<void*>(ptr_data->mic_addr),
1, &m_device.get_process(),
&ptr_data->mic_buf);
if (res != COI_SUCCESS) {
if (m_status != 0) {
m_status->result = translate_coi_error(res);
return false;
}
report_coi_error(c_buf_create_from_mem, res);
}
}
return true;
}
bool OffloadDescriptor::init_mic_address(PtrData *ptr_data)
{
if (ptr_data->mic_buf != 0 && ptr_data->mic_addr == 0) {
COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf,
&ptr_data->mic_addr);
if (res != COI_SUCCESS) {
if (m_status != 0) {
m_status->result = translate_coi_error(res);
}
else if (m_is_mandatory) {
report_coi_error(c_buf_get_address, res);
}
return false;
}
}
return true;
}
bool OffloadDescriptor::nullify_target_stack(
COIBUFFER targ_buf,
uint64_t size
)
{
char * ptr = (char*)malloc(size);
if (ptr == NULL)
LIBOFFLOAD_ERROR(c_malloc);
COIRESULT res;
memset(ptr, 0, size);
res = COI::BufferWrite(
targ_buf,
0,
ptr,
size,
COI_COPY_UNSPECIFIED,
0, 0, 0);
free(ptr);
if (res != COI_SUCCESS) {
if (m_status != 0) {
m_status->result = translate_coi_error(res);
return false;
}
report_coi_error(c_buf_write, res);
}
return true;
}
static void print_persistList_item(
const char *msg,
PersistData *cur_el
)
{
OFFLOAD_TRACE(4, "%s\n", msg);
OFFLOAD_TRACE(4, " stack_cpu_addr = %p\n", cur_el->stack_cpu_addr);
OFFLOAD_TRACE(4, " routine_id = %d\n", cur_el->routine_id);
OFFLOAD_TRACE(4, " thread_id = %lld\n", cur_el->thread_id);
OFFLOAD_TRACE(4, " stack_ptr_data = %p\n", cur_el->stack_ptr_data);
OFFLOAD_TRACE(4, " MIC buffer = %p\n", cur_el->stack_ptr_data->mic_buf);
OFFLOAD_TRACE(4, " MIC addr = %p\n", cur_el->stack_ptr_data->mic_addr);
OFFLOAD_TRACE(4, " cpu_stack_addr = %p\n", cur_el->cpu_stack_addr);
}
static mutex_t stack_memory_manager_lock;
bool OffloadDescriptor::offload_stack_memory_manager(
const void * stack_begin,
int routine_id,
int buf_size,
int align,
bool thread_specific_function_locals,
bool *is_new)
{
//mutex_locker_t locker(stack_alloc_lock);
stack_memory_manager_lock.lock();
PersistData * new_el;
PersistDataList::iterator it_begin = m_device.m_persist_list.begin();
PersistDataList::iterator it_end;
int erase = 0;
uint64_t cur_thread_id = m_device.get_thread_id();
OFFLOAD_TRACE(3, "offload_stack_memory_manager("
"stack_begin=%p, routine_id=%d, buf_size=%d,"
"align=%d, thread_specific_function_locals=%d, bool=%p)\n",
stack_begin, routine_id, buf_size,
align, thread_specific_function_locals, is_new);
OFFLOAD_TRACE(3, "cur_thread_id=%lld\n", cur_thread_id);
*is_new = false;
for (PersistDataList::iterator it = m_device.m_persist_list.begin();
it != m_device.m_persist_list.end(); it++) {
PersistData cur_el = *it;
print_persistList_item("Current element in persist list:", &cur_el);
if (stack_begin > it->stack_cpu_addr) {
if (cur_thread_id == cur_el.thread_id) {
// this stack data must be destroyed
m_destroy_stack.push_front(cur_el.stack_ptr_data);
it_end = it;
erase++;
OFFLOAD_TRACE(3, "Current element below TOS: so delete\n");
}
}
else if (stack_begin == it->stack_cpu_addr) {
if (routine_id != it-> routine_id) {
// this stack data must be destroyed
// because the current function is a dynamic sibling
m_destroy_stack.push_front(cur_el.stack_ptr_data);
it_end = it;
erase++;
OFFLOAD_TRACE(3, "Current element is sibling: so delete\n");
break;
}
else if (!thread_specific_function_locals ||
cur_thread_id == cur_el.thread_id) {
// stack data is reused
m_stack_ptr_data = it->stack_ptr_data;
if (erase > 0) {
// all obsolete stack sections must be erased from the list
m_device.m_persist_list.erase(it_begin, ++it_end);
m_in_datalen +=
erase * sizeof(new_el->stack_ptr_data->mic_addr);
}
OFFLOAD_TRACE(3, "Reuse of stack buffer with addr %p\n",
m_stack_ptr_data->mic_addr);
stack_memory_manager_lock.unlock();
return true;
}
}
else if (stack_begin < it->stack_cpu_addr &&
cur_thread_id == cur_el.thread_id) {
OFFLOAD_TRACE(3, "Current element is above TOS\n");
break;
}
}
if (erase > 0) {
// all obsolete stack sections must be erased from the list
m_device.m_persist_list.erase(it_begin, ++it_end);
m_in_datalen += erase * sizeof(new_el->stack_ptr_data->mic_addr);
}
// new stack table is created
new_el = new PersistData(stack_begin, routine_id, buf_size, cur_thread_id);
// create MIC buffer
COIRESULT res;
uint32_t buffer_flags = 0;
// create buffer with large pages if data length exceeds
// large page threshold
if (buf_size >= __offload_use_2mb_buffers) {
buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
}
res = COI::BufferCreate(buf_size,
COI_BUFFER_NORMAL,
buffer_flags,
0,
1,
&m_device.get_process(),
&new_el->stack_ptr_data->mic_buf);
if (res != COI_SUCCESS) {
if (m_status != 0) {
m_status->result = translate_coi_error(res);
}
else if (m_is_mandatory) {
report_coi_error(c_buf_create, res);
}
stack_memory_manager_lock.unlock();
return false;
}
// make buffer valid on the device.
res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf,
m_device.get_process(),
COI_BUFFER_VALID,
COI_BUFFER_NO_MOVE,
0, 0, 0);
if (res != COI_SUCCESS) {
if (m_status != 0) {
m_status->result = translate_coi_error(res);
}
else if (m_is_mandatory) {
report_coi_error(c_buf_set_state, res);
}
stack_memory_manager_lock.unlock();
return false;
}
res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf,
COI_PROCESS_SOURCE,
COI_BUFFER_INVALID,
COI_BUFFER_NO_MOVE,
0, 0, 0);
if (res != COI_SUCCESS) {
if (m_status != 0) {
m_status->result = translate_coi_error(res);
}
else if (m_is_mandatory) {
report_coi_error(c_buf_set_state, res);
}
stack_memory_manager_lock.unlock();
return false;
}
// persistence algorithm requires target stack initialy to be nullified
if (!nullify_target_stack(new_el->stack_ptr_data->mic_buf, buf_size)) {
stack_memory_manager_lock.unlock();
return false;
}
m_stack_ptr_data = new_el->stack_ptr_data;
init_mic_address(m_stack_ptr_data);
OFFLOAD_TRACE(3, "Allocating stack buffer with addr %p\n",
m_stack_ptr_data->mic_addr);
m_device.m_persist_list.push_front(*new_el);
init_mic_address(new_el->stack_ptr_data);
*is_new = true;
stack_memory_manager_lock.unlock();
return true;
}
// Search through persistent stack buffers
// for the top-of-stack buffer for this thread
char* OffloadDescriptor::get_this_threads_cpu_stack_addr(
const void * stack_begin,
int routine_id,
bool thread_specific_function_locals
)
{
uint64_t cur_thread_id = m_device.get_thread_id();
char* matched = 0;
OFFLOAD_TRACE(3, "get_this_threads_cpu_stack_addr("
"stack_begin=%p, routine_id=%d, thread_specific_function_locals=%d)\n",
stack_begin, routine_id, thread_specific_function_locals);
OFFLOAD_TRACE(3, "cur_thread_id=%lld\n", cur_thread_id);
stack_memory_manager_lock.lock();
for (PersistDataList::iterator it = m_device.m_persist_list.begin();
it != m_device.m_persist_list.end(); it++)
{
PersistData cur_el = *it;
print_persistList_item("Current element in persist list:", &cur_el);
if (stack_begin == cur_el.stack_cpu_addr)
{
// For OpenMP shared function locals matching is done without
// regard to thread id. But, we return the last match, which
// corresponds to the outer stack.
if (!thread_specific_function_locals)
{
matched = cur_el.cpu_stack_addr;
continue;
}
// For non-OpenMP shared function-local variables
// the thread-id must match
if (cur_thread_id == cur_el.thread_id)
{
matched = cur_el.cpu_stack_addr;
break;
}
}
}
stack_memory_manager_lock.unlock();
if (matched != 0)
{
OFFLOAD_TRACE(3, "get_this_threads_cpu_stack_addr() => %p\n", matched);
return matched;
}
OFFLOAD_TRACE(1,
"Could not find persistent data; expect Read/Write failure\n");
return 0;
}
// Search through persistent stack buffers
// for the top-of-stack MIC buffer for this thread
PtrData* OffloadDescriptor::get_this_threads_mic_stack_addr(
const void * stack_begin,
int routine_id,
bool thread_specific_function_locals
)
{
uint64_t cur_thread_id = m_device.get_thread_id();
PtrData* matched = 0;
OFFLOAD_TRACE(3, "get_this_threads_mic_stack_addr("
"stack_begin=%p, routine_id=%d, thread_specific_function_locals=%d)\n",
stack_begin, routine_id, thread_specific_function_locals);
OFFLOAD_TRACE(3, "cur_thread_id=%lld\n", cur_thread_id);
stack_memory_manager_lock.lock();
for (PersistDataList::iterator it = m_device.m_persist_list.begin();
it != m_device.m_persist_list.end(); it++)
{
PersistData cur_el = *it;
print_persistList_item("Current element in persist list:", &cur_el);
if (stack_begin == cur_el.stack_cpu_addr)
{
// For OpenMP shared function locals matching is done without
// regard to thread id. But, we return the last match, which
// corresponds to the outer stack.
if (!thread_specific_function_locals)
{
matched = cur_el.stack_ptr_data;
continue;
}
// For non-OpenMP shared function-local variables
// the thread-id must match
if (cur_thread_id == cur_el.thread_id)
{
matched = cur_el.stack_ptr_data;
break;
}
}
}
stack_memory_manager_lock.unlock();
if (matched != 0)
{
OFFLOAD_TRACE(3, "get_this_threads_mic_stack_addr() => %p\n", matched);
return matched;
}
OFFLOAD_TRACE(1,
"Could not find persistent data; expect Read/Write failure\n");
return 0;
}
void OffloadDescriptor::setup_use_device_ptr(int i)
{
PtrData *ptr_data;
ArrDesc *dvp;
void *base;
if (m_vars_extra[i].type_src == c_dv_ptr) {
dvp = *static_cast<ArrDesc**>(m_vars[i].ptr);
base = reinterpret_cast<void*>(dvp->Base);
}
else {
base = *static_cast<void**>(m_vars[i].ptr);
}
if (m_vars[i].direction.in) {
int64_t *device_ptr;
bool is_new = true;
find_device_ptr(device_ptr, base);
// Create a entry in targetptr table using device_ptr
// as lookup for later recover the host pointer
ptr_data = m_device.insert_targetptr_data(device_ptr,
0, is_new);
// Actually the base is a host pointer and cpu_addr is
// device pointer. This is special case where the 2
// address usage is reversed to enable using existing
// PtrData structure instead of adding new fields.
ptr_data->mic_addr = (uint64_t) base;
ptr_data->alloc_ptr_data_lock.unlock();
// Replace host pointer with device pointer
if (m_vars_extra[i].type_src == c_dv_ptr) {
dvp->Base = reinterpret_cast<dv_size>(device_ptr);
}
else {
*static_cast<void**>(m_vars[i].ptr) = device_ptr;
}
}
else if (m_vars[i].direction.out) {
// For use_device_ptr and out find associated host ptr
// and assign to host ptr
ptr_data = m_device.find_targetptr_data(base);
if (!ptr_data) {
LIBOFFLOAD_ERROR(c_no_ptr_data, base);
exit(1);
}
if (m_vars_extra[i].type_src == c_dv_ptr) {
dvp->Base = ptr_data->mic_addr;
}
else {
*static_cast<void**>(m_vars[i].ptr) =
reinterpret_cast<void*>(ptr_data->mic_addr);
}
m_device.remove_targetptr_data(
ptr_data->cpu_addr.start());
}
}
bool OffloadDescriptor::setup_descriptors(
VarDesc *vars,
VarDesc2 *vars2,
int vars_total,
int entry_id,
const void *stack_addr
)
{
COIRESULT res;
// To enable caching the CPU stack base address for stack variables
char* this_threads_cpu_stack_addr = 0;
// To properly deal with non-OpenMP threading and function-local variables
// For OpenMP threading we support all function-locals in shared mode only
bool thread_specific_function_locals = !omp_in_parallel();
OffloadTimer timer(get_timer_data(), c_offload_host_setup_buffers);
// make a copy of variable descriptors
m_vars_total = vars_total;
if (vars_total > 0) {
m_vars = (VarDesc*) malloc(m_vars_total * sizeof(VarDesc));
if (m_vars == NULL)
LIBOFFLOAD_ERROR(c_malloc);
memcpy(m_vars, vars, m_vars_total * sizeof(VarDesc));
m_vars_extra = (VarExtra*) malloc(m_vars_total * sizeof(VarExtra));
if (m_vars_extra == NULL)
LIBOFFLOAD_ERROR(c_malloc);
}
// dependencies
m_in_deps_allocated = m_vars_total + 1;
m_in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_in_deps_allocated);
if (m_in_deps == NULL)
LIBOFFLOAD_ERROR(c_malloc);
if (m_vars_total > 0) {
m_out_deps_allocated = m_vars_total;
m_out_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_out_deps_allocated);
if (m_out_deps == NULL)
LIBOFFLOAD_ERROR(c_malloc);
}
// copyin/copyout data length
m_in_datalen = 0;
m_out_datalen = 0;
// First pass over variable descriptors
// - Calculate size of the input and output non-pointer data
// - Allocate buffers for input and output pointers
for (int i = 0; i < m_vars_total; i++) {
void* alloc_base = NULL;
int64_t alloc_disp = 0;
int64_t alloc_size = 0;
bool src_is_for_mic = (m_vars[i].direction.out ||
m_vars[i].into == NULL);
bool src_is_for_host = (m_vars[i].direction.in ||
m_vars[i].into == NULL);
const char *var_sname = "";
if (vars2 != NULL && i < vars_total) {
if (vars2[i].sname != NULL) {
var_sname = vars2[i].sname;
}
}
// instead of m_vars[i].type.src we will use m_vars_extra[i].type_src
if (m_vars[i].type.src == c_extended_type) {
VarDescExtendedType *etype =
reinterpret_cast<VarDescExtendedType*>(m_vars[i].ptr);
m_vars_extra[i].type_src = etype->extended_type;
m_vars[i].ptr = etype->ptr;
}
else {
m_vars_extra[i].type_src = m_vars[i].type.src;
}
// instead of m_vars[i].type.dst we will use m_vars_extra[i].type_dst
if (m_vars[i].type.dst == c_extended_type) {
VarDescExtendedType *etype =
reinterpret_cast<VarDescExtendedType*>(m_vars[i].into);
if (etype) {
m_vars_extra[i].type_dst = etype->extended_type;
m_vars[i].into = etype->ptr;
}
else {
m_vars_extra[i].type_dst = m_vars_extra[i].type_src;
}
}
else {
m_vars_extra[i].type_dst = m_vars[i].type.dst;
}
OFFLOAD_TRACE(2, " VarDesc %d, var=%s, %s, %s\n",
i, var_sname,
vardesc_direction_as_string[m_vars[i].direction.bits],
vardesc_type_as_string[m_vars_extra[i].type_src]);
if (vars2 != NULL && i < vars_total && vars2[i].dname != NULL) {
OFFLOAD_TRACE(2, " into=%s, %s\n", vars2[i].dname,
vardesc_type_as_string[m_vars_extra[i].type_dst]);
}
OFFLOAD_TRACE(2,
" type_src=%d, type_dstn=%d, direction=%d, "
"alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, "
"offset=%lld, size=%lld, count/disp=%lld, ptr=%p, into=%p\n",
m_vars_extra[i].type_src,
m_vars_extra[i].type_dst,
m_vars[i].direction.bits,
m_vars[i].alloc_if,
m_vars[i].free_if,
m_vars[i].align,
m_vars[i].mic_offset,
m_vars[i].flags.bits,
m_vars[i].offset,
m_vars[i].size,
m_vars[i].count,
m_vars[i].ptr,
m_vars[i].into);
// If any varDesc flags bits set, show them
if (console_enabled >= 1 && m_vars[i].flags.bits != 0) {
trace_varDesc_flags(get_timer_data(), m_vars[i].flags);
}
// preallocated implies targetptr
if (m_vars[i].flags.preallocated) {
// targetptr preallocated alloc_if(1) may not be used with
// an in clause
if (m_vars[i].direction.in && m_vars[i].alloc_if) {
LIBOFFLOAD_ERROR(c_in_with_preallocated);
exit(1);
}
m_vars[i].flags.targetptr = 1;
}
if (m_vars[i].alloc != NULL) {
// array descriptor
const Arr_Desc *ap =
static_cast<const Arr_Desc*>(m_vars[i].alloc);
// debug dump
ARRAY_DESC_DUMP(" ", "ALLOC", ap, 0, 1);
__arr_data_offset_and_length(ap, alloc_disp, alloc_size);
alloc_base = reinterpret_cast<void*>(ap->base);
}
m_vars_extra[i].alloc = m_vars[i].alloc;
m_vars_extra[i].auto_data = 0;
m_vars_extra[i].cpu_disp = 0;
m_vars_extra[i].cpu_offset = 0;
m_vars_extra[i].src_data = 0;
m_vars_extra[i].read_rng_src = 0;
m_vars_extra[i].read_rng_dst = 0;
m_vars_extra[i].omp_last_event_type = c_last_not;
// flag is_arr_ptr_el is 1 only for var_descs generated
// for c_data_ptr_array type
if (i < vars_total) {
m_vars_extra[i].is_arr_ptr_el = 0;
}
if (TYPE_IS_PTR_TO_PTR(m_vars_extra[i].type_src) ||
TYPE_IS_PTR_TO_PTR(m_vars_extra[i].type_dst) ||
m_vars[i].flags.is_pointer) {
m_vars_extra[i].pointer_offset = m_vars[i].offset;
m_vars[i].offset = 0;
m_in_datalen += sizeof(m_vars[i].offset);
}
switch (m_vars_extra[i].type_src) {
case c_data_ptr_array:
{
const Arr_Desc *ap;
const VarDesc3 *vd3 =
static_cast<const VarDesc3*>(m_vars[i].ptr);
int flags = vd3->array_fields;
OFFLOAD_TRACE(2,
" pointer array flags = %04x\n", flags);
OFFLOAD_TRACE(2,
" pointer array type is %s\n",
vardesc_type_as_string[flags & 0x3f]);
ap = static_cast<const Arr_Desc*>(vd3->ptr_array);
ARRAY_DESC_DUMP(" ", "ptr array", ap,
m_vars[i].flags.is_pointer, 1);
if (m_vars[i].into) {
ap = static_cast<const Arr_Desc*>(m_vars[i].into);
ARRAY_DESC_DUMP(
" ", "into array", ap, 0, 1);
}
if ((flags & (1<<flag_align_is_array)) != 0) {
ap = static_cast<const Arr_Desc*>(vd3->align_array);
ARRAY_DESC_DUMP(
" ", "align array", ap, 0, 1);
}
if ((flags & (1<<flag_alloc_if_is_array)) != 0) {
ap = static_cast<const Arr_Desc*>(vd3->alloc_if_array);
ARRAY_DESC_DUMP(
" ", "alloc_if array", ap, 0, 1);
}
if ((flags & (1<<flag_free_if_is_array)) != 0) {
ap = static_cast<const Arr_Desc*>(vd3->free_if_array);
ARRAY_DESC_DUMP(
" ", "free_if array", ap, 0, 1);
}
if ((flags & (1<<flag_extent_start_is_array)) != 0) {
ap = static_cast<const Arr_Desc*>(vd3->extent_start);
ARRAY_DESC_DUMP(
" ", "extent_start array", ap, 0, 1);
} else if ((flags &
(1<<flag_extent_start_is_scalar)) != 0) {
OFFLOAD_TRACE(2,
" extent_start scalar = %d\n",
(int64_t)vd3->extent_start);
}
if ((flags & (1<<flag_extent_elements_is_array)) != 0) {
ap = static_cast<const Arr_Desc*>
(vd3->extent_elements);
ARRAY_DESC_DUMP(" ",
"extent_elements array", ap, 0, 1);
} else if ((flags &
(1<<flag_extent_elements_is_scalar)) != 0) {
OFFLOAD_TRACE(2,
" extent_elements scalar = %d\n",
(int64_t)vd3->extent_elements);
}
if ((flags & (1<<flag_into_start_is_array)) != 0) {
ap = static_cast<const Arr_Desc*>(vd3->into_start);
ARRAY_DESC_DUMP(
" ", "into_start array", ap, 0, 1);
} else if ((flags &
(1<<flag_into_start_is_scalar)) != 0) {
OFFLOAD_TRACE(2,
" into_start scalar = %d\n",
(int64_t)vd3->into_start);
}
if ((flags & (1<<flag_into_elements_is_array)) != 0) {
ap = static_cast<const Arr_Desc*>(vd3->into_elements);
ARRAY_DESC_DUMP(
" ", "into_elements array", ap, 0, 1);
} else if ((flags &
(1<<flag_into_elements_is_scalar)) != 0) {
OFFLOAD_TRACE(2,
" into_elements scalar = %d\n",
(int64_t)vd3->into_elements);
}
if ((flags & (1<<flag_alloc_start_is_array)) != 0) {
ap = static_cast<const Arr_Desc*>(vd3->alloc_start);
ARRAY_DESC_DUMP(
" ", "alloc_start array", ap, 0, 1);
} else if ((flags &
(1<<flag_alloc_start_is_scalar)) != 0) {
OFFLOAD_TRACE(2,
" alloc_start scalar = %d\n",
(int64_t)vd3->alloc_start);
}
if ((flags & (1<<flag_alloc_elements_is_array)) != 0) {
ap = static_cast<const Arr_Desc*>(vd3->alloc_elements);
ARRAY_DESC_DUMP(" ",
"alloc_elements array", ap, 0, 1);
} else if ((flags &
(1<<flag_alloc_elements_is_scalar)) != 0) {
OFFLOAD_TRACE(2,
" alloc_elements scalar = %d\n",
(int64_t)vd3->alloc_elements);
}
}
if (!gen_var_descs_for_pointer_array(i)) {
return false;
}
break;
case c_data:
case c_void_ptr:
case c_void_ptr_ptr:
case c_cean_var:
// In all uses later
// VarDesc.size will have the length of the data to be
// transferred
// VarDesc.disp will have an offset from base
if (m_vars[i].flags.is_non_cont_struct && src_is_for_host) {
NonContigDesc *desc =
static_cast<NonContigDesc*>(m_vars[i].ptr);
noncont_struct_dump(" ", "DATA", desc);
m_vars_extra[i].noncont_desc = desc;
m_vars[i].ptr = reinterpret_cast<void*>(desc->base);
m_vars[i].size = get_noncont_struct_size(desc);
m_vars[i].disp = 0;
}
else if (m_vars_extra[i].type_src == c_cean_var) {
// array descriptor
const Arr_Desc *ap =
static_cast<const Arr_Desc*>(m_vars[i].ptr);
// debug dump
ARRAY_DESC_DUMP("", "IN/OUT", ap, 0, !src_is_for_mic);
// offset and length are derived from the array descriptor
__arr_data_offset_and_length(ap, m_vars[i].disp,
m_vars[i].size);
if (!is_arr_desc_contiguous(ap)) {
m_vars[i].flags.is_noncont_src = 1;
m_vars_extra[i].read_rng_src =
init_read_ranges_arr_desc(ap);
}
// all necessary information about length and offset is
// transferred in var descriptor. There is no need to send
// array descriptor to the target side.
m_vars[i].ptr = reinterpret_cast<void*>(ap->base);
}
else {
m_vars[i].size *= m_vars[i].count;
m_vars[i].disp = 0;
}
if (m_vars[i].direction.bits) {
// make sure that transfer size > 0
if (m_vars[i].size <= 0) {
LIBOFFLOAD_ERROR(c_zero_or_neg_transfer_size);
exit(1);
}
if (m_vars[i].flags.is_static) {
PtrData *ptr_data;
// find data associated with variable
if (!find_ptr_data(ptr_data,
m_vars[i].ptr,
m_vars[i].disp,
m_vars[i].size,
false, false)) {
return false;
}
if (ptr_data != 0) {
// offset to base from the beginning of the buffer
// memory
m_vars[i].offset =
(char*) m_vars[i].ptr -
(char*) ptr_data->cpu_addr.start();
}
else {
m_vars[i].flags.is_static = false;
if (m_vars[i].into == NULL) {
m_vars[i].flags.is_static_dstn = false;
}
}
m_vars_extra[i].src_data = ptr_data;
}
if (m_vars[i].direction.in &&
!m_vars[i].flags.is_static &&
!m_vars[i].flags.is_stack_buf) {
m_in_datalen += m_vars[i].size;
// for non-static target destination defined as CEAN
// expression we pass to target its size and dist
if (m_vars[i].into == NULL &&
m_vars_extra[i].type_src == c_cean_var) {
m_in_datalen += 2 * sizeof(uint64_t);
}
m_need_runfunction = true;
}
if (m_vars[i].direction.out &&
!m_vars[i].flags.is_static &&
!m_vars[i].flags.is_stack_buf) {
m_out_datalen += m_vars[i].size;
m_need_runfunction = true;
}
}
if (m_is_openmp && src_is_for_host &&
!m_vars[i].flags.is_device_ptr) {
if (m_vars[i].flags.is_static) {
PtrData *ptr_data = m_vars_extra[i].src_data;
// Static data is transferred either by omp target
// update construct which passes zeros for
// alloc_if and free_if or by always modifier.
// Implicit openmp reference is transfered also
// if its reference count is equal to 1
if (ptr_data &&
IS_OPENMP_IMPLICIT_OR_LINK(ptr_data->var_alloc_type)) {
if (m_vars[i].alloc_if) {
ptr_data->add_reference();
}
if (!m_vars[i].flags.always_copy &&
(m_vars[i].alloc_if || m_vars[i].free_if) &&
ptr_data->get_reference() != 1) {
m_vars[i].direction.bits = c_parameter_nocopy;
}
}
else if (
!m_vars[i].flags.always_copy &&
(m_vars[i].alloc_if || m_vars[i].free_if)) {
m_vars[i].direction.bits = c_parameter_nocopy;
}
}
else {
AutoData *auto_data;
if (m_vars[i].alloc_if) {
auto_data = m_device.insert_auto_data(
m_vars[i].ptr, m_vars[i].size);
auto_data->add_reference();
}
else {
// TODO: what should be done if var is not in
// the table?
auto_data = m_device.find_auto_data(
m_vars[i].ptr);
}
// For automatic variables data is transferred:
// - if always modifier is used OR
// - if alloc_if == 0 && free_if == 0 OR
// - if reference count is 1
if (!m_vars[i].flags.always_copy &&
(m_vars[i].alloc_if || m_vars[i].free_if) &&
auto_data != 0 &&
auto_data->get_reference() != 1) {
m_vars[i].direction.bits = c_parameter_nocopy;
}
// save data for later use
m_vars_extra[i].auto_data = auto_data;
}
}
break;
case c_dv:
if (m_vars[i].flags.use_device_ptr) {
setup_use_device_ptr(i);
break;
}
else if (m_vars[i].direction.bits ||
m_vars[i].alloc_if ||
m_vars[i].free_if) {
ArrDesc *dvp = static_cast<ArrDesc*>(m_vars[i].ptr);
// debug dump
__dv_desc_dump("IN/OUT", dvp);
// send dope vector contents excluding base
m_in_datalen += m_vars[i].size - sizeof(uint64_t);
m_need_runfunction = true;
}
break;
case c_string_ptr:
case c_string_ptr_ptr:
if ((m_vars[i].direction.bits ||
m_vars[i].alloc_if ||
m_vars[i].free_if) &&
m_vars[i].size == 0) {
m_vars[i].size = 1;
m_vars[i].count =
strlen(*static_cast<char**>(m_vars[i].ptr)) + 1;
}
/* fallthru */
case c_data_ptr:
case c_data_ptr_ptr:
if (m_vars[i].flags.is_stack_buf &&
!m_vars[i].direction.bits &&
m_vars[i].alloc_if) {
// this var_desc is for stack buffer
bool is_new;
if (!offload_stack_memory_manager(
stack_addr, entry_id,
m_vars[i].count, m_vars[i].align,
thread_specific_function_locals, &is_new)) {
return false;
}
if (is_new) {
m_compute_buffers.push_back(
m_stack_ptr_data->mic_buf);
m_device.m_persist_list.front().cpu_stack_addr =
static_cast<char*>(m_vars[i].ptr);
PersistData *new_el = &m_device.m_persist_list.front();
print_persistList_item(
"New element in persist list:",
new_el);
}
else {
m_vars[i].flags.sink_addr = 1;
m_in_datalen += sizeof(m_stack_ptr_data->mic_addr);
if (thread_specific_function_locals) {
m_stack_ptr_data = get_this_threads_mic_stack_addr(
stack_addr, entry_id,
thread_specific_function_locals);
}
}
m_vars[i].size = m_destroy_stack.size();
m_vars_extra[i].src_data = m_stack_ptr_data;
// need to add or remove references for stack buffer at target
if (is_new || m_destroy_stack.size()) {
m_need_runfunction = true;
}
break;
}
/* fallthru */
case c_cean_var_ptr:
case c_cean_var_ptr_ptr:
case c_dv_ptr:
if (m_vars[i].flags.is_non_cont_struct && src_is_for_host) {
NonContigDesc *desc =
static_cast<NonContigDesc*>(m_vars[i].ptr);
noncont_struct_dump(" ", "PTR", desc);
m_vars_extra[i].noncont_desc = desc;
m_vars[i].ptr = reinterpret_cast<void*>(desc->base);
m_vars[i].disp = 0;
}
else if (m_vars_extra[i].type_src == c_cean_var_ptr ||
m_vars_extra[i].type_src == c_cean_var_ptr_ptr) {
// array descriptor
const Arr_Desc *ap =
static_cast<const Arr_Desc*>(m_vars[i].ptr);
// debug dump
ARRAY_DESC_DUMP("", "IN/OUT", ap, 1, !src_is_for_mic);
// offset and length are derived from the array descriptor
__arr_data_offset_and_length(ap, m_vars[i].disp,
m_vars[i].size);
if (!is_arr_desc_contiguous(ap)) {
m_vars[i].flags.is_noncont_src = 1;
m_vars_extra[i].read_rng_src =
init_read_ranges_arr_desc(ap);
}
// all necessary information about length and offset is
// transferred in var descriptor. There is no need to send
// array descriptor to the target side.
m_vars[i].ptr = reinterpret_cast<void*>(ap->base);
}
else if (m_vars_extra[i].type_src == c_dv_ptr) {
// need to send DV to the device unless it is 'nocopy'
if (m_vars[i].direction.bits ||
m_vars[i].alloc_if ||
m_vars[i].free_if) {
ArrDesc *dvp = *static_cast<ArrDesc**>(m_vars[i].ptr);
// debug dump
__dv_desc_dump("IN/OUT", dvp);
// for use_device_ptr don't need to change
// OUT direction to IN direction
if (!m_vars[i].flags.use_device_ptr) {
m_vars[i].direction.bits = c_parameter_in;
}
}
// no displacement
m_vars[i].disp = 0;
}
else {
// For "use_device_ptr" if direction is "in" then need to
// find the associated device pointer and replace the host
// pointer with device pointer. Also save the host pointer
// to restore when "out" is encountered.
// For "out" find the host pointer associated with the
// device pointer and restore the host pointer
if (m_vars[i].flags.use_device_ptr && src_is_for_host) {
setup_use_device_ptr(i);
break;
}
// c_data_ptr or c_string_ptr
m_vars[i].size *= m_vars[i].count;
m_vars[i].disp = 0;
}
if (m_vars[i].direction.bits ||
m_vars[i].alloc_if ||
m_vars[i].free_if) {
PtrData *ptr_data;
// check that buffer length > 0
if (m_vars[i].alloc_if &&
m_vars[i].disp + m_vars[i].size <
(m_is_openmp ? 0 : 1)) {
LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len);
exit(1);
}
// base address
void *base = *static_cast<void**>(m_vars[i].ptr);
// allocate buffer if we have no INTO and don't need
// allocation for the ptr at target
if (src_is_for_mic) {
if (m_vars[i].flags.is_stack_buf) {
// for stack persistent objects ptr data is created
// by var_desc with number 0.
// Its ptr_data is stored at m_stack_ptr_data
ptr_data = m_stack_ptr_data;
}
else if (m_vars[i].alloc_if) {
if (m_vars[i].flags.preallocated) {
m_out_datalen += sizeof(void*);
m_need_runfunction = true;
break;
}
// add new entry
if (!alloc_ptr_data(
ptr_data,
reinterpret_cast<char *>(base) + alloc_disp,
(alloc_base != NULL) ?
alloc_disp : m_vars[i].disp,
(alloc_base != NULL) ?
alloc_size : m_vars[i].size,
alloc_disp,
(alloc_base != NULL) ?
0 : m_vars[i].align,
m_vars[i].flags.targetptr,
0,
m_vars[i].flags.pin)) {
return false;
}
if (m_vars[i].flags.targetptr) {
if (!init_mic_address(ptr_data)) {
return false;
}
*static_cast<void**>(m_vars[i].ptr) = base =
reinterpret_cast<void*>(ptr_data->mic_addr);
}
if (ptr_data->add_reference() == 0 &&
ptr_data->mic_buf != 0) {
// add buffer to the list of buffers that
// are passed to dispatch call
m_compute_buffers.push_back(
ptr_data->mic_buf);
}
else if (!m_vars[i].flags.pin &&
!m_vars[i].flags.preallocated) {
// will send buffer address to device
m_vars[i].flags.sink_addr = 1;
m_in_datalen += sizeof(ptr_data->mic_addr);
}
if (!m_vars[i].flags.pin &&
!ptr_data->is_static) {
// need to add reference for buffer
m_need_runfunction = true;
}
}
else {
bool error_if_not_found = true;
if (m_is_openmp) {
// For omp target update variable is ignored
// if it does not exist.
if (m_vars[i].flags.always_copy ||
(!m_vars[i].alloc_if &&
!m_vars[i].free_if)) {
error_if_not_found = false;
}
}
// use existing association from pointer table
if (!find_ptr_data(ptr_data,
base,
m_vars[i].disp,
m_vars[i].size,
m_vars[i].flags.targetptr,
error_if_not_found)) {
return false;
}
if (m_is_openmp) {
// make var nocopy if it does not exist
if (ptr_data == 0) {
m_vars[i].direction.bits =
c_parameter_nocopy;
}
}
if (ptr_data != 0) {
m_vars[i].flags.sink_addr = 1;
m_in_datalen += sizeof(ptr_data->mic_addr);
}
}
if (ptr_data != 0) {
if (ptr_data->alloc_disp != 0) {
m_vars[i].flags.alloc_disp = 1;
m_in_datalen += sizeof(alloc_disp);
}
if (m_vars[i].flags.sink_addr) {
// get buffers's address on the sink
if (!init_mic_address(ptr_data)) {
return false;
}
m_in_datalen += sizeof(ptr_data->mic_addr);
}
if (!m_vars[i].flags.pin &&
!ptr_data->is_static && m_vars[i].free_if) {
// need to decrement buffer reference on target
m_need_runfunction = true;
}
// offset to base from the beginning of the buffer
// memory
m_vars[i].offset = (char*) base -
(char*) ptr_data->cpu_addr.start();
// copy other pointer properties to var descriptor
m_vars[i].mic_offset = ptr_data->mic_offset;
m_vars[i].flags.is_static = ptr_data->is_static;
}
}
else {
if (!find_ptr_data(ptr_data,
base,
m_vars[i].disp,
m_vars[i].size,
false, false)) {
return false;
}
if (ptr_data) {
m_vars[i].offset =
(char*) base -
(char*) ptr_data->cpu_addr.start();
}
}
if (m_is_openmp) {
if (m_vars[i].flags.use_device_ptr) {
setup_use_device_ptr(i);
}
// for TO transfer of stack buffer's variable
if (src_is_for_host && m_vars[i].flags.is_stack_buf) {
AutoData *auto_data;
char *base = *static_cast<char**>(m_vars[i].ptr);
if (m_vars[i].alloc_if) {
auto_data =m_device.insert_auto_data(
base + m_vars[i].disp,
m_vars[i].size);
auto_data->add_reference();
}
else {
auto_data = m_device.find_auto_data(
base + m_vars[i].disp);
}
// save data for later use
m_vars_extra[i].auto_data = auto_data;
// For automatic variables
// data is transferred:
// - if always modifier is used OR
// - if alloc_if == 0 && free_if == 0 OR
// - if reference count is 1
if (!m_vars[i].flags.always_copy &&
(m_vars[i].alloc_if ||
m_vars[i].free_if) &&
auto_data != 0 &&
auto_data->get_reference() != 1) {
m_vars[i].direction.bits =
c_parameter_nocopy;
}
}
// for FROM transfer of global pointer variable
// FROM transfer of stack buffer's variable
// is treated at INTO branch
else if (src_is_for_mic &&
!m_vars[i].flags.is_stack_buf) {
// data is transferred only if
// alloc_if == 0 && free_if == 0
// or reference count is 1
if (!m_vars[i].flags.always_copy &&
(m_vars[i].alloc_if ||
m_vars[i].free_if) &&
ptr_data &&
ptr_data->get_reference() != 1)
{
m_vars[i].direction.bits =
c_parameter_nocopy;
}
}
}
// save pointer data
m_vars_extra[i].src_data = ptr_data;
}
break;
case c_func_ptr:
case c_func_ptr_ptr:
if (m_vars[i].direction.in) {
m_in_datalen += __offload_funcs.max_name_length();
}
if (m_vars[i].direction.out) {
m_out_datalen += __offload_funcs.max_name_length();
}
m_need_runfunction = true;
break;
case c_dv_data:
case c_dv_ptr_data:
case c_dv_data_slice:
case c_dv_ptr_data_slice:
ArrDesc *dvp;
if (m_vars[i].flags.is_non_cont_struct) {
NonContigDesc *desc =
static_cast<NonContigDesc*>(m_vars[i].ptr);
noncont_struct_dump(" ", "DV-DATA", desc);
dvp = reinterpret_cast<ArrDesc*>(desc->base);
}
else if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_src)) {
const Arr_Desc *ap;
ap = static_cast<const Arr_Desc*>(m_vars[i].ptr);
dvp = (m_vars_extra[i].type_src == c_dv_data_slice) ?
reinterpret_cast<ArrDesc*>(ap->base) :
*reinterpret_cast<ArrDesc**>(ap->base);
}
else {
dvp = (m_vars_extra[i].type_src == c_dv_data) ?
static_cast<ArrDesc*>(m_vars[i].ptr) :
*static_cast<ArrDesc**>(m_vars[i].ptr);
}
// if allocatable dope vector isn't allocated don't
// transfer its data
if (!__dv_is_allocated(dvp)) {
m_vars[i].direction.bits = c_parameter_nocopy;
m_vars[i].alloc_if = 0;
m_vars[i].free_if = 0;
}
if (m_vars[i].direction.bits ||
m_vars[i].alloc_if ||
m_vars[i].free_if) {
const Arr_Desc *ap;
if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_src)) {
ap = static_cast<const Arr_Desc*>(m_vars[i].ptr);
// debug dump
ARRAY_DESC_DUMP("", "IN/OUT", ap, 0, !src_is_for_mic);
}
if (!__dv_is_contiguous(dvp)) {
m_vars[i].flags.is_noncont_src = 1;
m_vars_extra[i].read_rng_src =
init_read_ranges_dv(dvp);
}
// size and displacement
if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_src)) {
// offset and length are derived from the
// array descriptor
__arr_data_offset_and_length(ap,
m_vars[i].disp,
m_vars[i].size);
if (m_vars[i].direction.bits) {
if (!is_arr_desc_contiguous(ap)) {
if (m_vars[i].flags.is_noncont_src) {
LIBOFFLOAD_ERROR(c_slice_of_noncont_array);
return false;
}
m_vars[i].flags.is_noncont_src = 1;
m_vars_extra[i].read_rng_src =
init_read_ranges_arr_desc(ap);
}
}
}
else {
if (m_vars[i].flags.has_length) {
m_vars[i].size =
__dv_data_length(dvp, m_vars[i].count);
}
else {
m_vars[i].size = __dv_data_length(dvp);
}
m_vars[i].disp = 0;
}
// check that length >= 0
if (m_vars[i].alloc_if &&
(m_vars[i].disp + m_vars[i].size < 0)) {
LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len);
exit(1);
}
// base address
void *base = reinterpret_cast<void*>(dvp->Base);
PtrData *ptr_data;
// allocate buffer if we have no INTO and don't need
// allocation for the ptr at target
if (src_is_for_mic) {
if (m_vars[i].alloc_if) {
// add new entry
if (!alloc_ptr_data(
ptr_data,
reinterpret_cast<char *>(base) + alloc_disp,
(alloc_base != NULL) ?
alloc_disp : m_vars[i].disp,
(alloc_base != NULL) ?
alloc_size : m_vars[i].size,
alloc_disp,
(alloc_base != NULL) ?
0 : m_vars[i].align,
m_vars[i].flags.targetptr,
m_vars[i].flags.preallocated,
m_vars[i].flags.pin)) {
return false;
}
if (ptr_data->add_reference() == 0 &&
ptr_data->mic_buf != 0) {
// add buffer to the list of buffers
// that are passed to dispatch call
m_compute_buffers.push_back(
ptr_data->mic_buf);
}
else {
// will send buffer address to device
m_vars[i].flags.sink_addr = 1;
}
if (!ptr_data->is_static) {
// need to add reference for buffer
m_need_runfunction = true;
}
}
else {
bool error_if_not_found = true;
if (m_is_openmp) {
// For omp target update variable is ignored
// if it does not exist.
if (m_vars[i].flags.always_copy ||
(!m_vars[i].alloc_if &&
!m_vars[i].free_if)) {
error_if_not_found = false;
}
}
// use existing association from pointer table
if (!find_ptr_data(ptr_data,
base,
m_vars[i].disp,
m_vars[i].size,
m_vars[i].flags.targetptr,
error_if_not_found)) {
return false;
}
if (m_is_openmp) {
// make var nocopy if it does not exist
if (ptr_data == 0) {
m_vars[i].direction.bits =
c_parameter_nocopy;
}
}
if (ptr_data != 0) {
// need to update base in dope vector on device
m_vars[i].flags.sink_addr = 1;
}
}
if (ptr_data != 0) {
if (m_is_openmp) {
// data is transferred if
// - if always modifier is used OR
// - if alloc_if == 0 && free_if == 0 OR
// - if reference count is 1
if (!m_vars[i].flags.always_copy &&
(m_vars[i].alloc_if ||
m_vars[i].free_if) &&
ptr_data->get_reference() != 1) {
m_vars[i].direction.bits =
c_parameter_nocopy;
}
}
if (ptr_data->alloc_disp != 0) {
m_vars[i].flags.alloc_disp = 1;
m_in_datalen += sizeof(alloc_disp);
}
if (m_vars[i].flags.sink_addr) {
// get buffers's address on the sink
if (!init_mic_address(ptr_data)) {
return false;
}
m_in_datalen += sizeof(ptr_data->mic_addr);
}
if (!ptr_data->is_static && m_vars[i].free_if) {
// need to decrement buffer reference on target
m_need_runfunction = true;
}
// offset to base from the beginning of the buffer
// memory
m_vars[i].offset =
(char*) base -
(char*) ptr_data->cpu_addr.start();
// copy other pointer properties to var descriptor
m_vars[i].mic_offset = ptr_data->mic_offset;
m_vars[i].flags.is_static = ptr_data->is_static;
}
}
else { // !src_is_for_mic
if (!find_ptr_data(ptr_data,
base,
m_vars[i].disp,
m_vars[i].size,
false, false)) {
return false;
}
m_vars[i].offset = !ptr_data ? 0 :
(char*) base -
(char*) ptr_data->cpu_addr.start();
}
// save pointer data
m_vars_extra[i].src_data = ptr_data;
}
break;
default:
LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars_extra[i].type_src);
LIBOFFLOAD_ABORT;
}
if (m_vars_extra[i].type_src == c_data_ptr_array) {
continue;
}
if (src_is_for_mic && m_vars[i].flags.is_stack_buf) {
if (this_threads_cpu_stack_addr == 0) {
this_threads_cpu_stack_addr =
get_this_threads_cpu_stack_addr(
stack_addr, entry_id, thread_specific_function_locals);
}
m_vars[i].offset = static_cast<char*>
(m_vars[i].ptr) -
this_threads_cpu_stack_addr;
}
// if source is used at CPU save its offset and disp
if (m_vars[i].into == NULL || m_vars[i].direction.in) {
m_vars_extra[i].cpu_offset = m_vars[i].offset;
m_vars_extra[i].cpu_disp = m_vars[i].disp;
}
// If "into" is define we need to do the similar work for it
if (!m_vars[i].into) {
continue;
}
int64_t into_disp =0, into_offset = 0;
switch (m_vars_extra[i].type_dst) {
case c_data_ptr_array:
break;
case c_data:
case c_void_ptr:
case c_void_ptr_ptr:
case c_cean_var: {
int64_t size = m_vars[i].size;
if (m_vars[i].flags.is_non_cont_struct && src_is_for_mic) {
NonContigDesc *desc =
static_cast<NonContigDesc*>(m_vars[i].into);
noncont_struct_dump("", "INTO DATA", desc);
m_vars_extra[i].noncont_desc = desc;
m_vars[i].into = reinterpret_cast<void*>(desc->base);
size = get_noncont_struct_size(desc);
into_disp = 0;
}
else if (m_vars_extra[i].type_dst == c_cean_var) {
// array descriptor
const Arr_Desc *ap =
static_cast<const Arr_Desc*>(m_vars[i].into);
// debug dump
ARRAY_DESC_DUMP(" ", "INTO", ap, 0, src_is_for_mic);
// offset and length are derived from the array descriptor
__arr_data_offset_and_length(ap, into_disp, size);
if (!is_arr_desc_contiguous(ap)) {
m_vars[i].flags.is_noncont_dst = 1;
m_vars_extra[i].read_rng_dst =
init_read_ranges_arr_desc(ap);
if (!cean_ranges_match(
m_vars_extra[i].read_rng_src,
m_vars_extra[i].read_rng_dst)) {
LIBOFFLOAD_ERROR(c_ranges_dont_match);
exit(1);
}
}
m_vars[i].into = reinterpret_cast<void*>(ap->base);
}
int64_t size_src = m_vars_extra[i].read_rng_src &&
!m_vars[i].flags.is_non_cont_struct ?
cean_get_transf_size(m_vars_extra[i].read_rng_src) :
m_vars[i].size;
int64_t size_dst = m_vars_extra[i].read_rng_dst ?
cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
size;
// It's supposed that "into" size must be not less
// than src size
if (size_src > size_dst) {
LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
size_src, size_dst);
exit(1);
}
if (m_vars[i].direction.bits) {
if (m_vars[i].flags.is_static_dstn) {
PtrData *ptr_data;
// find data associated with variable
if (!find_ptr_data(ptr_data, m_vars[i].into,
into_disp, size, false, false)) {
return false;
}
if (ptr_data != 0) {
// offset to base from the beginning of the buffer
// memory
into_offset =
(char*) m_vars[i].into -
(char*) ptr_data->cpu_addr.start();
}
else {
m_vars[i].flags.is_static_dstn = false;
}
m_vars_extra[i].dst_data = ptr_data;
}
}
if (m_vars[i].direction.in &&
!m_vars[i].flags.is_static_dstn) {
m_in_datalen += m_vars[i].size;
// for non-static target destination defined as CEAN
// expression we pass to target its size and dist
if (m_vars_extra[i].type_dst == c_cean_var) {
m_in_datalen += 2 * sizeof(uint64_t);
}
m_need_runfunction = true;
}
if (m_is_openmp && src_is_for_mic) {
if (m_vars[i].flags.is_static_dstn) {
// Static data is transferred either by omp target
// update construct which passes zeros for
// alloc_if and free_if or by always modifier.
if (!m_vars[i].flags.always_copy &&
(m_vars[i].alloc_if || m_vars[i].free_if)) {
m_vars[i].direction.bits = c_parameter_nocopy;
}
}
else {
AutoData *auto_data;
if (m_vars[i].alloc_if) {
auto_data = m_device.insert_auto_data(
m_vars[i].into, size_dst);
auto_data->add_reference();
}
else {
// TODO: what should be done if var is not in
// the table?
auto_data = m_device.find_auto_data(
m_vars[i].into);
}
// For automatic variables data is transferred:
// - if always modifier is used OR
// - if alloc_if == 0 && free_if == 0 OR
// - if reference count is 1
if (!m_vars[i].flags.always_copy &&
(m_vars[i].alloc_if || m_vars[i].free_if) &&
(auto_data == 0 ||
auto_data->get_reference() != 1)) {
m_vars[i].direction.bits = c_parameter_nocopy;
}
// save data for later use
m_vars_extra[i].auto_data = auto_data;
}
}
break;
}
case c_dv:
if (m_vars[i].direction.bits ||
m_vars[i].alloc_if ||
m_vars[i].free_if) {
ArrDesc *dvp = static_cast<ArrDesc*>(m_vars[i].into);
// debug dump
__dv_desc_dump("INTO", dvp);
// send dope vector contents excluding base
m_in_datalen += m_vars[i].size - sizeof(uint64_t);
m_need_runfunction = true;
}
break;
case c_string_ptr:
case c_data_ptr:
case c_string_ptr_ptr:
case c_data_ptr_ptr:
case c_cean_var_ptr:
case c_cean_var_ptr_ptr:
case c_dv_ptr: {
int64_t size = m_vars[i].size;
if (m_vars_extra[i].type_dst == c_cean_var_ptr ||
m_vars_extra[i].type_dst == c_cean_var_ptr_ptr) {
// array descriptor
const Arr_Desc *ap =
static_cast<const Arr_Desc*>(m_vars[i].into);
// debug dump
ARRAY_DESC_DUMP(" ", "INTO", ap, 1, src_is_for_mic);
// offset and length are derived from the array descriptor
__arr_data_offset_and_length(ap, into_disp, size);
if (!is_arr_desc_contiguous(ap)) {
m_vars[i].flags.is_noncont_src = 1;
m_vars_extra[i].read_rng_dst =
init_read_ranges_arr_desc(ap);
if (!cean_ranges_match(
m_vars_extra[i].read_rng_src,
m_vars_extra[i].read_rng_dst)) {
LIBOFFLOAD_ERROR(c_ranges_dont_match);
}
}
m_vars[i].into = reinterpret_cast<char**>(ap->base);
}
else if (m_vars_extra[i].type_dst == c_dv_ptr) {
// need to send DV to the device unless it is 'nocopy'
if (m_vars[i].direction.bits ||
m_vars[i].alloc_if ||
m_vars[i].free_if) {
ArrDesc *dvp = *static_cast<ArrDesc**>(m_vars[i].into);
// debug dump
__dv_desc_dump("INTO", dvp);
m_vars[i].direction.bits = c_parameter_in;
}
}
int64_t size_src = m_vars_extra[i].read_rng_src &&
!m_vars[i].flags.is_non_cont_struct ?
cean_get_transf_size(m_vars_extra[i].read_rng_src) :
m_vars[i].size;
int64_t size_dst = m_vars_extra[i].read_rng_dst ?
cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
size;
// It's supposed that "into" size must be not less than
// src size
if (size_src > size_dst) {
LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
size_src, size_dst);
exit(1);
}
if (m_vars[i].direction.bits) {
PtrData *ptr_data;
// base address
void *base = *static_cast<void**>(m_vars[i].into);
if (m_vars[i].direction.in) {
// allocate buffer
if (m_vars[i].flags.is_stack_buf) {
// for stack persistent objects ptr data is created
// by var_desc with number 0.
// Its ptr_data is stored at m_stack_ptr_data
ptr_data = m_stack_ptr_data;
}
else if (m_vars[i].alloc_if) {
if (m_vars[i].flags.preallocated) {
m_out_datalen += sizeof(void*);
m_need_runfunction = true;
break;
}
// add new entry
if (!alloc_ptr_data(
ptr_data,
reinterpret_cast<char *>(base) + alloc_disp,
(alloc_base != NULL) ?
alloc_disp : into_disp,
(alloc_base != NULL) ?
alloc_size : size,
alloc_disp,
(alloc_base != NULL) ?
0 : m_vars[i].align,
m_vars[i].flags.targetptr,
m_vars[i].flags.preallocated,
m_vars[i].flags.pin)) {
return false;
}
if (m_vars[i].flags.targetptr) {
if (!init_mic_address(ptr_data)) {
return false;
}
*static_cast<void**>(m_vars[i].into) = base =
reinterpret_cast<void*>(ptr_data->mic_addr);
}
if (ptr_data->add_reference() == 0 &&
ptr_data->mic_buf != 0) {
// add buffer to the list of buffers that
// are passed to dispatch call
m_compute_buffers.push_back(
ptr_data->mic_buf);
}
else {
// will send buffer address to device
m_vars[i].flags.sink_addr = 1;
}
if (!ptr_data->is_static) {
// need to add reference for buffer
m_need_runfunction = true;
}
}
else {
// use existing association from pointer table
if (!find_ptr_data(ptr_data, base, into_disp,
size, m_vars[i].flags.targetptr, true)) {
return false;
}
m_vars[i].flags.sink_addr = 1;
}
if (ptr_data->alloc_disp != 0) {
m_vars[i].flags.alloc_disp = 1;
m_in_datalen += sizeof(alloc_disp);
}
if (m_vars[i].flags.sink_addr) {
// get buffers's address on the sink
if (!init_mic_address(ptr_data)) {
return false;
}
m_in_datalen += sizeof(ptr_data->mic_addr);
}
if (!ptr_data->is_static && m_vars[i].free_if) {
// need to decrement buffer reference on target
m_need_runfunction = true;
}
// copy other pointer properties to var descriptor
m_vars[i].mic_offset = ptr_data->mic_offset;
m_vars[i].flags.is_static_dstn = ptr_data->is_static;
}
else {
if (!find_ptr_data(ptr_data,
base,
into_disp,
m_vars[i].size,
false, false)) {
return false;
}
}
if (ptr_data) {
into_offset = ptr_data ?
(char*) base -
(char*) ptr_data->cpu_addr.start() :
0;
}
if (m_is_openmp) {
// for FROM transfer of stack buffer's variable
if (src_is_for_mic && m_vars[i].flags.is_stack_buf) {
AutoData *auto_data;
char *base = *static_cast<char**>(m_vars[i].into);
if (m_vars[i].alloc_if) {
auto_data =m_device.insert_auto_data(
base + into_disp,
size);
auto_data->add_reference();
}
else {
auto_data = m_device.find_auto_data(
base + into_disp);
}
// save data for later use
m_vars_extra[i].auto_data = auto_data;
// For automatic variables
// data is transferred:
// - if always modifier is used OR
// - if alloc_if == 0 && free_if == 0 OR
// - if reference count is 1
if (!m_vars[i].flags.always_copy &&
(m_vars[i].alloc_if ||
m_vars[i].free_if) &&
auto_data != 0 &&
auto_data->get_reference() != 1) {
m_vars[i].direction.bits =
c_parameter_nocopy;
}
}
}
// save pointer data
m_vars_extra[i].dst_data = ptr_data;
}
break;
}
case c_func_ptr:
case c_func_ptr_ptr:
break;
case c_dv_data:
case c_dv_ptr_data:
case c_dv_data_slice:
case c_dv_ptr_data_slice:
if (m_vars[i].direction.bits ||
m_vars[i].alloc_if ||
m_vars[i].free_if) {
const Arr_Desc *ap;
ArrDesc *dvp;
PtrData *ptr_data;
int64_t disp;
int64_t size;
if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_dst)) {
ap = static_cast<const Arr_Desc*>(m_vars[i].into);
// debug dump
ARRAY_DESC_DUMP(" ", "INTO", ap, 0, src_is_for_mic);
dvp = (m_vars_extra[i].type_dst == c_dv_data_slice) ?
reinterpret_cast<ArrDesc*>(ap->base) :
*reinterpret_cast<ArrDesc**>(ap->base);
}
else {
dvp = (m_vars_extra[i].type_dst == c_dv_data) ?
static_cast<ArrDesc*>(m_vars[i].into) :
*static_cast<ArrDesc**>(m_vars[i].into);
}
if (!__dv_is_contiguous(dvp)) {
m_vars[i].flags.is_noncont_dst = 1;
m_vars_extra[i].read_rng_dst =
init_read_ranges_dv(dvp);
}
// size and displacement
if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_dst)) {
// offset and length are derived from the array
// descriptor
__arr_data_offset_and_length(ap, into_disp, size);
if (m_vars[i].direction.bits) {
if (!is_arr_desc_contiguous(ap)) {
if (m_vars[i].flags.is_noncont_dst) {
LIBOFFLOAD_ERROR(c_slice_of_noncont_array);
return false;
}
m_vars[i].flags.is_noncont_dst = 1;
m_vars_extra[i].read_rng_dst =
init_read_ranges_arr_desc(ap);
if (!cean_ranges_match(
m_vars_extra[i].read_rng_src,
m_vars_extra[i].read_rng_dst)) {
LIBOFFLOAD_ERROR(c_ranges_dont_match);
}
}
}
}
else {
if (m_vars[i].flags.has_length) {
size = __dv_data_length(dvp, m_vars[i].count);
}
else {
size = __dv_data_length(dvp);
}
disp = 0;
}
int64_t size_src =
m_vars_extra[i].read_rng_src &&
(!m_vars[i].flags.is_non_cont_struct ||
src_is_for_mic) ?
cean_get_transf_size(m_vars_extra[i].read_rng_src) :
m_vars[i].size;
int64_t size_dst =
m_vars_extra[i].read_rng_dst ?
cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
size;
// It's supposed that "into" size must be not less
// than src size
if (size_src > size_dst) {
LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
size_src, size_dst);
exit(1);
}
// base address
void *base = reinterpret_cast<void*>(dvp->Base);
// allocate buffer
if (m_vars[i].direction.in) {
if (m_vars[i].alloc_if) {
// add new entry
if (!alloc_ptr_data(
ptr_data,
reinterpret_cast<char *>(base) + alloc_disp,
(alloc_base != NULL) ?
alloc_disp : into_disp,
(alloc_base != NULL) ?
alloc_size : size,
alloc_disp,
(alloc_base != NULL) ?
0 : m_vars[i].align,
m_vars[i].flags.targetptr,
m_vars[i].flags.preallocated,
m_vars[i].flags.pin)) {
return false;
}
if (ptr_data->add_reference() == 0 &&
ptr_data->mic_buf !=0) {
// add buffer to the list of buffers
// that are passed to dispatch call
m_compute_buffers.push_back(
ptr_data->mic_buf);
}
else {
// will send buffer address to device
m_vars[i].flags.sink_addr = 1;
}
if (!ptr_data->is_static) {
// need to add reference for buffer
m_need_runfunction = true;
}
}
else {
// use existing association from pointer table
if (!find_ptr_data(ptr_data, base, into_disp,
size, m_vars[i].flags.targetptr, true)) {
return false;
}
// need to update base in dope vector on device
m_vars[i].flags.sink_addr = 1;
}
if (ptr_data->alloc_disp != 0) {
m_vars[i].flags.alloc_disp = 1;
m_in_datalen += sizeof(alloc_disp);
}
if (m_vars[i].flags.sink_addr) {
// get buffers's address on the sink
if (!init_mic_address(ptr_data)) {
return false;
}
m_in_datalen += sizeof(ptr_data->mic_addr);
}
if (!ptr_data->is_static && m_vars[i].free_if) {
// need to decrement buffer reference on target
m_need_runfunction = true;
}
// offset to base from the beginning of the buffer
// memory
into_offset =
(char*) base - (char*) ptr_data->cpu_addr.start();
// copy other pointer properties to var descriptor
m_vars[i].mic_offset = ptr_data->mic_offset;
m_vars[i].flags.is_static_dstn = ptr_data->is_static;
}
else { // src_is_for_mic
if (!find_ptr_data(ptr_data,
base,
into_disp,
size,
false, false)) {
return false;
}
into_offset = !ptr_data ?
0 :
(char*) base - (char*) ptr_data->cpu_addr.start();
}
// save pointer data
m_vars_extra[i].dst_data = ptr_data;
}
break