blob: 2e90f327c45124ee67510a570922d173bdb97ff8 [file] [log] [blame]
/* Copyright (C) 2016-2021 Free Software Foundation, Inc.
This file is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This file is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
/* {{{ Includes. */
/* We want GET_MODE_SIZE et al to return integers, please. */
#define IN_TARGET_CODE 1
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "backend.h"
#include "target.h"
#include "memmodel.h"
#include "rtl.h"
#include "tree.h"
#include "df.h"
#include "tm_p.h"
#include "stringpool.h"
#include "optabs.h"
#include "regs.h"
#include "emit-rtl.h"
#include "recog.h"
#include "diagnostic-core.h"
#include "insn-attr.h"
#include "fold-const.h"
#include "calls.h"
#include "explow.h"
#include "expr.h"
#include "output.h"
#include "cfgrtl.h"
#include "langhooks.h"
#include "builtins.h"
#include "omp-general.h"
#include "print-rtl.h"
#include "attribs.h"
#include "varasm.h"
#include "intl.h"
#include "rtl-iter.h"
#include "dwarf2.h"
#include "gimple.h"
/* This file should be included last. */
#include "target-def.h"
/* }}} */
/* {{{ Global variables. */
/* Constants used by FP instructions. */
static REAL_VALUE_TYPE dconst4, dconst1over2pi;
static bool ext_gcn_constants_init = 0;
/* Holds the ISA variant, derived from the command line parameters. */
int gcn_isa = 3; /* Default to GCN3. */
/* Reserve this much space for LDS (for propagating variables from
worker-single mode to worker-partitioned mode), per workgroup. Global
analysis could calculate an exact bound, but we don't do that yet.
We want to permit full occupancy, so size accordingly. */
/* Use this as a default, but allow it to grow if the user requests a large
amount of gang-private shared-memory space. */
static int acc_lds_size = 0x600;
#define OMP_LDS_SIZE 0x600 /* 0x600 is 1/40 total, rounded down. */
#define ACC_LDS_SIZE acc_lds_size
#define OTHER_LDS_SIZE 65536 /* If in doubt, reserve all of it. */
#define LDS_SIZE (flag_openacc ? ACC_LDS_SIZE \
: flag_openmp ? OMP_LDS_SIZE \
: OTHER_LDS_SIZE)
static int gang_private_hwm = 32;
static hash_map<tree, int> lds_allocs;
/* The number of registers usable by normal non-kernel functions.
The SGPR count includes any special extra registers such as VCC. */
#define MAX_NORMAL_SGPR_COUNT 62 // i.e. 64 with VCC
#define MAX_NORMAL_VGPR_COUNT 24
/* }}} */
/* {{{ Initialization and options. */
/* Initialize machine_function. */
static struct machine_function *
gcn_init_machine_status (void)
{
struct machine_function *f;
f = ggc_cleared_alloc<machine_function> ();
if (TARGET_GCN3)
f->use_flat_addressing = true;
return f;
}
/* Implement TARGET_OPTION_OVERRIDE.
Override option settings where defaults are variable, or we have specific
needs to consider. */
static void
gcn_option_override (void)
{
init_machine_status = gcn_init_machine_status;
/* The HSA runtime does not respect ELF load addresses, so force PIE. */
if (!flag_pie)
flag_pie = 2;
if (!flag_pic)
flag_pic = flag_pie;
gcn_isa = gcn_arch == PROCESSOR_FIJI ? 3 : 5;
/* The default stack size needs to be small for offload kernels because
there may be many, many threads. Also, a smaller stack gives a
measureable performance boost. But, a small stack is insufficient
for running the testsuite, so we use a larger default for the stand
alone case. */
if (stack_size_opt == -1)
{
if (flag_openacc || flag_openmp)
/* 512 bytes per work item = 32kB total. */
stack_size_opt = 512 * 64;
else
/* 1MB total. */
stack_size_opt = 1048576;
}
/* Reserve 1Kb (somewhat arbitrarily) of LDS space for reduction results and
worker broadcasts. */
if (gang_private_size_opt == -1)
gang_private_size_opt = 512;
else if (gang_private_size_opt < gang_private_hwm)
gang_private_size_opt = gang_private_hwm;
else if (gang_private_size_opt >= acc_lds_size - 1024)
{
/* We need some space for reductions and worker broadcasting. If the
user requests a large amount of gang-private LDS space, we might not
have enough left for the former. Increase the LDS allocation in that
case, although this may reduce the maximum occupancy on the
hardware. */
acc_lds_size = gang_private_size_opt + 1024;
if (acc_lds_size > 32768)
acc_lds_size = 32768;
}
/* The xnack option is a placeholder, for now. */
if (flag_xnack)
sorry ("XNACK support");
}
/* }}} */
/* {{{ Attributes. */
/* This table defines the arguments that are permitted in
__attribute__ ((amdgpu_hsa_kernel (...))).
The names and values correspond to the HSA metadata that is encoded
into the assembler file and binary. */
static const struct gcn_kernel_arg_type
{
const char *name;
const char *header_pseudo;
machine_mode mode;
/* This should be set to -1 or -2 for a dynamically allocated register
number. Use -1 if this argument contributes to the user_sgpr_count,
-2 otherwise. */
int fixed_regno;
} gcn_kernel_arg_types[] = {
{"exec", NULL, DImode, EXEC_REG},
#define PRIVATE_SEGMENT_BUFFER_ARG 1
{"private_segment_buffer",
".amdhsa_user_sgpr_private_segment_buffer", TImode, -1},
#define DISPATCH_PTR_ARG 2
{"dispatch_ptr", ".amdhsa_user_sgpr_dispatch_ptr", DImode, -1},
#define QUEUE_PTR_ARG 3
{"queue_ptr", ".amdhsa_user_sgpr_queue_ptr", DImode, -1},
#define KERNARG_SEGMENT_PTR_ARG 4
{"kernarg_segment_ptr", ".amdhsa_user_sgpr_kernarg_segment_ptr", DImode, -1},
{"dispatch_id", ".amdhsa_user_sgpr_dispatch_id", DImode, -1},
#define FLAT_SCRATCH_INIT_ARG 6
{"flat_scratch_init", ".amdhsa_user_sgpr_flat_scratch_init", DImode, -1},
#define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7
{"private_segment_size", ".amdhsa_user_sgpr_private_segment_size", SImode, -1},
#define WORKGROUP_ID_X_ARG 8
{"workgroup_id_X", ".amdhsa_system_sgpr_workgroup_id_x", SImode, -2},
{"workgroup_id_Y", ".amdhsa_system_sgpr_workgroup_id_y", SImode, -2},
{"workgroup_id_Z", ".amdhsa_system_sgpr_workgroup_id_z", SImode, -2},
{"workgroup_info", ".amdhsa_system_sgpr_workgroup_info", SImode, -1},
#define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 12
{"private_segment_wave_offset",
".amdhsa_system_sgpr_private_segment_wavefront_offset", SImode, -2},
#define WORK_ITEM_ID_X_ARG 13
{"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG},
#define WORK_ITEM_ID_Y_ARG 14
{"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1},
#define WORK_ITEM_ID_Z_ARG 15
{"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2}
};
static const long default_requested_args
= (1 << PRIVATE_SEGMENT_BUFFER_ARG)
| (1 << DISPATCH_PTR_ARG)
| (1 << QUEUE_PTR_ARG)
| (1 << KERNARG_SEGMENT_PTR_ARG)
| (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG)
| (1 << WORKGROUP_ID_X_ARG)
| (1 << WORK_ITEM_ID_X_ARG)
| (1 << WORK_ITEM_ID_Y_ARG)
| (1 << WORK_ITEM_ID_Z_ARG);
/* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())).
This function also sets the default values for some arguments.
Return true on success, with ARGS populated. */
static bool
gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args,
tree list)
{
bool err = false;
args->requested = default_requested_args;
args->nargs = 0;
for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
args->reg[a] = -1;
for (; list; list = TREE_CHAIN (list))
{
const char *str;
if (TREE_CODE (TREE_VALUE (list)) != STRING_CST)
{
error ("%<amdgpu_hsa_kernel%> attribute requires string constant "
"arguments");
break;
}
str = TREE_STRING_POINTER (TREE_VALUE (list));
int a;
for (a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
{
if (!strcmp (str, gcn_kernel_arg_types[a].name))
break;
}
if (a == GCN_KERNEL_ARG_TYPES)
{
error ("unknown specifier %qs in %<amdgpu_hsa_kernel%> attribute",
str);
err = true;
break;
}
if (args->requested & (1 << a))
{
error ("duplicated parameter specifier %qs in %<amdgpu_hsa_kernel%> "
"attribute", str);
err = true;
break;
}
args->requested |= (1 << a);
args->order[args->nargs++] = a;
}
/* Requesting WORK_ITEM_ID_Z_ARG implies requesting WORK_ITEM_ID_X_ARG and
WORK_ITEM_ID_Y_ARG. Similarly, requesting WORK_ITEM_ID_Y_ARG implies
requesting WORK_ITEM_ID_X_ARG. */
if (args->requested & (1 << WORK_ITEM_ID_Z_ARG))
args->requested |= (1 << WORK_ITEM_ID_Y_ARG);
if (args->requested & (1 << WORK_ITEM_ID_Y_ARG))
args->requested |= (1 << WORK_ITEM_ID_X_ARG);
int sgpr_regno = FIRST_SGPR_REG;
args->nsgprs = 0;
for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
{
if (!(args->requested & (1 << a)))
continue;
if (gcn_kernel_arg_types[a].fixed_regno >= 0)
args->reg[a] = gcn_kernel_arg_types[a].fixed_regno;
else
{
int reg_count;
switch (gcn_kernel_arg_types[a].mode)
{
case E_SImode:
reg_count = 1;
break;
case E_DImode:
reg_count = 2;
break;
case E_TImode:
reg_count = 4;
break;
default:
gcc_unreachable ();
}
args->reg[a] = sgpr_regno;
sgpr_regno += reg_count;
if (gcn_kernel_arg_types[a].fixed_regno == -1)
args->nsgprs += reg_count;
}
}
if (sgpr_regno > FIRST_SGPR_REG + 16)
{
error ("too many arguments passed in sgpr registers");
}
return err;
}
/* Referenced by TARGET_ATTRIBUTE_TABLE.
Validates target specific attributes. */
static tree
gcn_handle_amdgpu_hsa_kernel_attribute (tree *node, tree name,
tree args, int, bool *no_add_attrs)
{
if (!FUNC_OR_METHOD_TYPE_P (*node))
{
warning (OPT_Wattributes, "%qE attribute only applies to functions",
name);
*no_add_attrs = true;
return NULL_TREE;
}
/* Can combine regparm with all attributes but fastcall, and thiscall. */
if (is_attribute_p ("gcnhsa_kernel", name))
{
struct gcn_kernel_args kernelarg;
if (gcn_parse_amdgpu_hsa_kernel_attribute (&kernelarg, args))
*no_add_attrs = true;
return NULL_TREE;
}
return NULL_TREE;
}
/* Implement TARGET_ATTRIBUTE_TABLE.
Create target-specific __attribute__ types. */
static const struct attribute_spec gcn_attribute_table[] = {
/* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
affects_type_identity } */
{"amdgpu_hsa_kernel", 0, GCN_KERNEL_ARG_TYPES, false, true,
true, true, gcn_handle_amdgpu_hsa_kernel_attribute, NULL},
/* End element. */
{NULL, 0, 0, false, false, false, false, NULL, NULL}
};
/* }}} */
/* {{{ Registers and modes. */
/* Implement TARGET_SCALAR_MODE_SUPPORTED_P. */
bool
gcn_scalar_mode_supported_p (scalar_mode mode)
{
return (mode == BImode
|| mode == QImode
|| mode == HImode /* || mode == HFmode */
|| mode == SImode || mode == SFmode
|| mode == DImode || mode == DFmode
|| mode == TImode);
}
/* Implement TARGET_CLASS_MAX_NREGS.
Return the number of hard registers needed to hold a value of MODE in
a register of class RCLASS. */
static unsigned char
gcn_class_max_nregs (reg_class_t rclass, machine_mode mode)
{
/* Scalar registers are 32bit, vector registers are in fact tuples of
64 lanes. */
if (rclass == VGPR_REGS)
{
if (vgpr_1reg_mode_p (mode))
return 1;
if (vgpr_2reg_mode_p (mode))
return 2;
/* TImode is used by DImode compare_and_swap. */
if (mode == TImode)
return 4;
}
else if (rclass == VCC_CONDITIONAL_REG && mode == BImode)
return 2;
return CEIL (GET_MODE_SIZE (mode), 4);
}
/* Implement TARGET_HARD_REGNO_NREGS.
Return the number of hard registers needed to hold a value of MODE in
REGNO. */
unsigned int
gcn_hard_regno_nregs (unsigned int regno, machine_mode mode)
{
return gcn_class_max_nregs (REGNO_REG_CLASS (regno), mode);
}
/* Implement TARGET_HARD_REGNO_MODE_OK.
Return true if REGNO can hold value in MODE. */
bool
gcn_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
{
/* Treat a complex mode as if it were a scalar mode of the same overall
size for the purposes of allocating hard registers. */
if (COMPLEX_MODE_P (mode))
switch (mode)
{
case E_CQImode:
case E_CHImode:
mode = SImode;
break;
case E_CSImode:
mode = DImode;
break;
case E_CDImode:
mode = TImode;
break;
case E_HCmode:
mode = SFmode;
break;
case E_SCmode:
mode = DFmode;
break;
default:
/* Not supported. */
return false;
}
switch (regno)
{
case FLAT_SCRATCH_LO_REG:
case XNACK_MASK_LO_REG:
case TBA_LO_REG:
case TMA_LO_REG:
return (mode == SImode || mode == DImode);
case VCC_LO_REG:
case EXEC_LO_REG:
return (mode == BImode || mode == SImode || mode == DImode);
case M0_REG:
case FLAT_SCRATCH_HI_REG:
case XNACK_MASK_HI_REG:
case TBA_HI_REG:
case TMA_HI_REG:
return mode == SImode;
case VCC_HI_REG:
return false;
case EXEC_HI_REG:
return mode == SImode /*|| mode == V32BImode */ ;
case SCC_REG:
case VCCZ_REG:
case EXECZ_REG:
return mode == BImode;
}
if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
return true;
if (SGPR_REGNO_P (regno))
/* We restrict double register values to aligned registers. */
return (sgpr_1reg_mode_p (mode)
|| (!((regno - FIRST_SGPR_REG) & 1) && sgpr_2reg_mode_p (mode))
|| (((regno - FIRST_SGPR_REG) & 3) == 0 && mode == TImode));
if (VGPR_REGNO_P (regno))
/* Vector instructions do not care about the alignment of register
pairs, but where there is no 64-bit instruction, many of the
define_split do not work if the input and output registers partially
overlap. We tried to fix this with early clobber and match
constraints, but it was bug prone, added complexity, and conflicts
with the 'U0' constraints on vec_merge.
Therefore, we restrict ourselved to aligned registers. */
return (vgpr_1reg_mode_p (mode)
|| (!((regno - FIRST_VGPR_REG) & 1) && vgpr_2reg_mode_p (mode))
/* TImode is used by DImode compare_and_swap. */
|| (mode == TImode
&& !((regno - FIRST_VGPR_REG) & 3)));
return false;
}
/* Implement REGNO_REG_CLASS via gcn.h.
Return smallest class containing REGNO. */
enum reg_class
gcn_regno_reg_class (int regno)
{
switch (regno)
{
case SCC_REG:
return SCC_CONDITIONAL_REG;
case VCC_LO_REG:
case VCC_HI_REG:
return VCC_CONDITIONAL_REG;
case VCCZ_REG:
return VCCZ_CONDITIONAL_REG;
case EXECZ_REG:
return EXECZ_CONDITIONAL_REG;
case EXEC_LO_REG:
case EXEC_HI_REG:
return EXEC_MASK_REG;
}
if (VGPR_REGNO_P (regno))
return VGPR_REGS;
if (SGPR_REGNO_P (regno))
return SGPR_REGS;
if (regno < FIRST_VGPR_REG)
return GENERAL_REGS;
if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
return AFP_REGS;
return ALL_REGS;
}
/* Implement TARGET_CAN_CHANGE_MODE_CLASS.
GCC assumes that lowpart contains first part of value as stored in memory.
This is not the case for vector registers. */
bool
gcn_can_change_mode_class (machine_mode from, machine_mode to,
reg_class_t regclass)
{
if (!vgpr_vector_mode_p (from) && !vgpr_vector_mode_p (to))
return true;
return (gcn_class_max_nregs (regclass, from)
== gcn_class_max_nregs (regclass, to));
}
/* Implement TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P.
When this hook returns true for MODE, the compiler allows
registers explicitly used in the rtl to be used as spill registers
but prevents the compiler from extending the lifetime of these
registers. */
bool
gcn_small_register_classes_for_mode_p (machine_mode mode)
{
/* We allocate into exec and vcc regs. Those make small register class. */
return mode == DImode || mode == SImode;
}
/* Implement TARGET_CLASS_LIKELY_SPILLED_P.
Returns true if pseudos that have been assigned to registers of class RCLASS
would likely be spilled because registers of RCLASS are needed for spill
registers. */
static bool
gcn_class_likely_spilled_p (reg_class_t rclass)
{
return (rclass == EXEC_MASK_REG
|| reg_classes_intersect_p (ALL_CONDITIONAL_REGS, rclass));
}
/* Implement TARGET_MODES_TIEABLE_P.
Returns true if a value of MODE1 is accessible in MODE2 without
copying. */
bool
gcn_modes_tieable_p (machine_mode mode1, machine_mode mode2)
{
return (GET_MODE_BITSIZE (mode1) <= MAX_FIXED_MODE_SIZE
&& GET_MODE_BITSIZE (mode2) <= MAX_FIXED_MODE_SIZE);
}
/* Implement TARGET_TRULY_NOOP_TRUNCATION.
Returns true if it is safe to “convert” a value of INPREC bits to one of
OUTPREC bits (where OUTPREC is smaller than INPREC) by merely operating on
it as if it had only OUTPREC bits. */
bool
gcn_truly_noop_truncation (poly_uint64 outprec, poly_uint64 inprec)
{
return ((inprec <= 32) && (outprec <= inprec));
}
/* Return N-th part of value occupying multiple registers. */
rtx
gcn_operand_part (machine_mode mode, rtx op, int n)
{
if (GET_MODE_SIZE (mode) >= 256)
{
/*gcc_assert (GET_MODE_SIZE (mode) == 256 || n == 0); */
if (REG_P (op))
{
gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
return gen_rtx_REG (V64SImode, REGNO (op) + n);
}
if (GET_CODE (op) == CONST_VECTOR)
{
int units = GET_MODE_NUNITS (mode);
rtvec v = rtvec_alloc (units);
for (int i = 0; i < units; ++i)
RTVEC_ELT (v, i) = gcn_operand_part (GET_MODE_INNER (mode),
CONST_VECTOR_ELT (op, i), n);
return gen_rtx_CONST_VECTOR (V64SImode, v);
}
if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
return gcn_gen_undef (V64SImode);
gcc_unreachable ();
}
else if (GET_MODE_SIZE (mode) == 8 && REG_P (op))
{
gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
return gen_rtx_REG (SImode, REGNO (op) + n);
}
else
{
if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
return gcn_gen_undef (SImode);
/* If it's a constant then let's assume it is of the largest mode
available, otherwise simplify_gen_subreg will fail. */
if (mode == VOIDmode && CONST_INT_P (op))
mode = DImode;
return simplify_gen_subreg (SImode, op, mode, n * 4);
}
}
/* Return N-th part of value occupying multiple registers. */
rtx
gcn_operand_doublepart (machine_mode mode, rtx op, int n)
{
return simplify_gen_subreg (DImode, op, mode, n * 8);
}
/* Return true if OP can be split into subregs or high/low parts.
This is always true for scalars, but not normally true for vectors.
However, for vectors in hardregs we can use the low and high registers. */
bool
gcn_can_split_p (machine_mode, rtx op)
{
if (vgpr_vector_mode_p (GET_MODE (op)))
{
if (GET_CODE (op) == SUBREG)
op = SUBREG_REG (op);
if (!REG_P (op))
return true;
return REGNO (op) <= FIRST_PSEUDO_REGISTER;
}
return true;
}
/* Implement TARGET_SPILL_CLASS.
Return class of registers which could be used for pseudo of MODE
and of class RCLASS for spilling instead of memory. Return NO_REGS
if it is not possible or non-profitable. */
static reg_class_t
gcn_spill_class (reg_class_t c, machine_mode /*mode */ )
{
if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c)
|| c == VCC_CONDITIONAL_REG)
return SGPR_REGS;
else
return NO_REGS;
}
/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
Change allocno class for given pseudo from allocno and best class
calculated by IRA. */
static reg_class_t
gcn_ira_change_pseudo_allocno_class (int regno, reg_class_t cl,
reg_class_t best_cl)
{
/* Avoid returning classes that contain both vgpr and sgpr registers. */
if (cl != ALL_REGS && cl != SRCDST_REGS && cl != ALL_GPR_REGS)
return cl;
if (best_cl != ALL_REGS && best_cl != SRCDST_REGS
&& best_cl != ALL_GPR_REGS)
return best_cl;
machine_mode mode = PSEUDO_REGNO_MODE (regno);
if (vgpr_vector_mode_p (mode))
return VGPR_REGS;
return GENERAL_REGS;
}
/* Create a new DImode pseudo reg and emit an instruction to initialize
it to VAL. */
static rtx
get_exec (int64_t val)
{
rtx reg = gen_reg_rtx (DImode);
emit_insn (gen_rtx_SET (reg, gen_int_mode (val, DImode)));
return reg;
}
/* Return value of scalar exec register. */
rtx
gcn_scalar_exec ()
{
return const1_rtx;
}
/* Return pseudo holding scalar exec register. */
rtx
gcn_scalar_exec_reg ()
{
return get_exec (1);
}
/* Return value of full exec register. */
rtx
gcn_full_exec ()
{
return constm1_rtx;
}
/* Return pseudo holding full exec register. */
rtx
gcn_full_exec_reg ()
{
return get_exec (-1);
}
/* }}} */
/* {{{ Immediate constants. */
/* Initialize shared numeric constants. */
static void
init_ext_gcn_constants (void)
{
real_from_integer (&dconst4, DFmode, 4, SIGNED);
/* FIXME: this constant probably does not match what hardware really loads.
Reality check it eventually. */
real_from_string (&dconst1over2pi,
"0.1591549430918953357663423455968866839");
real_convert (&dconst1over2pi, SFmode, &dconst1over2pi);
ext_gcn_constants_init = 1;
}
/* Return non-zero if X is a constant that can appear as an inline operand.
This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
Or a vector of those.
The value returned should be the encoding of this constant. */
int
gcn_inline_fp_constant_p (rtx x, bool allow_vector)
{
machine_mode mode = GET_MODE (x);
if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
&& allow_vector)
{
int n;
if (GET_CODE (x) != CONST_VECTOR)
return 0;
n = gcn_inline_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
if (!n)
return 0;
for (int i = 1; i < 64; i++)
if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
return 0;
return 1;
}
if (mode != HFmode && mode != SFmode && mode != DFmode)
return 0;
const REAL_VALUE_TYPE *r;
if (x == CONST0_RTX (mode))
return 128;
if (x == CONST1_RTX (mode))
return 242;
r = CONST_DOUBLE_REAL_VALUE (x);
if (real_identical (r, &dconstm1))
return 243;
if (real_identical (r, &dconsthalf))
return 240;
if (real_identical (r, &dconstm1))
return 243;
if (real_identical (r, &dconst2))
return 244;
if (real_identical (r, &dconst4))
return 246;
if (real_identical (r, &dconst1over2pi))
return 248;
if (!ext_gcn_constants_init)
init_ext_gcn_constants ();
real_value_negate (r);
if (real_identical (r, &dconsthalf))
return 241;
if (real_identical (r, &dconst2))
return 245;
if (real_identical (r, &dconst4))
return 247;
/* FIXME: add 4, -4 and 1/(2*PI). */
return 0;
}
/* Return non-zero if X is a constant that can appear as an immediate operand.
This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
Or a vector of those.
The value returned should be the encoding of this constant. */
bool
gcn_fp_constant_p (rtx x, bool allow_vector)
{
machine_mode mode = GET_MODE (x);
if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
&& allow_vector)
{
int n;
if (GET_CODE (x) != CONST_VECTOR)
return false;
n = gcn_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
if (!n)
return false;
for (int i = 1; i < 64; i++)
if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
return false;
return true;
}
if (mode != HFmode && mode != SFmode && mode != DFmode)
return false;
if (gcn_inline_fp_constant_p (x, false))
return true;
/* FIXME: It is not clear how 32bit immediates are interpreted here. */
return (mode != DFmode);
}
/* Return true if X is a constant representable as an inline immediate
constant in a 32-bit instruction encoding. */
bool
gcn_inline_constant_p (rtx x)
{
if (GET_CODE (x) == CONST_INT)
return INTVAL (x) >= -16 && INTVAL (x) <= 64;
if (GET_CODE (x) == CONST_DOUBLE)
return gcn_inline_fp_constant_p (x, false);
if (GET_CODE (x) == CONST_VECTOR)
{
int n;
if (!vgpr_vector_mode_p (GET_MODE (x)))
return false;
n = gcn_inline_constant_p (CONST_VECTOR_ELT (x, 0));
if (!n)
return false;
for (int i = 1; i < 64; i++)
if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
return false;
return 1;
}
return false;
}
/* Return true if X is a constant representable as an immediate constant
in a 32 or 64-bit instruction encoding. */
bool
gcn_constant_p (rtx x)
{
switch (GET_CODE (x))
{
case CONST_INT:
return true;
case CONST_DOUBLE:
return gcn_fp_constant_p (x, false);
case CONST_VECTOR:
{
int n;
if (!vgpr_vector_mode_p (GET_MODE (x)))
return false;
n = gcn_constant_p (CONST_VECTOR_ELT (x, 0));
if (!n)
return false;
for (int i = 1; i < 64; i++)
if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
return false;
return true;
}
case SYMBOL_REF:
case LABEL_REF:
return true;
default:
;
}
return false;
}
/* Return true if X is a constant representable as two inline immediate
constants in a 64-bit instruction that is split into two 32-bit
instructions.
When MIXED is set, the low-part is permitted to use the full 32-bits. */
bool
gcn_inline_constant64_p (rtx x, bool mixed)
{
if (GET_CODE (x) == CONST_VECTOR)
{
if (!vgpr_vector_mode_p (GET_MODE (x)))
return false;
if (!gcn_inline_constant64_p (CONST_VECTOR_ELT (x, 0), mixed))
return false;
for (int i = 1; i < 64; i++)
if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
return false;
return true;
}
if (GET_CODE (x) != CONST_INT)
return false;
rtx val_lo = gcn_operand_part (DImode, x, 0);
rtx val_hi = gcn_operand_part (DImode, x, 1);
return ((mixed || gcn_inline_constant_p (val_lo))
&& gcn_inline_constant_p (val_hi));
}
/* Return true if X is a constant representable as an immediate constant
in a 32 or 64-bit instruction encoding where the hardware will
extend the immediate to 64-bits. */
bool
gcn_constant64_p (rtx x)
{
if (!gcn_constant_p (x))
return false;
if (GET_CODE (x) != CONST_INT)
return true;
/* Negative numbers are only allowed if they can be encoded within src0,
because the 32-bit immediates do not get sign-extended.
Unsigned numbers must not be encodable as 32-bit -1..-16, because the
assembler will use a src0 inline immediate and that will get
sign-extended. */
HOST_WIDE_INT val = INTVAL (x);
return (((val & 0xffffffff) == val /* Positive 32-bit. */
&& (val & 0xfffffff0) != 0xfffffff0) /* Not -1..-16. */
|| gcn_inline_constant_p (x)); /* Src0. */
}
/* Implement TARGET_LEGITIMATE_CONSTANT_P.
Returns true if X is a legitimate constant for a MODE immediate operand. */
bool
gcn_legitimate_constant_p (machine_mode, rtx x)
{
return gcn_constant_p (x);
}
/* Return true if X is a CONST_VECTOR of single constant. */
static bool
single_cst_vector_p (rtx x)
{
if (GET_CODE (x) != CONST_VECTOR)
return false;
for (int i = 1; i < 64; i++)
if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
return false;
return true;
}
/* Create a CONST_VECTOR of duplicated value A. */
rtx
gcn_vec_constant (machine_mode mode, int a)
{
/*if (!a)
return CONST0_RTX (mode);
if (a == -1)
return CONSTM1_RTX (mode);
if (a == 1)
return CONST1_RTX (mode);
if (a == 2)
return CONST2_RTX (mode);*/
int units = GET_MODE_NUNITS (mode);
machine_mode innermode = GET_MODE_INNER (mode);
rtx tem;
if (FLOAT_MODE_P (innermode))
{
REAL_VALUE_TYPE rv;
real_from_integer (&rv, NULL, a, SIGNED);
tem = const_double_from_real_value (rv, innermode);
}
else
tem = gen_int_mode (a, innermode);
rtvec v = rtvec_alloc (units);
for (int i = 0; i < units; ++i)
RTVEC_ELT (v, i) = tem;
return gen_rtx_CONST_VECTOR (mode, v);
}
/* Create a CONST_VECTOR of duplicated value A. */
rtx
gcn_vec_constant (machine_mode mode, rtx a)
{
int units = GET_MODE_NUNITS (mode);
rtvec v = rtvec_alloc (units);
for (int i = 0; i < units; ++i)
RTVEC_ELT (v, i) = a;
return gen_rtx_CONST_VECTOR (mode, v);
}
/* Create an undefined vector value, used where an insn operand is
optional. */
rtx
gcn_gen_undef (machine_mode mode)
{
return gen_rtx_UNSPEC (mode, gen_rtvec (1, const0_rtx), UNSPEC_VECTOR);
}
/* }}} */
/* {{{ Addresses, pointers and moves. */
/* Return true is REG is a valid place to store a pointer,
for instructions that require an SGPR.
FIXME rename. */
static bool
gcn_address_register_p (rtx reg, machine_mode mode, bool strict)
{
if (GET_CODE (reg) == SUBREG)
reg = SUBREG_REG (reg);
if (!REG_P (reg))
return false;
if (GET_MODE (reg) != mode)
return false;
int regno = REGNO (reg);
if (regno >= FIRST_PSEUDO_REGISTER)
{
if (!strict)
return true;
if (!reg_renumber)
return false;
regno = reg_renumber[regno];
}
return (SGPR_REGNO_P (regno) || regno == M0_REG
|| regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
}
/* Return true is REG is a valid place to store a pointer,
for instructions that require a VGPR. */
static bool
gcn_vec_address_register_p (rtx reg, machine_mode mode, bool strict)
{
if (GET_CODE (reg) == SUBREG)
reg = SUBREG_REG (reg);
if (!REG_P (reg))
return false;
if (GET_MODE (reg) != mode)
return false;
int regno = REGNO (reg);
if (regno >= FIRST_PSEUDO_REGISTER)
{
if (!strict)
return true;
if (!reg_renumber)
return false;
regno = reg_renumber[regno];
}
return VGPR_REGNO_P (regno);
}
/* Return true if X would be valid inside a MEM using the Flat address
space. */
bool
gcn_flat_address_p (rtx x, machine_mode mode)
{
bool vec_mode = (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
|| GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT);
if (vec_mode && gcn_address_register_p (x, DImode, false))
return true;
if (!vec_mode && gcn_vec_address_register_p (x, DImode, false))
return true;
if (TARGET_GCN5_PLUS
&& GET_CODE (x) == PLUS
&& gcn_vec_address_register_p (XEXP (x, 0), DImode, false)
&& CONST_INT_P (XEXP (x, 1)))
return true;
return false;
}
/* Return true if X would be valid inside a MEM using the Scalar Flat
address space. */
bool
gcn_scalar_flat_address_p (rtx x)
{
if (gcn_address_register_p (x, DImode, false))
return true;
if (GET_CODE (x) == PLUS
&& gcn_address_register_p (XEXP (x, 0), DImode, false)
&& CONST_INT_P (XEXP (x, 1)))
return true;
return false;
}
/* Return true if MEM X would be valid for the Scalar Flat address space. */
bool
gcn_scalar_flat_mem_p (rtx x)
{
if (!MEM_P (x))
return false;
if (GET_MODE_SIZE (GET_MODE (x)) < 4)
return false;
return gcn_scalar_flat_address_p (XEXP (x, 0));
}
/* Return true if X would be valid inside a MEM using the LDS or GDS
address spaces. */
bool
gcn_ds_address_p (rtx x)
{
if (gcn_vec_address_register_p (x, SImode, false))
return true;
if (GET_CODE (x) == PLUS
&& gcn_vec_address_register_p (XEXP (x, 0), SImode, false)
&& CONST_INT_P (XEXP (x, 1)))
return true;
return false;
}
/* Return true if ADDR would be valid inside a MEM using the Global
address space. */
bool
gcn_global_address_p (rtx addr)
{
if (gcn_address_register_p (addr, DImode, false)
|| gcn_vec_address_register_p (addr, DImode, false))
return true;
if (GET_CODE (addr) == PLUS)
{
rtx base = XEXP (addr, 0);
rtx offset = XEXP (addr, 1);
bool immediate_p = (CONST_INT_P (offset)
&& INTVAL (offset) >= -(1 << 12)
&& INTVAL (offset) < (1 << 12));
if ((gcn_address_register_p (base, DImode, false)
|| gcn_vec_address_register_p (base, DImode, false))
&& immediate_p)
/* SGPR + CONST or VGPR + CONST */
return true;
if (gcn_address_register_p (base, DImode, false)
&& gcn_vgpr_register_operand (offset, SImode))
/* SPGR + VGPR */
return true;
if (GET_CODE (base) == PLUS
&& gcn_address_register_p (XEXP (base, 0), DImode, false)
&& gcn_vgpr_register_operand (XEXP (base, 1), SImode)
&& immediate_p)
/* (SGPR + VGPR) + CONST */
return true;
}
return false;
}
/* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P.
Recognizes RTL expressions that are valid memory addresses for an
instruction. The MODE argument is the machine mode for the MEM
expression that wants to use this address.
It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
convert common non-canonical forms to canonical form so that they will
be recognized. */
static bool
gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict,
addr_space_t as)
{
/* All vector instructions need to work on addresses in registers. */
if (!TARGET_GCN5_PLUS && (vgpr_vector_mode_p (mode) && !REG_P (x)))
return false;
if (AS_SCALAR_FLAT_P (as))
{
if (mode == QImode || mode == HImode)
return 0;
switch (GET_CODE (x))
{
case REG:
return gcn_address_register_p (x, DImode, strict);
/* Addresses are in the form BASE+OFFSET
OFFSET is either 20bit unsigned immediate, SGPR or M0.
Writes and atomics do not accept SGPR. */
case PLUS:
{
rtx x0 = XEXP (x, 0);
rtx x1 = XEXP (x, 1);
if (!gcn_address_register_p (x0, DImode, strict))
return false;
/* FIXME: This is disabled because of the mode mismatch between
SImode (for the address or m0 register) and the DImode PLUS.
We'll need a zero_extend or similar.
if (gcn_m0_register_p (x1, SImode, strict)
|| gcn_address_register_p (x1, SImode, strict))
return true;
else*/
if (GET_CODE (x1) == CONST_INT)
{
if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20)
/* The low bits of the offset are ignored, even when
they're meant to realign the pointer. */
&& !(INTVAL (x1) & 0x3))
return true;
}
return false;
}
default:
break;
}
}
else if (AS_SCRATCH_P (as))
return gcn_address_register_p (x, SImode, strict);
else if (AS_FLAT_P (as) || AS_FLAT_SCRATCH_P (as))
{
if (TARGET_GCN3 || GET_CODE (x) == REG)
return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
|| GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
? gcn_address_register_p (x, DImode, strict)
: gcn_vec_address_register_p (x, DImode, strict));
else
{
gcc_assert (TARGET_GCN5_PLUS);
if (GET_CODE (x) == PLUS)
{
rtx x1 = XEXP (x, 1);
if (VECTOR_MODE_P (mode)
? !gcn_address_register_p (x, DImode, strict)
: !gcn_vec_address_register_p (x, DImode, strict))
return false;
if (GET_CODE (x1) == CONST_INT)
{
if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 12)
/* The low bits of the offset are ignored, even when
they're meant to realign the pointer. */
&& !(INTVAL (x1) & 0x3))
return true;
}
}
return false;
}
}
else if (AS_GLOBAL_P (as))
{
gcc_assert (TARGET_GCN5_PLUS);
if (GET_CODE (x) == REG)
return (gcn_address_register_p (x, DImode, strict)
|| (!VECTOR_MODE_P (mode)
&& gcn_vec_address_register_p (x, DImode, strict)));
else if (GET_CODE (x) == PLUS)
{
rtx base = XEXP (x, 0);
rtx offset = XEXP (x, 1);
bool immediate_p = (GET_CODE (offset) == CONST_INT
/* Signed 13-bit immediate. */
&& INTVAL (offset) >= -(1 << 12)
&& INTVAL (offset) < (1 << 12)
/* The low bits of the offset are ignored, even
when they're meant to realign the pointer. */
&& !(INTVAL (offset) & 0x3));
if (!VECTOR_MODE_P (mode))
{
if ((gcn_address_register_p (base, DImode, strict)
|| gcn_vec_address_register_p (base, DImode, strict))
&& immediate_p)
/* SGPR + CONST or VGPR + CONST */
return true;
if (gcn_address_register_p (base, DImode, strict)
&& gcn_vgpr_register_operand (offset, SImode))
/* SGPR + VGPR */
return true;
if (GET_CODE (base) == PLUS
&& gcn_address_register_p (XEXP (base, 0), DImode, strict)
&& gcn_vgpr_register_operand (XEXP (base, 1), SImode)
&& immediate_p)
/* (SGPR + VGPR) + CONST */
return true;
}
else
{
if (gcn_address_register_p (base, DImode, strict)
&& immediate_p)
/* SGPR + CONST */
return true;
}
}
else
return false;
}
else if (AS_ANY_DS_P (as))
switch (GET_CODE (x))
{
case REG:
return (VECTOR_MODE_P (mode)
? gcn_address_register_p (x, SImode, strict)
: gcn_vec_address_register_p (x, SImode, strict));
/* Addresses are in the form BASE+OFFSET
OFFSET is either 20bit unsigned immediate, SGPR or M0.
Writes and atomics do not accept SGPR. */
case PLUS:
{
rtx x0 = XEXP (x, 0);
rtx x1 = XEXP (x, 1);
if (!gcn_vec_address_register_p (x0, DImode, strict))
return false;
if (GET_CODE (x1) == REG)
{
if (GET_CODE (x1) != REG
|| (REGNO (x1) <= FIRST_PSEUDO_REGISTER
&& !gcn_ssrc_register_operand (x1, DImode)))
return false;
}
else if (GET_CODE (x1) == CONST_VECTOR
&& GET_CODE (CONST_VECTOR_ELT (x1, 0)) == CONST_INT
&& single_cst_vector_p (x1))
{
x1 = CONST_VECTOR_ELT (x1, 0);
if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20))
return true;
}
return false;
}
default:
break;
}
else
gcc_unreachable ();
return false;
}
/* Implement TARGET_ADDR_SPACE_POINTER_MODE.
Return the appropriate mode for a named address pointer. */
static scalar_int_mode
gcn_addr_space_pointer_mode (addr_space_t addrspace)
{
switch (addrspace)
{
case ADDR_SPACE_SCRATCH:
case ADDR_SPACE_LDS:
case ADDR_SPACE_GDS:
return SImode;
case ADDR_SPACE_DEFAULT:
case ADDR_SPACE_FLAT:
case ADDR_SPACE_FLAT_SCRATCH:
case ADDR_SPACE_SCALAR_FLAT:
return DImode;
default:
gcc_unreachable ();
}
}
/* Implement TARGET_ADDR_SPACE_ADDRESS_MODE.
Return the appropriate mode for a named address space address. */
static scalar_int_mode
gcn_addr_space_address_mode (addr_space_t addrspace)
{
return gcn_addr_space_pointer_mode (addrspace);
}
/* Implement TARGET_ADDR_SPACE_SUBSET_P.
Determine if one named address space is a subset of another. */
static bool
gcn_addr_space_subset_p (addr_space_t subset, addr_space_t superset)
{
if (subset == superset)
return true;
/* FIXME is this true? */
if (AS_FLAT_P (superset) || AS_SCALAR_FLAT_P (superset))
return true;
return false;
}
/* Convert from one address space to another. */
static rtx
gcn_addr_space_convert (rtx op, tree from_type, tree to_type)
{
gcc_assert (POINTER_TYPE_P (from_type));
gcc_assert (POINTER_TYPE_P (to_type));
addr_space_t as_from = TYPE_ADDR_SPACE (TREE_TYPE (from_type));
addr_space_t as_to = TYPE_ADDR_SPACE (TREE_TYPE (to_type));
if (AS_LDS_P (as_from) && AS_FLAT_P (as_to))
{
rtx queue = gen_rtx_REG (DImode,
cfun->machine->args.reg[QUEUE_PTR_ARG]);
rtx group_seg_aperture_hi = gen_rtx_MEM (SImode,
gen_rtx_PLUS (DImode, queue,
gen_int_mode (64, SImode)));
rtx tmp = gen_reg_rtx (DImode);
emit_move_insn (gen_lowpart (SImode, tmp), op);
emit_move_insn (gen_highpart_mode (SImode, DImode, tmp),
group_seg_aperture_hi);
return tmp;
}
else if (as_from == as_to)
return op;
else
gcc_unreachable ();
}
/* Implement TARGET_ADDR_SPACE_DEBUG.
Return the dwarf address space class for each hardware address space. */
static int
gcn_addr_space_debug (addr_space_t as)
{
switch (as)
{
case ADDR_SPACE_DEFAULT:
case ADDR_SPACE_FLAT:
case ADDR_SPACE_SCALAR_FLAT:
case ADDR_SPACE_FLAT_SCRATCH:
return DW_ADDR_none;
case ADDR_SPACE_GLOBAL:
return 1; // DW_ADDR_LLVM_global
case ADDR_SPACE_LDS:
return 3; // DW_ADDR_LLVM_group
case ADDR_SPACE_SCRATCH:
return 4; // DW_ADDR_LLVM_private
case ADDR_SPACE_GDS:
return 0x8000; // DW_ADDR_AMDGPU_region
}
gcc_unreachable ();
}
/* Implement REGNO_MODE_CODE_OK_FOR_BASE_P via gcn.h
Retun true if REGNO is OK for memory adressing. */
bool
gcn_regno_mode_code_ok_for_base_p (int regno,
machine_mode, addr_space_t as, int, int)
{
if (regno >= FIRST_PSEUDO_REGISTER)
{
if (reg_renumber)
regno = reg_renumber[regno];
else
return true;
}
if (AS_FLAT_P (as))
return (VGPR_REGNO_P (regno)
|| regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
else if (AS_SCALAR_FLAT_P (as))
return (SGPR_REGNO_P (regno)
|| regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
else if (AS_GLOBAL_P (as))
{
return (SGPR_REGNO_P (regno)
|| VGPR_REGNO_P (regno)
|| regno == ARG_POINTER_REGNUM
|| regno == FRAME_POINTER_REGNUM);
}
else
/* For now. */
return false;
}
/* Implement MODE_CODE_BASE_REG_CLASS via gcn.h.
Return a suitable register class for memory addressing. */
reg_class
gcn_mode_code_base_reg_class (machine_mode mode, addr_space_t as, int oc,
int ic)
{
switch (as)
{
case ADDR_SPACE_DEFAULT:
return gcn_mode_code_base_reg_class (mode, DEFAULT_ADDR_SPACE, oc, ic);
case ADDR_SPACE_SCALAR_FLAT:
case ADDR_SPACE_SCRATCH:
return SGPR_REGS;
break;
case ADDR_SPACE_FLAT:
case ADDR_SPACE_FLAT_SCRATCH:
case ADDR_SPACE_LDS:
case ADDR_SPACE_GDS:
return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
|| GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
? SGPR_REGS : VGPR_REGS);
case ADDR_SPACE_GLOBAL:
return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
|| GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
? SGPR_REGS : ALL_GPR_REGS);
}
gcc_unreachable ();
}
/* Implement REGNO_OK_FOR_INDEX_P via gcn.h.
Return true if REGNO is OK for index of memory addressing. */
bool
regno_ok_for_index_p (int regno)
{
if (regno >= FIRST_PSEUDO_REGISTER)
{
if (reg_renumber)
regno = reg_renumber[regno];
else
return true;
}
return regno == M0_REG || VGPR_REGNO_P (regno);
}
/* Generate move which uses the exec flags. If EXEC is NULL, then it is
assumed that all lanes normally relevant to the mode of the move are
affected. If PREV is NULL, then a sensible default is supplied for
the inactive lanes. */
static rtx
gen_mov_with_exec (rtx op0, rtx op1, rtx exec = NULL, rtx prev = NULL)
{
machine_mode mode = GET_MODE (op0);
if (vgpr_vector_mode_p (mode))
{
if (exec && exec != CONSTM1_RTX (DImode))
{
if (!prev)
prev = op0;
}
else
{
if (!prev)
prev = gcn_gen_undef (mode);
exec = gcn_full_exec_reg ();
}
rtx set = gen_rtx_SET (op0, gen_rtx_VEC_MERGE (mode, op1, prev, exec));
return gen_rtx_PARALLEL (VOIDmode,
gen_rtvec (2, set,
gen_rtx_CLOBBER (VOIDmode,
gen_rtx_SCRATCH (V64DImode))));
}
return (gen_rtx_PARALLEL
(VOIDmode,
gen_rtvec (2, gen_rtx_SET (op0, op1),
gen_rtx_USE (VOIDmode,
exec ? exec : gcn_scalar_exec ()))));
}
/* Generate masked move. */
static rtx
gen_duplicate_load (rtx op0, rtx op1, rtx op2 = NULL, rtx exec = NULL)
{
if (exec)
return (gen_rtx_SET (op0,
gen_rtx_VEC_MERGE (GET_MODE (op0),
gen_rtx_VEC_DUPLICATE (GET_MODE
(op0), op1),
op2, exec)));
else
return (gen_rtx_SET (op0, gen_rtx_VEC_DUPLICATE (GET_MODE (op0), op1)));
}
/* Expand vector init of OP0 by VEC.
Implements vec_init instruction pattern. */
void
gcn_expand_vector_init (rtx op0, rtx vec)
{
int64_t initialized_mask = 0;
int64_t curr_mask = 1;
machine_mode mode = GET_MODE (op0);
rtx val = XVECEXP (vec, 0, 0);
for (int i = 1; i < 64; i++)
if (rtx_equal_p (val, XVECEXP (vec, 0, i)))
curr_mask |= (int64_t) 1 << i;
if (gcn_constant_p (val))
emit_move_insn (op0, gcn_vec_constant (mode, val));
else
{
val = force_reg (GET_MODE_INNER (mode), val);
emit_insn (gen_duplicate_load (op0, val));
}
initialized_mask |= curr_mask;
for (int i = 1; i < 64; i++)
if (!(initialized_mask & ((int64_t) 1 << i)))
{
curr_mask = (int64_t) 1 << i;
rtx val = XVECEXP (vec, 0, i);
for (int j = i + 1; j < 64; j++)
if (rtx_equal_p (val, XVECEXP (vec, 0, j)))
curr_mask |= (int64_t) 1 << j;
if (gcn_constant_p (val))
emit_insn (gen_mov_with_exec (op0, gcn_vec_constant (mode, val),
get_exec (curr_mask)));
else
{
val = force_reg (GET_MODE_INNER (mode), val);
emit_insn (gen_duplicate_load (op0, val, op0,
get_exec (curr_mask)));
}
initialized_mask |= curr_mask;
}
}
/* Load vector constant where n-th lane contains BASE+n*VAL. */
static rtx
strided_constant (machine_mode mode, int base, int val)
{
rtx x = gen_reg_rtx (mode);
emit_move_insn (x, gcn_vec_constant (mode, base));
emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 32),
x, get_exec (0xffffffff00000000)));
emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 16),
x, get_exec (0xffff0000ffff0000)));
emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 8),
x, get_exec (0xff00ff00ff00ff00)));
emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 4),
x, get_exec (0xf0f0f0f0f0f0f0f0)));
emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 2),
x, get_exec (0xcccccccccccccccc)));
emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 1),
x, get_exec (0xaaaaaaaaaaaaaaaa)));
return x;
}
/* Implement TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS. */
static rtx
gcn_addr_space_legitimize_address (rtx x, rtx old, machine_mode mode,
addr_space_t as)
{
switch (as)
{
case ADDR_SPACE_DEFAULT:
return gcn_addr_space_legitimize_address (x, old, mode,
DEFAULT_ADDR_SPACE);
case ADDR_SPACE_SCALAR_FLAT:
case ADDR_SPACE_SCRATCH:
/* Instructions working on vectors need the address to be in
a register. */
if (vgpr_vector_mode_p (mode))
return force_reg (GET_MODE (x), x);
return x;
case ADDR_SPACE_FLAT:
case ADDR_SPACE_FLAT_SCRATCH:
case ADDR_SPACE_GLOBAL:
return TARGET_GCN3 ? force_reg (DImode, x) : x;
case ADDR_SPACE_LDS:
case ADDR_SPACE_GDS:
/* FIXME: LDS support offsets, handle them!. */
if (vgpr_vector_mode_p (mode) && GET_MODE (x) != V64SImode)
{
rtx addrs = gen_reg_rtx (V64SImode);
rtx base = force_reg (SImode, x);
rtx offsets = strided_constant (V64SImode, 0,
GET_MODE_UNIT_SIZE (mode));
emit_insn (gen_vec_duplicatev64si (addrs, base));
emit_insn (gen_addv64si3 (addrs, offsets, addrs));
return addrs;
}
return x;
}
gcc_unreachable ();
}
/* Convert a (mem:<MODE> (reg:DI)) to (mem:<MODE> (reg:V64DI)) with the
proper vector of stepped addresses.
MEM will be a DImode address of a vector in an SGPR.
TMP will be a V64DImode VGPR pair or (scratch:V64DI). */
rtx
gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem,
rtx tmp)
{
gcc_assert (MEM_P (mem));
rtx mem_base = XEXP (mem, 0);
rtx mem_index = NULL_RTX;
if (!TARGET_GCN5_PLUS)
{
/* gcn_addr_space_legitimize_address should have put the address in a
register. If not, it is too late to do anything about it. */
gcc_assert (REG_P (mem_base));
}
if (GET_CODE (mem_base) == PLUS)
{
mem_index = XEXP (mem_base, 1);
mem_base = XEXP (mem_base, 0);
}
/* RF and RM base registers for vector modes should be always an SGPR. */
gcc_assert (SGPR_REGNO_P (REGNO (mem_base))
|| REGNO (mem_base) >= FIRST_PSEUDO_REGISTER);
machine_mode inner = GET_MODE_INNER (mode);
int shift = exact_log2 (GET_MODE_SIZE (inner));
rtx ramp = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
rtx undef_v64si = gcn_gen_undef (V64SImode);
rtx new_base = NULL_RTX;
addr_space_t as = MEM_ADDR_SPACE (mem);
rtx tmplo = (REG_P (tmp)
? gcn_operand_part (V64DImode, tmp, 0)
: gen_reg_rtx (V64SImode));
/* tmplo[:] = ramp[:] << shift */
if (exec)
emit_insn (gen_ashlv64si3_exec (tmplo, ramp,
gen_int_mode (shift, SImode),
undef_v64si, exec));
else
emit_insn (gen_ashlv64si3 (tmplo, ramp, gen_int_mode (shift, SImode)));
if (AS_FLAT_P (as))
{
rtx vcc = gen_rtx_REG (DImode, CC_SAVE_REG);
if (REG_P (tmp))
{
rtx mem_base_lo = gcn_operand_part (DImode, mem_base, 0);
rtx mem_base_hi = gcn_operand_part (DImode, mem_base, 1);
rtx tmphi = gcn_operand_part (V64DImode, tmp, 1);
/* tmphi[:] = mem_base_hi */
if (exec)
emit_insn (gen_vec_duplicatev64si_exec (tmphi, mem_base_hi,
undef_v64si, exec));
else
emit_insn (gen_vec_duplicatev64si (tmphi, mem_base_hi));
/* tmp[:] += zext (mem_base) */
if (exec)
{
emit_insn (gen_addv64si3_vcc_dup_exec (tmplo, mem_base_lo, tmplo,
vcc, undef_v64si, exec));
emit_insn (gen_addcv64si3_exec (tmphi, tmphi, const0_rtx,
vcc, vcc, undef_v64si, exec));
}
else
emit_insn (gen_addv64di3_vcc_zext_dup (tmp, mem_base_lo, tmp, vcc));
}
else
{
tmp = gen_reg_rtx (V64DImode);
if (exec)
emit_insn (gen_addv64di3_vcc_zext_dup2_exec
(tmp, tmplo, mem_base, vcc, gcn_gen_undef (V64DImode),
exec));
else
emit_insn (gen_addv64di3_vcc_zext_dup2 (tmp, tmplo, mem_base, vcc));
}
new_base = tmp;
}
else if (AS_ANY_DS_P (as))
{
if (!exec)
emit_insn (gen_addv64si3_dup (tmplo, tmplo, mem_base));
else
emit_insn (gen_addv64si3_dup_exec (tmplo, tmplo, mem_base,
gcn_gen_undef (V64SImode), exec));
new_base = tmplo;
}
else
{
mem_base = gen_rtx_VEC_DUPLICATE (V64DImode, mem_base);
new_base = gen_rtx_PLUS (V64DImode, mem_base,
gen_rtx_SIGN_EXTEND (V64DImode, tmplo));
}
return gen_rtx_PLUS (GET_MODE (new_base), new_base,
gen_rtx_VEC_DUPLICATE (GET_MODE (new_base),
(mem_index ? mem_index
: const0_rtx)));
}
/* Convert a BASE address, a vector of OFFSETS, and a SCALE, to addresses
suitable for the given address space. This is indented for use in
gather/scatter patterns.
The offsets may be signed or unsigned, according to UNSIGNED_P.
If EXEC is set then _exec patterns will be used, otherwise plain.
Return values.
ADDR_SPACE_FLAT - return V64DImode vector of absolute addresses.
ADDR_SPACE_GLOBAL - return V64SImode vector of offsets. */
rtx
gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale,
bool unsigned_p, rtx exec)
{
rtx tmpsi = gen_reg_rtx (V64SImode);
rtx tmpdi = gen_reg_rtx (V64DImode);
rtx undefsi = exec ? gcn_gen_undef (V64SImode) : NULL;
rtx undefdi = exec ? gcn_gen_undef (V64DImode) : NULL;
if (CONST_INT_P (scale)
&& INTVAL (scale) > 0
&& exact_log2 (INTVAL (scale)) >= 0)
emit_insn (gen_ashlv64si3 (tmpsi, offsets,
GEN_INT (exact_log2 (INTVAL (scale)))));
else
(exec
? emit_insn (gen_mulv64si3_dup_exec (tmpsi, offsets, scale, undefsi,
exec))
: emit_insn (gen_mulv64si3_dup (tmpsi, offsets, scale)));
/* "Global" instructions do not support negative register offsets. */
if (as == ADDR_SPACE_FLAT || !unsigned_p)
{
if (unsigned_p)
(exec
? emit_insn (gen_addv64di3_zext_dup2_exec (tmpdi, tmpsi, base,
undefdi, exec))
: emit_insn (gen_addv64di3_zext_dup2 (tmpdi, tmpsi, base)));
else
(exec
? emit_insn (gen_addv64di3_sext_dup2_exec (tmpdi, tmpsi, base,
undefdi, exec))
: emit_insn (gen_addv64di3_sext_dup2 (tmpdi, tmpsi, base)));
return tmpdi;
}
else if (as == ADDR_SPACE_GLOBAL)
return tmpsi;
gcc_unreachable ();
}
/* Return true if move from OP0 to OP1 is known to be executed in vector
unit. */
bool
gcn_vgpr_move_p (rtx op0, rtx op1)
{
if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
return true;
if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
return true;
return ((REG_P (op0) && VGPR_REGNO_P (REGNO (op0)))
|| (REG_P (op1) && VGPR_REGNO_P (REGNO (op1)))
|| vgpr_vector_mode_p (GET_MODE (op0)));
}
/* Return true if move from OP0 to OP1 is known to be executed in scalar
unit. Used in the machine description. */
bool
gcn_sgpr_move_p (rtx op0, rtx op1)
{
if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
return true;
if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
return true;
if (!REG_P (op0) || REGNO (op0) >= FIRST_PSEUDO_REGISTER
|| VGPR_REGNO_P (REGNO (op0)))
return false;
if (REG_P (op1)
&& REGNO (op1) < FIRST_PSEUDO_REGISTER
&& !VGPR_REGNO_P (REGNO (op1)))
return true;
return immediate_operand (op1, VOIDmode) || memory_operand (op1, VOIDmode);
}
/* Implement TARGET_SECONDARY_RELOAD.
The address space determines which registers can be used for loads and
stores. */
static reg_class_t
gcn_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
machine_mode reload_mode, secondary_reload_info *sri)
{
reg_class_t result = NO_REGS;
bool spilled_pseudo =
(REG_P (x) || GET_CODE (x) == SUBREG) && true_regnum (x) == -1;
if (dump_file && (dump_flags & TDF_DETAILS))
{
fprintf (dump_file, "gcn_secondary_reload: ");
dump_value_slim (dump_file, x, 1);
fprintf (dump_file, " %s %s:%s", (in_p ? "->" : "<-"),
reg_class_names[rclass], GET_MODE_NAME (reload_mode));
if (REG_P (x) || GET_CODE (x) == SUBREG)
fprintf (dump_file, " (true regnum: %d \"%s\")", true_regnum (x),
(true_regnum (x) >= 0
&& true_regnum (x) < FIRST_PSEUDO_REGISTER
? reg_names[true_regnum (x)]
: (spilled_pseudo ? "stack spill" : "??")));
fprintf (dump_file, "\n");
}
/* Some callers don't use or initialize icode. */
sri->icode = CODE_FOR_nothing;
if (MEM_P (x) || spilled_pseudo)
{
addr_space_t as = DEFAULT_ADDR_SPACE;
/* If we have a spilled pseudo, we can't find the address space
directly, but we know it's in ADDR_SPACE_FLAT space for GCN3 or
ADDR_SPACE_GLOBAL for GCN5. */
if (MEM_P (x))
as = MEM_ADDR_SPACE (x);
if (as == ADDR_SPACE_DEFAULT)
as = DEFAULT_ADDR_SPACE;
switch (as)
{
case ADDR_SPACE_SCALAR_FLAT:
result =
((!MEM_P (x) || rclass == SGPR_REGS) ? NO_REGS : SGPR_REGS);
break;
case ADDR_SPACE_FLAT:
case ADDR_SPACE_FLAT_SCRATCH:
case ADDR_SPACE_GLOBAL:
if (GET_MODE_CLASS (reload_mode) == MODE_VECTOR_INT
|| GET_MODE_CLASS (reload_mode) == MODE_VECTOR_FLOAT)
{
if (in_p)
switch (reload_mode)
{
case E_V64SImode:
sri->icode = CODE_FOR_reload_inv64si;
break;
case E_V64SFmode:
sri->icode = CODE_FOR_reload_inv64sf;
break;
case E_V64HImode:
sri->icode = CODE_FOR_reload_inv64hi;
break;
case E_V64HFmode:
sri->icode = CODE_FOR_reload_inv64hf;
break;
case E_V64QImode:
sri->icode = CODE_FOR_reload_inv64qi;
break;
case E_V64DImode:
sri->icode = CODE_FOR_reload_inv64di;
break;
case E_V64DFmode:
sri->icode = CODE_FOR_reload_inv64df;
break;
default:
gcc_unreachable ();
}
else
switch (reload_mode)
{
case E_V64SImode:
sri->icode = CODE_FOR_reload_outv64si;
break;
case E_V64SFmode:
sri->icode = CODE_FOR_reload_outv64sf;
break;
case E_V64HImode:
sri->icode = CODE_FOR_reload_outv64hi;
break;
case E_V64HFmode:
sri->icode = CODE_FOR_reload_outv64hf;
break;
case E_V64QImode:
sri->icode = CODE_FOR_reload_outv64qi;
break;
case E_V64DImode:
sri->icode = CODE_FOR_reload_outv64di;
break;
case E_V64DFmode:
sri->icode = CODE_FOR_reload_outv64df;
break;
default:
gcc_unreachable ();
}
break;
}
/* Fallthrough. */
case ADDR_SPACE_LDS:
case ADDR_SPACE_GDS:
case ADDR_SPACE_SCRATCH:
result = (rclass == VGPR_REGS ? NO_REGS : VGPR_REGS);
break;
}
}
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, " <= %s (icode: %s)\n", reg_class_names[result],
get_insn_name (sri->icode));
return result;
}
/* Update register usage after having seen the compiler flags and kernel
attributes. We typically want to fix registers that contain values
set by the HSA runtime. */
static void
gcn_conditional_register_usage (void)
{
if (!cfun || !cfun->machine)
return;
if (cfun->machine->normal_function)
{
/* Restrict the set of SGPRs and VGPRs used by non-kernel functions. */
for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT);
i <= LAST_SGPR_REG; i++)
fixed_regs[i] = 1, call_used_regs[i] = 1;
for (int i = VGPR_REGNO (MAX_NORMAL_VGPR_COUNT);
i <= LAST_VGPR_REG; i++)
fixed_regs[i] = 1, call_used_regs[i] = 1;
return;
}
/* If the set of requested args is the default set, nothing more needs to
be done. */
if (cfun->machine->args.requested == default_requested_args)
return;
/* Requesting a set of args different from the default violates the ABI. */
if (!leaf_function_p ())
warning (0, "A non-default set of initial values has been requested, "
"which violates the ABI");
for (int i = SGPR_REGNO (0); i < SGPR_REGNO (14); i++)
fixed_regs[i] = 0;
/* Fix the runtime argument register containing values that may be
needed later. DISPATCH_PTR_ARG and FLAT_SCRATCH_* should not be
needed after the prologue so there's no need to fix them. */
if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0)
fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]] = 1;
if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
{
/* The upper 32-bits of the 64-bit descriptor are not used, so allow
the containing registers to be used for other purposes. */
fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]] = 1;
fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 1] = 1;
}
if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
{
fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG]] = 1;
fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] + 1] = 1;
}
if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0)
{
fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1;
fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1;
}
if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0)
fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1;
if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0)
fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_X_ARG]] = 1;
if (cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG] >= 0)
fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG]] = 1;
if (cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG] >= 0)
fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG]] = 1;
}
/* Determine if a load or store is valid, according to the register classes
and address space. Used primarily by the machine description to decide
when to split a move into two steps. */
bool
gcn_valid_move_p (machine_mode mode, rtx dest, rtx src)
{
if (!MEM_P (dest) && !MEM_P (src))
return true;
if (MEM_P (dest)
&& AS_FLAT_P (MEM_ADDR_SPACE (dest))
&& (gcn_flat_address_p (XEXP (dest, 0), mode)
|| GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
|| GET_CODE (XEXP (dest, 0)) == LABEL_REF)
&& gcn_vgpr_register_operand (src, mode))
return true;
else if (MEM_P (src)
&& AS_FLAT_P (MEM_ADDR_SPACE (src))
&& (gcn_flat_address_p (XEXP (src, 0), mode)
|| GET_CODE (XEXP (src, 0)) == SYMBOL_REF
|| GET_CODE (XEXP (src, 0)) == LABEL_REF)
&& gcn_vgpr_register_operand (dest, mode))
return true;
if (MEM_P (dest)
&& AS_GLOBAL_P (MEM_ADDR_SPACE (dest))
&& (gcn_global_address_p (XEXP (dest, 0))
|| GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
|| GET_CODE (XEXP (dest, 0)) == LABEL_REF)
&& gcn_vgpr_register_operand (src, mode))
return true;
else if (MEM_P (src)
&& AS_GLOBAL_P (MEM_ADDR_SPACE (src))
&& (gcn_global_address_p (XEXP (src, 0))
|| GET_CODE (XEXP (src, 0)) == SYMBOL_REF
|| GET_CODE (XEXP (src, 0)) == LABEL_REF)
&& gcn_vgpr_register_operand (dest, mode))
return true;
if (MEM_P (dest)
&& MEM_ADDR_SPACE (dest) == ADDR_SPACE_SCALAR_FLAT
&& (gcn_scalar_flat_address_p (XEXP (dest, 0))
|| GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
|| GET_CODE (XEXP (dest, 0)) == LABEL_REF)
&& gcn_ssrc_register_operand (src, mode))
return true;
else if (MEM_P (src)
&& MEM_ADDR_SPACE (src) == ADDR_SPACE_SCALAR_FLAT
&& (gcn_scalar_flat_address_p (XEXP (src, 0))
|| GET_CODE (XEXP (src, 0)) == SYMBOL_REF
|| GET_CODE (XEXP (src, 0)) == LABEL_REF)
&& gcn_sdst_register_operand (dest, mode))
return true;
if (MEM_P (dest)
&& AS_ANY_DS_P (MEM_ADDR_SPACE (dest))
&& gcn_ds_address_p (XEXP (dest, 0))
&& gcn_vgpr_register_operand (src, mode))
return true;
else if (MEM_P (src)
&& AS_ANY_DS_P (MEM_ADDR_SPACE (src))
&& gcn_ds_address_p (XEXP (src, 0))
&& gcn_vgpr_register_operand (dest, mode))
return true;
return false;
}
/* }}} */
/* {{{ Functions and ABI. */
/* Implement TARGET_FUNCTION_VALUE.
Define how to find the value returned by a function.
The register location is always the same, but the mode depends on
VALTYPE. */
static rtx
gcn_function_value (const_tree valtype, const_tree, bool)
{
machine_mode mode = TYPE_MODE (valtype);
if (INTEGRAL_TYPE_P (valtype)
&& GET_MODE_CLASS (mode) == MODE_INT
&& GET_MODE_SIZE (mode) < 4)
mode = SImode;
return gen_rtx_REG (mode, SGPR_REGNO (RETURN_VALUE_REG));
}
/* Implement TARGET_FUNCTION_VALUE_REGNO_P.
Return true if N is a possible register number for the function return
value. */
static bool
gcn_function_value_regno_p (const unsigned int n)
{
return n == RETURN_VALUE_REG;
}
/* Calculate the number of registers required to hold function argument
ARG. */
static int
num_arg_regs (const function_arg_info &arg)
{
if (targetm.calls.must_pass_in_stack (arg))
return 0;
int size = arg.promoted_size_in_bytes ();
return (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
}
/* Implement TARGET_STRICT_ARGUMENT_NAMING.
Return true if the location where a function argument is passed
depends on whether or not it is a named argument
For gcn, we know how to handle functions declared as stdarg: by
passing an extra pointer to the unnamed arguments. However, the
Fortran frontend can produce a different situation, where a
function pointer is declared with no arguments, but the actual
function and calls to it take more arguments. In that case, we
want to ensure the call matches the definition of the function. */
static bool
gcn_strict_argument_naming (cumulative_args_t cum_v)
{
CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
}
/* Implement TARGET_PRETEND_OUTGOING_VARARGS_NAMED.
See comment on gcn_strict_argument_naming. */
static bool
gcn_pretend_outgoing_varargs_named (cumulative_args_t cum_v)
{
return !gcn_strict_argument_naming (cum_v);
}
/* Implement TARGET_FUNCTION_ARG.
Return an RTX indicating whether a function argument is passed in a register
and if so, which register. */
static rtx
gcn_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
{
CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
if (cum->normal_function)
{
if (!arg.named || arg.end_marker_p ())
return 0;
if (targetm.calls.must_pass_in_stack (arg))
return 0;
/* Vector parameters are not supported yet. */
if (VECTOR_MODE_P (arg.mode))
return 0;
int reg_num = FIRST_PARM_REG + cum->num;
int num_regs = num_arg_regs (arg);
if (num_regs > 0)
while (reg_num % num_regs != 0)
reg_num++;
if (reg_num + num_regs <= FIRST_PARM_REG + NUM_PARM_REGS)
return gen_rtx_REG (arg.mode, reg_num);
}
else
{
if (cum->num >= cum->args.nargs)
{
cum->offset = (cum->offset + TYPE_ALIGN (arg.type) / 8 - 1)
& -(TYPE_ALIGN (arg.type) / 8);
cfun->machine->kernarg_segment_alignment
= MAX ((unsigned) cfun->machine->kernarg_segment_alignment,
TYPE_ALIGN (arg.type) / 8);
rtx addr = gen_rtx_REG (DImode,
cum->args.reg[KERNARG_SEGMENT_PTR_ARG]);
if (cum->offset)
addr = gen_rtx_PLUS (DImode, addr,
gen_int_mode (cum->offset, DImode));
rtx mem = gen_rtx_MEM (arg.mode, addr);
set_mem_attributes (mem, arg.type, 1);
set_mem_addr_space (mem, ADDR_SPACE_SCALAR_FLAT);
MEM_READONLY_P (mem) = 1;
return mem;
}
int a = cum->args.order[cum->num];
if (arg.mode != gcn_kernel_arg_types[a].mode)
{
error ("wrong type of argument %s", gcn_kernel_arg_types[a].name);
return 0;
}
return gen_rtx_REG ((machine_mode) gcn_kernel_arg_types[a].mode,
cum->args.reg[a]);
}
return 0;
}
/* Implement TARGET_FUNCTION_ARG_ADVANCE.
Updates the summarizer variable pointed to by CUM_V to advance past an
argument in the argument list. */
static void
gcn_function_arg_advance (cumulative_args_t cum_v,
const function_arg_info &arg)
{
CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
if (cum->normal_function)
{
if (!arg.named)
return;
int num_regs = num_arg_regs (arg);
if (num_regs > 0)
while ((FIRST_PARM_REG + cum->num) % num_regs != 0)
cum->num++;
cum->num += num_regs;
}
else
{
if (cum->num < cum->args.nargs)
cum->num++;
else
{
cum->offset += tree_to_uhwi (TYPE_SIZE_UNIT (arg.type));
cfun->machine->kernarg_segment_byte_size = cum->offset;
}
}
}
/* Implement TARGET_ARG_PARTIAL_BYTES.
Returns the number of bytes at the beginning of an argument that must be put
in registers. The value must be zero for arguments that are passed entirely
in registers or that are entirely pushed on the stack. */
static int
gcn_arg_partial_bytes (cumulative_args_t cum_v, const function_arg_info &arg)
{
CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
if (!arg.named)
return 0;
if (targetm.calls.must_pass_in_stack (arg))
return 0;
if (cum->num >= NUM_PARM_REGS)
return 0;
/* If the argument fits entirely in registers, return 0. */
if (cum->num + num_arg_regs (arg) <= NUM_PARM_REGS)
return 0;
return (NUM_PARM_REGS - cum->num) * UNITS_PER_WORD;
}
/* A normal function which takes a pointer argument (to a scalar) may be
passed a pointer to LDS space (via a high-bits-set aperture), and that only
works with FLAT addressing, not GLOBAL. Force FLAT addressing if the
function has an incoming pointer-to-scalar parameter. */
static void
gcn_detect_incoming_pointer_arg (tree fndecl)
{
gcc_assert (cfun && cfun->machine);
for (tree arg = TYPE_ARG_TYPES (TREE_TYPE (fndecl));
arg;
arg = TREE_CHAIN (arg))
if (POINTER_TYPE_P (TREE_VALUE (arg))
&& !AGGREGATE_TYPE_P (TREE_TYPE (TREE_VALUE (arg))))
cfun->machine->use_flat_addressing = true;
}
/* Implement INIT_CUMULATIVE_ARGS, via gcn.h.
Initialize a variable CUM of type CUMULATIVE_ARGS for a call to a function
whose data type is FNTYPE. For a library call, FNTYPE is 0. */
void
gcn_init_cumulative_args (CUMULATIVE_ARGS *cum /* Argument info to init */ ,
tree fntype /* tree ptr for function decl */ ,
rtx libname /* SYMBOL_REF of library name or 0 */ ,
tree fndecl, int caller)
{
memset (cum, 0, sizeof (*cum));
cum->fntype = fntype;
if (libname)
{
gcc_assert (cfun && cfun->machine);
cum->normal_function = true;
if (!caller)
{
cfun->machine->normal_function = true;
gcn_detect_incoming_pointer_arg (fndecl);
}
return;
}
tree attr = NULL;
if (fndecl)
attr = lookup_attribute ("amdgpu_hsa_kernel", DECL_ATTRIBUTES (fndecl));
if (fndecl && !attr)
attr = lookup_attribute ("amdgpu_hsa_kernel",
TYPE_ATTRIBUTES (TREE_TYPE (fndecl)));
if (!attr && fntype)
attr = lookup_attribute ("amdgpu_hsa_kernel", TYPE_ATTRIBUTES (fntype));
/* Handle main () as kernel, so we can run testsuite.
Handle OpenACC kernels similarly to main. */
if (!attr && !caller && fndecl
&& (MAIN_NAME_P (DECL_NAME (fndecl))
|| lookup_attribute ("omp target entrypoint",
DECL_ATTRIBUTES (fndecl)) != NULL_TREE))
gcn_parse_amdgpu_hsa_kernel_attribute (&cum->args, NULL_TREE);
else
{
if (!attr || caller)
{
gcc_assert (cfun && cfun->machine);
cum->normal_function = true;
if (!caller)
cfun->machine->normal_function = true;
}
gcn_parse_amdgpu_hsa_kernel_attribute
(&cum->args, attr ? TREE_VALUE (attr) : NULL_TREE);
}
cfun->machine->args = cum->args;
if (!caller && cfun->machine->normal_function)
gcn_detect_incoming_pointer_arg (fndecl);
reinit_regs ();
}
static bool
gcn_return_in_memory (const_tree type, const_tree ARG_UNUSED (fntype))
{
machine_mode mode = TYPE_MODE (type);
HOST_WIDE_INT size = int_size_in_bytes (type);
if (AGGREGATE_TYPE_P (type))
return true;
/* Vector return values are not supported yet. */
if (VECTOR_TYPE_P (type))
return true;
if (mode == BLKmode)
return true;
if (size > 2 * UNITS_PER_WORD)
return true;
return false;
}
/* Implement TARGET_PROMOTE_FUNCTION_MODE.
Return the mode to use for outgoing function arguments. */
machine_mode
gcn_promote_function_mode (const_tree ARG_UNUSED (type), machine_mode mode,
int *ARG_UNUSED (punsignedp),
const_tree ARG_UNUSED (funtype),
int ARG_UNUSED (for_return))
{
if (GET_MODE_CLASS (mode) == MODE_INT && GET_MODE_SIZE (mode) < 4)
return SImode;
return mode;
}
/* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.
Derived from hppa_gimplify_va_arg_expr. The generic routine doesn't handle
ARGS_GROW_DOWNWARDS. */
static tree
gcn_gimplify_va_arg_expr (tree valist, tree type,
gimple_seq *ARG_UNUSED (pre_p),
gimple_seq *ARG_UNUSED (post_p))
{
tree ptr = build_pointer_type (type);
tree valist_type;
tree t, u;
bool indirect;
indirect = pass_va_arg_by_reference (type);
if (indirect)
{
type = ptr;
ptr = build_pointer_type (type);
}
valist_type = TREE_TYPE (valist);
/* Args grow down. Not handled by generic routines. */
u = fold_convert (sizetype, size_in_bytes (type));
u = fold_build1 (NEGATE_EXPR, sizetype, u);
t = fold_build_pointer_plus (valist, u);
/* Align to 8 byte boundary. */
u = build_int_cst (TREE_TYPE (t), -8);
t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, u);
t = fold_convert (valist_type, t);
t = build2 (MODIFY_EXPR, valist_type, valist, t);
t = fold_convert (ptr, t);
t = build_va_arg_indirect_ref (t);
if (indirect)
t = build_va_arg_indirect_ref (t);
return t;
}
/* Return 1 if TRAIT NAME is present in the OpenMP context's
device trait set, return 0 if not present in any OpenMP context in the
whole translation unit, or -1 if not present in the current OpenMP context
but might be present in another OpenMP context in the same TU. */
int
gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
const char *name)
{
switch (trait)
{
case omp_device_kind:
return strcmp (name, "gpu") == 0;
case omp_device_arch:
return strcmp (name, "gcn") == 0;
case omp_device_isa:
if (strcmp (name, "fiji") == 0)
return gcn_arch == PROCESSOR_FIJI;
if (strcmp (name, "gfx900") == 0)
return gcn_arch == PROCESSOR_VEGA10;
if (strcmp (name, "gfx906") == 0)
return gcn_arch == PROCESSOR_VEGA20;
if (strcmp (name, "gfx908") == 0)
return gcn_arch == PROCESSOR_GFX908;
return 0;
default:
gcc_unreachable ();
}
}
/* Calculate stack offsets needed to create prologues and epilogues. */
static struct machine_function *
gcn_compute_frame_offsets (void)
{
machine_function *offsets = cfun->machine;
if (reload_completed)
return offsets;
offsets->need_frame_pointer = frame_pointer_needed;
offsets->outgoing_args_size = crtl->outgoing_args_size;
offsets->pretend_size = crtl->args.pretend_args_size;
offsets->local_vars = get_frame_size ();
offsets->lr_needs_saving = (!leaf_function_p ()
|| df_regs_ever_live_p (LR_REGNUM)
|| df_regs_ever_live_p (LR_REGNUM + 1));
offsets->callee_saves = offsets->lr_needs_saving ? 8 : 0;
for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
|| ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
&& frame_pointer_needed))
offsets->callee_saves += (VGPR_REGNO_P (regno) ? 256 : 4);
/* Round up to 64-bit boundary to maintain stack alignment. */
offsets->callee_saves = (offsets->callee_saves + 7) & ~7;
return offsets;
}
/* Insert code into the prologue or epilogue to store or load any
callee-save register to/from the stack.
Helper function for gcn_expand_prologue and gcn_expand_epilogue. */
static void
move_callee_saved_registers (rtx sp, machine_function *offsets,
bool prologue)
{
int regno, offset, saved_scalars;
rtx exec = gen_rtx_REG (DImode, EXEC_REG);
rtx vcc = gen_rtx_REG (DImode, VCC_LO_REG);
rtx offreg = gen_rtx_REG (SImode, SGPR_REGNO (22));
rtx as = gen_rtx_CONST_INT (VOIDmode, STACK_ADDR_SPACE);
HOST_WIDE_INT exec_set = 0;
int offreg_set = 0;
auto_vec<int> saved_sgprs;
start_sequence ();
/* Move scalars into two vector registers. */
for (regno = 0, saved_scalars = 0; regno < FIRST_VGPR_REG; regno++)
if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
|| ((regno & ~1) == LINK_REGNUM && offsets->lr_needs_saving)
|| ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
&& offsets->need_frame_pointer))
{
rtx reg = gen_rtx_REG (SImode, regno);
rtx vreg = gen_rtx_REG (V64SImode,
VGPR_REGNO (6 + (saved_scalars / 64)));
int lane = saved_scalars % 64;
if (prologue)
{
emit_insn (gen_vec_setv64si (vreg, reg, GEN_INT (lane)));
saved_sgprs.safe_push (regno);
}
else
emit_insn (gen_vec_extractv64sisi (reg, vreg, GEN_INT (lane)));
saved_scalars++;
}
rtx move_scalars = get_insns ();
end_sequence ();
start_sequence ();
/* Ensure that all vector lanes are moved. */
exec_set = -1;
emit_move_insn (exec, GEN_INT (exec_set));
/* Set up a vector stack pointer. */
rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
rtx _0_4_8_12 = gen_rtx_REG (V64SImode, VGPR_REGNO (3));
emit_insn (gen_ashlv64si3_exec (_0_4_8_12, _0_1_2_3, GEN_INT (2),
gcn_gen_undef (V64SImode), exec));
rtx vsp = gen_rtx_REG (V64DImode, VGPR_REGNO (4));
emit_insn (gen_vec_duplicatev64di_exec (vsp, sp, gcn_gen_undef (V64DImode),
exec));
emit_insn (gen_addv64si3_vcc_exec (gcn_operand_part (V64SImode, vsp, 0),
gcn_operand_part (V64SImode, vsp, 0),
_0_4_8_12, vcc, gcn_gen_undef (V64SImode),
exec));
emit_insn (gen_addcv64si3_exec (gcn_operand_part (V64SImode, vsp, 1),
gcn_operand_part (V64SImode, vsp, 1),
const0_rtx, vcc, vcc,
gcn_gen_undef (V64SImode), exec));
/* Move vectors. */
for (regno = FIRST_VGPR_REG, offset = 0;
regno < FIRST_PSEUDO_REGISTER; regno++)
if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
|| (regno == VGPR_REGNO (6) && saved_scalars > 0)
|| (regno == VGPR_REGNO (7) && saved_scalars > 63))
{
rtx reg = gen_rtx_REG (V64SImode, regno);
int size = 256;
if (regno == VGPR_REGNO (6) && saved_scalars < 64)
size = saved_scalars * 4;
else if (regno == VGPR_REGNO (7) && saved_scalars < 128)
size = (saved_scalars - 64) * 4;
if (size != 256 || exec_set != -1)
{
exec_set = ((unsigned HOST_WIDE_INT) 1 << (size / 4)) - 1;
emit_move_insn (exec, gen_int_mode (exec_set, DImode));
}
if (prologue)
{
rtx insn = emit_insn (gen_scatterv64si_insn_1offset_exec
(vsp, const0_rtx, reg, as, const0_rtx,
exec));
/* Add CFI metadata. */
rtx note;
if (regno == VGPR_REGNO (6) || regno == VGPR_REGNO (7))
{
int start = (regno == VGPR_REGNO (7) ? 64 : 0);
int count = MIN (saved_scalars - start, 64);
int add_lr = (regno == VGPR_REGNO (6)
&& df_regs_ever_live_p (LINK_REGNUM));
int lrdest = -1;
rtvec seq = rtvec_alloc (count + add_lr);
/* Add an REG_FRAME_RELATED_EXPR entry for each scalar
register that was saved in this batch. */
for (int idx = 0; idx < count; idx++)
{
int stackaddr = offset + idx * 4;
rtx dest = gen_rtx_MEM (SImode,
gen_rtx_PLUS
(DImode, sp,
GEN_INT (stackaddr)));
rtx src = gen_rtx_REG (SImode, saved_sgprs[start + idx]);
rtx set = gen_rtx_SET (dest, src);
RTX_FRAME_RELATED_P (set) = 1;
RTVEC_ELT (seq, idx) = set;
if (saved_sgprs[start + idx] == LINK_REGNUM)
lrdest = stackaddr;
}
/* Add an additional expression for DWARF_LINK_REGISTER if
LINK_REGNUM was saved. */
if (lrdest != -1)
{
rtx dest = gen_rtx_MEM (DImode,
gen_rtx_PLUS
(DImode, sp,
GEN_INT (lrdest)));
rtx src = gen_rtx_REG (DImode, DWARF_LINK_REGISTER);
rtx set = gen_rtx_SET (dest, src);
RTX_FRAME_RELATED_P (set) = 1;
RTVEC_ELT (seq, count) = set;
}
note = gen_rtx_SEQUENCE (VOIDmode, seq);
}
else
{
rtx dest = gen_rtx_MEM (V64SImode,
gen_rtx_PLUS (DImode, sp,
GEN_INT (offset)));
rtx src = gen_rtx_REG (V64SImode, regno);
note = gen_rtx_SET (dest, src);
}
RTX_FRAME_RELATED_P (insn) = 1;
add_reg_note (insn, REG_FRAME_RELATED_EXPR, note);
}
else
emit_insn (gen_gatherv64si_insn_1offset_exec
(reg, vsp, const0_rtx, as, const0_rtx,
gcn_gen_undef (V64SImode), exec));
/* Move our VSP to the next stack entry. */
if (offreg_set != size)
{
offreg_set = size;
emit_move_insn (offreg, GEN_INT (size));
}
if (exec_set != -1)
{
exec_set = -1;
emit_move_insn (exec, GEN_INT (exec_set));
}
emit_insn (gen_addv64si3_vcc_dup_exec
(gcn_operand_part (V64SImode, vsp, 0),
offreg, gcn_operand_part (V64SImode, vsp, 0),
vcc, gcn_gen_undef (V64SImode), exec));
emit_insn (gen_addcv64si3_exec
(gcn_operand_part (V64SImode, vsp, 1),
gcn_operand_part (V64SImode, vsp, 1),
const0_rtx, vcc, vcc, gcn_gen_undef (V64SImode), exec));
offset += size;
}
rtx move_vectors = get_insns ();
end_sequence ();
if (prologue)
{
emit_insn (move_scalars);
emit_insn (move_vectors);
}
else
{
emit_insn (move_vectors);
emit_insn (move_scalars);
}
}
/* Generate prologue. Called from gen_prologue during pro_and_epilogue pass.
For a non-kernel function, the stack layout looks like this (interim),
growing *upwards*:
hi | + ...
|__________________| <-- current SP
| outgoing args |
|__________________|
| (alloca space) |
|__________________|
| local vars |
|__________________| <-- FP/hard FP
| callee-save regs |
|__________________| <-- soft arg pointer
| pretend args |
|__________________| <-- incoming SP
| incoming args |
lo |..................|
This implies arguments (beyond the first N in registers) must grow
downwards (as, apparently, PA has them do).
For a kernel function we have the simpler:
hi | + ...
|__________________| <-- current SP
| outgoing args |
|__________________|
| (alloca space) |
|__________________|
| local vars |
lo |__________________| <-- FP/hard FP
*/
void
gcn_expand_prologue ()
{
machine_function *offsets = gcn_compute_frame_offsets ();
if (!cfun || !cfun->machine || cfun->machine->normal_function)
{
rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
rtx sp_hi = gcn_operand_part (Pmode, sp, 1);
rtx sp_lo = gcn_operand_part (Pmode, sp, 0);
rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
rtx fp_hi = gcn_operand_part (Pmode, fp, 1);
rtx fp_lo = gcn_operand_part (Pmode, fp, 0);
start_sequence ();
if (offsets->pretend_size > 0)
{
/* FIXME: Do the actual saving of register pretend args to the stack.
Register order needs consideration. */
}
/* Save callee-save regs. */
move_callee_saved_registers (sp, offsets, true);
HOST_WIDE_INT sp_adjust = offsets->pretend_size
+ offsets->callee_saves
+ offsets->local_vars + offsets->outgoing_args_size;
if (sp_adjust > 0)
{
/* Adding RTX_FRAME_RELATED_P effectively disables spliting, so
we use split add explictly, and specify the DImode add in
the note. */
rtx scc = gen_rtx_REG (BImode, SCC_REG);
rtx adjustment = gen_int_mode (sp_adjust, SImode);
rtx insn = emit_insn (gen_addsi3_scalar_carry (sp_lo, sp_lo,
adjustment, scc));
if (!offsets->need_frame_pointer)
{
RTX_FRAME_RELATED_P (insn) = 1;
add_reg_note (insn, REG_FRAME_RELATED_EXPR,
gen_rtx_SET (sp,
gen_rtx_PLUS (DImode, sp,
adjustment)));
}
emit_insn (gen_addcsi3_scalar_zero (sp_hi, sp_hi, scc));
}
if (offsets->need_frame_pointer)
{
/* Adding RTX_FRAME_RELATED_P effectively disables spliting, so
we use split add explictly, and specify the DImode add in
the note. */
rtx scc = gen_rtx_REG (BImode, SCC_REG);
int fp_adjust = -(offsets->local_vars + offsets->outgoing_args_size);
rtx adjustment = gen_int_mode (fp_adjust, SImode);
rtx insn = emit_insn (gen_addsi3_scalar_carry(fp_lo, sp_lo,
adjustment, scc));
emit_insn (gen_addcsi3_scalar (fp_hi, sp_hi,
(fp_adjust < 0 ? GEN_INT (-1)
: const0_rtx),
scc, scc));
/* Set the CFA to the entry stack address, as an offset from the
frame pointer. This is preferred because the frame pointer is
saved in each frame, whereas the stack pointer is not. */
RTX_FRAME_RELATED_P (insn) = 1;
add_reg_note (insn, REG_CFA_DEF_CFA,
gen_rtx_PLUS (DImode, fp,
GEN_INT (-(offsets->pretend_size
+ offsets->callee_saves))));
}
rtx_insn *seq = get_insns ();
end_sequence ();
emit_insn (seq);
}
else
{
rtx wave_offset = gen_rtx_REG (SImode,
cfun->machine->args.
reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG))
{
rtx fs_init_lo =
gen_rtx_REG (SImode,
cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG]);
rtx fs_init_hi =
gen_rtx_REG (SImode,
cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG] + 1);
rtx fs_reg_lo = gen_rtx_REG (SImode, FLAT_SCRATCH_REG);
rtx fs_reg_hi = gen_rtx_REG (SImode, FLAT_SCRATCH_REG + 1);
/*rtx queue = gen_rtx_REG(DImode,
cfun->machine->args.reg[QUEUE_PTR_ARG]);
rtx aperture = gen_rtx_MEM (SImode,
gen_rtx_PLUS (DImode, queue,
gen_int_mode (68, SImode)));
set_mem_addr_space (aperture, ADDR_SPACE_SCALAR_FLAT);*/
/* Set up flat_scratch. */
emit_insn (gen_addsi3_scc (fs_reg_hi, fs_init_lo, wave_offset));
emit_insn (gen_lshrsi3_scc (fs_reg_hi, fs_reg_hi,
gen_int_mode (8, SImode)));
emit_move_insn (fs_reg_lo, fs_init_hi);
}
/* Set up frame pointer and stack pointer. */
rtx sp = gen_rtx_REG (DImode, STACK_POINTER_REGNUM);
rtx sp_hi = simplify_gen_subreg (SImode, sp, DImode, 4);
rtx sp_lo = simplify_gen_subreg (SImode, sp, DImode, 0);
rtx fp = gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM);
rtx fp_hi = simplify_gen_subreg (SImode, fp, DImode, 4);
rtx fp_lo = simplify_gen_subreg (SImode, fp, DImode, 0);
HOST_WIDE_INT sp_adjust = (offsets->local_vars
+ offsets->outgoing_args_size);
/* Initialise FP and SP from the buffer descriptor in s[0:3]. */
emit_move_insn (fp_lo, gen_rtx_REG (SImode, 0));
emit_insn (gen_andsi3_scc (fp_hi, gen_rtx_REG (SImode, 1),
gen_int_mode (0xffff, SImode)));
rtx scc = gen_rtx_REG (BImode, SCC_REG);
emit_insn (gen_addsi3_scalar_carry (fp_lo, fp_lo, wave_offset, scc));
emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi, scc));
/* Adding RTX_FRAME_RELATED_P effectively disables spliting, so we use
split add explictly, and specify the DImode add in the note.
The DWARF info expects that the callee-save data is in the frame,
even though it isn't (because this is the entry point), so we
make a notional adjustment to the DWARF frame offset here. */
rtx dbg_adjustment = gen_int_mode (sp_adjust + offsets->callee_saves,
DImode);
rtx insn;
if (sp_adjust > 0)
{
rtx scc = gen_rtx_REG (BImode, SCC_REG);
rtx adjustment = gen_int_mode (sp_adjust, DImode);
insn = emit_insn (gen_addsi3_scalar_carry(sp_lo, fp_lo, adjustment,
scc));
emit_insn (gen_addcsi3_scalar_zero (sp_hi, fp_hi, scc));
}
else
insn = emit_move_insn (sp, fp);
RTX_FRAME_RELATED_P (insn) = 1;
add_reg_note (insn, REG_FRAME_RELATED_EXPR,
gen_rtx_SET (sp, gen_rtx_PLUS (DImode, sp,
dbg_adjustment)));
if (offsets->need_frame_pointer)
{
/* Set the CFA to the entry stack address, as an offset from the
frame pointer. This is necessary when alloca is used, and
harmless otherwise. */
rtx neg_adjust = gen_int_mode (-offsets->callee_saves, DImode);
add_reg_note (insn, REG_CFA_DEF_CFA,
gen_rtx_PLUS (DImode, fp, neg_adjust));
}
/* Make sure the flat scratch reg doesn't get optimised away. */
emit_insn (gen_prologue_use (gen_rtx_REG (DImode, FLAT_SCRATCH_REG)));
}
/* Ensure that the scheduler doesn't do anything unexpected. */
emit_insn (gen_blockage ());
/* m0 is initialized for the usual LDS DS and FLAT memory case.
The low-part is the address of the topmost addressable byte, which is
size-1. The high-part is an offset and should be zero. */
emit_move_insn (gen_rtx_REG (SImode, M0_REG),
gen_int_mode (LDS_SIZE, SImode));
emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
{
/* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel. */
rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
emit_move_insn (fn_reg, gen_rtx_SYMBOL_REF (Pmode,
"gomp_gcn_enter_kernel"));
emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
}
}
/* Generate epilogue. Called from gen_epilogue during pro_and_epilogue pass.
See gcn_expand_prologue for stack details. */
void
gcn_expand_epilogue (void)
{
/* Ensure that the scheduler doesn't do anything unexpected. */
emit_insn (gen_blockage ());
if (!cfun || !cfun->machine || cfun->machine->normal_function)
{
machine_function *offsets = gcn_compute_frame_offsets ();
rtx sp = gen_rtx_REG (