| /* Copyright (C) 2016-2021 Free Software Foundation, Inc. |
| |
| This file is free software; you can redistribute it and/or modify it under |
| the terms of the GNU General Public License as published by the Free |
| Software Foundation; either version 3 of the License, or (at your option) |
| any later version. |
| |
| This file is distributed in the hope that it will be useful, but WITHOUT |
| ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with GCC; see the file COPYING3. If not see |
| <http://www.gnu.org/licenses/>. */ |
| |
| /* {{{ Includes. */ |
| |
| /* We want GET_MODE_SIZE et al to return integers, please. */ |
| #define IN_TARGET_CODE 1 |
| |
| #include "config.h" |
| #include "system.h" |
| #include "coretypes.h" |
| #include "backend.h" |
| #include "target.h" |
| #include "memmodel.h" |
| #include "rtl.h" |
| #include "tree.h" |
| #include "df.h" |
| #include "tm_p.h" |
| #include "stringpool.h" |
| #include "optabs.h" |
| #include "regs.h" |
| #include "emit-rtl.h" |
| #include "recog.h" |
| #include "diagnostic-core.h" |
| #include "insn-attr.h" |
| #include "fold-const.h" |
| #include "calls.h" |
| #include "explow.h" |
| #include "expr.h" |
| #include "output.h" |
| #include "cfgrtl.h" |
| #include "langhooks.h" |
| #include "builtins.h" |
| #include "omp-general.h" |
| #include "print-rtl.h" |
| #include "attribs.h" |
| #include "varasm.h" |
| #include "intl.h" |
| #include "rtl-iter.h" |
| #include "dwarf2.h" |
| #include "gimple.h" |
| |
| /* This file should be included last. */ |
| #include "target-def.h" |
| |
| /* }}} */ |
| /* {{{ Global variables. */ |
| |
| /* Constants used by FP instructions. */ |
| |
| static REAL_VALUE_TYPE dconst4, dconst1over2pi; |
| static bool ext_gcn_constants_init = 0; |
| |
| /* Holds the ISA variant, derived from the command line parameters. */ |
| |
| int gcn_isa = 3; /* Default to GCN3. */ |
| |
| /* Reserve this much space for LDS (for propagating variables from |
| worker-single mode to worker-partitioned mode), per workgroup. Global |
| analysis could calculate an exact bound, but we don't do that yet. |
| |
| We want to permit full occupancy, so size accordingly. */ |
| |
| /* Use this as a default, but allow it to grow if the user requests a large |
| amount of gang-private shared-memory space. */ |
| static int acc_lds_size = 0x600; |
| |
| #define OMP_LDS_SIZE 0x600 /* 0x600 is 1/40 total, rounded down. */ |
| #define ACC_LDS_SIZE acc_lds_size |
| #define OTHER_LDS_SIZE 65536 /* If in doubt, reserve all of it. */ |
| |
| #define LDS_SIZE (flag_openacc ? ACC_LDS_SIZE \ |
| : flag_openmp ? OMP_LDS_SIZE \ |
| : OTHER_LDS_SIZE) |
| |
| static int gang_private_hwm = 32; |
| static hash_map<tree, int> lds_allocs; |
| |
| /* The number of registers usable by normal non-kernel functions. |
| The SGPR count includes any special extra registers such as VCC. */ |
| |
| #define MAX_NORMAL_SGPR_COUNT 62 // i.e. 64 with VCC |
| #define MAX_NORMAL_VGPR_COUNT 24 |
| |
| /* }}} */ |
| /* {{{ Initialization and options. */ |
| |
| /* Initialize machine_function. */ |
| |
| static struct machine_function * |
| gcn_init_machine_status (void) |
| { |
| struct machine_function *f; |
| |
| f = ggc_cleared_alloc<machine_function> (); |
| |
| if (TARGET_GCN3) |
| f->use_flat_addressing = true; |
| |
| return f; |
| } |
| |
| /* Implement TARGET_OPTION_OVERRIDE. |
| |
| Override option settings where defaults are variable, or we have specific |
| needs to consider. */ |
| |
| static void |
| gcn_option_override (void) |
| { |
| init_machine_status = gcn_init_machine_status; |
| |
| /* The HSA runtime does not respect ELF load addresses, so force PIE. */ |
| if (!flag_pie) |
| flag_pie = 2; |
| if (!flag_pic) |
| flag_pic = flag_pie; |
| |
| gcn_isa = gcn_arch == PROCESSOR_FIJI ? 3 : 5; |
| |
| /* The default stack size needs to be small for offload kernels because |
| there may be many, many threads. Also, a smaller stack gives a |
| measureable performance boost. But, a small stack is insufficient |
| for running the testsuite, so we use a larger default for the stand |
| alone case. */ |
| if (stack_size_opt == -1) |
| { |
| if (flag_openacc || flag_openmp) |
| /* 512 bytes per work item = 32kB total. */ |
| stack_size_opt = 512 * 64; |
| else |
| /* 1MB total. */ |
| stack_size_opt = 1048576; |
| } |
| |
| /* Reserve 1Kb (somewhat arbitrarily) of LDS space for reduction results and |
| worker broadcasts. */ |
| if (gang_private_size_opt == -1) |
| gang_private_size_opt = 512; |
| else if (gang_private_size_opt < gang_private_hwm) |
| gang_private_size_opt = gang_private_hwm; |
| else if (gang_private_size_opt >= acc_lds_size - 1024) |
| { |
| /* We need some space for reductions and worker broadcasting. If the |
| user requests a large amount of gang-private LDS space, we might not |
| have enough left for the former. Increase the LDS allocation in that |
| case, although this may reduce the maximum occupancy on the |
| hardware. */ |
| acc_lds_size = gang_private_size_opt + 1024; |
| if (acc_lds_size > 32768) |
| acc_lds_size = 32768; |
| } |
| |
| /* The xnack option is a placeholder, for now. */ |
| if (flag_xnack) |
| sorry ("XNACK support"); |
| } |
| |
| /* }}} */ |
| /* {{{ Attributes. */ |
| |
| /* This table defines the arguments that are permitted in |
| __attribute__ ((amdgpu_hsa_kernel (...))). |
| |
| The names and values correspond to the HSA metadata that is encoded |
| into the assembler file and binary. */ |
| |
| static const struct gcn_kernel_arg_type |
| { |
| const char *name; |
| const char *header_pseudo; |
| machine_mode mode; |
| |
| /* This should be set to -1 or -2 for a dynamically allocated register |
| number. Use -1 if this argument contributes to the user_sgpr_count, |
| -2 otherwise. */ |
| int fixed_regno; |
| } gcn_kernel_arg_types[] = { |
| {"exec", NULL, DImode, EXEC_REG}, |
| #define PRIVATE_SEGMENT_BUFFER_ARG 1 |
| {"private_segment_buffer", |
| ".amdhsa_user_sgpr_private_segment_buffer", TImode, -1}, |
| #define DISPATCH_PTR_ARG 2 |
| {"dispatch_ptr", ".amdhsa_user_sgpr_dispatch_ptr", DImode, -1}, |
| #define QUEUE_PTR_ARG 3 |
| {"queue_ptr", ".amdhsa_user_sgpr_queue_ptr", DImode, -1}, |
| #define KERNARG_SEGMENT_PTR_ARG 4 |
| {"kernarg_segment_ptr", ".amdhsa_user_sgpr_kernarg_segment_ptr", DImode, -1}, |
| {"dispatch_id", ".amdhsa_user_sgpr_dispatch_id", DImode, -1}, |
| #define FLAT_SCRATCH_INIT_ARG 6 |
| {"flat_scratch_init", ".amdhsa_user_sgpr_flat_scratch_init", DImode, -1}, |
| #define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7 |
| {"private_segment_size", ".amdhsa_user_sgpr_private_segment_size", SImode, -1}, |
| #define WORKGROUP_ID_X_ARG 8 |
| {"workgroup_id_X", ".amdhsa_system_sgpr_workgroup_id_x", SImode, -2}, |
| {"workgroup_id_Y", ".amdhsa_system_sgpr_workgroup_id_y", SImode, -2}, |
| {"workgroup_id_Z", ".amdhsa_system_sgpr_workgroup_id_z", SImode, -2}, |
| {"workgroup_info", ".amdhsa_system_sgpr_workgroup_info", SImode, -1}, |
| #define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 12 |
| {"private_segment_wave_offset", |
| ".amdhsa_system_sgpr_private_segment_wavefront_offset", SImode, -2}, |
| #define WORK_ITEM_ID_X_ARG 13 |
| {"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG}, |
| #define WORK_ITEM_ID_Y_ARG 14 |
| {"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1}, |
| #define WORK_ITEM_ID_Z_ARG 15 |
| {"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2} |
| }; |
| |
| static const long default_requested_args |
| = (1 << PRIVATE_SEGMENT_BUFFER_ARG) |
| | (1 << DISPATCH_PTR_ARG) |
| | (1 << QUEUE_PTR_ARG) |
| | (1 << KERNARG_SEGMENT_PTR_ARG) |
| | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG) |
| | (1 << WORKGROUP_ID_X_ARG) |
| | (1 << WORK_ITEM_ID_X_ARG) |
| | (1 << WORK_ITEM_ID_Y_ARG) |
| | (1 << WORK_ITEM_ID_Z_ARG); |
| |
| /* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())). |
| This function also sets the default values for some arguments. |
| |
| Return true on success, with ARGS populated. */ |
| |
| static bool |
| gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args, |
| tree list) |
| { |
| bool err = false; |
| args->requested = default_requested_args; |
| args->nargs = 0; |
| |
| for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++) |
| args->reg[a] = -1; |
| |
| for (; list; list = TREE_CHAIN (list)) |
| { |
| const char *str; |
| if (TREE_CODE (TREE_VALUE (list)) != STRING_CST) |
| { |
| error ("%<amdgpu_hsa_kernel%> attribute requires string constant " |
| "arguments"); |
| break; |
| } |
| str = TREE_STRING_POINTER (TREE_VALUE (list)); |
| int a; |
| for (a = 0; a < GCN_KERNEL_ARG_TYPES; a++) |
| { |
| if (!strcmp (str, gcn_kernel_arg_types[a].name)) |
| break; |
| } |
| if (a == GCN_KERNEL_ARG_TYPES) |
| { |
| error ("unknown specifier %qs in %<amdgpu_hsa_kernel%> attribute", |
| str); |
| err = true; |
| break; |
| } |
| if (args->requested & (1 << a)) |
| { |
| error ("duplicated parameter specifier %qs in %<amdgpu_hsa_kernel%> " |
| "attribute", str); |
| err = true; |
| break; |
| } |
| args->requested |= (1 << a); |
| args->order[args->nargs++] = a; |
| } |
| |
| /* Requesting WORK_ITEM_ID_Z_ARG implies requesting WORK_ITEM_ID_X_ARG and |
| WORK_ITEM_ID_Y_ARG. Similarly, requesting WORK_ITEM_ID_Y_ARG implies |
| requesting WORK_ITEM_ID_X_ARG. */ |
| if (args->requested & (1 << WORK_ITEM_ID_Z_ARG)) |
| args->requested |= (1 << WORK_ITEM_ID_Y_ARG); |
| if (args->requested & (1 << WORK_ITEM_ID_Y_ARG)) |
| args->requested |= (1 << WORK_ITEM_ID_X_ARG); |
| |
| int sgpr_regno = FIRST_SGPR_REG; |
| args->nsgprs = 0; |
| for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++) |
| { |
| if (!(args->requested & (1 << a))) |
| continue; |
| |
| if (gcn_kernel_arg_types[a].fixed_regno >= 0) |
| args->reg[a] = gcn_kernel_arg_types[a].fixed_regno; |
| else |
| { |
| int reg_count; |
| |
| switch (gcn_kernel_arg_types[a].mode) |
| { |
| case E_SImode: |
| reg_count = 1; |
| break; |
| case E_DImode: |
| reg_count = 2; |
| break; |
| case E_TImode: |
| reg_count = 4; |
| break; |
| default: |
| gcc_unreachable (); |
| } |
| args->reg[a] = sgpr_regno; |
| sgpr_regno += reg_count; |
| if (gcn_kernel_arg_types[a].fixed_regno == -1) |
| args->nsgprs += reg_count; |
| } |
| } |
| if (sgpr_regno > FIRST_SGPR_REG + 16) |
| { |
| error ("too many arguments passed in sgpr registers"); |
| } |
| return err; |
| } |
| |
| /* Referenced by TARGET_ATTRIBUTE_TABLE. |
| |
| Validates target specific attributes. */ |
| |
| static tree |
| gcn_handle_amdgpu_hsa_kernel_attribute (tree *node, tree name, |
| tree args, int, bool *no_add_attrs) |
| { |
| if (!FUNC_OR_METHOD_TYPE_P (*node)) |
| { |
| warning (OPT_Wattributes, "%qE attribute only applies to functions", |
| name); |
| *no_add_attrs = true; |
| return NULL_TREE; |
| } |
| |
| /* Can combine regparm with all attributes but fastcall, and thiscall. */ |
| if (is_attribute_p ("gcnhsa_kernel", name)) |
| { |
| struct gcn_kernel_args kernelarg; |
| |
| if (gcn_parse_amdgpu_hsa_kernel_attribute (&kernelarg, args)) |
| *no_add_attrs = true; |
| |
| return NULL_TREE; |
| } |
| |
| return NULL_TREE; |
| } |
| |
| /* Implement TARGET_ATTRIBUTE_TABLE. |
| |
| Create target-specific __attribute__ types. */ |
| |
| static const struct attribute_spec gcn_attribute_table[] = { |
| /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler, |
| affects_type_identity } */ |
| {"amdgpu_hsa_kernel", 0, GCN_KERNEL_ARG_TYPES, false, true, |
| true, true, gcn_handle_amdgpu_hsa_kernel_attribute, NULL}, |
| /* End element. */ |
| {NULL, 0, 0, false, false, false, false, NULL, NULL} |
| }; |
| |
| /* }}} */ |
| /* {{{ Registers and modes. */ |
| |
| /* Implement TARGET_SCALAR_MODE_SUPPORTED_P. */ |
| |
| bool |
| gcn_scalar_mode_supported_p (scalar_mode mode) |
| { |
| return (mode == BImode |
| || mode == QImode |
| || mode == HImode /* || mode == HFmode */ |
| || mode == SImode || mode == SFmode |
| || mode == DImode || mode == DFmode |
| || mode == TImode); |
| } |
| |
| /* Implement TARGET_CLASS_MAX_NREGS. |
| |
| Return the number of hard registers needed to hold a value of MODE in |
| a register of class RCLASS. */ |
| |
| static unsigned char |
| gcn_class_max_nregs (reg_class_t rclass, machine_mode mode) |
| { |
| /* Scalar registers are 32bit, vector registers are in fact tuples of |
| 64 lanes. */ |
| if (rclass == VGPR_REGS) |
| { |
| if (vgpr_1reg_mode_p (mode)) |
| return 1; |
| if (vgpr_2reg_mode_p (mode)) |
| return 2; |
| /* TImode is used by DImode compare_and_swap. */ |
| if (mode == TImode) |
| return 4; |
| } |
| else if (rclass == VCC_CONDITIONAL_REG && mode == BImode) |
| return 2; |
| return CEIL (GET_MODE_SIZE (mode), 4); |
| } |
| |
| /* Implement TARGET_HARD_REGNO_NREGS. |
| |
| Return the number of hard registers needed to hold a value of MODE in |
| REGNO. */ |
| |
| unsigned int |
| gcn_hard_regno_nregs (unsigned int regno, machine_mode mode) |
| { |
| return gcn_class_max_nregs (REGNO_REG_CLASS (regno), mode); |
| } |
| |
| /* Implement TARGET_HARD_REGNO_MODE_OK. |
| |
| Return true if REGNO can hold value in MODE. */ |
| |
| bool |
| gcn_hard_regno_mode_ok (unsigned int regno, machine_mode mode) |
| { |
| /* Treat a complex mode as if it were a scalar mode of the same overall |
| size for the purposes of allocating hard registers. */ |
| if (COMPLEX_MODE_P (mode)) |
| switch (mode) |
| { |
| case E_CQImode: |
| case E_CHImode: |
| mode = SImode; |
| break; |
| case E_CSImode: |
| mode = DImode; |
| break; |
| case E_CDImode: |
| mode = TImode; |
| break; |
| case E_HCmode: |
| mode = SFmode; |
| break; |
| case E_SCmode: |
| mode = DFmode; |
| break; |
| default: |
| /* Not supported. */ |
| return false; |
| } |
| |
| switch (regno) |
| { |
| case FLAT_SCRATCH_LO_REG: |
| case XNACK_MASK_LO_REG: |
| case TBA_LO_REG: |
| case TMA_LO_REG: |
| return (mode == SImode || mode == DImode); |
| case VCC_LO_REG: |
| case EXEC_LO_REG: |
| return (mode == BImode || mode == SImode || mode == DImode); |
| case M0_REG: |
| case FLAT_SCRATCH_HI_REG: |
| case XNACK_MASK_HI_REG: |
| case TBA_HI_REG: |
| case TMA_HI_REG: |
| return mode == SImode; |
| case VCC_HI_REG: |
| return false; |
| case EXEC_HI_REG: |
| return mode == SImode /*|| mode == V32BImode */ ; |
| case SCC_REG: |
| case VCCZ_REG: |
| case EXECZ_REG: |
| return mode == BImode; |
| } |
| if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM) |
| return true; |
| if (SGPR_REGNO_P (regno)) |
| /* We restrict double register values to aligned registers. */ |
| return (sgpr_1reg_mode_p (mode) |
| || (!((regno - FIRST_SGPR_REG) & 1) && sgpr_2reg_mode_p (mode)) |
| || (((regno - FIRST_SGPR_REG) & 3) == 0 && mode == TImode)); |
| if (VGPR_REGNO_P (regno)) |
| /* Vector instructions do not care about the alignment of register |
| pairs, but where there is no 64-bit instruction, many of the |
| define_split do not work if the input and output registers partially |
| overlap. We tried to fix this with early clobber and match |
| constraints, but it was bug prone, added complexity, and conflicts |
| with the 'U0' constraints on vec_merge. |
| Therefore, we restrict ourselved to aligned registers. */ |
| return (vgpr_1reg_mode_p (mode) |
| || (!((regno - FIRST_VGPR_REG) & 1) && vgpr_2reg_mode_p (mode)) |
| /* TImode is used by DImode compare_and_swap. */ |
| || (mode == TImode |
| && !((regno - FIRST_VGPR_REG) & 3))); |
| return false; |
| } |
| |
| /* Implement REGNO_REG_CLASS via gcn.h. |
| |
| Return smallest class containing REGNO. */ |
| |
| enum reg_class |
| gcn_regno_reg_class (int regno) |
| { |
| switch (regno) |
| { |
| case SCC_REG: |
| return SCC_CONDITIONAL_REG; |
| case VCC_LO_REG: |
| case VCC_HI_REG: |
| return VCC_CONDITIONAL_REG; |
| case VCCZ_REG: |
| return VCCZ_CONDITIONAL_REG; |
| case EXECZ_REG: |
| return EXECZ_CONDITIONAL_REG; |
| case EXEC_LO_REG: |
| case EXEC_HI_REG: |
| return EXEC_MASK_REG; |
| } |
| if (VGPR_REGNO_P (regno)) |
| return VGPR_REGS; |
| if (SGPR_REGNO_P (regno)) |
| return SGPR_REGS; |
| if (regno < FIRST_VGPR_REG) |
| return GENERAL_REGS; |
| if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM) |
| return AFP_REGS; |
| return ALL_REGS; |
| } |
| |
| /* Implement TARGET_CAN_CHANGE_MODE_CLASS. |
| |
| GCC assumes that lowpart contains first part of value as stored in memory. |
| This is not the case for vector registers. */ |
| |
| bool |
| gcn_can_change_mode_class (machine_mode from, machine_mode to, |
| reg_class_t regclass) |
| { |
| if (!vgpr_vector_mode_p (from) && !vgpr_vector_mode_p (to)) |
| return true; |
| return (gcn_class_max_nregs (regclass, from) |
| == gcn_class_max_nregs (regclass, to)); |
| } |
| |
| /* Implement TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P. |
| |
| When this hook returns true for MODE, the compiler allows |
| registers explicitly used in the rtl to be used as spill registers |
| but prevents the compiler from extending the lifetime of these |
| registers. */ |
| |
| bool |
| gcn_small_register_classes_for_mode_p (machine_mode mode) |
| { |
| /* We allocate into exec and vcc regs. Those make small register class. */ |
| return mode == DImode || mode == SImode; |
| } |
| |
| /* Implement TARGET_CLASS_LIKELY_SPILLED_P. |
| |
| Returns true if pseudos that have been assigned to registers of class RCLASS |
| would likely be spilled because registers of RCLASS are needed for spill |
| registers. */ |
| |
| static bool |
| gcn_class_likely_spilled_p (reg_class_t rclass) |
| { |
| return (rclass == EXEC_MASK_REG |
| || reg_classes_intersect_p (ALL_CONDITIONAL_REGS, rclass)); |
| } |
| |
| /* Implement TARGET_MODES_TIEABLE_P. |
| |
| Returns true if a value of MODE1 is accessible in MODE2 without |
| copying. */ |
| |
| bool |
| gcn_modes_tieable_p (machine_mode mode1, machine_mode mode2) |
| { |
| return (GET_MODE_BITSIZE (mode1) <= MAX_FIXED_MODE_SIZE |
| && GET_MODE_BITSIZE (mode2) <= MAX_FIXED_MODE_SIZE); |
| } |
| |
| /* Implement TARGET_TRULY_NOOP_TRUNCATION. |
| |
| Returns true if it is safe to “convert” a value of INPREC bits to one of |
| OUTPREC bits (where OUTPREC is smaller than INPREC) by merely operating on |
| it as if it had only OUTPREC bits. */ |
| |
| bool |
| gcn_truly_noop_truncation (poly_uint64 outprec, poly_uint64 inprec) |
| { |
| return ((inprec <= 32) && (outprec <= inprec)); |
| } |
| |
| /* Return N-th part of value occupying multiple registers. */ |
| |
| rtx |
| gcn_operand_part (machine_mode mode, rtx op, int n) |
| { |
| if (GET_MODE_SIZE (mode) >= 256) |
| { |
| /*gcc_assert (GET_MODE_SIZE (mode) == 256 || n == 0); */ |
| |
| if (REG_P (op)) |
| { |
| gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER); |
| return gen_rtx_REG (V64SImode, REGNO (op) + n); |
| } |
| if (GET_CODE (op) == CONST_VECTOR) |
| { |
| int units = GET_MODE_NUNITS (mode); |
| rtvec v = rtvec_alloc (units); |
| |
| for (int i = 0; i < units; ++i) |
| RTVEC_ELT (v, i) = gcn_operand_part (GET_MODE_INNER (mode), |
| CONST_VECTOR_ELT (op, i), n); |
| |
| return gen_rtx_CONST_VECTOR (V64SImode, v); |
| } |
| if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR) |
| return gcn_gen_undef (V64SImode); |
| gcc_unreachable (); |
| } |
| else if (GET_MODE_SIZE (mode) == 8 && REG_P (op)) |
| { |
| gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER); |
| return gen_rtx_REG (SImode, REGNO (op) + n); |
| } |
| else |
| { |
| if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR) |
| return gcn_gen_undef (SImode); |
| |
| /* If it's a constant then let's assume it is of the largest mode |
| available, otherwise simplify_gen_subreg will fail. */ |
| if (mode == VOIDmode && CONST_INT_P (op)) |
| mode = DImode; |
| return simplify_gen_subreg (SImode, op, mode, n * 4); |
| } |
| } |
| |
| /* Return N-th part of value occupying multiple registers. */ |
| |
| rtx |
| gcn_operand_doublepart (machine_mode mode, rtx op, int n) |
| { |
| return simplify_gen_subreg (DImode, op, mode, n * 8); |
| } |
| |
| /* Return true if OP can be split into subregs or high/low parts. |
| This is always true for scalars, but not normally true for vectors. |
| However, for vectors in hardregs we can use the low and high registers. */ |
| |
| bool |
| gcn_can_split_p (machine_mode, rtx op) |
| { |
| if (vgpr_vector_mode_p (GET_MODE (op))) |
| { |
| if (GET_CODE (op) == SUBREG) |
| op = SUBREG_REG (op); |
| if (!REG_P (op)) |
| return true; |
| return REGNO (op) <= FIRST_PSEUDO_REGISTER; |
| } |
| return true; |
| } |
| |
| /* Implement TARGET_SPILL_CLASS. |
| |
| Return class of registers which could be used for pseudo of MODE |
| and of class RCLASS for spilling instead of memory. Return NO_REGS |
| if it is not possible or non-profitable. */ |
| |
| static reg_class_t |
| gcn_spill_class (reg_class_t c, machine_mode /*mode */ ) |
| { |
| if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c) |
| || c == VCC_CONDITIONAL_REG) |
| return SGPR_REGS; |
| else |
| return NO_REGS; |
| } |
| |
| /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS. |
| |
| Change allocno class for given pseudo from allocno and best class |
| calculated by IRA. */ |
| |
| static reg_class_t |
| gcn_ira_change_pseudo_allocno_class (int regno, reg_class_t cl, |
| reg_class_t best_cl) |
| { |
| /* Avoid returning classes that contain both vgpr and sgpr registers. */ |
| if (cl != ALL_REGS && cl != SRCDST_REGS && cl != ALL_GPR_REGS) |
| return cl; |
| if (best_cl != ALL_REGS && best_cl != SRCDST_REGS |
| && best_cl != ALL_GPR_REGS) |
| return best_cl; |
| |
| machine_mode mode = PSEUDO_REGNO_MODE (regno); |
| if (vgpr_vector_mode_p (mode)) |
| return VGPR_REGS; |
| |
| return GENERAL_REGS; |
| } |
| |
| /* Create a new DImode pseudo reg and emit an instruction to initialize |
| it to VAL. */ |
| |
| static rtx |
| get_exec (int64_t val) |
| { |
| rtx reg = gen_reg_rtx (DImode); |
| emit_insn (gen_rtx_SET (reg, gen_int_mode (val, DImode))); |
| return reg; |
| } |
| |
| /* Return value of scalar exec register. */ |
| |
| rtx |
| gcn_scalar_exec () |
| { |
| return const1_rtx; |
| } |
| |
| /* Return pseudo holding scalar exec register. */ |
| |
| rtx |
| gcn_scalar_exec_reg () |
| { |
| return get_exec (1); |
| } |
| |
| /* Return value of full exec register. */ |
| |
| rtx |
| gcn_full_exec () |
| { |
| return constm1_rtx; |
| } |
| |
| /* Return pseudo holding full exec register. */ |
| |
| rtx |
| gcn_full_exec_reg () |
| { |
| return get_exec (-1); |
| } |
| |
| /* }}} */ |
| /* {{{ Immediate constants. */ |
| |
| /* Initialize shared numeric constants. */ |
| |
| static void |
| init_ext_gcn_constants (void) |
| { |
| real_from_integer (&dconst4, DFmode, 4, SIGNED); |
| |
| /* FIXME: this constant probably does not match what hardware really loads. |
| Reality check it eventually. */ |
| real_from_string (&dconst1over2pi, |
| "0.1591549430918953357663423455968866839"); |
| real_convert (&dconst1over2pi, SFmode, &dconst1over2pi); |
| |
| ext_gcn_constants_init = 1; |
| } |
| |
| /* Return non-zero if X is a constant that can appear as an inline operand. |
| This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi) |
| Or a vector of those. |
| The value returned should be the encoding of this constant. */ |
| |
| int |
| gcn_inline_fp_constant_p (rtx x, bool allow_vector) |
| { |
| machine_mode mode = GET_MODE (x); |
| |
| if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode) |
| && allow_vector) |
| { |
| int n; |
| if (GET_CODE (x) != CONST_VECTOR) |
| return 0; |
| n = gcn_inline_fp_constant_p (CONST_VECTOR_ELT (x, 0), false); |
| if (!n) |
| return 0; |
| for (int i = 1; i < 64; i++) |
| if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0)) |
| return 0; |
| return 1; |
| } |
| |
| if (mode != HFmode && mode != SFmode && mode != DFmode) |
| return 0; |
| |
| const REAL_VALUE_TYPE *r; |
| |
| if (x == CONST0_RTX (mode)) |
| return 128; |
| if (x == CONST1_RTX (mode)) |
| return 242; |
| |
| r = CONST_DOUBLE_REAL_VALUE (x); |
| |
| if (real_identical (r, &dconstm1)) |
| return 243; |
| |
| if (real_identical (r, &dconsthalf)) |
| return 240; |
| if (real_identical (r, &dconstm1)) |
| return 243; |
| if (real_identical (r, &dconst2)) |
| return 244; |
| if (real_identical (r, &dconst4)) |
| return 246; |
| if (real_identical (r, &dconst1over2pi)) |
| return 248; |
| if (!ext_gcn_constants_init) |
| init_ext_gcn_constants (); |
| real_value_negate (r); |
| if (real_identical (r, &dconsthalf)) |
| return 241; |
| if (real_identical (r, &dconst2)) |
| return 245; |
| if (real_identical (r, &dconst4)) |
| return 247; |
| |
| /* FIXME: add 4, -4 and 1/(2*PI). */ |
| |
| return 0; |
| } |
| |
| /* Return non-zero if X is a constant that can appear as an immediate operand. |
| This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi) |
| Or a vector of those. |
| The value returned should be the encoding of this constant. */ |
| |
| bool |
| gcn_fp_constant_p (rtx x, bool allow_vector) |
| { |
| machine_mode mode = GET_MODE (x); |
| |
| if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode) |
| && allow_vector) |
| { |
| int n; |
| if (GET_CODE (x) != CONST_VECTOR) |
| return false; |
| n = gcn_fp_constant_p (CONST_VECTOR_ELT (x, 0), false); |
| if (!n) |
| return false; |
| for (int i = 1; i < 64; i++) |
| if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0)) |
| return false; |
| return true; |
| } |
| if (mode != HFmode && mode != SFmode && mode != DFmode) |
| return false; |
| |
| if (gcn_inline_fp_constant_p (x, false)) |
| return true; |
| /* FIXME: It is not clear how 32bit immediates are interpreted here. */ |
| return (mode != DFmode); |
| } |
| |
| /* Return true if X is a constant representable as an inline immediate |
| constant in a 32-bit instruction encoding. */ |
| |
| bool |
| gcn_inline_constant_p (rtx x) |
| { |
| if (GET_CODE (x) == CONST_INT) |
| return INTVAL (x) >= -16 && INTVAL (x) <= 64; |
| if (GET_CODE (x) == CONST_DOUBLE) |
| return gcn_inline_fp_constant_p (x, false); |
| if (GET_CODE (x) == CONST_VECTOR) |
| { |
| int n; |
| if (!vgpr_vector_mode_p (GET_MODE (x))) |
| return false; |
| n = gcn_inline_constant_p (CONST_VECTOR_ELT (x, 0)); |
| if (!n) |
| return false; |
| for (int i = 1; i < 64; i++) |
| if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0)) |
| return false; |
| return 1; |
| } |
| return false; |
| } |
| |
| /* Return true if X is a constant representable as an immediate constant |
| in a 32 or 64-bit instruction encoding. */ |
| |
| bool |
| gcn_constant_p (rtx x) |
| { |
| switch (GET_CODE (x)) |
| { |
| case CONST_INT: |
| return true; |
| |
| case CONST_DOUBLE: |
| return gcn_fp_constant_p (x, false); |
| |
| case CONST_VECTOR: |
| { |
| int n; |
| if (!vgpr_vector_mode_p (GET_MODE (x))) |
| return false; |
| n = gcn_constant_p (CONST_VECTOR_ELT (x, 0)); |
| if (!n) |
| return false; |
| for (int i = 1; i < 64; i++) |
| if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0)) |
| return false; |
| return true; |
| } |
| |
| case SYMBOL_REF: |
| case LABEL_REF: |
| return true; |
| |
| default: |
| ; |
| } |
| |
| return false; |
| } |
| |
| /* Return true if X is a constant representable as two inline immediate |
| constants in a 64-bit instruction that is split into two 32-bit |
| instructions. |
| When MIXED is set, the low-part is permitted to use the full 32-bits. */ |
| |
| bool |
| gcn_inline_constant64_p (rtx x, bool mixed) |
| { |
| if (GET_CODE (x) == CONST_VECTOR) |
| { |
| if (!vgpr_vector_mode_p (GET_MODE (x))) |
| return false; |
| if (!gcn_inline_constant64_p (CONST_VECTOR_ELT (x, 0), mixed)) |
| return false; |
| for (int i = 1; i < 64; i++) |
| if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0)) |
| return false; |
| |
| return true; |
| } |
| |
| if (GET_CODE (x) != CONST_INT) |
| return false; |
| |
| rtx val_lo = gcn_operand_part (DImode, x, 0); |
| rtx val_hi = gcn_operand_part (DImode, x, 1); |
| return ((mixed || gcn_inline_constant_p (val_lo)) |
| && gcn_inline_constant_p (val_hi)); |
| } |
| |
| /* Return true if X is a constant representable as an immediate constant |
| in a 32 or 64-bit instruction encoding where the hardware will |
| extend the immediate to 64-bits. */ |
| |
| bool |
| gcn_constant64_p (rtx x) |
| { |
| if (!gcn_constant_p (x)) |
| return false; |
| |
| if (GET_CODE (x) != CONST_INT) |
| return true; |
| |
| /* Negative numbers are only allowed if they can be encoded within src0, |
| because the 32-bit immediates do not get sign-extended. |
| Unsigned numbers must not be encodable as 32-bit -1..-16, because the |
| assembler will use a src0 inline immediate and that will get |
| sign-extended. */ |
| HOST_WIDE_INT val = INTVAL (x); |
| return (((val & 0xffffffff) == val /* Positive 32-bit. */ |
| && (val & 0xfffffff0) != 0xfffffff0) /* Not -1..-16. */ |
| || gcn_inline_constant_p (x)); /* Src0. */ |
| } |
| |
| /* Implement TARGET_LEGITIMATE_CONSTANT_P. |
| |
| Returns true if X is a legitimate constant for a MODE immediate operand. */ |
| |
| bool |
| gcn_legitimate_constant_p (machine_mode, rtx x) |
| { |
| return gcn_constant_p (x); |
| } |
| |
| /* Return true if X is a CONST_VECTOR of single constant. */ |
| |
| static bool |
| single_cst_vector_p (rtx x) |
| { |
| if (GET_CODE (x) != CONST_VECTOR) |
| return false; |
| for (int i = 1; i < 64; i++) |
| if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0)) |
| return false; |
| return true; |
| } |
| |
| /* Create a CONST_VECTOR of duplicated value A. */ |
| |
| rtx |
| gcn_vec_constant (machine_mode mode, int a) |
| { |
| /*if (!a) |
| return CONST0_RTX (mode); |
| if (a == -1) |
| return CONSTM1_RTX (mode); |
| if (a == 1) |
| return CONST1_RTX (mode); |
| if (a == 2) |
| return CONST2_RTX (mode);*/ |
| |
| int units = GET_MODE_NUNITS (mode); |
| machine_mode innermode = GET_MODE_INNER (mode); |
| |
| rtx tem; |
| if (FLOAT_MODE_P (innermode)) |
| { |
| REAL_VALUE_TYPE rv; |
| real_from_integer (&rv, NULL, a, SIGNED); |
| tem = const_double_from_real_value (rv, innermode); |
| } |
| else |
| tem = gen_int_mode (a, innermode); |
| |
| rtvec v = rtvec_alloc (units); |
| for (int i = 0; i < units; ++i) |
| RTVEC_ELT (v, i) = tem; |
| |
| return gen_rtx_CONST_VECTOR (mode, v); |
| } |
| |
| /* Create a CONST_VECTOR of duplicated value A. */ |
| |
| rtx |
| gcn_vec_constant (machine_mode mode, rtx a) |
| { |
| int units = GET_MODE_NUNITS (mode); |
| rtvec v = rtvec_alloc (units); |
| |
| for (int i = 0; i < units; ++i) |
| RTVEC_ELT (v, i) = a; |
| |
| return gen_rtx_CONST_VECTOR (mode, v); |
| } |
| |
| /* Create an undefined vector value, used where an insn operand is |
| optional. */ |
| |
| rtx |
| gcn_gen_undef (machine_mode mode) |
| { |
| return gen_rtx_UNSPEC (mode, gen_rtvec (1, const0_rtx), UNSPEC_VECTOR); |
| } |
| |
| /* }}} */ |
| /* {{{ Addresses, pointers and moves. */ |
| |
| /* Return true is REG is a valid place to store a pointer, |
| for instructions that require an SGPR. |
| FIXME rename. */ |
| |
| static bool |
| gcn_address_register_p (rtx reg, machine_mode mode, bool strict) |
| { |
| if (GET_CODE (reg) == SUBREG) |
| reg = SUBREG_REG (reg); |
| |
| if (!REG_P (reg)) |
| return false; |
| |
| if (GET_MODE (reg) != mode) |
| return false; |
| |
| int regno = REGNO (reg); |
| |
| if (regno >= FIRST_PSEUDO_REGISTER) |
| { |
| if (!strict) |
| return true; |
| |
| if (!reg_renumber) |
| return false; |
| |
| regno = reg_renumber[regno]; |
| } |
| |
| return (SGPR_REGNO_P (regno) || regno == M0_REG |
| || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM); |
| } |
| |
| /* Return true is REG is a valid place to store a pointer, |
| for instructions that require a VGPR. */ |
| |
| static bool |
| gcn_vec_address_register_p (rtx reg, machine_mode mode, bool strict) |
| { |
| if (GET_CODE (reg) == SUBREG) |
| reg = SUBREG_REG (reg); |
| |
| if (!REG_P (reg)) |
| return false; |
| |
| if (GET_MODE (reg) != mode) |
| return false; |
| |
| int regno = REGNO (reg); |
| |
| if (regno >= FIRST_PSEUDO_REGISTER) |
| { |
| if (!strict) |
| return true; |
| |
| if (!reg_renumber) |
| return false; |
| |
| regno = reg_renumber[regno]; |
| } |
| |
| return VGPR_REGNO_P (regno); |
| } |
| |
| /* Return true if X would be valid inside a MEM using the Flat address |
| space. */ |
| |
| bool |
| gcn_flat_address_p (rtx x, machine_mode mode) |
| { |
| bool vec_mode = (GET_MODE_CLASS (mode) == MODE_VECTOR_INT |
| || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT); |
| |
| if (vec_mode && gcn_address_register_p (x, DImode, false)) |
| return true; |
| |
| if (!vec_mode && gcn_vec_address_register_p (x, DImode, false)) |
| return true; |
| |
| if (TARGET_GCN5_PLUS |
| && GET_CODE (x) == PLUS |
| && gcn_vec_address_register_p (XEXP (x, 0), DImode, false) |
| && CONST_INT_P (XEXP (x, 1))) |
| return true; |
| |
| return false; |
| } |
| |
| /* Return true if X would be valid inside a MEM using the Scalar Flat |
| address space. */ |
| |
| bool |
| gcn_scalar_flat_address_p (rtx x) |
| { |
| if (gcn_address_register_p (x, DImode, false)) |
| return true; |
| |
| if (GET_CODE (x) == PLUS |
| && gcn_address_register_p (XEXP (x, 0), DImode, false) |
| && CONST_INT_P (XEXP (x, 1))) |
| return true; |
| |
| return false; |
| } |
| |
| /* Return true if MEM X would be valid for the Scalar Flat address space. */ |
| |
| bool |
| gcn_scalar_flat_mem_p (rtx x) |
| { |
| if (!MEM_P (x)) |
| return false; |
| |
| if (GET_MODE_SIZE (GET_MODE (x)) < 4) |
| return false; |
| |
| return gcn_scalar_flat_address_p (XEXP (x, 0)); |
| } |
| |
| /* Return true if X would be valid inside a MEM using the LDS or GDS |
| address spaces. */ |
| |
| bool |
| gcn_ds_address_p (rtx x) |
| { |
| if (gcn_vec_address_register_p (x, SImode, false)) |
| return true; |
| |
| if (GET_CODE (x) == PLUS |
| && gcn_vec_address_register_p (XEXP (x, 0), SImode, false) |
| && CONST_INT_P (XEXP (x, 1))) |
| return true; |
| |
| return false; |
| } |
| |
| /* Return true if ADDR would be valid inside a MEM using the Global |
| address space. */ |
| |
| bool |
| gcn_global_address_p (rtx addr) |
| { |
| if (gcn_address_register_p (addr, DImode, false) |
| || gcn_vec_address_register_p (addr, DImode, false)) |
| return true; |
| |
| if (GET_CODE (addr) == PLUS) |
| { |
| rtx base = XEXP (addr, 0); |
| rtx offset = XEXP (addr, 1); |
| bool immediate_p = (CONST_INT_P (offset) |
| && INTVAL (offset) >= -(1 << 12) |
| && INTVAL (offset) < (1 << 12)); |
| |
| if ((gcn_address_register_p (base, DImode, false) |
| || gcn_vec_address_register_p (base, DImode, false)) |
| && immediate_p) |
| /* SGPR + CONST or VGPR + CONST */ |
| return true; |
| |
| if (gcn_address_register_p (base, DImode, false) |
| && gcn_vgpr_register_operand (offset, SImode)) |
| /* SPGR + VGPR */ |
| return true; |
| |
| if (GET_CODE (base) == PLUS |
| && gcn_address_register_p (XEXP (base, 0), DImode, false) |
| && gcn_vgpr_register_operand (XEXP (base, 1), SImode) |
| && immediate_p) |
| /* (SGPR + VGPR) + CONST */ |
| return true; |
| } |
| |
| return false; |
| } |
| |
| /* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P. |
| |
| Recognizes RTL expressions that are valid memory addresses for an |
| instruction. The MODE argument is the machine mode for the MEM |
| expression that wants to use this address. |
| |
| It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should |
| convert common non-canonical forms to canonical form so that they will |
| be recognized. */ |
| |
| static bool |
| gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict, |
| addr_space_t as) |
| { |
| /* All vector instructions need to work on addresses in registers. */ |
| if (!TARGET_GCN5_PLUS && (vgpr_vector_mode_p (mode) && !REG_P (x))) |
| return false; |
| |
| if (AS_SCALAR_FLAT_P (as)) |
| { |
| if (mode == QImode || mode == HImode) |
| return 0; |
| |
| switch (GET_CODE (x)) |
| { |
| case REG: |
| return gcn_address_register_p (x, DImode, strict); |
| /* Addresses are in the form BASE+OFFSET |
| OFFSET is either 20bit unsigned immediate, SGPR or M0. |
| Writes and atomics do not accept SGPR. */ |
| case PLUS: |
| { |
| rtx x0 = XEXP (x, 0); |
| rtx x1 = XEXP (x, 1); |
| if (!gcn_address_register_p (x0, DImode, strict)) |
| return false; |
| /* FIXME: This is disabled because of the mode mismatch between |
| SImode (for the address or m0 register) and the DImode PLUS. |
| We'll need a zero_extend or similar. |
| |
| if (gcn_m0_register_p (x1, SImode, strict) |
| || gcn_address_register_p (x1, SImode, strict)) |
| return true; |
| else*/ |
| if (GET_CODE (x1) == CONST_INT) |
| { |
| if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20) |
| /* The low bits of the offset are ignored, even when |
| they're meant to realign the pointer. */ |
| && !(INTVAL (x1) & 0x3)) |
| return true; |
| } |
| return false; |
| } |
| |
| default: |
| break; |
| } |
| } |
| else if (AS_SCRATCH_P (as)) |
| return gcn_address_register_p (x, SImode, strict); |
| else if (AS_FLAT_P (as) || AS_FLAT_SCRATCH_P (as)) |
| { |
| if (TARGET_GCN3 || GET_CODE (x) == REG) |
| return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT |
| || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT) |
| ? gcn_address_register_p (x, DImode, strict) |
| : gcn_vec_address_register_p (x, DImode, strict)); |
| else |
| { |
| gcc_assert (TARGET_GCN5_PLUS); |
| |
| if (GET_CODE (x) == PLUS) |
| { |
| rtx x1 = XEXP (x, 1); |
| |
| if (VECTOR_MODE_P (mode) |
| ? !gcn_address_register_p (x, DImode, strict) |
| : !gcn_vec_address_register_p (x, DImode, strict)) |
| return false; |
| |
| if (GET_CODE (x1) == CONST_INT) |
| { |
| if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 12) |
| /* The low bits of the offset are ignored, even when |
| they're meant to realign the pointer. */ |
| && !(INTVAL (x1) & 0x3)) |
| return true; |
| } |
| } |
| return false; |
| } |
| } |
| else if (AS_GLOBAL_P (as)) |
| { |
| gcc_assert (TARGET_GCN5_PLUS); |
| |
| if (GET_CODE (x) == REG) |
| return (gcn_address_register_p (x, DImode, strict) |
| || (!VECTOR_MODE_P (mode) |
| && gcn_vec_address_register_p (x, DImode, strict))); |
| else if (GET_CODE (x) == PLUS) |
| { |
| rtx base = XEXP (x, 0); |
| rtx offset = XEXP (x, 1); |
| |
| bool immediate_p = (GET_CODE (offset) == CONST_INT |
| /* Signed 13-bit immediate. */ |
| && INTVAL (offset) >= -(1 << 12) |
| && INTVAL (offset) < (1 << 12) |
| /* The low bits of the offset are ignored, even |
| when they're meant to realign the pointer. */ |
| && !(INTVAL (offset) & 0x3)); |
| |
| if (!VECTOR_MODE_P (mode)) |
| { |
| if ((gcn_address_register_p (base, DImode, strict) |
| || gcn_vec_address_register_p (base, DImode, strict)) |
| && immediate_p) |
| /* SGPR + CONST or VGPR + CONST */ |
| return true; |
| |
| if (gcn_address_register_p (base, DImode, strict) |
| && gcn_vgpr_register_operand (offset, SImode)) |
| /* SGPR + VGPR */ |
| return true; |
| |
| if (GET_CODE (base) == PLUS |
| && gcn_address_register_p (XEXP (base, 0), DImode, strict) |
| && gcn_vgpr_register_operand (XEXP (base, 1), SImode) |
| && immediate_p) |
| /* (SGPR + VGPR) + CONST */ |
| return true; |
| } |
| else |
| { |
| if (gcn_address_register_p (base, DImode, strict) |
| && immediate_p) |
| /* SGPR + CONST */ |
| return true; |
| } |
| } |
| else |
| return false; |
| } |
| else if (AS_ANY_DS_P (as)) |
| switch (GET_CODE (x)) |
| { |
| case REG: |
| return (VECTOR_MODE_P (mode) |
| ? gcn_address_register_p (x, SImode, strict) |
| : gcn_vec_address_register_p (x, SImode, strict)); |
| /* Addresses are in the form BASE+OFFSET |
| OFFSET is either 20bit unsigned immediate, SGPR or M0. |
| Writes and atomics do not accept SGPR. */ |
| case PLUS: |
| { |
| rtx x0 = XEXP (x, 0); |
| rtx x1 = XEXP (x, 1); |
| if (!gcn_vec_address_register_p (x0, DImode, strict)) |
| return false; |
| if (GET_CODE (x1) == REG) |
| { |
| if (GET_CODE (x1) != REG |
| || (REGNO (x1) <= FIRST_PSEUDO_REGISTER |
| && !gcn_ssrc_register_operand (x1, DImode))) |
| return false; |
| } |
| else if (GET_CODE (x1) == CONST_VECTOR |
| && GET_CODE (CONST_VECTOR_ELT (x1, 0)) == CONST_INT |
| && single_cst_vector_p (x1)) |
| { |
| x1 = CONST_VECTOR_ELT (x1, 0); |
| if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20)) |
| return true; |
| } |
| return false; |
| } |
| |
| default: |
| break; |
| } |
| else |
| gcc_unreachable (); |
| return false; |
| } |
| |
| /* Implement TARGET_ADDR_SPACE_POINTER_MODE. |
| |
| Return the appropriate mode for a named address pointer. */ |
| |
| static scalar_int_mode |
| gcn_addr_space_pointer_mode (addr_space_t addrspace) |
| { |
| switch (addrspace) |
| { |
| case ADDR_SPACE_SCRATCH: |
| case ADDR_SPACE_LDS: |
| case ADDR_SPACE_GDS: |
| return SImode; |
| case ADDR_SPACE_DEFAULT: |
| case ADDR_SPACE_FLAT: |
| case ADDR_SPACE_FLAT_SCRATCH: |
| case ADDR_SPACE_SCALAR_FLAT: |
| return DImode; |
| default: |
| gcc_unreachable (); |
| } |
| } |
| |
| /* Implement TARGET_ADDR_SPACE_ADDRESS_MODE. |
| |
| Return the appropriate mode for a named address space address. */ |
| |
| static scalar_int_mode |
| gcn_addr_space_address_mode (addr_space_t addrspace) |
| { |
| return gcn_addr_space_pointer_mode (addrspace); |
| } |
| |
| /* Implement TARGET_ADDR_SPACE_SUBSET_P. |
| |
| Determine if one named address space is a subset of another. */ |
| |
| static bool |
| gcn_addr_space_subset_p (addr_space_t subset, addr_space_t superset) |
| { |
| if (subset == superset) |
| return true; |
| /* FIXME is this true? */ |
| if (AS_FLAT_P (superset) || AS_SCALAR_FLAT_P (superset)) |
| return true; |
| return false; |
| } |
| |
| /* Convert from one address space to another. */ |
| |
| static rtx |
| gcn_addr_space_convert (rtx op, tree from_type, tree to_type) |
| { |
| gcc_assert (POINTER_TYPE_P (from_type)); |
| gcc_assert (POINTER_TYPE_P (to_type)); |
| |
| addr_space_t as_from = TYPE_ADDR_SPACE (TREE_TYPE (from_type)); |
| addr_space_t as_to = TYPE_ADDR_SPACE (TREE_TYPE (to_type)); |
| |
| if (AS_LDS_P (as_from) && AS_FLAT_P (as_to)) |
| { |
| rtx queue = gen_rtx_REG (DImode, |
| cfun->machine->args.reg[QUEUE_PTR_ARG]); |
| rtx group_seg_aperture_hi = gen_rtx_MEM (SImode, |
| gen_rtx_PLUS (DImode, queue, |
| gen_int_mode (64, SImode))); |
| rtx tmp = gen_reg_rtx (DImode); |
| |
| emit_move_insn (gen_lowpart (SImode, tmp), op); |
| emit_move_insn (gen_highpart_mode (SImode, DImode, tmp), |
| group_seg_aperture_hi); |
| |
| return tmp; |
| } |
| else if (as_from == as_to) |
| return op; |
| else |
| gcc_unreachable (); |
| } |
| |
| /* Implement TARGET_ADDR_SPACE_DEBUG. |
| |
| Return the dwarf address space class for each hardware address space. */ |
| |
| static int |
| gcn_addr_space_debug (addr_space_t as) |
| { |
| switch (as) |
| { |
| case ADDR_SPACE_DEFAULT: |
| case ADDR_SPACE_FLAT: |
| case ADDR_SPACE_SCALAR_FLAT: |
| case ADDR_SPACE_FLAT_SCRATCH: |
| return DW_ADDR_none; |
| case ADDR_SPACE_GLOBAL: |
| return 1; // DW_ADDR_LLVM_global |
| case ADDR_SPACE_LDS: |
| return 3; // DW_ADDR_LLVM_group |
| case ADDR_SPACE_SCRATCH: |
| return 4; // DW_ADDR_LLVM_private |
| case ADDR_SPACE_GDS: |
| return 0x8000; // DW_ADDR_AMDGPU_region |
| } |
| gcc_unreachable (); |
| } |
| |
| |
| /* Implement REGNO_MODE_CODE_OK_FOR_BASE_P via gcn.h |
| |
| Retun true if REGNO is OK for memory adressing. */ |
| |
| bool |
| gcn_regno_mode_code_ok_for_base_p (int regno, |
| machine_mode, addr_space_t as, int, int) |
| { |
| if (regno >= FIRST_PSEUDO_REGISTER) |
| { |
| if (reg_renumber) |
| regno = reg_renumber[regno]; |
| else |
| return true; |
| } |
| if (AS_FLAT_P (as)) |
| return (VGPR_REGNO_P (regno) |
| || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM); |
| else if (AS_SCALAR_FLAT_P (as)) |
| return (SGPR_REGNO_P (regno) |
| || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM); |
| else if (AS_GLOBAL_P (as)) |
| { |
| return (SGPR_REGNO_P (regno) |
| || VGPR_REGNO_P (regno) |
| || regno == ARG_POINTER_REGNUM |
| || regno == FRAME_POINTER_REGNUM); |
| } |
| else |
| /* For now. */ |
| return false; |
| } |
| |
| /* Implement MODE_CODE_BASE_REG_CLASS via gcn.h. |
| |
| Return a suitable register class for memory addressing. */ |
| |
| reg_class |
| gcn_mode_code_base_reg_class (machine_mode mode, addr_space_t as, int oc, |
| int ic) |
| { |
| switch (as) |
| { |
| case ADDR_SPACE_DEFAULT: |
| return gcn_mode_code_base_reg_class (mode, DEFAULT_ADDR_SPACE, oc, ic); |
| case ADDR_SPACE_SCALAR_FLAT: |
| case ADDR_SPACE_SCRATCH: |
| return SGPR_REGS; |
| break; |
| case ADDR_SPACE_FLAT: |
| case ADDR_SPACE_FLAT_SCRATCH: |
| case ADDR_SPACE_LDS: |
| case ADDR_SPACE_GDS: |
| return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT |
| || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT) |
| ? SGPR_REGS : VGPR_REGS); |
| case ADDR_SPACE_GLOBAL: |
| return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT |
| || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT) |
| ? SGPR_REGS : ALL_GPR_REGS); |
| } |
| gcc_unreachable (); |
| } |
| |
| /* Implement REGNO_OK_FOR_INDEX_P via gcn.h. |
| |
| Return true if REGNO is OK for index of memory addressing. */ |
| |
| bool |
| regno_ok_for_index_p (int regno) |
| { |
| if (regno >= FIRST_PSEUDO_REGISTER) |
| { |
| if (reg_renumber) |
| regno = reg_renumber[regno]; |
| else |
| return true; |
| } |
| return regno == M0_REG || VGPR_REGNO_P (regno); |
| } |
| |
| /* Generate move which uses the exec flags. If EXEC is NULL, then it is |
| assumed that all lanes normally relevant to the mode of the move are |
| affected. If PREV is NULL, then a sensible default is supplied for |
| the inactive lanes. */ |
| |
| static rtx |
| gen_mov_with_exec (rtx op0, rtx op1, rtx exec = NULL, rtx prev = NULL) |
| { |
| machine_mode mode = GET_MODE (op0); |
| |
| if (vgpr_vector_mode_p (mode)) |
| { |
| if (exec && exec != CONSTM1_RTX (DImode)) |
| { |
| if (!prev) |
| prev = op0; |
| } |
| else |
| { |
| if (!prev) |
| prev = gcn_gen_undef (mode); |
| exec = gcn_full_exec_reg (); |
| } |
| |
| rtx set = gen_rtx_SET (op0, gen_rtx_VEC_MERGE (mode, op1, prev, exec)); |
| |
| return gen_rtx_PARALLEL (VOIDmode, |
| gen_rtvec (2, set, |
| gen_rtx_CLOBBER (VOIDmode, |
| gen_rtx_SCRATCH (V64DImode)))); |
| } |
| |
| return (gen_rtx_PARALLEL |
| (VOIDmode, |
| gen_rtvec (2, gen_rtx_SET (op0, op1), |
| gen_rtx_USE (VOIDmode, |
| exec ? exec : gcn_scalar_exec ())))); |
| } |
| |
| /* Generate masked move. */ |
| |
| static rtx |
| gen_duplicate_load (rtx op0, rtx op1, rtx op2 = NULL, rtx exec = NULL) |
| { |
| if (exec) |
| return (gen_rtx_SET (op0, |
| gen_rtx_VEC_MERGE (GET_MODE (op0), |
| gen_rtx_VEC_DUPLICATE (GET_MODE |
| (op0), op1), |
| op2, exec))); |
| else |
| return (gen_rtx_SET (op0, gen_rtx_VEC_DUPLICATE (GET_MODE (op0), op1))); |
| } |
| |
| /* Expand vector init of OP0 by VEC. |
| Implements vec_init instruction pattern. */ |
| |
| void |
| gcn_expand_vector_init (rtx op0, rtx vec) |
| { |
| int64_t initialized_mask = 0; |
| int64_t curr_mask = 1; |
| machine_mode mode = GET_MODE (op0); |
| |
| rtx val = XVECEXP (vec, 0, 0); |
| |
| for (int i = 1; i < 64; i++) |
| if (rtx_equal_p (val, XVECEXP (vec, 0, i))) |
| curr_mask |= (int64_t) 1 << i; |
| |
| if (gcn_constant_p (val)) |
| emit_move_insn (op0, gcn_vec_constant (mode, val)); |
| else |
| { |
| val = force_reg (GET_MODE_INNER (mode), val); |
| emit_insn (gen_duplicate_load (op0, val)); |
| } |
| initialized_mask |= curr_mask; |
| for (int i = 1; i < 64; i++) |
| if (!(initialized_mask & ((int64_t) 1 << i))) |
| { |
| curr_mask = (int64_t) 1 << i; |
| rtx val = XVECEXP (vec, 0, i); |
| |
| for (int j = i + 1; j < 64; j++) |
| if (rtx_equal_p (val, XVECEXP (vec, 0, j))) |
| curr_mask |= (int64_t) 1 << j; |
| if (gcn_constant_p (val)) |
| emit_insn (gen_mov_with_exec (op0, gcn_vec_constant (mode, val), |
| get_exec (curr_mask))); |
| else |
| { |
| val = force_reg (GET_MODE_INNER (mode), val); |
| emit_insn (gen_duplicate_load (op0, val, op0, |
| get_exec (curr_mask))); |
| } |
| initialized_mask |= curr_mask; |
| } |
| } |
| |
| /* Load vector constant where n-th lane contains BASE+n*VAL. */ |
| |
| static rtx |
| strided_constant (machine_mode mode, int base, int val) |
| { |
| rtx x = gen_reg_rtx (mode); |
| emit_move_insn (x, gcn_vec_constant (mode, base)); |
| emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 32), |
| x, get_exec (0xffffffff00000000))); |
| emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 16), |
| x, get_exec (0xffff0000ffff0000))); |
| emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 8), |
| x, get_exec (0xff00ff00ff00ff00))); |
| emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 4), |
| x, get_exec (0xf0f0f0f0f0f0f0f0))); |
| emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 2), |
| x, get_exec (0xcccccccccccccccc))); |
| emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 1), |
| x, get_exec (0xaaaaaaaaaaaaaaaa))); |
| return x; |
| } |
| |
| /* Implement TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS. */ |
| |
| static rtx |
| gcn_addr_space_legitimize_address (rtx x, rtx old, machine_mode mode, |
| addr_space_t as) |
| { |
| switch (as) |
| { |
| case ADDR_SPACE_DEFAULT: |
| return gcn_addr_space_legitimize_address (x, old, mode, |
| DEFAULT_ADDR_SPACE); |
| case ADDR_SPACE_SCALAR_FLAT: |
| case ADDR_SPACE_SCRATCH: |
| /* Instructions working on vectors need the address to be in |
| a register. */ |
| if (vgpr_vector_mode_p (mode)) |
| return force_reg (GET_MODE (x), x); |
| |
| return x; |
| case ADDR_SPACE_FLAT: |
| case ADDR_SPACE_FLAT_SCRATCH: |
| case ADDR_SPACE_GLOBAL: |
| return TARGET_GCN3 ? force_reg (DImode, x) : x; |
| case ADDR_SPACE_LDS: |
| case ADDR_SPACE_GDS: |
| /* FIXME: LDS support offsets, handle them!. */ |
| if (vgpr_vector_mode_p (mode) && GET_MODE (x) != V64SImode) |
| { |
| rtx addrs = gen_reg_rtx (V64SImode); |
| rtx base = force_reg (SImode, x); |
| rtx offsets = strided_constant (V64SImode, 0, |
| GET_MODE_UNIT_SIZE (mode)); |
| |
| emit_insn (gen_vec_duplicatev64si (addrs, base)); |
| emit_insn (gen_addv64si3 (addrs, offsets, addrs)); |
| return addrs; |
| } |
| return x; |
| } |
| gcc_unreachable (); |
| } |
| |
| /* Convert a (mem:<MODE> (reg:DI)) to (mem:<MODE> (reg:V64DI)) with the |
| proper vector of stepped addresses. |
| |
| MEM will be a DImode address of a vector in an SGPR. |
| TMP will be a V64DImode VGPR pair or (scratch:V64DI). */ |
| |
| rtx |
| gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem, |
| rtx tmp) |
| { |
| gcc_assert (MEM_P (mem)); |
| rtx mem_base = XEXP (mem, 0); |
| rtx mem_index = NULL_RTX; |
| |
| if (!TARGET_GCN5_PLUS) |
| { |
| /* gcn_addr_space_legitimize_address should have put the address in a |
| register. If not, it is too late to do anything about it. */ |
| gcc_assert (REG_P (mem_base)); |
| } |
| |
| if (GET_CODE (mem_base) == PLUS) |
| { |
| mem_index = XEXP (mem_base, 1); |
| mem_base = XEXP (mem_base, 0); |
| } |
| |
| /* RF and RM base registers for vector modes should be always an SGPR. */ |
| gcc_assert (SGPR_REGNO_P (REGNO (mem_base)) |
| || REGNO (mem_base) >= FIRST_PSEUDO_REGISTER); |
| |
| machine_mode inner = GET_MODE_INNER (mode); |
| int shift = exact_log2 (GET_MODE_SIZE (inner)); |
| rtx ramp = gen_rtx_REG (V64SImode, VGPR_REGNO (1)); |
| rtx undef_v64si = gcn_gen_undef (V64SImode); |
| rtx new_base = NULL_RTX; |
| addr_space_t as = MEM_ADDR_SPACE (mem); |
| |
| rtx tmplo = (REG_P (tmp) |
| ? gcn_operand_part (V64DImode, tmp, 0) |
| : gen_reg_rtx (V64SImode)); |
| |
| /* tmplo[:] = ramp[:] << shift */ |
| if (exec) |
| emit_insn (gen_ashlv64si3_exec (tmplo, ramp, |
| gen_int_mode (shift, SImode), |
| undef_v64si, exec)); |
| else |
| emit_insn (gen_ashlv64si3 (tmplo, ramp, gen_int_mode (shift, SImode))); |
| |
| if (AS_FLAT_P (as)) |
| { |
| rtx vcc = gen_rtx_REG (DImode, CC_SAVE_REG); |
| |
| if (REG_P (tmp)) |
| { |
| rtx mem_base_lo = gcn_operand_part (DImode, mem_base, 0); |
| rtx mem_base_hi = gcn_operand_part (DImode, mem_base, 1); |
| rtx tmphi = gcn_operand_part (V64DImode, tmp, 1); |
| |
| /* tmphi[:] = mem_base_hi */ |
| if (exec) |
| emit_insn (gen_vec_duplicatev64si_exec (tmphi, mem_base_hi, |
| undef_v64si, exec)); |
| else |
| emit_insn (gen_vec_duplicatev64si (tmphi, mem_base_hi)); |
| |
| /* tmp[:] += zext (mem_base) */ |
| if (exec) |
| { |
| emit_insn (gen_addv64si3_vcc_dup_exec (tmplo, mem_base_lo, tmplo, |
| vcc, undef_v64si, exec)); |
| emit_insn (gen_addcv64si3_exec (tmphi, tmphi, const0_rtx, |
| vcc, vcc, undef_v64si, exec)); |
| } |
| else |
| emit_insn (gen_addv64di3_vcc_zext_dup (tmp, mem_base_lo, tmp, vcc)); |
| } |
| else |
| { |
| tmp = gen_reg_rtx (V64DImode); |
| if (exec) |
| emit_insn (gen_addv64di3_vcc_zext_dup2_exec |
| (tmp, tmplo, mem_base, vcc, gcn_gen_undef (V64DImode), |
| exec)); |
| else |
| emit_insn (gen_addv64di3_vcc_zext_dup2 (tmp, tmplo, mem_base, vcc)); |
| } |
| |
| new_base = tmp; |
| } |
| else if (AS_ANY_DS_P (as)) |
| { |
| if (!exec) |
| emit_insn (gen_addv64si3_dup (tmplo, tmplo, mem_base)); |
| else |
| emit_insn (gen_addv64si3_dup_exec (tmplo, tmplo, mem_base, |
| gcn_gen_undef (V64SImode), exec)); |
| new_base = tmplo; |
| } |
| else |
| { |
| mem_base = gen_rtx_VEC_DUPLICATE (V64DImode, mem_base); |
| new_base = gen_rtx_PLUS (V64DImode, mem_base, |
| gen_rtx_SIGN_EXTEND (V64DImode, tmplo)); |
| } |
| |
| return gen_rtx_PLUS (GET_MODE (new_base), new_base, |
| gen_rtx_VEC_DUPLICATE (GET_MODE (new_base), |
| (mem_index ? mem_index |
| : const0_rtx))); |
| } |
| |
| /* Convert a BASE address, a vector of OFFSETS, and a SCALE, to addresses |
| suitable for the given address space. This is indented for use in |
| gather/scatter patterns. |
| |
| The offsets may be signed or unsigned, according to UNSIGNED_P. |
| If EXEC is set then _exec patterns will be used, otherwise plain. |
| |
| Return values. |
| ADDR_SPACE_FLAT - return V64DImode vector of absolute addresses. |
| ADDR_SPACE_GLOBAL - return V64SImode vector of offsets. */ |
| |
| rtx |
| gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale, |
| bool unsigned_p, rtx exec) |
| { |
| rtx tmpsi = gen_reg_rtx (V64SImode); |
| rtx tmpdi = gen_reg_rtx (V64DImode); |
| rtx undefsi = exec ? gcn_gen_undef (V64SImode) : NULL; |
| rtx undefdi = exec ? gcn_gen_undef (V64DImode) : NULL; |
| |
| if (CONST_INT_P (scale) |
| && INTVAL (scale) > 0 |
| && exact_log2 (INTVAL (scale)) >= 0) |
| emit_insn (gen_ashlv64si3 (tmpsi, offsets, |
| GEN_INT (exact_log2 (INTVAL (scale))))); |
| else |
| (exec |
| ? emit_insn (gen_mulv64si3_dup_exec (tmpsi, offsets, scale, undefsi, |
| exec)) |
| : emit_insn (gen_mulv64si3_dup (tmpsi, offsets, scale))); |
| |
| /* "Global" instructions do not support negative register offsets. */ |
| if (as == ADDR_SPACE_FLAT || !unsigned_p) |
| { |
| if (unsigned_p) |
| (exec |
| ? emit_insn (gen_addv64di3_zext_dup2_exec (tmpdi, tmpsi, base, |
| undefdi, exec)) |
| : emit_insn (gen_addv64di3_zext_dup2 (tmpdi, tmpsi, base))); |
| else |
| (exec |
| ? emit_insn (gen_addv64di3_sext_dup2_exec (tmpdi, tmpsi, base, |
| undefdi, exec)) |
| : emit_insn (gen_addv64di3_sext_dup2 (tmpdi, tmpsi, base))); |
| return tmpdi; |
| } |
| else if (as == ADDR_SPACE_GLOBAL) |
| return tmpsi; |
| |
| gcc_unreachable (); |
| } |
| |
| /* Return true if move from OP0 to OP1 is known to be executed in vector |
| unit. */ |
| |
| bool |
| gcn_vgpr_move_p (rtx op0, rtx op1) |
| { |
| if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0))) |
| return true; |
| if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1))) |
| return true; |
| return ((REG_P (op0) && VGPR_REGNO_P (REGNO (op0))) |
| || (REG_P (op1) && VGPR_REGNO_P (REGNO (op1))) |
| || vgpr_vector_mode_p (GET_MODE (op0))); |
| } |
| |
| /* Return true if move from OP0 to OP1 is known to be executed in scalar |
| unit. Used in the machine description. */ |
| |
| bool |
| gcn_sgpr_move_p (rtx op0, rtx op1) |
| { |
| if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0))) |
| return true; |
| if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1))) |
| return true; |
| if (!REG_P (op0) || REGNO (op0) >= FIRST_PSEUDO_REGISTER |
| || VGPR_REGNO_P (REGNO (op0))) |
| return false; |
| if (REG_P (op1) |
| && REGNO (op1) < FIRST_PSEUDO_REGISTER |
| && !VGPR_REGNO_P (REGNO (op1))) |
| return true; |
| return immediate_operand (op1, VOIDmode) || memory_operand (op1, VOIDmode); |
| } |
| |
| /* Implement TARGET_SECONDARY_RELOAD. |
| |
| The address space determines which registers can be used for loads and |
| stores. */ |
| |
| static reg_class_t |
| gcn_secondary_reload (bool in_p, rtx x, reg_class_t rclass, |
| machine_mode reload_mode, secondary_reload_info *sri) |
| { |
| reg_class_t result = NO_REGS; |
| bool spilled_pseudo = |
| (REG_P (x) || GET_CODE (x) == SUBREG) && true_regnum (x) == -1; |
| |
| if (dump_file && (dump_flags & TDF_DETAILS)) |
| { |
| fprintf (dump_file, "gcn_secondary_reload: "); |
| dump_value_slim (dump_file, x, 1); |
| fprintf (dump_file, " %s %s:%s", (in_p ? "->" : "<-"), |
| reg_class_names[rclass], GET_MODE_NAME (reload_mode)); |
| if (REG_P (x) || GET_CODE (x) == SUBREG) |
| fprintf (dump_file, " (true regnum: %d \"%s\")", true_regnum (x), |
| (true_regnum (x) >= 0 |
| && true_regnum (x) < FIRST_PSEUDO_REGISTER |
| ? reg_names[true_regnum (x)] |
| : (spilled_pseudo ? "stack spill" : "??"))); |
| fprintf (dump_file, "\n"); |
| } |
| |
| /* Some callers don't use or initialize icode. */ |
| sri->icode = CODE_FOR_nothing; |
| |
| if (MEM_P (x) || spilled_pseudo) |
| { |
| addr_space_t as = DEFAULT_ADDR_SPACE; |
| |
| /* If we have a spilled pseudo, we can't find the address space |
| directly, but we know it's in ADDR_SPACE_FLAT space for GCN3 or |
| ADDR_SPACE_GLOBAL for GCN5. */ |
| if (MEM_P (x)) |
| as = MEM_ADDR_SPACE (x); |
| |
| if (as == ADDR_SPACE_DEFAULT) |
| as = DEFAULT_ADDR_SPACE; |
| |
| switch (as) |
| { |
| case ADDR_SPACE_SCALAR_FLAT: |
| result = |
| ((!MEM_P (x) || rclass == SGPR_REGS) ? NO_REGS : SGPR_REGS); |
| break; |
| case ADDR_SPACE_FLAT: |
| case ADDR_SPACE_FLAT_SCRATCH: |
| case ADDR_SPACE_GLOBAL: |
| if (GET_MODE_CLASS (reload_mode) == MODE_VECTOR_INT |
| || GET_MODE_CLASS (reload_mode) == MODE_VECTOR_FLOAT) |
| { |
| if (in_p) |
| switch (reload_mode) |
| { |
| case E_V64SImode: |
| sri->icode = CODE_FOR_reload_inv64si; |
| break; |
| case E_V64SFmode: |
| sri->icode = CODE_FOR_reload_inv64sf; |
| break; |
| case E_V64HImode: |
| sri->icode = CODE_FOR_reload_inv64hi; |
| break; |
| case E_V64HFmode: |
| sri->icode = CODE_FOR_reload_inv64hf; |
| break; |
| case E_V64QImode: |
| sri->icode = CODE_FOR_reload_inv64qi; |
| break; |
| case E_V64DImode: |
| sri->icode = CODE_FOR_reload_inv64di; |
| break; |
| case E_V64DFmode: |
| sri->icode = CODE_FOR_reload_inv64df; |
| break; |
| default: |
| gcc_unreachable (); |
| } |
| else |
| switch (reload_mode) |
| { |
| case E_V64SImode: |
| sri->icode = CODE_FOR_reload_outv64si; |
| break; |
| case E_V64SFmode: |
| sri->icode = CODE_FOR_reload_outv64sf; |
| break; |
| case E_V64HImode: |
| sri->icode = CODE_FOR_reload_outv64hi; |
| break; |
| case E_V64HFmode: |
| sri->icode = CODE_FOR_reload_outv64hf; |
| break; |
| case E_V64QImode: |
| sri->icode = CODE_FOR_reload_outv64qi; |
| break; |
| case E_V64DImode: |
| sri->icode = CODE_FOR_reload_outv64di; |
| break; |
| case E_V64DFmode: |
| sri->icode = CODE_FOR_reload_outv64df; |
| break; |
| default: |
| gcc_unreachable (); |
| } |
| break; |
| } |
| /* Fallthrough. */ |
| case ADDR_SPACE_LDS: |
| case ADDR_SPACE_GDS: |
| case ADDR_SPACE_SCRATCH: |
| result = (rclass == VGPR_REGS ? NO_REGS : VGPR_REGS); |
| break; |
| } |
| } |
| |
| if (dump_file && (dump_flags & TDF_DETAILS)) |
| fprintf (dump_file, " <= %s (icode: %s)\n", reg_class_names[result], |
| get_insn_name (sri->icode)); |
| |
| return result; |
| } |
| |
| /* Update register usage after having seen the compiler flags and kernel |
| attributes. We typically want to fix registers that contain values |
| set by the HSA runtime. */ |
| |
| static void |
| gcn_conditional_register_usage (void) |
| { |
| if (!cfun || !cfun->machine) |
| return; |
| |
| if (cfun->machine->normal_function) |
| { |
| /* Restrict the set of SGPRs and VGPRs used by non-kernel functions. */ |
| for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT); |
| i <= LAST_SGPR_REG; i++) |
| fixed_regs[i] = 1, call_used_regs[i] = 1; |
| |
| for (int i = VGPR_REGNO (MAX_NORMAL_VGPR_COUNT); |
| i <= LAST_VGPR_REG; i++) |
| fixed_regs[i] = 1, call_used_regs[i] = 1; |
| |
| return; |
| } |
| |
| /* If the set of requested args is the default set, nothing more needs to |
| be done. */ |
| if (cfun->machine->args.requested == default_requested_args) |
| return; |
| |
| /* Requesting a set of args different from the default violates the ABI. */ |
| if (!leaf_function_p ()) |
| warning (0, "A non-default set of initial values has been requested, " |
| "which violates the ABI"); |
| |
| for (int i = SGPR_REGNO (0); i < SGPR_REGNO (14); i++) |
| fixed_regs[i] = 0; |
| |
| /* Fix the runtime argument register containing values that may be |
| needed later. DISPATCH_PTR_ARG and FLAT_SCRATCH_* should not be |
| needed after the prologue so there's no need to fix them. */ |
| if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0) |
| fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]] = 1; |
| if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0) |
| { |
| /* The upper 32-bits of the 64-bit descriptor are not used, so allow |
| the containing registers to be used for other purposes. */ |
| fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]] = 1; |
| fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 1] = 1; |
| } |
| if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0) |
| { |
| fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG]] = 1; |
| fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] + 1] = 1; |
| } |
| if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0) |
| { |
| fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1; |
| fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1; |
| } |
| if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0) |
| fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1; |
| if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0) |
| fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_X_ARG]] = 1; |
| if (cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG] >= 0) |
| fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG]] = 1; |
| if (cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG] >= 0) |
| fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG]] = 1; |
| } |
| |
| /* Determine if a load or store is valid, according to the register classes |
| and address space. Used primarily by the machine description to decide |
| when to split a move into two steps. */ |
| |
| bool |
| gcn_valid_move_p (machine_mode mode, rtx dest, rtx src) |
| { |
| if (!MEM_P (dest) && !MEM_P (src)) |
| return true; |
| |
| if (MEM_P (dest) |
| && AS_FLAT_P (MEM_ADDR_SPACE (dest)) |
| && (gcn_flat_address_p (XEXP (dest, 0), mode) |
| || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF |
| || GET_CODE (XEXP (dest, 0)) == LABEL_REF) |
| && gcn_vgpr_register_operand (src, mode)) |
| return true; |
| else if (MEM_P (src) |
| && AS_FLAT_P (MEM_ADDR_SPACE (src)) |
| && (gcn_flat_address_p (XEXP (src, 0), mode) |
| || GET_CODE (XEXP (src, 0)) == SYMBOL_REF |
| || GET_CODE (XEXP (src, 0)) == LABEL_REF) |
| && gcn_vgpr_register_operand (dest, mode)) |
| return true; |
| |
| if (MEM_P (dest) |
| && AS_GLOBAL_P (MEM_ADDR_SPACE (dest)) |
| && (gcn_global_address_p (XEXP (dest, 0)) |
| || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF |
| || GET_CODE (XEXP (dest, 0)) == LABEL_REF) |
| && gcn_vgpr_register_operand (src, mode)) |
| return true; |
| else if (MEM_P (src) |
| && AS_GLOBAL_P (MEM_ADDR_SPACE (src)) |
| && (gcn_global_address_p (XEXP (src, 0)) |
| || GET_CODE (XEXP (src, 0)) == SYMBOL_REF |
| || GET_CODE (XEXP (src, 0)) == LABEL_REF) |
| && gcn_vgpr_register_operand (dest, mode)) |
| return true; |
| |
| if (MEM_P (dest) |
| && MEM_ADDR_SPACE (dest) == ADDR_SPACE_SCALAR_FLAT |
| && (gcn_scalar_flat_address_p (XEXP (dest, 0)) |
| || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF |
| || GET_CODE (XEXP (dest, 0)) == LABEL_REF) |
| && gcn_ssrc_register_operand (src, mode)) |
| return true; |
| else if (MEM_P (src) |
| && MEM_ADDR_SPACE (src) == ADDR_SPACE_SCALAR_FLAT |
| && (gcn_scalar_flat_address_p (XEXP (src, 0)) |
| || GET_CODE (XEXP (src, 0)) == SYMBOL_REF |
| || GET_CODE (XEXP (src, 0)) == LABEL_REF) |
| && gcn_sdst_register_operand (dest, mode)) |
| return true; |
| |
| if (MEM_P (dest) |
| && AS_ANY_DS_P (MEM_ADDR_SPACE (dest)) |
| && gcn_ds_address_p (XEXP (dest, 0)) |
| && gcn_vgpr_register_operand (src, mode)) |
| return true; |
| else if (MEM_P (src) |
| && AS_ANY_DS_P (MEM_ADDR_SPACE (src)) |
| && gcn_ds_address_p (XEXP (src, 0)) |
| && gcn_vgpr_register_operand (dest, mode)) |
| return true; |
| |
| return false; |
| } |
| |
| /* }}} */ |
| /* {{{ Functions and ABI. */ |
| |
| /* Implement TARGET_FUNCTION_VALUE. |
| |
| Define how to find the value returned by a function. |
| The register location is always the same, but the mode depends on |
| VALTYPE. */ |
| |
| static rtx |
| gcn_function_value (const_tree valtype, const_tree, bool) |
| { |
| machine_mode mode = TYPE_MODE (valtype); |
| |
| if (INTEGRAL_TYPE_P (valtype) |
| && GET_MODE_CLASS (mode) == MODE_INT |
| && GET_MODE_SIZE (mode) < 4) |
| mode = SImode; |
| |
| return gen_rtx_REG (mode, SGPR_REGNO (RETURN_VALUE_REG)); |
| } |
| |
| /* Implement TARGET_FUNCTION_VALUE_REGNO_P. |
| |
| Return true if N is a possible register number for the function return |
| value. */ |
| |
| static bool |
| gcn_function_value_regno_p (const unsigned int n) |
| { |
| return n == RETURN_VALUE_REG; |
| } |
| |
| /* Calculate the number of registers required to hold function argument |
| ARG. */ |
| |
| static int |
| num_arg_regs (const function_arg_info &arg) |
| { |
| if (targetm.calls.must_pass_in_stack (arg)) |
| return 0; |
| |
| int size = arg.promoted_size_in_bytes (); |
| return (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD; |
| } |
| |
| /* Implement TARGET_STRICT_ARGUMENT_NAMING. |
| |
| Return true if the location where a function argument is passed |
| depends on whether or not it is a named argument |
| |
| For gcn, we know how to handle functions declared as stdarg: by |
| passing an extra pointer to the unnamed arguments. However, the |
| Fortran frontend can produce a different situation, where a |
| function pointer is declared with no arguments, but the actual |
| function and calls to it take more arguments. In that case, we |
| want to ensure the call matches the definition of the function. */ |
| |
| static bool |
| gcn_strict_argument_naming (cumulative_args_t cum_v) |
| { |
| CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); |
| |
| return cum->fntype == NULL_TREE || stdarg_p (cum->fntype); |
| } |
| |
| /* Implement TARGET_PRETEND_OUTGOING_VARARGS_NAMED. |
| |
| See comment on gcn_strict_argument_naming. */ |
| |
| static bool |
| gcn_pretend_outgoing_varargs_named (cumulative_args_t cum_v) |
| { |
| return !gcn_strict_argument_naming (cum_v); |
| } |
| |
| /* Implement TARGET_FUNCTION_ARG. |
| |
| Return an RTX indicating whether a function argument is passed in a register |
| and if so, which register. */ |
| |
| static rtx |
| gcn_function_arg (cumulative_args_t cum_v, const function_arg_info &arg) |
| { |
| CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); |
| if (cum->normal_function) |
| { |
| if (!arg.named || arg.end_marker_p ()) |
| return 0; |
| |
| if (targetm.calls.must_pass_in_stack (arg)) |
| return 0; |
| |
| /* Vector parameters are not supported yet. */ |
| if (VECTOR_MODE_P (arg.mode)) |
| return 0; |
| |
| int reg_num = FIRST_PARM_REG + cum->num; |
| int num_regs = num_arg_regs (arg); |
| if (num_regs > 0) |
| while (reg_num % num_regs != 0) |
| reg_num++; |
| if (reg_num + num_regs <= FIRST_PARM_REG + NUM_PARM_REGS) |
| return gen_rtx_REG (arg.mode, reg_num); |
| } |
| else |
| { |
| if (cum->num >= cum->args.nargs) |
| { |
| cum->offset = (cum->offset + TYPE_ALIGN (arg.type) / 8 - 1) |
| & -(TYPE_ALIGN (arg.type) / 8); |
| cfun->machine->kernarg_segment_alignment |
| = MAX ((unsigned) cfun->machine->kernarg_segment_alignment, |
| TYPE_ALIGN (arg.type) / 8); |
| rtx addr = gen_rtx_REG (DImode, |
| cum->args.reg[KERNARG_SEGMENT_PTR_ARG]); |
| if (cum->offset) |
| addr = gen_rtx_PLUS (DImode, addr, |
| gen_int_mode (cum->offset, DImode)); |
| rtx mem = gen_rtx_MEM (arg.mode, addr); |
| set_mem_attributes (mem, arg.type, 1); |
| set_mem_addr_space (mem, ADDR_SPACE_SCALAR_FLAT); |
| MEM_READONLY_P (mem) = 1; |
| return mem; |
| } |
| |
| int a = cum->args.order[cum->num]; |
| if (arg.mode != gcn_kernel_arg_types[a].mode) |
| { |
| error ("wrong type of argument %s", gcn_kernel_arg_types[a].name); |
| return 0; |
| } |
| return gen_rtx_REG ((machine_mode) gcn_kernel_arg_types[a].mode, |
| cum->args.reg[a]); |
| } |
| return 0; |
| } |
| |
| /* Implement TARGET_FUNCTION_ARG_ADVANCE. |
| |
| Updates the summarizer variable pointed to by CUM_V to advance past an |
| argument in the argument list. */ |
| |
| static void |
| gcn_function_arg_advance (cumulative_args_t cum_v, |
| const function_arg_info &arg) |
| { |
| CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); |
| |
| if (cum->normal_function) |
| { |
| if (!arg.named) |
| return; |
| |
| int num_regs = num_arg_regs (arg); |
| if (num_regs > 0) |
| while ((FIRST_PARM_REG + cum->num) % num_regs != 0) |
| cum->num++; |
| cum->num += num_regs; |
| } |
| else |
| { |
| if (cum->num < cum->args.nargs) |
| cum->num++; |
| else |
| { |
| cum->offset += tree_to_uhwi (TYPE_SIZE_UNIT (arg.type)); |
| cfun->machine->kernarg_segment_byte_size = cum->offset; |
| } |
| } |
| } |
| |
| /* Implement TARGET_ARG_PARTIAL_BYTES. |
| |
| Returns the number of bytes at the beginning of an argument that must be put |
| in registers. The value must be zero for arguments that are passed entirely |
| in registers or that are entirely pushed on the stack. */ |
| |
| static int |
| gcn_arg_partial_bytes (cumulative_args_t cum_v, const function_arg_info &arg) |
| { |
| CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); |
| |
| if (!arg.named) |
| return 0; |
| |
| if (targetm.calls.must_pass_in_stack (arg)) |
| return 0; |
| |
| if (cum->num >= NUM_PARM_REGS) |
| return 0; |
| |
| /* If the argument fits entirely in registers, return 0. */ |
| if (cum->num + num_arg_regs (arg) <= NUM_PARM_REGS) |
| return 0; |
| |
| return (NUM_PARM_REGS - cum->num) * UNITS_PER_WORD; |
| } |
| |
| /* A normal function which takes a pointer argument (to a scalar) may be |
| passed a pointer to LDS space (via a high-bits-set aperture), and that only |
| works with FLAT addressing, not GLOBAL. Force FLAT addressing if the |
| function has an incoming pointer-to-scalar parameter. */ |
| |
| static void |
| gcn_detect_incoming_pointer_arg (tree fndecl) |
| { |
| gcc_assert (cfun && cfun->machine); |
| |
| for (tree arg = TYPE_ARG_TYPES (TREE_TYPE (fndecl)); |
| arg; |
| arg = TREE_CHAIN (arg)) |
| if (POINTER_TYPE_P (TREE_VALUE (arg)) |
| && !AGGREGATE_TYPE_P (TREE_TYPE (TREE_VALUE (arg)))) |
| cfun->machine->use_flat_addressing = true; |
| } |
| |
| /* Implement INIT_CUMULATIVE_ARGS, via gcn.h. |
| |
| Initialize a variable CUM of type CUMULATIVE_ARGS for a call to a function |
| whose data type is FNTYPE. For a library call, FNTYPE is 0. */ |
| |
| void |
| gcn_init_cumulative_args (CUMULATIVE_ARGS *cum /* Argument info to init */ , |
| tree fntype /* tree ptr for function decl */ , |
| rtx libname /* SYMBOL_REF of library name or 0 */ , |
| tree fndecl, int caller) |
| { |
| memset (cum, 0, sizeof (*cum)); |
| cum->fntype = fntype; |
| if (libname) |
| { |
| gcc_assert (cfun && cfun->machine); |
| cum->normal_function = true; |
| if (!caller) |
| { |
| cfun->machine->normal_function = true; |
| gcn_detect_incoming_pointer_arg (fndecl); |
| } |
| return; |
| } |
| tree attr = NULL; |
| if (fndecl) |
| attr = lookup_attribute ("amdgpu_hsa_kernel", DECL_ATTRIBUTES (fndecl)); |
| if (fndecl && !attr) |
| attr = lookup_attribute ("amdgpu_hsa_kernel", |
| TYPE_ATTRIBUTES (TREE_TYPE (fndecl))); |
| if (!attr && fntype) |
| attr = lookup_attribute ("amdgpu_hsa_kernel", TYPE_ATTRIBUTES (fntype)); |
| /* Handle main () as kernel, so we can run testsuite. |
| Handle OpenACC kernels similarly to main. */ |
| if (!attr && !caller && fndecl |
| && (MAIN_NAME_P (DECL_NAME (fndecl)) |
| || lookup_attribute ("omp target entrypoint", |
| DECL_ATTRIBUTES (fndecl)) != NULL_TREE)) |
| gcn_parse_amdgpu_hsa_kernel_attribute (&cum->args, NULL_TREE); |
| else |
| { |
| if (!attr || caller) |
| { |
| gcc_assert (cfun && cfun->machine); |
| cum->normal_function = true; |
| if (!caller) |
| cfun->machine->normal_function = true; |
| } |
| gcn_parse_amdgpu_hsa_kernel_attribute |
| (&cum->args, attr ? TREE_VALUE (attr) : NULL_TREE); |
| } |
| cfun->machine->args = cum->args; |
| if (!caller && cfun->machine->normal_function) |
| gcn_detect_incoming_pointer_arg (fndecl); |
| |
| reinit_regs (); |
| } |
| |
| static bool |
| gcn_return_in_memory (const_tree type, const_tree ARG_UNUSED (fntype)) |
| { |
| machine_mode mode = TYPE_MODE (type); |
| HOST_WIDE_INT size = int_size_in_bytes (type); |
| |
| if (AGGREGATE_TYPE_P (type)) |
| return true; |
| |
| /* Vector return values are not supported yet. */ |
| if (VECTOR_TYPE_P (type)) |
| return true; |
| |
| if (mode == BLKmode) |
| return true; |
| |
| if (size > 2 * UNITS_PER_WORD) |
| return true; |
| |
| return false; |
| } |
| |
| /* Implement TARGET_PROMOTE_FUNCTION_MODE. |
| |
| Return the mode to use for outgoing function arguments. */ |
| |
| machine_mode |
| gcn_promote_function_mode (const_tree ARG_UNUSED (type), machine_mode mode, |
| int *ARG_UNUSED (punsignedp), |
| const_tree ARG_UNUSED (funtype), |
| int ARG_UNUSED (for_return)) |
| { |
| if (GET_MODE_CLASS (mode) == MODE_INT && GET_MODE_SIZE (mode) < 4) |
| return SImode; |
| |
| return mode; |
| } |
| |
| /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. |
| |
| Derived from hppa_gimplify_va_arg_expr. The generic routine doesn't handle |
| ARGS_GROW_DOWNWARDS. */ |
| |
| static tree |
| gcn_gimplify_va_arg_expr (tree valist, tree type, |
| gimple_seq *ARG_UNUSED (pre_p), |
| gimple_seq *ARG_UNUSED (post_p)) |
| { |
| tree ptr = build_pointer_type (type); |
| tree valist_type; |
| tree t, u; |
| bool indirect; |
| |
| indirect = pass_va_arg_by_reference (type); |
| if (indirect) |
| { |
| type = ptr; |
| ptr = build_pointer_type (type); |
| } |
| valist_type = TREE_TYPE (valist); |
| |
| /* Args grow down. Not handled by generic routines. */ |
| |
| u = fold_convert (sizetype, size_in_bytes (type)); |
| u = fold_build1 (NEGATE_EXPR, sizetype, u); |
| t = fold_build_pointer_plus (valist, u); |
| |
| /* Align to 8 byte boundary. */ |
| |
| u = build_int_cst (TREE_TYPE (t), -8); |
| t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, u); |
| t = fold_convert (valist_type, t); |
| |
| t = build2 (MODIFY_EXPR, valist_type, valist, t); |
| |
| t = fold_convert (ptr, t); |
| t = build_va_arg_indirect_ref (t); |
| |
| if (indirect) |
| t = build_va_arg_indirect_ref (t); |
| |
| return t; |
| } |
| |
| /* Return 1 if TRAIT NAME is present in the OpenMP context's |
| device trait set, return 0 if not present in any OpenMP context in the |
| whole translation unit, or -1 if not present in the current OpenMP context |
| but might be present in another OpenMP context in the same TU. */ |
| |
| int |
| gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait, |
| const char *name) |
| { |
| switch (trait) |
| { |
| case omp_device_kind: |
| return strcmp (name, "gpu") == 0; |
| case omp_device_arch: |
| return strcmp (name, "gcn") == 0; |
| case omp_device_isa: |
| if (strcmp (name, "fiji") == 0) |
| return gcn_arch == PROCESSOR_FIJI; |
| if (strcmp (name, "gfx900") == 0) |
| return gcn_arch == PROCESSOR_VEGA10; |
| if (strcmp (name, "gfx906") == 0) |
| return gcn_arch == PROCESSOR_VEGA20; |
| if (strcmp (name, "gfx908") == 0) |
| return gcn_arch == PROCESSOR_GFX908; |
| return 0; |
| default: |
| gcc_unreachable (); |
| } |
| } |
| |
| /* Calculate stack offsets needed to create prologues and epilogues. */ |
| |
| static struct machine_function * |
| gcn_compute_frame_offsets (void) |
| { |
| machine_function *offsets = cfun->machine; |
| |
| if (reload_completed) |
| return offsets; |
| |
| offsets->need_frame_pointer = frame_pointer_needed; |
| |
| offsets->outgoing_args_size = crtl->outgoing_args_size; |
| offsets->pretend_size = crtl->args.pretend_args_size; |
| |
| offsets->local_vars = get_frame_size (); |
| |
| offsets->lr_needs_saving = (!leaf_function_p () |
| || df_regs_ever_live_p (LR_REGNUM) |
| || df_regs_ever_live_p (LR_REGNUM + 1)); |
| |
| offsets->callee_saves = offsets->lr_needs_saving ? 8 : 0; |
| |
| for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) |
| if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno)) |
| || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM |
| && frame_pointer_needed)) |
| offsets->callee_saves += (VGPR_REGNO_P (regno) ? 256 : 4); |
| |
| /* Round up to 64-bit boundary to maintain stack alignment. */ |
| offsets->callee_saves = (offsets->callee_saves + 7) & ~7; |
| |
| return offsets; |
| } |
| |
| /* Insert code into the prologue or epilogue to store or load any |
| callee-save register to/from the stack. |
| |
| Helper function for gcn_expand_prologue and gcn_expand_epilogue. */ |
| |
| static void |
| move_callee_saved_registers (rtx sp, machine_function *offsets, |
| bool prologue) |
| { |
| int regno, offset, saved_scalars; |
| rtx exec = gen_rtx_REG (DImode, EXEC_REG); |
| rtx vcc = gen_rtx_REG (DImode, VCC_LO_REG); |
| rtx offreg = gen_rtx_REG (SImode, SGPR_REGNO (22)); |
| rtx as = gen_rtx_CONST_INT (VOIDmode, STACK_ADDR_SPACE); |
| HOST_WIDE_INT exec_set = 0; |
| int offreg_set = 0; |
| auto_vec<int> saved_sgprs; |
| |
| start_sequence (); |
| |
| /* Move scalars into two vector registers. */ |
| for (regno = 0, saved_scalars = 0; regno < FIRST_VGPR_REG; regno++) |
| if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno)) |
| || ((regno & ~1) == LINK_REGNUM && offsets->lr_needs_saving) |
| || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM |
| && offsets->need_frame_pointer)) |
| { |
| rtx reg = gen_rtx_REG (SImode, regno); |
| rtx vreg = gen_rtx_REG (V64SImode, |
| VGPR_REGNO (6 + (saved_scalars / 64))); |
| int lane = saved_scalars % 64; |
| |
| if (prologue) |
| { |
| emit_insn (gen_vec_setv64si (vreg, reg, GEN_INT (lane))); |
| saved_sgprs.safe_push (regno); |
| } |
| else |
| emit_insn (gen_vec_extractv64sisi (reg, vreg, GEN_INT (lane))); |
| |
| saved_scalars++; |
| } |
| |
| rtx move_scalars = get_insns (); |
| end_sequence (); |
| start_sequence (); |
| |
| /* Ensure that all vector lanes are moved. */ |
| exec_set = -1; |
| emit_move_insn (exec, GEN_INT (exec_set)); |
| |
| /* Set up a vector stack pointer. */ |
| rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1)); |
| rtx _0_4_8_12 = gen_rtx_REG (V64SImode, VGPR_REGNO (3)); |
| emit_insn (gen_ashlv64si3_exec (_0_4_8_12, _0_1_2_3, GEN_INT (2), |
| gcn_gen_undef (V64SImode), exec)); |
| rtx vsp = gen_rtx_REG (V64DImode, VGPR_REGNO (4)); |
| emit_insn (gen_vec_duplicatev64di_exec (vsp, sp, gcn_gen_undef (V64DImode), |
| exec)); |
| emit_insn (gen_addv64si3_vcc_exec (gcn_operand_part (V64SImode, vsp, 0), |
| gcn_operand_part (V64SImode, vsp, 0), |
| _0_4_8_12, vcc, gcn_gen_undef (V64SImode), |
| exec)); |
| emit_insn (gen_addcv64si3_exec (gcn_operand_part (V64SImode, vsp, 1), |
| gcn_operand_part (V64SImode, vsp, 1), |
| const0_rtx, vcc, vcc, |
| gcn_gen_undef (V64SImode), exec)); |
| |
| /* Move vectors. */ |
| for (regno = FIRST_VGPR_REG, offset = 0; |
| regno < FIRST_PSEUDO_REGISTER; regno++) |
| if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno)) |
| || (regno == VGPR_REGNO (6) && saved_scalars > 0) |
| || (regno == VGPR_REGNO (7) && saved_scalars > 63)) |
| { |
| rtx reg = gen_rtx_REG (V64SImode, regno); |
| int size = 256; |
| |
| if (regno == VGPR_REGNO (6) && saved_scalars < 64) |
| size = saved_scalars * 4; |
| else if (regno == VGPR_REGNO (7) && saved_scalars < 128) |
| size = (saved_scalars - 64) * 4; |
| |
| if (size != 256 || exec_set != -1) |
| { |
| exec_set = ((unsigned HOST_WIDE_INT) 1 << (size / 4)) - 1; |
| emit_move_insn (exec, gen_int_mode (exec_set, DImode)); |
| } |
| |
| if (prologue) |
| { |
| rtx insn = emit_insn (gen_scatterv64si_insn_1offset_exec |
| (vsp, const0_rtx, reg, as, const0_rtx, |
| exec)); |
| |
| /* Add CFI metadata. */ |
| rtx note; |
| if (regno == VGPR_REGNO (6) || regno == VGPR_REGNO (7)) |
| { |
| int start = (regno == VGPR_REGNO (7) ? 64 : 0); |
| int count = MIN (saved_scalars - start, 64); |
| int add_lr = (regno == VGPR_REGNO (6) |
| && df_regs_ever_live_p (LINK_REGNUM)); |
| int lrdest = -1; |
| rtvec seq = rtvec_alloc (count + add_lr); |
| |
| /* Add an REG_FRAME_RELATED_EXPR entry for each scalar |
| register that was saved in this batch. */ |
| for (int idx = 0; idx < count; idx++) |
| { |
| int stackaddr = offset + idx * 4; |
| rtx dest = gen_rtx_MEM (SImode, |
| gen_rtx_PLUS |
| (DImode, sp, |
| GEN_INT (stackaddr))); |
| rtx src = gen_rtx_REG (SImode, saved_sgprs[start + idx]); |
| rtx set = gen_rtx_SET (dest, src); |
| RTX_FRAME_RELATED_P (set) = 1; |
| RTVEC_ELT (seq, idx) = set; |
| |
| if (saved_sgprs[start + idx] == LINK_REGNUM) |
| lrdest = stackaddr; |
| } |
| |
| /* Add an additional expression for DWARF_LINK_REGISTER if |
| LINK_REGNUM was saved. */ |
| if (lrdest != -1) |
| { |
| rtx dest = gen_rtx_MEM (DImode, |
| gen_rtx_PLUS |
| (DImode, sp, |
| GEN_INT (lrdest))); |
| rtx src = gen_rtx_REG (DImode, DWARF_LINK_REGISTER); |
| rtx set = gen_rtx_SET (dest, src); |
| RTX_FRAME_RELATED_P (set) = 1; |
| RTVEC_ELT (seq, count) = set; |
| } |
| |
| note = gen_rtx_SEQUENCE (VOIDmode, seq); |
| } |
| else |
| { |
| rtx dest = gen_rtx_MEM (V64SImode, |
| gen_rtx_PLUS (DImode, sp, |
| GEN_INT (offset))); |
| rtx src = gen_rtx_REG (V64SImode, regno); |
| note = gen_rtx_SET (dest, src); |
| } |
| RTX_FRAME_RELATED_P (insn) = 1; |
| add_reg_note (insn, REG_FRAME_RELATED_EXPR, note); |
| } |
| else |
| emit_insn (gen_gatherv64si_insn_1offset_exec |
| (reg, vsp, const0_rtx, as, const0_rtx, |
| gcn_gen_undef (V64SImode), exec)); |
| |
| /* Move our VSP to the next stack entry. */ |
| if (offreg_set != size) |
| { |
| offreg_set = size; |
| emit_move_insn (offreg, GEN_INT (size)); |
| } |
| if (exec_set != -1) |
| { |
| exec_set = -1; |
| emit_move_insn (exec, GEN_INT (exec_set)); |
| } |
| emit_insn (gen_addv64si3_vcc_dup_exec |
| (gcn_operand_part (V64SImode, vsp, 0), |
| offreg, gcn_operand_part (V64SImode, vsp, 0), |
| vcc, gcn_gen_undef (V64SImode), exec)); |
| emit_insn (gen_addcv64si3_exec |
| (gcn_operand_part (V64SImode, vsp, 1), |
| gcn_operand_part (V64SImode, vsp, 1), |
| const0_rtx, vcc, vcc, gcn_gen_undef (V64SImode), exec)); |
| |
| offset += size; |
| } |
| |
| rtx move_vectors = get_insns (); |
| end_sequence (); |
| |
| if (prologue) |
| { |
| emit_insn (move_scalars); |
| emit_insn (move_vectors); |
| } |
| else |
| { |
| emit_insn (move_vectors); |
| emit_insn (move_scalars); |
| } |
| } |
| |
| /* Generate prologue. Called from gen_prologue during pro_and_epilogue pass. |
| |
| For a non-kernel function, the stack layout looks like this (interim), |
| growing *upwards*: |
| |
| hi | + ... |
| |__________________| <-- current SP |
| | outgoing args | |
| |__________________| |
| | (alloca space) | |
| |__________________| |
| | local vars | |
| |__________________| <-- FP/hard FP |
| | callee-save regs | |
| |__________________| <-- soft arg pointer |
| | pretend args | |
| |__________________| <-- incoming SP |
| | incoming args | |
| lo |..................| |
| |
| This implies arguments (beyond the first N in registers) must grow |
| downwards (as, apparently, PA has them do). |
| |
| For a kernel function we have the simpler: |
| |
| hi | + ... |
| |__________________| <-- current SP |
| | outgoing args | |
| |__________________| |
| | (alloca space) | |
| |__________________| |
| | local vars | |
| lo |__________________| <-- FP/hard FP |
| |
| */ |
| |
| void |
| gcn_expand_prologue () |
| { |
| machine_function *offsets = gcn_compute_frame_offsets (); |
| |
| if (!cfun || !cfun->machine || cfun->machine->normal_function) |
| { |
| rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM); |
| rtx sp_hi = gcn_operand_part (Pmode, sp, 1); |
| rtx sp_lo = gcn_operand_part (Pmode, sp, 0); |
| rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM); |
| rtx fp_hi = gcn_operand_part (Pmode, fp, 1); |
| rtx fp_lo = gcn_operand_part (Pmode, fp, 0); |
| |
| start_sequence (); |
| |
| if (offsets->pretend_size > 0) |
| { |
| /* FIXME: Do the actual saving of register pretend args to the stack. |
| Register order needs consideration. */ |
| } |
| |
| /* Save callee-save regs. */ |
| move_callee_saved_registers (sp, offsets, true); |
| |
| HOST_WIDE_INT sp_adjust = offsets->pretend_size |
| + offsets->callee_saves |
| + offsets->local_vars + offsets->outgoing_args_size; |
| if (sp_adjust > 0) |
| { |
| /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so |
| we use split add explictly, and specify the DImode add in |
| the note. */ |
| rtx scc = gen_rtx_REG (BImode, SCC_REG); |
| rtx adjustment = gen_int_mode (sp_adjust, SImode); |
| rtx insn = emit_insn (gen_addsi3_scalar_carry (sp_lo, sp_lo, |
| adjustment, scc)); |
| if (!offsets->need_frame_pointer) |
| { |
| RTX_FRAME_RELATED_P (insn) = 1; |
| add_reg_note (insn, REG_FRAME_RELATED_EXPR, |
| gen_rtx_SET (sp, |
| gen_rtx_PLUS (DImode, sp, |
| adjustment))); |
| } |
| emit_insn (gen_addcsi3_scalar_zero (sp_hi, sp_hi, scc)); |
| } |
| |
| if (offsets->need_frame_pointer) |
| { |
| /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so |
| we use split add explictly, and specify the DImode add in |
| the note. */ |
| rtx scc = gen_rtx_REG (BImode, SCC_REG); |
| int fp_adjust = -(offsets->local_vars + offsets->outgoing_args_size); |
| rtx adjustment = gen_int_mode (fp_adjust, SImode); |
| rtx insn = emit_insn (gen_addsi3_scalar_carry(fp_lo, sp_lo, |
| adjustment, scc)); |
| emit_insn (gen_addcsi3_scalar (fp_hi, sp_hi, |
| (fp_adjust < 0 ? GEN_INT (-1) |
| : const0_rtx), |
| scc, scc)); |
| |
| /* Set the CFA to the entry stack address, as an offset from the |
| frame pointer. This is preferred because the frame pointer is |
| saved in each frame, whereas the stack pointer is not. */ |
| RTX_FRAME_RELATED_P (insn) = 1; |
| add_reg_note (insn, REG_CFA_DEF_CFA, |
| gen_rtx_PLUS (DImode, fp, |
| GEN_INT (-(offsets->pretend_size |
| + offsets->callee_saves)))); |
| } |
| |
| rtx_insn *seq = get_insns (); |
| end_sequence (); |
| |
| emit_insn (seq); |
| } |
| else |
| { |
| rtx wave_offset = gen_rtx_REG (SImode, |
| cfun->machine->args. |
| reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]); |
| |
| if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG)) |
| { |
| rtx fs_init_lo = |
| gen_rtx_REG (SImode, |
| cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG]); |
| rtx fs_init_hi = |
| gen_rtx_REG (SImode, |
| cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG] + 1); |
| rtx fs_reg_lo = gen_rtx_REG (SImode, FLAT_SCRATCH_REG); |
| rtx fs_reg_hi = gen_rtx_REG (SImode, FLAT_SCRATCH_REG + 1); |
| |
| /*rtx queue = gen_rtx_REG(DImode, |
| cfun->machine->args.reg[QUEUE_PTR_ARG]); |
| rtx aperture = gen_rtx_MEM (SImode, |
| gen_rtx_PLUS (DImode, queue, |
| gen_int_mode (68, SImode))); |
| set_mem_addr_space (aperture, ADDR_SPACE_SCALAR_FLAT);*/ |
| |
| /* Set up flat_scratch. */ |
| emit_insn (gen_addsi3_scc (fs_reg_hi, fs_init_lo, wave_offset)); |
| emit_insn (gen_lshrsi3_scc (fs_reg_hi, fs_reg_hi, |
| gen_int_mode (8, SImode))); |
| emit_move_insn (fs_reg_lo, fs_init_hi); |
| } |
| |
| /* Set up frame pointer and stack pointer. */ |
| rtx sp = gen_rtx_REG (DImode, STACK_POINTER_REGNUM); |
| rtx sp_hi = simplify_gen_subreg (SImode, sp, DImode, 4); |
| rtx sp_lo = simplify_gen_subreg (SImode, sp, DImode, 0); |
| rtx fp = gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM); |
| rtx fp_hi = simplify_gen_subreg (SImode, fp, DImode, 4); |
| rtx fp_lo = simplify_gen_subreg (SImode, fp, DImode, 0); |
| |
| HOST_WIDE_INT sp_adjust = (offsets->local_vars |
| + offsets->outgoing_args_size); |
| |
| /* Initialise FP and SP from the buffer descriptor in s[0:3]. */ |
| emit_move_insn (fp_lo, gen_rtx_REG (SImode, 0)); |
| emit_insn (gen_andsi3_scc (fp_hi, gen_rtx_REG (SImode, 1), |
| gen_int_mode (0xffff, SImode))); |
| rtx scc = gen_rtx_REG (BImode, SCC_REG); |
| emit_insn (gen_addsi3_scalar_carry (fp_lo, fp_lo, wave_offset, scc)); |
| emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi, scc)); |
| |
| /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so we use |
| split add explictly, and specify the DImode add in the note. |
| The DWARF info expects that the callee-save data is in the frame, |
| even though it isn't (because this is the entry point), so we |
| make a notional adjustment to the DWARF frame offset here. */ |
| rtx dbg_adjustment = gen_int_mode (sp_adjust + offsets->callee_saves, |
| DImode); |
| rtx insn; |
| if (sp_adjust > 0) |
| { |
| rtx scc = gen_rtx_REG (BImode, SCC_REG); |
| rtx adjustment = gen_int_mode (sp_adjust, DImode); |
| insn = emit_insn (gen_addsi3_scalar_carry(sp_lo, fp_lo, adjustment, |
| scc)); |
| emit_insn (gen_addcsi3_scalar_zero (sp_hi, fp_hi, scc)); |
| } |
| else |
| insn = emit_move_insn (sp, fp); |
| RTX_FRAME_RELATED_P (insn) = 1; |
| add_reg_note (insn, REG_FRAME_RELATED_EXPR, |
| gen_rtx_SET (sp, gen_rtx_PLUS (DImode, sp, |
| dbg_adjustment))); |
| |
| if (offsets->need_frame_pointer) |
| { |
| /* Set the CFA to the entry stack address, as an offset from the |
| frame pointer. This is necessary when alloca is used, and |
| harmless otherwise. */ |
| rtx neg_adjust = gen_int_mode (-offsets->callee_saves, DImode); |
| add_reg_note (insn, REG_CFA_DEF_CFA, |
| gen_rtx_PLUS (DImode, fp, neg_adjust)); |
| } |
| |
| /* Make sure the flat scratch reg doesn't get optimised away. */ |
| emit_insn (gen_prologue_use (gen_rtx_REG (DImode, FLAT_SCRATCH_REG))); |
| } |
| |
| /* Ensure that the scheduler doesn't do anything unexpected. */ |
| emit_insn (gen_blockage ()); |
| |
| /* m0 is initialized for the usual LDS DS and FLAT memory case. |
| The low-part is the address of the topmost addressable byte, which is |
| size-1. The high-part is an offset and should be zero. */ |
| emit_move_insn (gen_rtx_REG (SImode, M0_REG), |
| gen_int_mode (LDS_SIZE, SImode)); |
| |
| emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG))); |
| |
| if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp) |
| { |
| /* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel. */ |
| rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG); |
| emit_move_insn (fn_reg, gen_rtx_SYMBOL_REF (Pmode, |
| "gomp_gcn_enter_kernel")); |
| emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx)); |
| } |
| } |
| |
| /* Generate epilogue. Called from gen_epilogue during pro_and_epilogue pass. |
| |
| See gcn_expand_prologue for stack details. */ |
| |
| void |
| gcn_expand_epilogue (void) |
| { |
| /* Ensure that the scheduler doesn't do anything unexpected. */ |
| emit_insn (gen_blockage ()); |
| |
| if (!cfun || !cfun->machine || cfun->machine->normal_function) |
| { |
| machine_function *offsets = gcn_compute_frame_offsets (); |
| rtx sp = gen_rtx_REG ( |