| /* Target code for NVPTX. |
| Copyright (C) 2014-2022 Free Software Foundation, Inc. |
| Contributed by Bernd Schmidt <bernds@codesourcery.com> |
| |
| This file is part of GCC. |
| |
| GCC is free software; you can redistribute it and/or modify it |
| under the terms of the GNU General Public License as published |
| by the Free Software Foundation; either version 3, or (at your |
| option) any later version. |
| |
| GCC is distributed in the hope that it will be useful, but WITHOUT |
| ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public |
| License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with GCC; see the file COPYING3. If not see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #define IN_TARGET_CODE 1 |
| |
| #include "config.h" |
| #include <sstream> |
| #include "system.h" |
| #include "coretypes.h" |
| #include "backend.h" |
| #include "target.h" |
| #include "rtl.h" |
| #include "tree.h" |
| #include "cfghooks.h" |
| #include "df.h" |
| #include "memmodel.h" |
| #include "tm_p.h" |
| #include "expmed.h" |
| #include "optabs.h" |
| #include "regs.h" |
| #include "emit-rtl.h" |
| #include "recog.h" |
| #include "diagnostic.h" |
| #include "alias.h" |
| #include "insn-flags.h" |
| #include "output.h" |
| #include "insn-attr.h" |
| #include "flags.h" |
| #include "dojump.h" |
| #include "explow.h" |
| #include "calls.h" |
| #include "varasm.h" |
| #include "stmt.h" |
| #include "expr.h" |
| #include "tm-preds.h" |
| #include "tm-constrs.h" |
| #include "langhooks.h" |
| #include "dbxout.h" |
| #include "cfgrtl.h" |
| #include "gimple.h" |
| #include "stor-layout.h" |
| #include "builtins.h" |
| #include "omp-general.h" |
| #include "omp-low.h" |
| #include "omp-offload.h" |
| #include "gomp-constants.h" |
| #include "dumpfile.h" |
| #include "internal-fn.h" |
| #include "gimple-iterator.h" |
| #include "stringpool.h" |
| #include "attribs.h" |
| #include "tree-vrp.h" |
| #include "tree-ssa-operands.h" |
| #include "tree-ssanames.h" |
| #include "gimplify.h" |
| #include "tree-phinodes.h" |
| #include "cfgloop.h" |
| #include "fold-const.h" |
| #include "intl.h" |
| #include "opts.h" |
| #include "tree-pretty-print.h" |
| #include "rtl-iter.h" |
| #include "cgraph.h" |
| |
| /* This file should be included last. */ |
| #include "target-def.h" |
| |
| #define WORKAROUND_PTXJIT_BUG 1 |
| #define WORKAROUND_PTXJIT_BUG_2 1 |
| #define WORKAROUND_PTXJIT_BUG_3 1 |
| |
| /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread |
| block, which has had a maximum number of threads of 1024 since CUDA version |
| 2.x. */ |
| #define PTX_CTA_SIZE 1024 |
| |
| #define PTX_CTA_NUM_BARRIERS 16 |
| #define PTX_WARP_SIZE 32 |
| |
| #define PTX_PER_CTA_BARRIER 0 |
| #define PTX_NUM_PER_CTA_BARRIERS 1 |
| #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS) |
| #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS) |
| |
| #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE |
| #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE |
| #define PTX_WORKER_LENGTH 32 |
| #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */ |
| |
| /* The various PTX memory areas an object might reside in. */ |
| enum nvptx_data_area |
| { |
| DATA_AREA_GENERIC, |
| DATA_AREA_GLOBAL, |
| DATA_AREA_SHARED, |
| DATA_AREA_LOCAL, |
| DATA_AREA_CONST, |
| DATA_AREA_PARAM, |
| DATA_AREA_MAX |
| }; |
| |
| /* We record the data area in the target symbol flags. */ |
| #define SYMBOL_DATA_AREA(SYM) \ |
| (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \ |
| & 7) |
| #define SET_SYMBOL_DATA_AREA(SYM,AREA) \ |
| (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT) |
| |
| /* Record the function decls we've written, and the libfuncs and function |
| decls corresponding to them. */ |
| static std::stringstream func_decls; |
| |
| struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def> |
| { |
| static hashval_t hash (rtx x) { return htab_hash_pointer (x); } |
| static bool equal (rtx a, rtx b) { return a == b; } |
| }; |
| |
| static GTY((cache)) |
| hash_table<declared_libfunc_hasher> *declared_libfuncs_htab; |
| |
| struct tree_hasher : ggc_cache_ptr_hash<tree_node> |
| { |
| static hashval_t hash (tree t) { return htab_hash_pointer (t); } |
| static bool equal (tree a, tree b) { return a == b; } |
| }; |
| |
| static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab; |
| static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab; |
| |
| /* Buffer needed to broadcast across workers and vectors. This is |
| used for both worker-neutering and worker broadcasting, and |
| vector-neutering and boardcasting when vector_length > 32. It is |
| shared by all functions emitted. The buffer is placed in shared |
| memory. It'd be nice if PTX supported common blocks, because then |
| this could be shared across TUs (taking the largest size). */ |
| static unsigned oacc_bcast_size; |
| static unsigned oacc_bcast_partition; |
| static unsigned oacc_bcast_align; |
| static GTY(()) rtx oacc_bcast_sym; |
| |
| /* Buffer needed for worker reductions. This has to be distinct from |
| the worker broadcast array, as both may be live concurrently. */ |
| static unsigned worker_red_size; |
| static unsigned worker_red_align; |
| static GTY(()) rtx worker_red_sym; |
| |
| /* Buffer needed for vector reductions, when vector_length > |
| PTX_WARP_SIZE. This has to be distinct from the worker broadcast |
| array, as both may be live concurrently. */ |
| static unsigned vector_red_size; |
| static unsigned vector_red_align; |
| static unsigned vector_red_partition; |
| static GTY(()) rtx vector_red_sym; |
| |
| /* Shared memory block for gang-private variables. */ |
| static unsigned gang_private_shared_size; |
| static unsigned gang_private_shared_align; |
| static GTY(()) rtx gang_private_shared_sym; |
| static hash_map<tree_decl_hash, unsigned int> gang_private_shared_hmap; |
| |
| /* Global lock variable, needed for 128bit worker & gang reductions. */ |
| static GTY(()) tree global_lock_var; |
| |
| /* True if any function references __nvptx_stacks. */ |
| static bool need_softstack_decl; |
| |
| /* True if any function references __nvptx_uni. */ |
| static bool need_unisimt_decl; |
| |
| static int nvptx_mach_max_workers (); |
| |
| /* Allocate a new, cleared machine_function structure. */ |
| |
| static struct machine_function * |
| nvptx_init_machine_status (void) |
| { |
| struct machine_function *p = ggc_cleared_alloc<machine_function> (); |
| p->return_mode = VOIDmode; |
| return p; |
| } |
| |
| /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL) |
| and -fopenacc is also enabled. */ |
| |
| static void |
| diagnose_openacc_conflict (bool optval, const char *optname) |
| { |
| if (flag_openacc && optval) |
| error ("option %s is not supported together with %<-fopenacc%>", optname); |
| } |
| |
| static enum ptx_version |
| first_ptx_version_supporting_sm (enum ptx_isa sm) |
| { |
| switch (sm) |
| { |
| case PTX_ISA_SM30: |
| return PTX_VERSION_3_0; |
| case PTX_ISA_SM35: |
| return PTX_VERSION_3_1; |
| case PTX_ISA_SM53: |
| return PTX_VERSION_4_2; |
| case PTX_ISA_SM70: |
| return PTX_VERSION_6_0; |
| case PTX_ISA_SM75: |
| return PTX_VERSION_6_3; |
| case PTX_ISA_SM80: |
| return PTX_VERSION_7_0; |
| default: |
| gcc_unreachable (); |
| } |
| } |
| |
| static enum ptx_version |
| default_ptx_version_option (void) |
| { |
| enum ptx_version first |
| = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option); |
| |
| /* Pick a version that supports the sm. */ |
| enum ptx_version res = first; |
| |
| /* Pick at least 3.1. This has been the smallest version historically. */ |
| res = MAX (res, PTX_VERSION_3_1); |
| |
| /* Pick at least 6.0, to enable using bar.warp.sync to have a way to force |
| warp convergence. */ |
| res = MAX (res, PTX_VERSION_6_0); |
| |
| /* Verify that we pick a version that supports the sm. */ |
| gcc_assert (first <= res); |
| return res; |
| } |
| |
| static const char * |
| ptx_version_to_string (enum ptx_version v) |
| { |
| switch (v) |
| { |
| case PTX_VERSION_3_0: |
| return "3.0"; |
| case PTX_VERSION_3_1: |
| return "3.1"; |
| case PTX_VERSION_4_2: |
| return "4.2"; |
| case PTX_VERSION_6_0: |
| return "6.0"; |
| case PTX_VERSION_6_3: |
| return "6.3"; |
| case PTX_VERSION_7_0: |
| return "7.0"; |
| default: |
| gcc_unreachable (); |
| } |
| } |
| |
| unsigned int |
| ptx_version_to_number (enum ptx_version v, bool major_p) |
| { |
| switch (v) |
| { |
| case PTX_VERSION_3_0: |
| return major_p ? 3 : 0; |
| case PTX_VERSION_3_1: |
| return major_p ? 3 : 1; |
| case PTX_VERSION_4_2: |
| return major_p ? 4 : 2; |
| case PTX_VERSION_6_0: |
| return major_p ? 6 : 0; |
| case PTX_VERSION_6_3: |
| return major_p ? 6 : 3; |
| case PTX_VERSION_7_0: |
| return major_p ? 7 : 0; |
| default: |
| gcc_unreachable (); |
| } |
| } |
| |
| static const char * |
| sm_version_to_string (enum ptx_isa sm) |
| { |
| switch (sm) |
| { |
| #define NVPTX_SM(XX, SEP) \ |
| case PTX_ISA_SM ## XX: \ |
| return #XX; |
| #include "nvptx-sm.def" |
| #undef NVPTX_SM |
| default: |
| gcc_unreachable (); |
| } |
| } |
| |
| static void |
| handle_ptx_version_option (void) |
| { |
| if (!OPTION_SET_P (ptx_version_option) |
| || ptx_version_option == PTX_VERSION_default) |
| { |
| ptx_version_option = default_ptx_version_option (); |
| return; |
| } |
| |
| enum ptx_version first |
| = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option); |
| |
| if (ptx_version_option < first) |
| error ("PTX version (%<-mptx%>) needs to be at least %s to support selected" |
| " %<-misa%> (sm_%s)", ptx_version_to_string (first), |
| sm_version_to_string ((enum ptx_isa)ptx_isa_option)); |
| } |
| |
| /* Implement TARGET_OPTION_OVERRIDE. */ |
| |
| static void |
| nvptx_option_override (void) |
| { |
| init_machine_status = nvptx_init_machine_status; |
| |
| handle_ptx_version_option (); |
| |
| /* Set toplevel_reorder, unless explicitly disabled. We need |
| reordering so that we emit necessary assembler decls of |
| undeclared variables. */ |
| if (!OPTION_SET_P (flag_toplevel_reorder)) |
| flag_toplevel_reorder = 1; |
| |
| debug_nonbind_markers_p = 0; |
| |
| /* Set flag_no_common, unless explicitly disabled. We fake common |
| using .weak, and that's not entirely accurate, so avoid it |
| unless forced. */ |
| if (!OPTION_SET_P (flag_no_common)) |
| flag_no_common = 1; |
| |
| /* The patch area requires nops, which we don't have. */ |
| HOST_WIDE_INT patch_area_size, patch_area_entry; |
| parse_and_check_patch_area (flag_patchable_function_entry, false, |
| &patch_area_size, &patch_area_entry); |
| if (patch_area_size > 0) |
| sorry ("not generating patch area, nops not supported"); |
| |
| /* Assumes that it will see only hard registers. */ |
| flag_var_tracking = 0; |
| |
| if (nvptx_optimize < 0) |
| nvptx_optimize = optimize > 0; |
| |
| declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17); |
| needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17); |
| declared_libfuncs_htab |
| = hash_table<declared_libfunc_hasher>::create_ggc (17); |
| |
| oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast"); |
| SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED); |
| oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; |
| oacc_bcast_partition = 0; |
| |
| worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red"); |
| SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED); |
| worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; |
| |
| vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red"); |
| SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED); |
| vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; |
| vector_red_partition = 0; |
| |
| gang_private_shared_sym = gen_rtx_SYMBOL_REF (Pmode, "__gang_private_shared"); |
| SET_SYMBOL_DATA_AREA (gang_private_shared_sym, DATA_AREA_SHARED); |
| gang_private_shared_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; |
| |
| diagnose_openacc_conflict (TARGET_GOMP, "-mgomp"); |
| diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack"); |
| diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt"); |
| |
| if (TARGET_GOMP) |
| target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT; |
| } |
| |
| /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to |
| deal with ptx ideosyncracies. */ |
| |
| const char * |
| nvptx_ptx_type_from_mode (machine_mode mode, bool promote) |
| { |
| switch (mode) |
| { |
| case E_BLKmode: |
| return ".b8"; |
| case E_BImode: |
| return ".pred"; |
| case E_QImode: |
| if (promote) |
| return ".u32"; |
| else |
| return ".u8"; |
| case E_HImode: |
| return ".u16"; |
| case E_SImode: |
| return ".u32"; |
| case E_DImode: |
| return ".u64"; |
| |
| case E_HFmode: |
| return ".f16"; |
| case E_SFmode: |
| return ".f32"; |
| case E_DFmode: |
| return ".f64"; |
| |
| case E_V2SImode: |
| return ".v2.u32"; |
| case E_V2DImode: |
| return ".v2.u64"; |
| |
| default: |
| gcc_unreachable (); |
| } |
| } |
| |
| /* Encode the PTX data area that DECL (which might not actually be a |
| _DECL) should reside in. */ |
| |
| static void |
| nvptx_encode_section_info (tree decl, rtx rtl, int first) |
| { |
| default_encode_section_info (decl, rtl, first); |
| if (first && MEM_P (rtl)) |
| { |
| nvptx_data_area area = DATA_AREA_GENERIC; |
| |
| if (TREE_CONSTANT (decl)) |
| area = DATA_AREA_CONST; |
| else if (TREE_CODE (decl) == VAR_DECL) |
| { |
| if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl))) |
| { |
| area = DATA_AREA_SHARED; |
| if (DECL_INITIAL (decl)) |
| error ("static initialization of variable %q+D in %<.shared%>" |
| " memory is not supported", decl); |
| } |
| else |
| area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL; |
| } |
| |
| SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area); |
| } |
| } |
| |
| /* Return the PTX name of the data area in which SYM should be |
| placed. The symbol must have already been processed by |
| nvptx_encode_seciton_info, or equivalent. */ |
| |
| static const char * |
| section_for_sym (rtx sym) |
| { |
| nvptx_data_area area = SYMBOL_DATA_AREA (sym); |
| /* Same order as nvptx_data_area enum. */ |
| static char const *const areas[] = |
| {"", ".global", ".shared", ".local", ".const", ".param"}; |
| |
| return areas[area]; |
| } |
| |
| /* Similarly for a decl. */ |
| |
| static const char * |
| section_for_decl (const_tree decl) |
| { |
| return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0)); |
| } |
| |
| /* Check NAME for special function names and redirect them by returning a |
| replacement. This applies to malloc, free and realloc, for which we |
| want to use libgcc wrappers, and call, which triggers a bug in |
| ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's |
| not active in an offload compiler -- the names are all set by the |
| host-side compiler. */ |
| |
| static const char * |
| nvptx_name_replacement (const char *name) |
| { |
| if (strcmp (name, "call") == 0) |
| return "__nvptx_call"; |
| if (strcmp (name, "malloc") == 0) |
| return "__nvptx_malloc"; |
| if (strcmp (name, "free") == 0) |
| return "__nvptx_free"; |
| if (strcmp (name, "realloc") == 0) |
| return "__nvptx_realloc"; |
| return name; |
| } |
| |
| /* Return NULL if NAME contains no dot. Otherwise return a copy of NAME |
| with the dots replaced with dollar signs. */ |
| |
| static char * |
| nvptx_replace_dot (const char *name) |
| { |
| if (strchr (name, '.') == NULL) |
| return NULL; |
| |
| char *p = xstrdup (name); |
| for (size_t i = 0; i < strlen (p); ++i) |
| if (p[i] == '.') |
| p[i] = '$'; |
| return p; |
| } |
| |
| /* If MODE should be treated as two registers of an inner mode, return |
| that inner mode. Otherwise return VOIDmode. */ |
| |
| static machine_mode |
| maybe_split_mode (machine_mode mode) |
| { |
| if (COMPLEX_MODE_P (mode)) |
| return GET_MODE_INNER (mode); |
| |
| if (mode == TImode) |
| return DImode; |
| |
| return VOIDmode; |
| } |
| |
| /* Return true if mode should be treated as two registers. */ |
| |
| static bool |
| split_mode_p (machine_mode mode) |
| { |
| return maybe_split_mode (mode) != VOIDmode; |
| } |
| |
| /* Output a register, subreg, or register pair (with optional |
| enclosing braces). */ |
| |
| static void |
| output_reg (FILE *file, unsigned regno, machine_mode inner_mode, |
| int subreg_offset = -1) |
| { |
| if (inner_mode == VOIDmode) |
| { |
| if (HARD_REGISTER_NUM_P (regno)) |
| fprintf (file, "%s", reg_names[regno]); |
| else |
| fprintf (file, "%%r%d", regno); |
| } |
| else if (subreg_offset >= 0) |
| { |
| output_reg (file, regno, VOIDmode); |
| fprintf (file, "$%d", subreg_offset); |
| } |
| else |
| { |
| if (subreg_offset == -1) |
| fprintf (file, "{"); |
| output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode)); |
| fprintf (file, ","); |
| output_reg (file, regno, inner_mode, 0); |
| if (subreg_offset == -1) |
| fprintf (file, "}"); |
| } |
| } |
| |
| /* Emit forking instructions for MASK. */ |
| |
| static void |
| nvptx_emit_forking (unsigned mask, bool is_call) |
| { |
| mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER) |
| | GOMP_DIM_MASK (GOMP_DIM_VECTOR)); |
| if (mask) |
| { |
| rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX)); |
| |
| /* Emit fork at all levels. This helps form SESE regions, as |
| it creates a block with a single successor before entering a |
| partitooned region. That is a good candidate for the end of |
| an SESE region. */ |
| emit_insn (gen_nvptx_fork (op)); |
| emit_insn (gen_nvptx_forked (op)); |
| } |
| } |
| |
| /* Emit joining instructions for MASK. */ |
| |
| static void |
| nvptx_emit_joining (unsigned mask, bool is_call) |
| { |
| mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER) |
| | GOMP_DIM_MASK (GOMP_DIM_VECTOR)); |
| if (mask) |
| { |
| rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX)); |
| |
| /* Emit joining for all non-call pars to ensure there's a single |
| predecessor for the block the join insn ends up in. This is |
| needed for skipping entire loops. */ |
| emit_insn (gen_nvptx_joining (op)); |
| emit_insn (gen_nvptx_join (op)); |
| } |
| } |
| |
| |
| /* Determine whether MODE and TYPE (possibly NULL) should be passed or |
| returned in memory. Integer and floating types supported by the |
| machine are passed in registers, everything else is passed in |
| memory. Complex types are split. */ |
| |
| static bool |
| pass_in_memory (machine_mode mode, const_tree type, bool for_return) |
| { |
| if (type) |
| { |
| if (AGGREGATE_TYPE_P (type)) |
| return true; |
| if (TREE_CODE (type) == VECTOR_TYPE) |
| return true; |
| } |
| |
| if (!for_return && COMPLEX_MODE_P (mode)) |
| /* Complex types are passed as two underlying args. */ |
| mode = GET_MODE_INNER (mode); |
| |
| if (GET_MODE_CLASS (mode) != MODE_INT |
| && GET_MODE_CLASS (mode) != MODE_FLOAT) |
| return true; |
| |
| if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) |
| return true; |
| |
| return false; |
| } |
| |
| /* A non-memory argument of mode MODE is being passed, determine the mode it |
| should be promoted to. This is also used for determining return |
| type promotion. */ |
| |
| static machine_mode |
| promote_arg (machine_mode mode, bool prototyped) |
| { |
| if (!prototyped && mode == SFmode) |
| /* K&R float promotion for unprototyped functions. */ |
| mode = DFmode; |
| else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode)) |
| mode = SImode; |
| |
| return mode; |
| } |
| |
| /* A non-memory return type of MODE is being returned. Determine the |
| mode it should be promoted to. */ |
| |
| static machine_mode |
| promote_return (machine_mode mode) |
| { |
| return promote_arg (mode, true); |
| } |
| |
| /* Implement TARGET_FUNCTION_ARG. */ |
| |
| static rtx |
| nvptx_function_arg (cumulative_args_t, const function_arg_info &arg) |
| { |
| if (arg.end_marker_p () || !arg.named) |
| return NULL_RTX; |
| |
| return gen_reg_rtx (arg.mode); |
| } |
| |
| /* Implement TARGET_FUNCTION_INCOMING_ARG. */ |
| |
| static rtx |
| nvptx_function_incoming_arg (cumulative_args_t cum_v, |
| const function_arg_info &arg) |
| { |
| CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); |
| |
| if (arg.end_marker_p () || !arg.named) |
| return NULL_RTX; |
| |
| /* No need to deal with split modes here, the only case that can |
| happen is complex modes and those are dealt with by |
| TARGET_SPLIT_COMPLEX_ARG. */ |
| return gen_rtx_UNSPEC (arg.mode, |
| gen_rtvec (1, GEN_INT (cum->count)), |
| UNSPEC_ARG_REG); |
| } |
| |
| /* Implement TARGET_FUNCTION_ARG_ADVANCE. */ |
| |
| static void |
| nvptx_function_arg_advance (cumulative_args_t cum_v, const function_arg_info &) |
| { |
| CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); |
| |
| cum->count++; |
| } |
| |
| /* Implement TARGET_FUNCTION_ARG_BOUNDARY. |
| |
| For nvptx This is only used for varadic args. The type has already |
| been promoted and/or converted to invisible reference. */ |
| |
| static unsigned |
| nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type)) |
| { |
| return GET_MODE_ALIGNMENT (mode); |
| } |
| |
| /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook. |
| |
| For nvptx, we know how to handle functions declared as stdarg: by |
| passing an extra pointer to the unnamed arguments. However, the |
| Fortran frontend can produce a different situation, where a |
| function pointer is declared with no arguments, but the actual |
| function and calls to it take more arguments. In that case, we |
| want to ensure the call matches the definition of the function. */ |
| |
| static bool |
| nvptx_strict_argument_naming (cumulative_args_t cum_v) |
| { |
| CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); |
| |
| return cum->fntype == NULL_TREE || stdarg_p (cum->fntype); |
| } |
| |
| /* Implement TARGET_LIBCALL_VALUE. */ |
| |
| static rtx |
| nvptx_libcall_value (machine_mode mode, const_rtx) |
| { |
| if (!cfun || !cfun->machine->doing_call) |
| /* Pretend to return in a hard reg for early uses before pseudos can be |
| generated. */ |
| return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM); |
| |
| return gen_reg_rtx (mode); |
| } |
| |
| /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place |
| where function FUNC returns or receives a value of data type TYPE. */ |
| |
| static rtx |
| nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func), |
| bool outgoing) |
| { |
| machine_mode mode = promote_return (TYPE_MODE (type)); |
| |
| if (outgoing) |
| { |
| gcc_assert (cfun); |
| cfun->machine->return_mode = mode; |
| return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM); |
| } |
| |
| return nvptx_libcall_value (mode, NULL_RTX); |
| } |
| |
| /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */ |
| |
| static bool |
| nvptx_function_value_regno_p (const unsigned int regno) |
| { |
| return regno == NVPTX_RETURN_REGNUM; |
| } |
| |
| /* Types with a mode other than those supported by the machine are passed by |
| reference in memory. */ |
| |
| static bool |
| nvptx_pass_by_reference (cumulative_args_t, const function_arg_info &arg) |
| { |
| return pass_in_memory (arg.mode, arg.type, false); |
| } |
| |
| /* Implement TARGET_RETURN_IN_MEMORY. */ |
| |
| static bool |
| nvptx_return_in_memory (const_tree type, const_tree) |
| { |
| return pass_in_memory (TYPE_MODE (type), type, true); |
| } |
| |
| /* Implement TARGET_PROMOTE_FUNCTION_MODE. */ |
| |
| static machine_mode |
| nvptx_promote_function_mode (const_tree type, machine_mode mode, |
| int *ARG_UNUSED (punsignedp), |
| const_tree funtype, int for_return) |
| { |
| return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype)); |
| } |
| |
| /* Helper for write_arg. Emit a single PTX argument of MODE, either |
| in a prototype, or as copy in a function prologue. ARGNO is the |
| index of this argument in the PTX function. FOR_REG is negative, |
| if we're emitting the PTX prototype. It is zero if we're copying |
| to an argument register and it is greater than zero if we're |
| copying to a specific hard register. */ |
| |
| static int |
| write_arg_mode (std::stringstream &s, int for_reg, int argno, |
| machine_mode mode) |
| { |
| const char *ptx_type = nvptx_ptx_type_from_mode (mode, false); |
| |
| if (for_reg < 0) |
| { |
| /* Writing PTX prototype. */ |
| s << (argno ? ", " : " ("); |
| s << ".param" << ptx_type << " %in_ar" << argno; |
| } |
| else |
| { |
| s << "\t.reg" << ptx_type << " "; |
| if (for_reg) |
| s << reg_names[for_reg]; |
| else |
| s << "%ar" << argno; |
| s << ";\n"; |
| if (argno >= 0) |
| { |
| s << "\tld.param" << ptx_type << " "; |
| if (for_reg) |
| s << reg_names[for_reg]; |
| else |
| s << "%ar" << argno; |
| s << ", [%in_ar" << argno << "];\n"; |
| } |
| } |
| return argno + 1; |
| } |
| |
| /* Process function parameter TYPE to emit one or more PTX |
| arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED |
| is true, if this is a prototyped function, rather than an old-style |
| C declaration. Returns the next argument number to use. |
| |
| The promotion behavior here must match the regular GCC function |
| parameter marshalling machinery. */ |
| |
| static int |
| write_arg_type (std::stringstream &s, int for_reg, int argno, |
| tree type, bool prototyped) |
| { |
| machine_mode mode = TYPE_MODE (type); |
| |
| if (mode == VOIDmode) |
| return argno; |
| |
| if (pass_in_memory (mode, type, false)) |
| mode = Pmode; |
| else |
| { |
| bool split = TREE_CODE (type) == COMPLEX_TYPE; |
| |
| if (split) |
| { |
| /* Complex types are sent as two separate args. */ |
| type = TREE_TYPE (type); |
| mode = TYPE_MODE (type); |
| prototyped = true; |
| } |
| |
| mode = promote_arg (mode, prototyped); |
| if (split) |
| argno = write_arg_mode (s, for_reg, argno, mode); |
| } |
| |
| return write_arg_mode (s, for_reg, argno, mode); |
| } |
| |
| /* Emit a PTX return as a prototype or function prologue declaration |
| for MODE. */ |
| |
| static void |
| write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode) |
| { |
| const char *ptx_type = nvptx_ptx_type_from_mode (mode, false); |
| const char *pfx = "\t.reg"; |
| const char *sfx = ";\n"; |
| |
| if (for_proto) |
| pfx = "(.param", sfx = "_out) "; |
| |
| s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx; |
| } |
| |
| /* Process a function return TYPE to emit a PTX return as a prototype |
| or function prologue declaration. Returns true if return is via an |
| additional pointer parameter. The promotion behavior here must |
| match the regular GCC function return mashalling. */ |
| |
| static bool |
| write_return_type (std::stringstream &s, bool for_proto, tree type) |
| { |
| machine_mode mode = TYPE_MODE (type); |
| |
| if (mode == VOIDmode) |
| return false; |
| |
| bool return_in_mem = pass_in_memory (mode, type, true); |
| |
| if (return_in_mem) |
| { |
| if (for_proto) |
| return return_in_mem; |
| |
| /* Named return values can cause us to return a pointer as well |
| as expect an argument for the return location. This is |
| optimization-level specific, so no caller can make use of |
| this data, but more importantly for us, we must ensure it |
| doesn't change the PTX prototype. */ |
| mode = (machine_mode) cfun->machine->return_mode; |
| |
| if (mode == VOIDmode) |
| return return_in_mem; |
| |
| /* Clear return_mode to inhibit copy of retval to non-existent |
| retval parameter. */ |
| cfun->machine->return_mode = VOIDmode; |
| } |
| else |
| mode = promote_return (mode); |
| |
| write_return_mode (s, for_proto, mode); |
| |
| return return_in_mem; |
| } |
| |
| /* Look for attributes in ATTRS that would indicate we must write a function |
| as a .entry kernel rather than a .func. Return true if one is found. */ |
| |
| static bool |
| write_as_kernel (tree attrs) |
| { |
| return (lookup_attribute ("kernel", attrs) != NULL_TREE |
| || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE |
| && lookup_attribute ("oacc function", attrs) != NULL_TREE)); |
| /* For OpenMP target regions, the corresponding kernel entry is emitted from |
| write_omp_entry as a separate function. */ |
| } |
| |
| /* Emit a linker marker for a function decl or defn. */ |
| |
| static void |
| write_fn_marker (std::stringstream &s, bool is_defn, bool globalize, |
| const char *name) |
| { |
| s << "\n// BEGIN"; |
| if (globalize) |
| s << " GLOBAL"; |
| s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: "); |
| s << name << "\n"; |
| } |
| |
| /* Emit a linker marker for a variable decl or defn. */ |
| |
| static void |
| write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name) |
| { |
| fprintf (file, "\n// BEGIN%s VAR %s: ", |
| globalize ? " GLOBAL" : "", |
| is_defn ? "DEF" : "DECL"); |
| assemble_name_raw (file, name); |
| fputs ("\n", file); |
| } |
| |
| /* Helper function for write_fn_proto. */ |
| |
| static void |
| write_fn_proto_1 (std::stringstream &s, bool is_defn, |
| const char *name, const_tree decl) |
| { |
| if (lookup_attribute ("alias", DECL_ATTRIBUTES (decl)) == NULL) |
| write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name); |
| |
| /* PTX declaration. */ |
| if (DECL_EXTERNAL (decl)) |
| s << ".extern "; |
| else if (TREE_PUBLIC (decl)) |
| s << (DECL_WEAK (decl) ? ".weak " : ".visible "); |
| s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func "); |
| |
| tree fntype = TREE_TYPE (decl); |
| tree result_type = TREE_TYPE (fntype); |
| |
| /* atomic_compare_exchange_$n builtins have an exceptional calling |
| convention. */ |
| int not_atomic_weak_arg = -1; |
| if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL) |
| switch (DECL_FUNCTION_CODE (decl)) |
| { |
| case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1: |
| case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2: |
| case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4: |
| case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8: |
| case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16: |
| /* These atomics skip the 'weak' parm in an actual library |
| call. We must skip it in the prototype too. */ |
| not_atomic_weak_arg = 3; |
| break; |
| |
| default: |
| break; |
| } |
| |
| /* Declare the result. */ |
| bool return_in_mem = write_return_type (s, true, result_type); |
| |
| s << name; |
| |
| int argno = 0; |
| |
| /* Emit argument list. */ |
| if (return_in_mem) |
| argno = write_arg_type (s, -1, argno, ptr_type_node, true); |
| |
| /* We get: |
| NULL in TYPE_ARG_TYPES, for old-style functions |
| NULL in DECL_ARGUMENTS, for builtin functions without another |
| declaration. |
| So we have to pick the best one we have. */ |
| tree args = TYPE_ARG_TYPES (fntype); |
| bool prototyped = true; |
| if (!args) |
| { |
| args = DECL_ARGUMENTS (decl); |
| prototyped = false; |
| } |
| |
| for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--) |
| { |
| tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args); |
| |
| if (not_atomic_weak_arg) |
| argno = write_arg_type (s, -1, argno, type, prototyped); |
| else |
| gcc_assert (TREE_CODE (type) == BOOLEAN_TYPE); |
| } |
| |
| if (stdarg_p (fntype)) |
| argno = write_arg_type (s, -1, argno, ptr_type_node, true); |
| |
| if (DECL_STATIC_CHAIN (decl)) |
| argno = write_arg_type (s, -1, argno, ptr_type_node, true); |
| |
| if (argno < 2 && strcmp (name, "main") == 0) |
| { |
| if (argno == 0) |
| argno = write_arg_type (s, -1, argno, integer_type_node, true); |
| |
| if (argno == 1) |
| argno = write_arg_type (s, -1, argno, ptr_type_node, true); |
| } |
| |
| if (argno) |
| s << ")"; |
| |
| s << (is_defn ? "\n" : ";\n"); |
| } |
| |
| /* Write a .func or .kernel declaration or definition along with |
| a helper comment for use by ld. S is the stream to write to, DECL |
| the decl for the function with name NAME. For definitions, emit |
| a declaration too. */ |
| |
| static void |
| write_fn_proto (std::stringstream &s, bool is_defn, |
| const char *name, const_tree decl) |
| { |
| const char *replacement = nvptx_name_replacement (name); |
| char *replaced_dots = NULL; |
| if (replacement != name) |
| name = replacement; |
| else |
| { |
| replaced_dots = nvptx_replace_dot (name); |
| if (replaced_dots) |
| name = replaced_dots; |
| } |
| if (name[0] == '*') |
| name++; |
| |
| if (is_defn) |
| /* Emit a declaration. The PTX assembler gets upset without it. */ |
| write_fn_proto_1 (s, false, name, decl); |
| |
| write_fn_proto_1 (s, is_defn, name, decl); |
| |
| if (replaced_dots) |
| XDELETE (replaced_dots); |
| } |
| |
| /* Construct a function declaration from a call insn. This can be |
| necessary for two reasons - either we have an indirect call which |
| requires a .callprototype declaration, or we have a libcall |
| generated by emit_library_call for which no decl exists. */ |
| |
| static void |
| write_fn_proto_from_insn (std::stringstream &s, const char *name, |
| rtx result, rtx pat) |
| { |
| char *replaced_dots = NULL; |
| |
| if (!name) |
| { |
| s << "\t.callprototype "; |
| name = "_"; |
| } |
| else |
| { |
| const char *replacement = nvptx_name_replacement (name); |
| if (replacement != name) |
| name = replacement; |
| else |
| { |
| replaced_dots = nvptx_replace_dot (name); |
| if (replaced_dots) |
| name = replaced_dots; |
| } |
| write_fn_marker (s, false, true, name); |
| s << "\t.extern .func "; |
| } |
| |
| if (result != NULL_RTX) |
| write_return_mode (s, true, GET_MODE (result)); |
| |
| s << name; |
| if (replaced_dots) |
| XDELETE (replaced_dots); |
| |
| int arg_end = XVECLEN (pat, 0); |
| for (int i = 1; i < arg_end; i++) |
| { |
| /* We don't have to deal with mode splitting & promotion here, |
| as that was already done when generating the call |
| sequence. */ |
| machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0)); |
| |
| write_arg_mode (s, -1, i - 1, mode); |
| } |
| if (arg_end != 1) |
| s << ")"; |
| s << ";\n"; |
| } |
| |
| /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash |
| table and write a ptx prototype. These are emitted at end of |
| compilation. */ |
| |
| static void |
| nvptx_record_fndecl (tree decl) |
| { |
| tree *slot = declared_fndecls_htab->find_slot (decl, INSERT); |
| if (*slot == NULL) |
| { |
| *slot = decl; |
| const char *name = get_fnname_from_decl (decl); |
| write_fn_proto (func_decls, false, name, decl); |
| } |
| } |
| |
| /* Record a libcall or unprototyped external function. CALLEE is the |
| SYMBOL_REF. Insert into the libfunc hash table and emit a ptx |
| declaration for it. */ |
| |
| static void |
| nvptx_record_libfunc (rtx callee, rtx retval, rtx pat) |
| { |
| rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT); |
| if (*slot == NULL) |
| { |
| *slot = callee; |
| |
| const char *name = XSTR (callee, 0); |
| write_fn_proto_from_insn (func_decls, name, retval, pat); |
| } |
| } |
| |
| /* DECL is an external FUNCTION_DECL, that we're referencing. If it |
| is prototyped, record it now. Otherwise record it as needed at end |
| of compilation, when we might have more information about it. */ |
| |
| void |
| nvptx_record_needed_fndecl (tree decl) |
| { |
| if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE) |
| { |
| tree *slot = needed_fndecls_htab->find_slot (decl, INSERT); |
| if (*slot == NULL) |
| *slot = decl; |
| } |
| else |
| nvptx_record_fndecl (decl); |
| } |
| |
| /* SYM is a SYMBOL_REF. If it refers to an external function, record |
| it as needed. */ |
| |
| static void |
| nvptx_maybe_record_fnsym (rtx sym) |
| { |
| tree decl = SYMBOL_REF_DECL (sym); |
| |
| if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl)) |
| nvptx_record_needed_fndecl (decl); |
| } |
| |
| /* Emit a local array to hold some part of a conventional stack frame |
| and initialize REGNO to point to it. If the size is zero, it'll |
| never be valid to dereference, so we can simply initialize to |
| zero. */ |
| |
| static void |
| init_frame (FILE *file, int regno, unsigned align, unsigned size) |
| { |
| if (size) |
| fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n", |
| align, reg_names[regno], size); |
| fprintf (file, "\t.reg.u%d %s;\n", |
| POINTER_SIZE, reg_names[regno]); |
| fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n" |
| : "\tmov.u%d %s, 0;\n"), |
| POINTER_SIZE, reg_names[regno], reg_names[regno]); |
| } |
| |
| /* Emit soft stack frame setup sequence. */ |
| |
| static void |
| init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size) |
| { |
| /* Maintain 64-bit stack alignment. */ |
| unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT; |
| size = ROUND_UP (size, keep_align); |
| int bits = POINTER_SIZE; |
| const char *reg_stack = reg_names[STACK_POINTER_REGNUM]; |
| const char *reg_frame = reg_names[FRAME_POINTER_REGNUM]; |
| const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM]; |
| const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM]; |
| fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack); |
| fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame); |
| fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot); |
| fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev); |
| fprintf (file, "\t{\n"); |
| fprintf (file, "\t\t.reg.u32 %%fstmp0;\n"); |
| fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits); |
| fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits); |
| fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n"); |
| fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n", |
| bits == 64 ? ".wide" : ".lo", bits / 8); |
| fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits); |
| |
| /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */ |
| fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot); |
| |
| /* Initialize %sspprev = __nvptx_stacks[tid.y]. */ |
| fprintf (file, "\t\tld.shared.u%d %s, [%s];\n", |
| bits, reg_sspprev, reg_sspslot); |
| |
| /* Initialize %frame = %sspprev - size. */ |
| fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n", |
| bits, reg_frame, reg_sspprev, size); |
| |
| /* Apply alignment, if larger than 64. */ |
| if (alignment > keep_align) |
| fprintf (file, "\t\tand.b%d %s, %s, %d;\n", |
| bits, reg_frame, reg_frame, -alignment); |
| |
| size = crtl->outgoing_args_size; |
| gcc_assert (size % keep_align == 0); |
| |
| /* Initialize %stack. */ |
| fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n", |
| bits, reg_stack, reg_frame, size); |
| |
| if (!crtl->is_leaf) |
| fprintf (file, "\t\tst.shared.u%d [%s], %s;\n", |
| bits, reg_sspslot, reg_stack); |
| fprintf (file, "\t}\n"); |
| cfun->machine->has_softstack = true; |
| need_softstack_decl = true; |
| } |
| |
| /* Emit code to initialize the REGNO predicate register to indicate |
| whether we are not lane zero on the NAME axis. */ |
| |
| static void |
| nvptx_init_axis_predicate (FILE *file, int regno, const char *name) |
| { |
| fprintf (file, "\t{\n"); |
| fprintf (file, "\t\t.reg.u32\t%%%s;\n", name); |
| if (strcmp (name, "x") == 0 && cfun->machine->red_partition) |
| { |
| fprintf (file, "\t\t.reg.u64\t%%t_red;\n"); |
| fprintf (file, "\t\t.reg.u64\t%%y64;\n"); |
| } |
| fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name); |
| fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name); |
| if (strcmp (name, "x") == 0 && cfun->machine->red_partition) |
| { |
| fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n"); |
| fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n"); |
| fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; " |
| "// vector reduction buffer\n", |
| REGNO (cfun->machine->red_partition), |
| vector_red_partition); |
| } |
| /* Verify vector_red_size. */ |
| gcc_assert (vector_red_partition * nvptx_mach_max_workers () |
| <= vector_red_size); |
| fprintf (file, "\t}\n"); |
| } |
| |
| /* Emit code to initialize OpenACC worker broadcast and synchronization |
| registers. */ |
| |
| static void |
| nvptx_init_oacc_workers (FILE *file) |
| { |
| fprintf (file, "\t{\n"); |
| fprintf (file, "\t\t.reg.u32\t%%tidy;\n"); |
| if (cfun->machine->bcast_partition) |
| { |
| fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n"); |
| fprintf (file, "\t\t.reg.u64\t%%y64;\n"); |
| } |
| fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n"); |
| if (cfun->machine->bcast_partition) |
| { |
| fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n"); |
| fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n"); |
| fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n"); |
| fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; " |
| "// vector broadcast offset\n", |
| REGNO (cfun->machine->bcast_partition), |
| oacc_bcast_partition); |
| } |
| /* Verify oacc_bcast_size. */ |
| gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1) |
| <= oacc_bcast_size); |
| if (cfun->machine->sync_bar) |
| fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; " |
| "// vector synchronization barrier\n", |
| REGNO (cfun->machine->sync_bar)); |
| fprintf (file, "\t}\n"); |
| } |
| |
| /* Emit code to initialize predicate and master lane index registers for |
| -muniform-simt code generation variant. */ |
| |
| static void |
| nvptx_init_unisimt_predicate (FILE *file) |
| { |
| cfun->machine->unisimt_location = gen_reg_rtx (Pmode); |
| int loc = REGNO (cfun->machine->unisimt_location); |
| int bits = POINTER_SIZE; |
| fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc); |
| fprintf (file, "\t{\n"); |
| fprintf (file, "\t\t.reg.u32 %%ustmp0;\n"); |
| fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits); |
| fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n"); |
| fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n", |
| bits == 64 ? ".wide" : ".lo"); |
| fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc); |
| fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc); |
| if (cfun->machine->unisimt_predicate) |
| { |
| int master = REGNO (cfun->machine->unisimt_master); |
| int pred = REGNO (cfun->machine->unisimt_predicate); |
| fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc); |
| if (cfun->machine->unisimt_outside_simt_predicate) |
| { |
| int pred_outside_simt |
| = REGNO (cfun->machine->unisimt_outside_simt_predicate); |
| fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, 0;\n", |
| pred_outside_simt, master); |
| } |
| fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n"); |
| /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */ |
| fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master); |
| /* Compute predicate as 'tid.x == master'. */ |
| fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master); |
| } |
| fprintf (file, "\t}\n"); |
| need_unisimt_decl = true; |
| } |
| |
| /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region: |
| |
| extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg); |
| void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize) |
| { |
| __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1); |
| __nvptx_uni[tid.y] = 0; |
| gomp_nvptx_main (ORIG, arg); |
| } |
| ORIG itself should not be emitted as a PTX .entry function. */ |
| |
| static void |
| write_omp_entry (FILE *file, const char *name, const char *orig) |
| { |
| static bool gomp_nvptx_main_declared; |
| if (!gomp_nvptx_main_declared) |
| { |
| gomp_nvptx_main_declared = true; |
| write_fn_marker (func_decls, false, true, "gomp_nvptx_main"); |
| func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE |
| << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n"; |
| } |
| /* PR79332. Single out this string; it confuses gcc.pot generation. */ |
| #define NTID_Y "%ntid.y" |
| #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\ |
| (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\ |
| {\n\ |
| .reg.u32 %r<3>;\n\ |
| .reg.u" PS " %R<4>;\n\ |
| mov.u32 %r0, %tid.y;\n\ |
| mov.u32 %r1, " NTID_Y ";\n\ |
| mov.u32 %r2, %ctaid.x;\n\ |
| cvt.u" PS ".u32 %R1, %r0;\n\ |
| " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\ |
| mov.u" PS " %R0, __nvptx_stacks;\n\ |
| " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\ |
| ld.param.u" PS " %R2, [%stack];\n\ |
| ld.param.u" PS " %R3, [%sz];\n\ |
| add.u" PS " %R2, %R2, %R3;\n\ |
| mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\ |
| st.shared.u" PS " [%R0], %R2;\n\ |
| mov.u" PS " %R0, __nvptx_uni;\n\ |
| " MAD_PS_32 " %R0, %r0, 4, %R0;\n\ |
| mov.u32 %r0, 0;\n\ |
| st.shared.u32 [%R0], %r0;\n\ |
| mov.u" PS " %R0, \0;\n\ |
| ld.param.u" PS " %R1, [%arg];\n\ |
| {\n\ |
| .param.u" PS " %P<2>;\n\ |
| st.param.u" PS " [%P0], %R0;\n\ |
| st.param.u" PS " [%P1], %R1;\n\ |
| call.uni gomp_nvptx_main, (%P0, %P1);\n\ |
| }\n\ |
| ret.uni;\n\ |
| }\n" |
| static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32"); |
| static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 "); |
| #undef ENTRY_TEMPLATE |
| #undef NTID_Y |
| const char *entry_1 = TARGET_ABI64 ? entry64 : entry32; |
| /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */ |
| const char *entry_2 = entry_1 + strlen (entry64) + 1; |
| fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2); |
| need_softstack_decl = need_unisimt_decl = true; |
| } |
| |
| /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx |
| function, including local var decls and copies from the arguments to |
| local regs. */ |
| |
| void |
| nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) |
| { |
| tree fntype = TREE_TYPE (decl); |
| tree result_type = TREE_TYPE (fntype); |
| int argno = 0; |
| |
| if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl)) |
| && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl))) |
| { |
| char *buf = (char *) alloca (strlen (name) + sizeof ("$impl")); |
| sprintf (buf, "%s$impl", name); |
| write_omp_entry (file, name, buf); |
| name = buf; |
| } |
| /* We construct the initial part of the function into a string |
| stream, in order to share the prototype writing code. */ |
| std::stringstream s; |
| write_fn_proto (s, true, name, decl); |
| s << "{\n"; |
| |
| bool return_in_mem = write_return_type (s, false, result_type); |
| if (return_in_mem) |
| argno = write_arg_type (s, 0, argno, ptr_type_node, true); |
| |
| /* Declare and initialize incoming arguments. */ |
| tree args = TYPE_ARG_TYPES (fntype); |
| bool prototyped = true; |
| if (!args) |
| { |
| args = DECL_ARGUMENTS (decl); |
| prototyped = false; |
| } |
| |
| for (; args != NULL_TREE; args = TREE_CHAIN (args)) |
| { |
| tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args); |
| |
| argno = write_arg_type (s, 0, argno, type, prototyped); |
| } |
| |
| if (stdarg_p (fntype)) |
| argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node, |
| true); |
| |
| if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain) |
| write_arg_type (s, STATIC_CHAIN_REGNUM, |
| DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node, |
| true); |
| |
| fprintf (file, "%s", s.str().c_str()); |
| |
| /* Usually 'crtl->is_leaf' is computed during register allocator |
| initialization (which is not done on NVPTX) or for pressure-sensitive |
| optimizations. Initialize it here, except if already set. */ |
| if (!crtl->is_leaf) |
| crtl->is_leaf = leaf_function_p (); |
| |
| HOST_WIDE_INT sz = get_frame_size (); |
| bool need_frameptr = sz || cfun->machine->has_chain; |
| int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT; |
| if (!TARGET_SOFT_STACK) |
| { |
| /* Declare a local var for outgoing varargs. */ |
| if (cfun->machine->has_varadic) |
| init_frame (file, STACK_POINTER_REGNUM, |
| UNITS_PER_WORD, crtl->outgoing_args_size); |
| |
| /* Declare a local variable for the frame. Force its size to be |
| DImode-compatible. */ |
| if (need_frameptr) |
| init_frame (file, FRAME_POINTER_REGNUM, alignment, |
| ROUND_UP (sz, GET_MODE_SIZE (DImode))); |
| } |
| else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca |
| || (cfun->machine->has_simtreg && !crtl->is_leaf)) |
| init_softstack_frame (file, alignment, sz); |
| |
| if (cfun->machine->has_simtreg) |
| { |
| unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size; |
| unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align; |
| align = MAX (align, GET_MODE_SIZE (DImode)); |
| if (!crtl->is_leaf || cfun->calls_alloca) |
| simtsz = HOST_WIDE_INT_M1U; |
| if (simtsz == HOST_WIDE_INT_M1U) |
| simtsz = nvptx_softstack_size; |
| if (cfun->machine->has_softstack) |
| simtsz += POINTER_SIZE / 8; |
| simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode)); |
| if (align > GET_MODE_SIZE (DImode)) |
| simtsz += align - GET_MODE_SIZE (DImode); |
| if (simtsz) |
| fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar[" |
| HOST_WIDE_INT_PRINT_DEC "];\n", simtsz); |
| } |
| |
| /* Restore the vector reduction partition register, if necessary. |
| FIXME: Find out when and why this is necessary, and fix it. */ |
| if (cfun->machine->red_partition) |
| regno_reg_rtx[REGNO (cfun->machine->red_partition)] |
| = cfun->machine->red_partition; |
| |
| /* Declare the pseudos we have as ptx registers. */ |
| int maxregs = max_reg_num (); |
| for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++) |
| { |
| if (regno_reg_rtx[i] != const0_rtx) |
| { |
| machine_mode mode = PSEUDO_REGNO_MODE (i); |
| machine_mode split = maybe_split_mode (mode); |
| |
| if (split_mode_p (mode)) |
| mode = split; |
| fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true)); |
| output_reg (file, i, split, -2); |
| fprintf (file, ";\n"); |
| } |
| } |
| |
| /* Emit axis predicates. */ |
| if (cfun->machine->axis_predicate[0]) |
| nvptx_init_axis_predicate (file, |
| REGNO (cfun->machine->axis_predicate[0]), "y"); |
| if (cfun->machine->axis_predicate[1]) |
| nvptx_init_axis_predicate (file, |
| REGNO (cfun->machine->axis_predicate[1]), "x"); |
| if (cfun->machine->unisimt_predicate |
| || (cfun->machine->has_simtreg && !crtl->is_leaf)) |
| nvptx_init_unisimt_predicate (file); |
| if (cfun->machine->bcast_partition || cfun->machine->sync_bar) |
| nvptx_init_oacc_workers (file); |
| } |
| |
| /* Output code for switching uniform-simt state. ENTERING indicates whether |
| we are entering or leaving non-uniform execution region. */ |
| |
| static void |
| nvptx_output_unisimt_switch (FILE *file, bool entering) |
| { |
| if (crtl->is_leaf && !cfun->machine->unisimt_predicate) |
| return; |
| fprintf (file, "\t{\n"); |
| fprintf (file, "\t\t.reg.u32 %%ustmp2;\n"); |
| fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0); |
| if (cfun->machine->unisimt_outside_simt_predicate) |
| { |
| int pred_outside_simt |
| = REGNO (cfun->machine->unisimt_outside_simt_predicate); |
| fprintf (file, "\t\tmov.pred %%r%d, %d;\n", pred_outside_simt, |
| entering ? 0 : 1); |
| } |
| if (!crtl->is_leaf) |
| { |
| int loc = REGNO (cfun->machine->unisimt_location); |
| fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc); |
| } |
| if (cfun->machine->unisimt_predicate) |
| { |
| int master = REGNO (cfun->machine->unisimt_master); |
| int pred = REGNO (cfun->machine->unisimt_predicate); |
| fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n"); |
| fprintf (file, "\t\tmov.u32 %%r%d, %s;\n", |
| master, entering ? "%ustmp2" : "0"); |
| fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master); |
| } |
| fprintf (file, "\t}\n"); |
| } |
| |
| /* Output code for allocating per-lane storage and switching soft-stack pointer. |
| ENTERING indicates whether we are entering or leaving non-uniform execution. |
| PTR is the register pointing to allocated storage, it is assigned to on |
| entering and used to restore state on leaving. SIZE and ALIGN are used only |
| on entering. */ |
| |
| static void |
| nvptx_output_softstack_switch (FILE *file, bool entering, |
| rtx ptr, rtx size, rtx align) |
| { |
| gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr)); |
| if (crtl->is_leaf && !cfun->machine->simt_stack_size) |
| return; |
| int bits = POINTER_SIZE, regno = REGNO (ptr); |
| fprintf (file, "\t{\n"); |
| if (entering) |
| { |
| fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + " |
| HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno, |
| cfun->machine->simt_stack_size); |
| fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno); |
| if (CONST_INT_P (size)) |
| fprintf (file, HOST_WIDE_INT_PRINT_DEC, |
| ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode))); |
| else |
| output_reg (file, REGNO (size), VOIDmode); |
| fputs (";\n", file); |
| if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode)) |
| fprintf (file, |
| "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n", |
| bits, regno, regno, UINTVAL (align)); |
| } |
| if (cfun->machine->has_softstack) |
| { |
| const char *reg_stack = reg_names[STACK_POINTER_REGNUM]; |
| if (entering) |
| { |
| fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n", |
| bits, regno, bits / 8, reg_stack); |
| fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n", |
| bits, reg_stack, regno, bits / 8); |
| } |
| else |
| { |
| fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n", |
| bits, reg_stack, regno, bits / 8); |
| } |
| nvptx_output_set_softstack (REGNO (stack_pointer_rtx)); |
| } |
| fprintf (file, "\t}\n"); |
| } |
| |
| /* Output code to enter non-uniform execution region. DEST is a register |
| to hold a per-lane allocation given by SIZE and ALIGN. */ |
| |
| const char * |
| nvptx_output_simt_enter (rtx dest, rtx size, rtx align) |
| { |
| nvptx_output_unisimt_switch (asm_out_file, true); |
| nvptx_output_softstack_switch (asm_out_file, true, dest, size, align); |
| return ""; |
| } |
| |
| /* Output code to leave non-uniform execution region. SRC is the register |
| holding per-lane storage previously allocated by omp_simt_enter insn. */ |
| |
| const char * |
| nvptx_output_simt_exit (rtx src) |
| { |
| nvptx_output_unisimt_switch (asm_out_file, false); |
| nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX); |
| return ""; |
| } |
| |
| /* Output instruction that sets soft stack pointer in shared memory to the |
| value in register given by SRC_REGNO. */ |
| |
| const char * |
| nvptx_output_set_softstack (unsigned src_regno) |
| { |
| if (cfun->machine->has_softstack && !crtl->is_leaf) |
| { |
| fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ", |
| POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]); |
| output_reg (asm_out_file, src_regno, VOIDmode); |
| fprintf (asm_out_file, ";\n"); |
| } |
| return ""; |
| } |
| /* Output a return instruction. Also copy the return value to its outgoing |
| location. */ |
| |
| const char * |
| nvptx_output_return (void) |
| { |
| machine_mode mode = (machine_mode)cfun->machine->return_mode; |
| |
| if (mode != VOIDmode) |
| fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n", |
| nvptx_ptx_type_from_mode (mode, false), |
| reg_names[NVPTX_RETURN_REGNUM], |
| reg_names[NVPTX_RETURN_REGNUM]); |
| |
| return "ret;"; |
| } |
| |
| /* Terminate a function by writing a closing brace to FILE. */ |
| |
| void |
| nvptx_function_end (FILE *file) |
| { |
| fprintf (file, "}\n"); |
| } |
| |
| /* Decide whether we can make a sibling call to a function. For ptx, we |
| can't. */ |
| |
| static bool |
| nvptx_function_ok_for_sibcall (tree, tree) |
| { |
| return false; |
| } |
| |
| /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */ |
| |
| static rtx |
| nvptx_get_drap_rtx (void) |
| { |
| if (TARGET_SOFT_STACK && stack_realign_drap) |
| return arg_pointer_rtx; |
| return NULL_RTX; |
| } |
| |
| /* Implement the TARGET_CALL_ARGS hook. Record information about one |
| argument to the next call. */ |
| |
| static void |
| nvptx_call_args (rtx arg, tree fntype) |
| { |
| if (!cfun->machine->doing_call) |
| { |
| cfun->machine->doing_call = true; |
| cfun->machine->is_varadic = false; |
| cfun->machine->num_args = 0; |
| |
| if (fntype && stdarg_p (fntype)) |
| { |
| cfun->machine->is_varadic = true; |
| cfun->machine->has_varadic = true; |
| cfun->machine->num_args++; |
| } |
| } |
| |
| if (REG_P (arg) && arg != pc_rtx) |
| { |
| cfun->machine->num_args++; |
| cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg, |
| cfun->machine->call_args); |
| } |
| } |
| |
| /* Implement the corresponding END_CALL_ARGS hook. Clear and free the |
| information we recorded. */ |
| |
| static void |
| nvptx_end_call_args (void) |
| { |
| cfun->machine->doing_call = false; |
| free_EXPR_LIST_list (&cfun->machine->call_args); |
| } |
| |
| /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep |
| track of whether calls involving static chains or varargs were seen |
| in the current function. |
| For libcalls, maintain a hash table of decls we have seen, and |
| record a function decl for later when encountering a new one. */ |
| |
| void |
| nvptx_expand_call (rtx retval, rtx address) |
| { |
| rtx callee = XEXP (address, 0); |
| rtx varargs = NULL_RTX; |
| unsigned parallel = 0; |
| |
| if (!call_insn_operand (callee, Pmode)) |
| { |
| callee = force_reg (Pmode, callee); |
| address = change_address (address, QImode, callee); |
| } |
| |
| if (GET_CODE (callee) == SYMBOL_REF) |
| { |
| tree decl = SYMBOL_REF_DECL (callee); |
| if (decl != NULL_TREE) |
| { |
| if (DECL_STATIC_CHAIN (decl)) |
| cfun->machine->has_chain = true; |
| |
| tree attr = oacc_get_fn_attrib (decl); |
| if (attr) |
| { |
| tree dims = TREE_VALUE (attr); |
| |
| parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1; |
| for (int ix = 0; ix != GOMP_DIM_MAX; ix++) |
| { |
| if (TREE_PURPOSE (dims) |
| && !integer_zerop (TREE_PURPOSE (dims))) |
| break; |
| /* Not on this axis. */ |
| parallel ^= GOMP_DIM_MASK (ix); |
| dims = TREE_CHAIN (dims); |
| } |
| } |
| } |
| } |
| |
| unsigned nargs = cfun->machine->num_args; |
| if (cfun->machine->is_varadic) |
| { |
| varargs = gen_reg_rtx (Pmode); |
| emit_move_insn (varargs, stack_pointer_rtx); |
| } |
| |
| rtvec vec = rtvec_alloc (nargs + 1); |
| rtx pat = gen_rtx_PARALLEL (VOIDmode, vec); |
| int vec_pos = 0; |
| |
| rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx); |
| rtx tmp_retval = retval; |
| if (retval) |
| { |
| if (!nvptx_register_operand (retval, GET_MODE (retval))) |
| tmp_retval = gen_reg_rtx (GET_MODE (retval)); |
| call = gen_rtx_SET (tmp_retval, call); |
| } |
| XVECEXP (pat, 0, vec_pos++) = call; |
| |
| /* Construct the call insn, including a USE for each argument pseudo |
| register. These will be used when printing the insn. */ |
| for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1)) |
| XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0)); |
| |
| if (varargs) |
| XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs); |
| |
| gcc_assert (vec_pos = XVECLEN (pat, 0)); |
| |
| nvptx_emit_forking (parallel, true); |
| emit_call_insn (pat); |
| nvptx_emit_joining (parallel, true); |
| |
| if (tmp_retval != retval) |
| emit_move_insn (retval, tmp_retval); |
| } |
| |
| /* Emit a comparison COMPARE, and return the new test to be used in the |
| jump. */ |
| |
| rtx |
| nvptx_expand_compare (rtx compare) |
| { |
| rtx pred = gen_reg_rtx (BImode); |
| rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode, |
| XEXP (compare, 0), XEXP (compare, 1)); |
| emit_insn (gen_rtx_SET (pred, cmp)); |
| return gen_rtx_NE (BImode, pred, const0_rtx); |
| } |
| |
| /* Expand the oacc fork & join primitive into ptx-required unspecs. */ |
| |
| void |
| nvptx_expand_oacc_fork (unsigned mode) |
| { |
| nvptx_emit_forking (GOMP_DIM_MASK (mode), false); |
| } |
| |
| void |
| nvptx_expand_oacc_join (unsigned mode) |
| { |
| nvptx_emit_joining (GOMP_DIM_MASK (mode), false); |
| } |
| |
| /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit |
| objects. */ |
| |
| static rtx |
| nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src) |
| { |
| rtx res; |
| |
| switch (GET_MODE (src)) |
| { |
| case E_DImode: |
| res = gen_unpackdisi2 (dst0, dst1, src); |
| break; |
| case E_DFmode: |
| res = gen_unpackdfsi2 (dst0, dst1, src); |
| break; |
| default: gcc_unreachable (); |
| } |
| return res; |
| } |
| |
| /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit |
| object. */ |
| |
| static rtx |
| nvptx_gen_pack (rtx dst, rtx src0, rtx src1) |
| { |
| rtx res; |
| |
| switch (GET_MODE (dst)) |
| { |
| case E_DImode: |
| res = gen_packsidi2 (dst, src0, src1); |
| break; |
| case E_DFmode: |
| res = gen_packsidf2 (dst, src0, src1); |
| break; |
| default: gcc_unreachable (); |
| } |
| return res; |
| } |
| |
| /* Generate an instruction or sequence to broadcast register REG |
| across the vectors of a single warp. */ |
| |
| rtx |
| nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind) |
| { |
| rtx res; |
| |
| switch (GET_MODE (dst)) |
| { |
| case E_DCmode: |
| case E_CDImode: |
| { |
| gcc_assert (GET_CODE (dst) == CONCAT); |
| gcc_assert (GET_CODE (src) == CONCAT); |
| rtx dst_real = XEXP (dst, 0); |
| rtx dst_imag = XEXP (dst, 1); |
| rtx src_real = XEXP (src, 0); |
| rtx src_imag = XEXP (src, 1); |
| |
| start_sequence (); |
| emit_insn (nvptx_gen_shuffle (dst_real, src_real, idx, kind)); |
| emit_insn (nvptx_gen_shuffle (dst_imag, src_imag, idx, kind)); |
| res = get_insns (); |
| end_sequence (); |
| } |
| break; |
| case E_SImode: |
| res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind)); |
| break; |
| case E_SFmode: |
| res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind)); |
| break; |
| case E_DImode: |
| case E_DFmode: |
| { |
| rtx tmp0 = gen_reg_rtx (SImode); |
| rtx tmp1 = gen_reg_rtx (SImode); |
| |
| start_sequence (); |
| emit_insn (nvptx_gen_unpack (tmp0, tmp1, src)); |
| emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind)); |
| emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind)); |
| emit_insn (nvptx_gen_pack (dst, tmp0, tmp1)); |
| res = get_insns (); |
| end_sequence (); |
| } |
| break; |
| case E_V2SImode: |
| { |
| rtx src0 = gen_rtx_SUBREG (SImode, src, 0); |
| rtx src1 = gen_rtx_SUBREG (SImode, src, 4); |
| rtx dst0 = gen_rtx_SUBREG (SImode, dst, 0); |
| rtx dst1 = gen_rtx_SUBREG (SImode, dst, 4); |
| rtx tmp0 = gen_reg_rtx (SImode); |
| rtx tmp1 = gen_reg_rtx (SImode); |
| start_sequence (); |
| emit_insn (gen_movsi (tmp0, src0)); |
| emit_insn (gen_movsi (tmp1, src1)); |
| emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind)); |
| emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind)); |
| emit_insn (gen_movsi (dst0, tmp0)); |
| emit_insn (gen_movsi (dst1, tmp1)); |
| res = get_insns (); |
| end_sequence (); |
| } |
| break; |
| case E_V2DImode: |
| { |
| rtx src0 = gen_rtx_SUBREG (DImode, src, 0); |
| rtx src1 = gen_rtx_SUBREG (DImode, src, 8); |
| rtx dst0 = gen_rtx_SUBREG (DImode, dst, 0); |
| rtx dst1 = gen_rtx_SUBREG (DImode, dst, 8); |
| rtx tmp0 = gen_reg_rtx (DImode); |
| rtx tmp1 = gen_reg_rtx (DImode); |
| start_sequence (); |
| emit_insn (gen_movdi (tmp0, src0)); |
| emit_insn (gen_movdi (tmp1, src1)); |
| emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind)); |
| emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind)); |
| emit_insn (gen_movdi (dst0, tmp0)); |
| emit_insn (gen_movdi (dst1, tmp1)); |
| res = get_insns (); |
| end_sequence (); |
| } |
| break; |
| case E_BImode: |
| { |
| rtx tmp = gen_reg_rtx (SImode); |
| |
| start_sequence (); |
| emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx)); |
| emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind)); |
| emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx))); |
| res = get_insns (); |
| end_sequence (); |
| } |
| break; |
| case E_QImode: |
| case E_HImode: |
| { |
| rtx tmp = gen_reg_rtx (SImode); |
| |
| start_sequence (); |
| emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src))); |
| emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind)); |
| emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst), |
| tmp))); |
| res = get_insns (); |
| end_sequence (); |
| } |
| break; |
| |
| default: |
| gcc_unreachable (); |
| } |
| return res; |
| } |
| |
| /* Generate an instruction or sequence to broadcast register REG |
| across the vectors of a single warp. */ |
| |
| static rtx |
| nvptx_gen_warp_bcast (rtx reg) |
| { |
| return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX); |
| } |
| |
| /* Structure used when generating a worker-level spill or fill. */ |
| |
| struct broadcast_data_t |
| { |
| rtx base; /* Register holding base addr of buffer. */ |
| rtx ptr; /* Iteration var, if needed. */ |
| unsigned offset; /* Offset into worker buffer. */ |
| }; |
| |
| /* Direction of the spill/fill and looping setup/teardown indicator. */ |
| |
| enum propagate_mask |
| { |
| PM_read = 1 << 0, |
| PM_write = 1 << 1, |
| PM_loop_begin = 1 << 2, |
| PM_loop_end = 1 << 3, |
| |
| PM_read_write = PM_read | PM_write |
| }; |
| |
| /* Generate instruction(s) to spill or fill register REG to/from the |
| worker broadcast array. PM indicates what is to be done, REP |
| how many loop iterations will be executed (0 for not a loop). */ |
| |
| static rtx |
| nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep, |
| broadcast_data_t *data, bool vector) |
| { |
| rtx res; |
| machine_mode mode = GET_MODE (reg); |
| |
| switch (mode) |
| { |
| case E_BImode: |
| { |
| rtx tmp = gen_reg_rtx (SImode); |
| |
| start_sequence (); |
| if (pm & PM_read) |
| emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx)); |
| emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector)); |
| if (pm & PM_write) |
| emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx))); |
| res = get_insns (); |
| end_sequence (); |
| } |
| break; |
| |
| default: |
| { |
| rtx addr = data->ptr; |
| |
| if (!addr) |
| { |
| unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT; |
| |
| oacc_bcast_align = MAX (oacc_bcast_align, align); |
| data->offset = ROUND_UP (data->offset, align); |
| addr = data->base; |
| gcc_assert (data->base != NULL); |
| if (data->offset) |
| addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset)); |
| } |
| |
| addr = gen_rtx_MEM (mode, addr); |
| if (pm == PM_read) |
| res = gen_rtx_SET (addr, reg); |
| else if (pm == PM_write) |
| res = gen_rtx_SET (reg, addr); |
| else |
| gcc_unreachable (); |
| |
| if (data->ptr) |
| { |
| /* We're using a ptr, increment it. */ |
| start_sequence (); |
| |
| emit_insn (res); |
| emit_insn (gen_adddi3 (data->ptr, data->ptr, |
| GEN_INT (GET_MODE_SIZE (GET_MODE (reg))))); |
| res = get_insns (); |
| end_sequence (); |
| } |
| else |
| rep = 1; |
| data->offset += rep * GET_MODE_SIZE (GET_MODE (reg)); |
| } |
| break; |
| } |
| return res; |
| } |
| |
| /* Returns true if X is a valid address for use in a memory reference. */ |
| |
| static bool |
| nvptx_legitimate_address_p (machine_mode, rtx x, bool) |
| { |
| enum rtx_code code = GET_CODE (x); |
| |
| switch (code) |
| { |
| case REG: |
| return true; |
| |
| case PLUS: |
| if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1))) |
| return true; |
| return false; |
| |
| case CONST: |
| case SYMBOL_REF: |
| case LABEL_REF: |
| return true; |
| |
| default: |
| return false; |
| } |
| } |
| |
| /* Machinery to output constant initializers. When beginning an |
| initializer, we decide on a fragment size (which is visible in ptx |
| in the type used), and then all initializer data is buffered until |
| a fragment is filled and ready to be written out. */ |
| |
| static struct |
| { |
| unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */ |
| unsigned HOST_WIDE_INT val; /* Current fragment value. */ |
| unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written |
| out. */ |
| unsigned size; /* Fragment size to accumulate. */ |
| unsigned offset; /* Offset within current fragment. */ |
| bool started; /* Whether we've output any initializer. */ |
| } init_frag; |
| |
| /* The current fragment is full, write it out. SYM may provide a |
| symbolic reference we should output, in which case the fragment |
| value is the addend. */ |
| |
| static void |
| output_init_frag (rtx sym) |
| { |
| fprintf (asm_out_file, init_frag.started ? ", " : " = { "); |
| unsigned HOST_WIDE_INT val = init_frag.val; |
| |
| init_frag.started = true; |
| init_frag.val = 0; |
| init_frag.offset = 0; |
| init_frag.remaining--; |
| |
| if (sym) |
| { |
| bool function = (SYMBOL_REF_DECL (sym) |
| && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL)); |
| if (!function) |
| fprintf (asm_out_file, "generic("); |
| output_address (VOIDmode, sym); |
| if (!function) |
| fprintf (asm_out_file, ")"); |
| if (val) |
| fprintf (asm_out_file, " + "); |
| } |
| |
| if (!sym || val) |
| fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val); |
| } |
| |
| /* Add value VAL of size SIZE to the data we're emitting, and keep |
| writing out chunks as they fill up. */ |
| |
| static void |
| nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size) |
| { |
| bool negative_p |
| = val & (HOST_WIDE_INT_1U << (HOST_BITS_PER_WIDE_INT - 1)); |
| |
| /* Avoid undefined behaviour. */ |
| if (size * BITS_PER_UNIT < HOST_BITS_PER_WIDE_INT) |
| val &= (HOST_WIDE_INT_1U << (size * BITS_PER_UNIT)) - 1; |
| |
| for (unsigned part = 0; size; size -= part) |
| { |
| if (part * BITS_PER_UNIT == HOST_BITS_PER_WIDE_INT) |
| /* Avoid undefined behaviour. */ |
| val = negative_p ? -1 : 0; |
| else |
| val >>= (part * BITS_PER_UNIT); |
| part = init_frag.size - init_frag.offset; |
| part = MIN (part, size); |
| |
| unsigned HOST_WIDE_INT partial |
| = val << (init_frag.offset * BITS_PER_UNIT); |
| init_frag.val |= partial & init_frag.mask; |
| init_frag.offset += part; |
| |
| if (init_frag.offset == init_frag.size) |
| output_init_frag (NULL); |
| } |
| } |
| |
| /* Target hook for assembling integer object X of size SIZE. */ |
| |
| static bool |
| nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p)) |
| { |
| HOST_WIDE_INT val = 0; |
| |
| switch (GET_CODE (x)) |
| { |
| default: |
| /* Let the generic machinery figure it out, usually for a |
| CONST_WIDE_INT. */ |
| return false; |
| |
| case CONST_INT: |
| nvptx_assemble_value (INTVAL (x), size); |
| break; |
| |
| case CONST: |
| x = XEXP (x, 0); |
| gcc_assert (GET_CODE (x) == PLUS); |
| val = INTVAL (XEXP (x, 1)); |
| x = XEXP (x, 0); |
| gcc_assert (GET_CODE (x) == SYMBOL_REF); |
| gcc_fallthrough (); /* FALLTHROUGH */ |
| |
| case SYMBOL_REF: |
| gcc_assert (size == init_frag.size); |
| if (init_frag.offset) |
| sorry ("cannot emit unaligned pointers in ptx assembly"); |
| |
| nvptx_maybe_record_fnsym (x); |
| init_frag.val = val; |
| output_init_frag (x); |
| break; |
| } |
| |
| return true; |
| } |
| |
| /* Output SIZE zero bytes. We ignore the FILE argument since the |
| functions we're calling to perform the output just use |
| asm_out_file. */ |
| |
| void |
| nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size) |
| { |
| /* Finish the current fragment, if it's started. */ |
| if (init_frag.offset) |
| { |
| unsigned part = init_frag.size - init_frag.offset; |
| part = MIN (part, (unsigned)size); |
| size -= part; |
| nvptx_assemble_value (0, part); |
| } |
| |
| /* If this skip doesn't terminate the initializer, write as many |
| remaining pieces as possible directly. */ |
| if (size < init_frag.remaining * init_frag.size) |
| { |
| while (size >= init_frag.size) |
| { |
| size -= init_frag.size; |
| output_init_frag (NULL_RTX); |
| } |
| if (size) |
| nvptx_assemble_value (0, size); |
| } |
| } |
| |
| /* Output a string STR with length SIZE. As in nvptx_output_skip we |
| ignore the FILE arg. */ |
| |
| void |
| nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size) |
| { |
| for (unsigned HOST_WIDE_INT i = 0; i < size; i++) |
| nvptx_assemble_value (str[i], 1); |
| } |
| |
| /* Return true if TYPE is a record type where the last field is an array without |
| given dimension. */ |
| |
| static bool |
| flexible_array_member_type_p (const_tree type) |
| { |
| if (TREE_CODE (type) != RECORD_TYPE) |
| return false; |
| |
| const_tree last_field = NULL_TREE; |
| for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f)) |
| last_field = f; |
| |
| if (!last_field) |
| return false; |
| |
| const_tree last_field_type = TREE_TYPE (last_field); |
| if (TREE_CODE (last_field_type) != ARRAY_TYPE) |
| return false; |
| |
| return (! TYPE_DOMAIN (last_field_type) |
| || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type))); |
| } |
| |
| /* Emit a PTX variable decl and prepare for emission of its |
| initializer. NAME is the symbol name and SETION the PTX data |
| area. The type is TYPE, object size SIZE and alignment is ALIGN. |
| The caller has already emitted any indentation and linkage |
| specifier. It is responsible for any initializer, terminating ; |
| and newline. SIZE is in bytes, ALIGN is in bits -- confusingly |
| this is the opposite way round that PTX wants them! */ |
| |
| static void |
| nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section, |
| const_tree type, HOST_WIDE_INT size, unsigned align, |
| bool undefined = false) |
| { |
| bool atype = (TREE_CODE (type) == ARRAY_TYPE) |
| && (TYPE_DOMAIN (type) == NULL_TREE); |
| |
| if (undefined && flexible_array_member_type_p (type)) |
| { |
| size = 0; |
| atype = true; |
| } |
| |
| while (TREE_CODE (type) == ARRAY_TYPE) |
| type = TREE_TYPE (type); |
| |
| if (TREE_CODE (type) == VECTOR_TYPE |
| || TREE_CODE (type) == COMPLEX_TYPE) |
| /* Neither vector nor complex types can contain the other. */ |
| type = TREE_TYPE (type); |
| |
| unsigned HOST_WIDE_INT elt_size = int_size_in_bytes (type); |
| |
| /* Largest mode we're prepared to accept. For BLKmode types we |
| don't know if it'll contain pointer constants, so have to choose |
| pointer size, otherwise we can choose DImode. */ |
| machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode; |
| |
| elt_size |= GET_MODE_SIZE (elt_mode); |
| elt_size &= -elt_size; /* Extract LSB set. */ |
| |
| init_frag.size = elt_size; |
| /* Avoid undefined shift behavior by using '2'. */ |
| init_frag.mask = ((unsigned HOST_WIDE_INT)2 |
| << (elt_size * BITS_PER_UNIT - 1)) - 1; |
| init_frag.val = 0; |
| init_frag.offset = 0; |
| init_frag.started = false; |
| /* Size might not be a multiple of elt size, if there's an |
| initialized trailing struct array with smaller type than |
| elt_size. */ |
| init_frag.remaining = (size + elt_size - 1) / elt_size; |
| |
| fprintf (file, "%s .align %d .u" HOST_WIDE_INT_PRINT_UNSIGNED " ", |
| section, align / BITS_PER_UNIT, |
| elt_size * BITS_PER_UNIT); |
| assemble_name (file, name); |
| |
| if (size) |
| /* We make everything an array, to simplify any initialization |
| emission. */ |
| fprintf (file, "[" HOST_WIDE_INT_PRINT_UNSIGNED "]", init_frag.remaining); |
| else if (atype) |
| fprintf (file, "[]"); |
| } |
| |
| /* Called when the initializer for a decl has been completely output through |
| combinations of the three functions above. */ |
| |
| static void |
| nvptx_assemble_decl_end (void) |
| { |
| if (init_frag.offset) |
| /* This can happen with a packed struct with trailing array member. */ |
| nvptx_assemble_value (0, init_frag.size - init_frag.offset); |
| fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n"); |
| } |
| |
| /* Output an uninitialized common or file-scope variable. */ |
| |
| void |
| nvptx_output_aligned_decl (FILE *file, const char *name, |
| const_tree decl, HOST_WIDE_INT size, unsigned align) |
| { |
| write_var_marker (file, true, TREE_PUBLIC (decl), name); |
| |
| /* If this is public, it is common. The nearest thing we have to |
| common is weak. */ |
| fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : ""); |
| |
| nvptx_assemble_decl_begin (file, name, section_for_decl (decl), |
| TREE_TYPE (decl), size, align); |
| nvptx_assemble_decl_end (); |
| } |
| |
| /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of |
| writing a constant variable EXP with NAME and SIZE and its |
| initializer to FILE. */ |
| |
| static void |
| nvptx_asm_declare_constant_name (FILE *file, const char *name, |
| const_tree exp, HOST_WIDE_INT obj_size) |
| { |
| write_var_marker (file, true, false, name); |
| |
| fprintf (file, "\t"); |
| |
| tree type = TREE_TYPE (exp); |
| nvptx_assemble_decl_begin (file, name, ".const", type, obj_size, |
| TYPE_ALIGN (type)); |
| } |
| |
| /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing |
| a variable DECL with NAME to FILE. */ |
| |
| void |
| nvptx_declare_object_name (FILE *file, const char *name, const_tree decl) |
| { |
| write_var_marker (file, true, TREE_PUBLIC (decl), name); |
| |
| fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? "" |
| : DECL_WEAK (decl) ? ".weak " : ".visible ")); |
| |
| tree type = TREE_TYPE (decl); |
| HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl)); |
| nvptx_assemble_decl_begin (file, name, section_for_decl (decl), |
| type, obj_size, DECL_ALIGN (decl)); |
| } |
| |
| /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */ |
| |
| static void |
| nvptx_globalize_label (FILE *, const char *) |
| { |
| } |
| |
| /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern |
| declaration only for variable DECL with NAME to FILE. */ |
| |
| static void |
| nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl) |
| { |
| /* The middle end can place constant pool decls into the varpool as |
| undefined. Until that is fixed, catch the problem here. */ |
| if (DECL_IN_CONSTANT_POOL (decl)) |
| return; |
| |
| /* We support weak defintions, and hence have the right |
| ASM_WEAKEN_DECL definition. Diagnose the problem here. */ |
| if (DECL_WEAK (decl)) |
| error_at (DECL_SOURCE_LOCATION (decl), |
| "PTX does not support weak declarations" |
| " (only weak definitions)"); |
| write_var_marker (file, false, TREE_PUBLIC (decl), name); |
| |
| fprintf (file, "\t.extern "); |
| tree size = DECL_SIZE_UNIT (decl); |
| nvptx_assemble_decl_begin (file, name, section_for_decl (decl), |
| TREE_TYPE (decl), size ? tree_to_shwi (size) : 0, |
| DECL_ALIGN (decl), true); |
| nvptx_assemble_decl_end (); |
| } |
| |
| /* Output a pattern for a move instruction. */ |
| |
| const char * |
| nvptx_output_mov_insn (rtx dst, rtx src) |
| { |
| machine_mode dst_mode = GET_MODE (dst); |
| machine_mode src_mode = GET_MODE (src); |
| machine_mode dst_inner = (GET_CODE (dst) == SUBREG |
| ? GET_MODE (XEXP (dst, 0)) : dst_mode); |
| machine_mode src_inner = (GET_CODE (src) == SUBREG |
| ? GET_MODE (XEXP (src, 0)) : dst_mode); |
| |
| rtx sym = src; |
| if (GET_CODE (sym) == CONST) |
| sym = XEXP (XEXP (sym, 0), 0); |
| if (SYMBOL_REF_P (sym)) |
| { |
| if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC) |
| return "%.\tcvta%D1%t0\t%0, %1;"; |
| nvptx_maybe_record_fnsym (sym); |
| } |
| |
| if (src_inner == dst_inner) |
| return "%.\tmov%t0\t%0, %1;"; |
| |
| if (CONSTANT_P (src)) |
| return (GET_MODE_CLASS (dst_inner) == MODE_INT |
| && GET_MODE_CLASS (src_inner) != MODE_FLOAT |
| ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;"); |
| |
| if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner)) |
| { |
| if (GET_MODE_BITSIZE (dst_mode) == 128 |
| && GET_MODE_BITSIZE (src_mode) == 128) |
| { |
| /* mov.b128 is not supported. */ |
| if (dst_inner == V2DImode && src_inner == TImode) |
| return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;"; |
| else if (dst_inner == TImode && src_inner == V2DImode) |
| return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;"; |
| |
| gcc_unreachable (); |
| } |
| return "%.\tmov.b%T0\t%0, %1;"; |
| } |
| |
| if (GET_MODE_BITSIZE (src_inner) == 128 |
| && GET_MODE_BITSIZE (src_mode) == 64) |
| return "%.\tmov.b%T0\t%0, %1;"; |
| |
| return "%.\tcvt%t0%t1\t%0, %1;"; |
| } |
| |
| /* Output a pre/post barrier for MEM_OPERAND according to MEMMODEL. */ |
| |
| static void |
| nvptx_output_barrier (rtx *mem_operand, int memmodel, bool pre_p) |
| { |
| bool post_p = !pre_p; |
| |
| switch (memmodel) |
| { |
| case MEMMODEL_RELAXED: |
| return; |
| case MEMMODEL_CONSUME: |
| case MEMMODEL_ACQUIRE: |
| case MEMMODEL_SYNC_ACQUIRE: |
| if (post_p) |
| break; |
| return; |
| case MEMMODEL_RELEASE: |
| case MEMMODEL_SYNC_RELEASE: |
| if (pre_p) |
| break; |
| return; |
| case MEMMODEL_ACQ_REL: |
| case MEMMODEL_SEQ_CST: |
| case MEMMODEL_SYNC_SEQ_CST: |
| if (pre_p || post_p) |
| break; |
| return; |
| default: |
| gcc_unreachable (); |
| } |
| |
| output_asm_insn ("%.\tmembar%B0;", mem_operand); |
| } |
| |
| const char * |
| nvptx_output_atomic_insn (const char *asm_template, rtx *operands, int mem_pos, |
| int memmodel_pos) |
| { |
| nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]), |
| true); |
| output_asm_insn (asm_template, operands); |
| nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]), |
| false); |
| return ""; |
| } |
| |
| static void nvptx_print_operand (FILE *, rtx, int); |
| |
| /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this |
| involves writing .param declarations and in/out copies into them. For |
| indirect calls, also write the .callprototype. */ |
| |
| const char * |
| nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee) |
| { |
| char buf[16]; |
| static int labelno; |
| bool needs_tgt = register_operand (callee, Pmode); |
| rtx pat = PATTERN (insn); |
| if (GET_CODE (pat) == COND_EXEC) |
| pat = COND_EXEC_CODE (pat); |
| int arg_end = XVECLEN (pat, 0); |
| tree decl = NULL_TREE; |
| |
| fprintf (asm_out_file, "\t{\n"); |
| if (result != NULL) |
| fprintf (asm_out_file, "\t\t.param%s %s_in;\n", |
| nvptx_ptx_type_from_mode (GET_MODE (result), false), |
| reg_names[NVPTX_RETURN_REGNUM]); |
| |
| /* Ensure we have a ptx declaration in the output if necessary. */ |
| if (GET_CODE (callee) == SYMBOL_REF) |
| { |
| decl = SYMBOL_REF_DECL (callee); |
| if (!decl |
| || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl)))) |
| nvptx_record_libfunc (callee, result, pat); |
| else if (DECL_EXTERNAL (decl)) |
| nvptx_record_fndecl (decl); |
| } |
| |
| if (needs_tgt) |
| { |
| ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno); |
| labelno++; |
| ASM_OUTPUT_LABEL (asm_out_file, buf); |
| std::stringstream s; |
| write_fn_proto_from_insn (s, NULL, result, pat); |
| fputs (s.str().c_str(), asm_out_file); |
| } |
| |
| for (int argno = 1; argno < arg_end; argno++) |
| { |
| rtx t = XEXP (XVECEXP (pat, 0, argno), 0); |
| machine_mode mode = GET_MODE (t); |
| const char *ptx_type = nvptx_ptx_type_from_mode (mode, false); |
| |
| /* Mode splitting has already been done. */ |
| fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n" |
| "\t\tst.param%s [%%out_arg%d], ", |
| ptx_type, argno, ptx_type, argno); |
| output_reg (asm_out_file, REGNO (t), VOIDmode); |
| fprintf (asm_out_file, ";\n"); |
| } |
| |
| /* The '.' stands for the call's predicate, if any. */ |
| nvptx_print_operand (asm_out_file, NULL_RTX, '.'); |
| fprintf (asm_out_file, "\t\tcall "); |
| if (result != NULL_RTX) |
| fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]); |
| |
| if (decl) |
| { |
| char *replaced_dots = NULL; |
| const char *name = get_fnname_from_decl (decl); |
| const char *replacement = nvptx_name_replacement (name); |
| if (replacement != name) |
| name = replacement; |
| else |
| { |
| replaced_dots = nvptx_replace_dot (name); |
| if (replaced_dots) |
| name = replaced_dots; |
| } |
| assemble_name (asm_out_file, name); |
| if (replaced_dots) |
| XDELETE (replaced_dots); |
| } |
| else |
| output_address (VOIDmode, callee); |
| |
| const char *open = "("; |
| for (int argno = 1; argno < arg_end; argno++) |
| { |
| fprintf (asm_out_file, ", %s%%out_arg%d", open, argno); |
| open = ""; |
| } |
| if (decl && DECL_STATIC_CHAIN (decl)) |
| { |
| fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]); |
| open = ""; |
| } |
| if (!open[0]) |
| fprintf (asm_out_file, ")"); |
| |
| if (needs_tgt) |
| { |
| fprintf (asm_out_file, ", "); |
| assemble_name (asm_out_file, buf); |
| } |
| fprintf (asm_out_file, ";\n"); |
| |
| if (find_reg_note (insn, REG_NORETURN, NULL)) |
| { |
| /* No return functions confuse the PTX JIT, as it doesn't realize |
| the flow control barrier they imply. It can seg fault if it |
| encounters what looks like an unexitable loop. Emit a trailing |
| trap and exit, which it does grok. */ |
| fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n"); |
| fprintf (asm_out_file, "\t\texit; // (noreturn)\n"); |
| } |
| |
| if (result) |
| { |
| static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8]; |
| |
| if (!rval[0]) |
| /* We must escape the '%' that starts RETURN_REGNUM. */ |
| sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}", |
| reg_names[NVPTX_RETURN_REGNUM]); |
| return rval; |
| } |
| |
| return "}"; |
| } |
| |
| /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */ |
| |
| static bool |
| nvptx_print_operand_punct_valid_p (unsigned char c) |
| { |
| return c == '.' || c== '#'; |
| } |
| |
| /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */ |
| |
| static void |
| nvptx_print_address_operand (FILE *file, rtx x, machine_mode) |
| { |
| rtx off; |
| if (GET_CODE (x) == CONST) |
| x = XEXP (x, 0); |
| switch (GET_CODE (x)) |
| { |
| case PLUS: |
| off = XEXP (x, 1); |
| output_address (VOIDmode, XEXP (x, 0)); |
| fprintf (file, "+"); |
| output_address (VOIDmode, off); |
| break; |
| |
| case SYMBOL_REF: |
| case LABEL_REF: |
| output_addr_const (file, x); |
| break; |
| |
| default: |
| gcc_assert (GET_CODE (x) != MEM); |
| nvptx_print_operand (file, x, 0); |
| break; |
| } |
| } |
| |
| /* Write assembly language output for the address ADDR to FILE. */ |
| |
| static void |
| nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr) |
| { |
| nvptx_print_address_operand (file, addr, mode); |
| } |
| |
| static nvptx_data_area |
| nvptx_mem_data_area (const_rtx x) |
| { |
| gcc_assert (GET_CODE (x) == MEM); |
| |
| const_rtx addr = XEXP (x, 0); |
| subrtx_iterator::array_type array; |
| FOR_EACH_SUBRTX (iter, array, addr, ALL) |
| if (SYMBOL_REF_P (*iter)) |
| return SYMBOL_DATA_AREA (*iter); |
| |
| return DATA_AREA_GENERIC; |
| } |
| |
| bool |
| nvptx_mem_maybe_shared_p (const_rtx x) |
| { |
| nvptx_data_area area = nvptx_mem_data_area (x); |
| return area == DATA_AREA_SHARED || area == DATA_AREA_GENERIC; |
| } |
| |
| /* Print an operand, X, to FILE, with an optional modifier in CODE. |
| |
| Meaning of CODE: |
| . -- print the predicate for the instruction or an emptry string for an |
| unconditional one. |
| # -- print a rounding mode for the instruction |
| |
| A -- print a data area for a MEM |
| c -- print an opcode suffix for a comparison operator, including a type code |
| D -- print a data area for a MEM operand |
| S -- print a shuffle kind specified by CONST_INT |
| t -- print a type opcode suffix, promoting QImode to 32 bits |
| T -- print a type size in bits |
| u -- print a type opcode suffix without promotions. |
| x -- print a destination operand that may also be a bit bucket. */ |
| |
| static void |
| nvptx_print_operand (FILE *file, rtx x, int code) |
| { |
| if (code == '.') |
| { |
| x = current_insn_predicate; |
| if (x) |
| { |
| fputs ("@", file); |
| if (GET_CODE (x) == EQ) |
| fputs ("!", file); |
| output_reg (file, REGNO (XEXP (x, 0)), VOIDmode); |
| } |
| return; |
| } |
| else if (code == '#') |
| { |
| fputs (".rn", file); |
| return; |
| } |
| |
| enum rtx_code x_code = GET_CODE (x); |
| machine_mode mode = GET_MODE (x); |
| |
| switch (code) |
| { |
| case 'x': |
| if (current_output_insn != NULL |
| && find_reg_note (current_output_insn, REG_UNUSED, x) != NULL_RTX) |
| { |
| fputs ("_", file); |
| return; |
| } |
| goto common; |
| case 'B': |
| if (SYMBOL_REF_P (XEXP (x, 0))) |
| switch (SYMBOL_DATA_AREA (XEXP (x, 0))) |
| { |
| case DATA_AREA_GENERIC: |
| /* Assume worst-case: global. */ |
| gcc_fallthrough (); /* FALLTHROUGH. */ |
| case DATA_AREA_GLOBAL: |
| break; |
| case DATA_AREA_SHARED: |
| fputs (".cta", file); |
| return; |
| case DATA_AREA_LOCAL: |
| case DATA_AREA_CONST: |
| case DATA_AREA_PARAM: |
| default: |
| gcc_unreachable (); |
| } |
| |
| /* There are 2 cases where membar.sys differs from membar.gl: |
| - host accesses global memory (f.i. systemwide atomics) |
| - 2 or more devices are setup in peer-to-peer mode, and one |
| peer can access global memory of other peer. |
| Neither are currently supported by openMP/OpenACC on nvptx, but |
| that could change, so we default to membar.sys. We could support |
| this more optimally by adding DATA_AREA_SYS and then emitting |
| .gl for DATA_AREA_GLOBAL and .sys for DATA_AREA_SYS. */ |
| fputs (".sys", file); |
| return; |
| |
| case 'A': |
| x = XEXP (x, 0); |
| gcc_fallthrough (); /* FALLTHROUGH. */ |
| |
| case 'D': |
| if (GET_CODE (x) == CONST) |
| x = XEXP (x, 0); |
| if (GET_CODE (x) == PLUS) |
| x = XEXP (x, 0); |
| |
| if (GET_CODE (x) == SYMBOL_REF) |
| fputs (section_for_sym (x), file); |
| break; |
| |
| case 't': |
| case 'u': |
| if (x_code == SUBREG) |
| { |
| machine_mode inner_mode = GET_MODE (SUBREG_REG (x)); |
| if (VECTOR_MODE_P (inner_mode) |
| && (GET_MODE_SIZE (mode) |
| <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode)))) |
| mode = GET_MODE_INNER (inner_mode); |
| else if (split_mode_p (inner_mode)) |
| mode = maybe_split_mode (inner_mode); |
| else |
| mode = inner_mode; |
| } |
| fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't')); |
| break; |
| |
| case 'H': |
| case 'L': |
| { |
| rtx inner_x = SUBREG_REG (x); |
| machine_mode inner_mode = GET_MODE (inner_x); |
| machine_mode split = maybe_split_mode (inner_mode); |
| |
| output_reg (file, REGNO (inner_x), split, |
| (code == 'H' |
| ? GET_MODE_SIZE (inner_mode) / 2 |
| : 0)); |
| } |
| break; |
| |
| case 'S': |
| { |
| nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x); |
| /* Same order as nvptx_shuffle_kind. */ |
| static const char *const kinds[] = |
| {".up", ".down", ".bfly", ".idx"}; |
| fputs (kinds[kind], file); |
| } |
| break; |
| |
| case 'T': |
| fprintf (file, "%d", GET_MODE_BITSIZE (mode)); |
| break; |
| |
| case 'j': |
| fprintf (file, "@"); |
| goto common; |
| |
| case 'J': |
| fprintf (file, "@!"); |
| goto common; |
| |
| case 'c': |
| mode = GET_MODE (XEXP (x, 0)); |
| switch (x_code) |
| { |
| case EQ: |
| fputs (".eq", file); |
| break; |
| case NE: |
| if (FLOAT_MODE_P (mode)) |
| fputs (".neu", file); |
| else |
| fputs (".ne", file); |
| break; |
| case LE: |
| case LEU: |
| fputs (".le", file); |
| break; |
| case GE: |
| case GEU: |
| fputs (".ge", file); |
| break; |
| case LT: |
| case LTU: |
| fputs (".lt", file); |
| break; |
| case GT: |
| case GTU: |
| fputs (".gt", file); |
| break; |
| case LTGT: |
| fputs (".ne", file); |
| break; |
| case UNEQ: |
| fputs (".equ", file); |
| break; |
| case UNLE: |
| fputs (".leu", file); |
| break; |
| case UNGE: |
| fputs (".geu", file); |
| break; |
| case UNLT: |
| fputs (".ltu", file); |
| break; |
| case UNGT: |
| fputs (".gtu", file); |
| break; |
| case UNORDERED: |
| fputs (".nan", file); |
| break; |
| case ORDERED: |
| fputs (".num", file); |
| break; |
| default: |
| gcc_unreachable (); |
| } |
| if (FLOAT_MODE_P (mode) |
| || x_code == EQ || x_code == NE |
| || x_code == GEU || x_code == GTU |
| || x_code == LEU || x_code == LTU) |
| fputs (nvptx_ptx_type_from_mode (mode, true), file); |
| else |
| fprintf (file, ".s%d", GET_MODE_BITSIZE (mode)); |
| break; |
| default: |
| common: |
| switch (x_code) |
| { |
| case SUBREG: |
| { |
| rtx inner_x = SUBREG_REG (x); |
| machine_mode inner_mode = GET_MODE (inner_x); |
| machine_mode split = maybe_split_mode (inner_mode); |
| |
| if (VECTOR_MODE_P (inner_mode) |
| && (GET_MODE_SIZE (mode) |
| <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode)))) |
| { |
| output_reg (file, REGNO (inner_x), VOIDmode); |
| fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y"); |
| } |
| else if (split_mode_p (inner_mode) |
| && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode))) |
| output_reg (file, REGNO (inner_x), split); |
| else |
| output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x)); |
| } |
| break; |
| |
| case REG: |
| output_reg (file, REGNO (x), maybe_split_mode (mode)); |
| break; |
| |
| case MEM: |
| fputc ('[', file); |
| nvptx_print_address_operand (file, XEXP (x, 0), mode); |
| fputc (']', file); |
| break; |
| |
| case CONST_INT: |
| output_addr_const (file, x); |
| break; |
| |
| case CONST: |
| case SYMBOL_REF: |
| case LABEL_REF: |
| /* We could use output_addr_const, but that can print things like |
| "x-8", which breaks ptxas. Need to ensure it is output as |
| "x+-8". */ |
| nvptx_print_address_operand (file, x, VOIDmode); |
| break; |
| |
| case CONST_DOUBLE: |
| long vals[2]; |
| real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode); |
| vals[0] &= 0xffffffff; |
| vals[1] &= 0xffffffff; |
| if (mode == SFmode) |
| fprintf (file, "0f%08lx", vals[0]); |
| else |
| fprintf (file, "0d%08lx%08lx", vals[1], vals[0]); |
| break; |
| |
| case CONST_VECTOR: |
| { |
| unsigned n = CO
|