| /* Machine description for AArch64 architecture. |
| Copyright (C) 2009-2021 Free Software Foundation, Inc. |
| Contributed by ARM Ltd. |
| |
| This file is part of GCC. |
| |
| GCC is free software; you can redistribute it and/or modify it |
| under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 3, or (at your option) |
| any later version. |
| |
| GCC is distributed in the hope that it will be useful, but |
| WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with GCC; see the file COPYING3. If not see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #define IN_TARGET_CODE 1 |
| |
| #define INCLUDE_STRING |
| #define INCLUDE_ALGORITHM |
| #include "config.h" |
| #include "system.h" |
| #include "coretypes.h" |
| #include "backend.h" |
| #include "target.h" |
| #include "rtl.h" |
| #include "tree.h" |
| #include "memmodel.h" |
| #include "gimple.h" |
| #include "cfghooks.h" |
| #include "cfgloop.h" |
| #include "df.h" |
| #include "tm_p.h" |
| #include "stringpool.h" |
| #include "attribs.h" |
| #include "optabs.h" |
| #include "regs.h" |
| #include "emit-rtl.h" |
| #include "recog.h" |
| #include "cgraph.h" |
| #include "diagnostic.h" |
| #include "insn-attr.h" |
| #include "alias.h" |
| #include "fold-const.h" |
| #include "stor-layout.h" |
| #include "calls.h" |
| #include "varasm.h" |
| #include "output.h" |
| #include "flags.h" |
| #include "explow.h" |
| #include "expr.h" |
| #include "reload.h" |
| #include "langhooks.h" |
| #include "opts.h" |
| #include "gimplify.h" |
| #include "dwarf2.h" |
| #include "gimple-iterator.h" |
| #include "tree-vectorizer.h" |
| #include "aarch64-cost-tables.h" |
| #include "dumpfile.h" |
| #include "builtins.h" |
| #include "rtl-iter.h" |
| #include "tm-constrs.h" |
| #include "sched-int.h" |
| #include "target-globals.h" |
| #include "common/common-target.h" |
| #include "cfgrtl.h" |
| #include "selftest.h" |
| #include "selftest-rtl.h" |
| #include "rtx-vector-builder.h" |
| #include "intl.h" |
| #include "expmed.h" |
| #include "function-abi.h" |
| #include "gimple-pretty-print.h" |
| #include "tree-ssa-loop-niter.h" |
| #include "fractional-cost.h" |
| #include "rtlanal.h" |
| |
| /* This file should be included last. */ |
| #include "target-def.h" |
| |
| /* Defined for convenience. */ |
| #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT) |
| |
| /* Information about a legitimate vector immediate operand. */ |
| struct simd_immediate_info |
| { |
| enum insn_type { MOV, MVN, INDEX, PTRUE }; |
| enum modifier_type { LSL, MSL }; |
| |
| simd_immediate_info () {} |
| simd_immediate_info (scalar_float_mode, rtx); |
| simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT, |
| insn_type = MOV, modifier_type = LSL, |
| unsigned int = 0); |
| simd_immediate_info (scalar_mode, rtx, rtx); |
| simd_immediate_info (scalar_int_mode, aarch64_svpattern); |
| |
| /* The mode of the elements. */ |
| scalar_mode elt_mode; |
| |
| /* The instruction to use to move the immediate into a vector. */ |
| insn_type insn; |
| |
| union |
| { |
| /* For MOV and MVN. */ |
| struct |
| { |
| /* The value of each element. */ |
| rtx value; |
| |
| /* The kind of shift modifier to use, and the number of bits to shift. |
| This is (LSL, 0) if no shift is needed. */ |
| modifier_type modifier; |
| unsigned int shift; |
| } mov; |
| |
| /* For INDEX. */ |
| struct |
| { |
| /* The value of the first element and the step to be added for each |
| subsequent element. */ |
| rtx base, step; |
| } index; |
| |
| /* For PTRUE. */ |
| aarch64_svpattern pattern; |
| } u; |
| }; |
| |
| /* Construct a floating-point immediate in which each element has mode |
| ELT_MODE_IN and value VALUE_IN. */ |
| inline simd_immediate_info |
| ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in) |
| : elt_mode (elt_mode_in), insn (MOV) |
| { |
| u.mov.value = value_in; |
| u.mov.modifier = LSL; |
| u.mov.shift = 0; |
| } |
| |
| /* Construct an integer immediate in which each element has mode ELT_MODE_IN |
| and value VALUE_IN. The other parameters are as for the structure |
| fields. */ |
| inline simd_immediate_info |
| ::simd_immediate_info (scalar_int_mode elt_mode_in, |
| unsigned HOST_WIDE_INT value_in, |
| insn_type insn_in, modifier_type modifier_in, |
| unsigned int shift_in) |
| : elt_mode (elt_mode_in), insn (insn_in) |
| { |
| u.mov.value = gen_int_mode (value_in, elt_mode_in); |
| u.mov.modifier = modifier_in; |
| u.mov.shift = shift_in; |
| } |
| |
| /* Construct an integer immediate in which each element has mode ELT_MODE_IN |
| and where element I is equal to BASE_IN + I * STEP_IN. */ |
| inline simd_immediate_info |
| ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in) |
| : elt_mode (elt_mode_in), insn (INDEX) |
| { |
| u.index.base = base_in; |
| u.index.step = step_in; |
| } |
| |
| /* Construct a predicate that controls elements of mode ELT_MODE_IN |
| and has PTRUE pattern PATTERN_IN. */ |
| inline simd_immediate_info |
| ::simd_immediate_info (scalar_int_mode elt_mode_in, |
| aarch64_svpattern pattern_in) |
| : elt_mode (elt_mode_in), insn (PTRUE) |
| { |
| u.pattern = pattern_in; |
| } |
| |
| namespace { |
| |
| /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */ |
| class pure_scalable_type_info |
| { |
| public: |
| /* Represents the result of analyzing a type. All values are nonzero, |
| in the possibly forlorn hope that accidental conversions to bool |
| trigger a warning. */ |
| enum analysis_result |
| { |
| /* The type does not have an ABI identity; i.e. it doesn't contain |
| at least one object whose type is a Fundamental Data Type. */ |
| NO_ABI_IDENTITY = 1, |
| |
| /* The type is definitely a Pure Scalable Type. */ |
| IS_PST, |
| |
| /* The type is definitely not a Pure Scalable Type. */ |
| ISNT_PST, |
| |
| /* It doesn't matter for PCS purposes whether the type is a Pure |
| Scalable Type or not, since the type will be handled the same |
| way regardless. |
| |
| Specifically, this means that if the type is a Pure Scalable Type, |
| there aren't enough argument registers to hold it, and so it will |
| need to be passed or returned in memory. If the type isn't a |
| Pure Scalable Type, it's too big to be passed or returned in core |
| or SIMD&FP registers, and so again will need to go in memory. */ |
| DOESNT_MATTER |
| }; |
| |
| /* Aggregates of 17 bytes or more are normally passed and returned |
| in memory, so aggregates of that size can safely be analyzed as |
| DOESNT_MATTER. We need to be able to collect enough pieces to |
| represent a PST that is smaller than that. Since predicates are |
| 2 bytes in size for -msve-vector-bits=128, that means we need to be |
| able to store at least 8 pieces. |
| |
| We also need to be able to store enough pieces to represent |
| a single vector in each vector argument register and a single |
| predicate in each predicate argument register. This means that |
| we need at least 12 pieces. */ |
| static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS; |
| static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates"); |
| |
| /* Describes one piece of a PST. Each piece is one of: |
| |
| - a single Scalable Vector Type (SVT) |
| - a single Scalable Predicate Type (SPT) |
| - a PST containing 2, 3 or 4 SVTs, with no padding |
| |
| It either represents a single built-in type or a PST formed from |
| multiple homogeneous built-in types. */ |
| struct piece |
| { |
| rtx get_rtx (unsigned int, unsigned int) const; |
| |
| /* The number of vector and predicate registers that the piece |
| occupies. One of the two is always zero. */ |
| unsigned int num_zr; |
| unsigned int num_pr; |
| |
| /* The mode of the registers described above. */ |
| machine_mode mode; |
| |
| /* If this piece is formed from multiple homogeneous built-in types, |
| this is the mode of the built-in types, otherwise it is MODE. */ |
| machine_mode orig_mode; |
| |
| /* The offset in bytes of the piece from the start of the type. */ |
| poly_uint64_pod offset; |
| }; |
| |
| /* Divides types analyzed as IS_PST into individual pieces. The pieces |
| are in memory order. */ |
| auto_vec<piece, MAX_PIECES> pieces; |
| |
| unsigned int num_zr () const; |
| unsigned int num_pr () const; |
| |
| rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const; |
| |
| analysis_result analyze (const_tree); |
| bool analyze_registers (const_tree); |
| |
| private: |
| analysis_result analyze_array (const_tree); |
| analysis_result analyze_record (const_tree); |
| void add_piece (const piece &); |
| }; |
| } |
| |
| /* The current code model. */ |
| enum aarch64_code_model aarch64_cmodel; |
| |
| /* The number of 64-bit elements in an SVE vector. */ |
| poly_uint16 aarch64_sve_vg; |
| |
| #ifdef HAVE_AS_TLS |
| #undef TARGET_HAVE_TLS |
| #define TARGET_HAVE_TLS 1 |
| #endif |
| |
| static bool aarch64_composite_type_p (const_tree, machine_mode); |
| static bool aarch64_return_in_memory_1 (const_tree); |
| static bool aarch64_vfp_is_call_or_return_candidate (machine_mode, |
| const_tree, |
| machine_mode *, int *, |
| bool *, bool); |
| static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED; |
| static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED; |
| static void aarch64_override_options_after_change (void); |
| static bool aarch64_vector_mode_supported_p (machine_mode); |
| static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool); |
| static bool aarch64_builtin_support_vector_misalignment (machine_mode mode, |
| const_tree type, |
| int misalignment, |
| bool is_packed); |
| static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64); |
| static bool aarch64_print_address_internal (FILE*, machine_mode, rtx, |
| aarch64_addr_query_type); |
| static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val); |
| |
| /* Major revision number of the ARM Architecture implemented by the target. */ |
| unsigned aarch64_architecture_version; |
| |
| /* The processor for which instructions should be scheduled. */ |
| enum aarch64_processor aarch64_tune = cortexa53; |
| |
| /* Mask to specify which instruction scheduling options should be used. */ |
| uint64_t aarch64_tune_flags = 0; |
| |
| /* Global flag for PC relative loads. */ |
| bool aarch64_pcrelative_literal_loads; |
| |
| /* Global flag for whether frame pointer is enabled. */ |
| bool aarch64_use_frame_pointer; |
| |
| #define BRANCH_PROTECT_STR_MAX 255 |
| char *accepted_branch_protection_string = NULL; |
| |
| static enum aarch64_parse_opt_result |
| aarch64_parse_branch_protection (const char*, char**); |
| |
| /* Support for command line parsing of boolean flags in the tuning |
| structures. */ |
| struct aarch64_flag_desc |
| { |
| const char* name; |
| unsigned int flag; |
| }; |
| |
| #define AARCH64_FUSION_PAIR(name, internal_name) \ |
| { name, AARCH64_FUSE_##internal_name }, |
| static const struct aarch64_flag_desc aarch64_fusible_pairs[] = |
| { |
| { "none", AARCH64_FUSE_NOTHING }, |
| #include "aarch64-fusion-pairs.def" |
| { "all", AARCH64_FUSE_ALL }, |
| { NULL, AARCH64_FUSE_NOTHING } |
| }; |
| |
| #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \ |
| { name, AARCH64_EXTRA_TUNE_##internal_name }, |
| static const struct aarch64_flag_desc aarch64_tuning_flags[] = |
| { |
| { "none", AARCH64_EXTRA_TUNE_NONE }, |
| #include "aarch64-tuning-flags.def" |
| { "all", AARCH64_EXTRA_TUNE_ALL }, |
| { NULL, AARCH64_EXTRA_TUNE_NONE } |
| }; |
| |
| /* Tuning parameters. */ |
| |
| static const struct cpu_addrcost_table generic_addrcost_table = |
| { |
| { |
| 1, /* hi */ |
| 0, /* si */ |
| 0, /* di */ |
| 1, /* ti */ |
| }, |
| 0, /* pre_modify */ |
| 0, /* post_modify */ |
| 0, /* post_modify_ld3_st3 */ |
| 0, /* post_modify_ld4_st4 */ |
| 0, /* register_offset */ |
| 0, /* register_sextend */ |
| 0, /* register_zextend */ |
| 0 /* imm_offset */ |
| }; |
| |
| static const struct cpu_addrcost_table exynosm1_addrcost_table = |
| { |
| { |
| 0, /* hi */ |
| 0, /* si */ |
| 0, /* di */ |
| 2, /* ti */ |
| }, |
| 0, /* pre_modify */ |
| 0, /* post_modify */ |
| 0, /* post_modify_ld3_st3 */ |
| 0, /* post_modify_ld4_st4 */ |
| 1, /* register_offset */ |
| 1, /* register_sextend */ |
| 2, /* register_zextend */ |
| 0, /* imm_offset */ |
| }; |
| |
| static const struct cpu_addrcost_table xgene1_addrcost_table = |
| { |
| { |
| 1, /* hi */ |
| 0, /* si */ |
| 0, /* di */ |
| 1, /* ti */ |
| }, |
| 1, /* pre_modify */ |
| 1, /* post_modify */ |
| 1, /* post_modify_ld3_st3 */ |
| 1, /* post_modify_ld4_st4 */ |
| 0, /* register_offset */ |
| 1, /* register_sextend */ |
| 1, /* register_zextend */ |
| 0, /* imm_offset */ |
| }; |
| |
| static const struct cpu_addrcost_table thunderx2t99_addrcost_table = |
| { |
| { |
| 1, /* hi */ |
| 1, /* si */ |
| 1, /* di */ |
| 2, /* ti */ |
| }, |
| 0, /* pre_modify */ |
| 0, /* post_modify */ |
| 0, /* post_modify_ld3_st3 */ |
| 0, /* post_modify_ld4_st4 */ |
| 2, /* register_offset */ |
| 3, /* register_sextend */ |
| 3, /* register_zextend */ |
| 0, /* imm_offset */ |
| }; |
| |
| static const struct cpu_addrcost_table thunderx3t110_addrcost_table = |
| { |
| { |
| 1, /* hi */ |
| 1, /* si */ |
| 1, /* di */ |
| 2, /* ti */ |
| }, |
| 0, /* pre_modify */ |
| 0, /* post_modify */ |
| 0, /* post_modify_ld3_st3 */ |
| 0, /* post_modify_ld4_st4 */ |
| 2, /* register_offset */ |
| 3, /* register_sextend */ |
| 3, /* register_zextend */ |
| 0, /* imm_offset */ |
| }; |
| |
| static const struct cpu_addrcost_table tsv110_addrcost_table = |
| { |
| { |
| 1, /* hi */ |
| 0, /* si */ |
| 0, /* di */ |
| 1, /* ti */ |
| }, |
| 0, /* pre_modify */ |
| 0, /* post_modify */ |
| 0, /* post_modify_ld3_st3 */ |
| 0, /* post_modify_ld4_st4 */ |
| 0, /* register_offset */ |
| 1, /* register_sextend */ |
| 1, /* register_zextend */ |
| 0, /* imm_offset */ |
| }; |
| |
| static const struct cpu_addrcost_table qdf24xx_addrcost_table = |
| { |
| { |
| 1, /* hi */ |
| 1, /* si */ |
| 1, /* di */ |
| 2, /* ti */ |
| }, |
| 1, /* pre_modify */ |
| 1, /* post_modify */ |
| 1, /* post_modify_ld3_st3 */ |
| 1, /* post_modify_ld4_st4 */ |
| 3, /* register_offset */ |
| 3, /* register_sextend */ |
| 3, /* register_zextend */ |
| 2, /* imm_offset */ |
| }; |
| |
| static const struct cpu_addrcost_table a64fx_addrcost_table = |
| { |
| { |
| 1, /* hi */ |
| 1, /* si */ |
| 1, /* di */ |
| 2, /* ti */ |
| }, |
| 0, /* pre_modify */ |
| 0, /* post_modify */ |
| 0, /* post_modify_ld3_st3 */ |
| 0, /* post_modify_ld4_st4 */ |
| 2, /* register_offset */ |
| 3, /* register_sextend */ |
| 3, /* register_zextend */ |
| 0, /* imm_offset */ |
| }; |
| |
| static const struct cpu_addrcost_table neoversev1_addrcost_table = |
| { |
| { |
| 1, /* hi */ |
| 0, /* si */ |
| 0, /* di */ |
| 1, /* ti */ |
| }, |
| 0, /* pre_modify */ |
| 0, /* post_modify */ |
| 3, /* post_modify_ld3_st3 */ |
| 3, /* post_modify_ld4_st4 */ |
| 0, /* register_offset */ |
| 0, /* register_sextend */ |
| 0, /* register_zextend */ |
| 0 /* imm_offset */ |
| }; |
| |
| static const struct cpu_regmove_cost generic_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Avoid the use of slow int<->fp moves for spilling by setting |
| their cost higher than memmov_cost. */ |
| 5, /* GP2FP */ |
| 5, /* FP2GP */ |
| 2 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost cortexa57_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Avoid the use of slow int<->fp moves for spilling by setting |
| their cost higher than memmov_cost. */ |
| 5, /* GP2FP */ |
| 5, /* FP2GP */ |
| 2 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost cortexa53_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Avoid the use of slow int<->fp moves for spilling by setting |
| their cost higher than memmov_cost. */ |
| 5, /* GP2FP */ |
| 5, /* FP2GP */ |
| 2 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost exynosm1_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Avoid the use of slow int<->fp moves for spilling by setting |
| their cost higher than memmov_cost (actual, 4 and 9). */ |
| 9, /* GP2FP */ |
| 9, /* FP2GP */ |
| 1 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost thunderx_regmove_cost = |
| { |
| 2, /* GP2GP */ |
| 2, /* GP2FP */ |
| 6, /* FP2GP */ |
| 4 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost xgene1_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Avoid the use of slow int<->fp moves for spilling by setting |
| their cost higher than memmov_cost. */ |
| 8, /* GP2FP */ |
| 8, /* FP2GP */ |
| 2 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost qdf24xx_regmove_cost = |
| { |
| 2, /* GP2GP */ |
| /* Avoid the use of int<->fp moves for spilling. */ |
| 6, /* GP2FP */ |
| 6, /* FP2GP */ |
| 4 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost thunderx2t99_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Avoid the use of int<->fp moves for spilling. */ |
| 5, /* GP2FP */ |
| 6, /* FP2GP */ |
| 3, /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost thunderx3t110_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Avoid the use of int<->fp moves for spilling. */ |
| 4, /* GP2FP */ |
| 5, /* FP2GP */ |
| 4 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost tsv110_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Avoid the use of slow int<->fp moves for spilling by setting |
| their cost higher than memmov_cost. */ |
| 2, /* GP2FP */ |
| 3, /* FP2GP */ |
| 2 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost a64fx_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Avoid the use of slow int<->fp moves for spilling by setting |
| their cost higher than memmov_cost. */ |
| 5, /* GP2FP */ |
| 7, /* FP2GP */ |
| 2 /* FP2FP */ |
| }; |
| |
| /* Generic costs for Advanced SIMD vector operations. */ |
| static const advsimd_vec_cost generic_advsimd_vector_cost = |
| { |
| 1, /* int_stmt_cost */ |
| 1, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 2, /* permute_cost */ |
| 2, /* reduc_i8_cost */ |
| 2, /* reduc_i16_cost */ |
| 2, /* reduc_i32_cost */ |
| 2, /* reduc_i64_cost */ |
| 2, /* reduc_f16_cost */ |
| 2, /* reduc_f32_cost */ |
| 2, /* reduc_f64_cost */ |
| 2, /* store_elt_extra_cost */ |
| 2, /* vec_to_scalar_cost */ |
| 1, /* scalar_to_vec_cost */ |
| 1, /* align_load_cost */ |
| 1, /* unalign_load_cost */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| /* Generic costs for SVE vector operations. */ |
| static const sve_vec_cost generic_sve_vector_cost = |
| { |
| { |
| 1, /* int_stmt_cost */ |
| 1, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 2, /* permute_cost */ |
| 2, /* reduc_i8_cost */ |
| 2, /* reduc_i16_cost */ |
| 2, /* reduc_i32_cost */ |
| 2, /* reduc_i64_cost */ |
| 2, /* reduc_f16_cost */ |
| 2, /* reduc_f32_cost */ |
| 2, /* reduc_f64_cost */ |
| 2, /* store_elt_extra_cost */ |
| 2, /* vec_to_scalar_cost */ |
| 1, /* scalar_to_vec_cost */ |
| 1, /* align_load_cost */ |
| 1, /* unalign_load_cost */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }, |
| 2, /* clast_cost */ |
| 2, /* fadda_f16_cost */ |
| 2, /* fadda_f32_cost */ |
| 2, /* fadda_f64_cost */ |
| 4, /* gather_load_x32_cost */ |
| 2, /* gather_load_x64_cost */ |
| 1 /* scatter_store_elt_cost */ |
| }; |
| |
| /* Generic costs for vector insn classes. */ |
| static const struct cpu_vector_cost generic_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 1, /* scalar_fp_stmt_cost */ |
| 1, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 3, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &generic_advsimd_vector_cost, /* advsimd */ |
| &generic_sve_vector_cost, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| static const advsimd_vec_cost a64fx_advsimd_vector_cost = |
| { |
| 2, /* int_stmt_cost */ |
| 5, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 3, /* permute_cost */ |
| 13, /* reduc_i8_cost */ |
| 13, /* reduc_i16_cost */ |
| 13, /* reduc_i32_cost */ |
| 13, /* reduc_i64_cost */ |
| 13, /* reduc_f16_cost */ |
| 13, /* reduc_f32_cost */ |
| 13, /* reduc_f64_cost */ |
| 13, /* store_elt_extra_cost */ |
| 13, /* vec_to_scalar_cost */ |
| 4, /* scalar_to_vec_cost */ |
| 6, /* align_load_cost */ |
| 6, /* unalign_load_cost */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| static const sve_vec_cost a64fx_sve_vector_cost = |
| { |
| { |
| 2, /* int_stmt_cost */ |
| 5, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 3, /* permute_cost */ |
| 13, /* reduc_i8_cost */ |
| 13, /* reduc_i16_cost */ |
| 13, /* reduc_i32_cost */ |
| 13, /* reduc_i64_cost */ |
| 13, /* reduc_f16_cost */ |
| 13, /* reduc_f32_cost */ |
| 13, /* reduc_f64_cost */ |
| 13, /* store_elt_extra_cost */ |
| 13, /* vec_to_scalar_cost */ |
| 4, /* scalar_to_vec_cost */ |
| 6, /* align_load_cost */ |
| 6, /* unalign_load_cost */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }, |
| 13, /* clast_cost */ |
| 13, /* fadda_f16_cost */ |
| 13, /* fadda_f32_cost */ |
| 13, /* fadda_f64_cost */ |
| 64, /* gather_load_x32_cost */ |
| 32, /* gather_load_x64_cost */ |
| 1 /* scatter_store_elt_cost */ |
| }; |
| |
| static const struct cpu_vector_cost a64fx_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 5, /* scalar_fp_stmt_cost */ |
| 4, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 3, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &a64fx_advsimd_vector_cost, /* advsimd */ |
| &a64fx_sve_vector_cost, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| static const advsimd_vec_cost qdf24xx_advsimd_vector_cost = |
| { |
| 1, /* int_stmt_cost */ |
| 3, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 2, /* permute_cost */ |
| 1, /* reduc_i8_cost */ |
| 1, /* reduc_i16_cost */ |
| 1, /* reduc_i32_cost */ |
| 1, /* reduc_i64_cost */ |
| 1, /* reduc_f16_cost */ |
| 1, /* reduc_f32_cost */ |
| 1, /* reduc_f64_cost */ |
| 1, /* store_elt_extra_cost */ |
| 1, /* vec_to_scalar_cost */ |
| 1, /* scalar_to_vec_cost */ |
| 1, /* align_load_cost */ |
| 1, /* unalign_load_cost */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| /* QDF24XX costs for vector insn classes. */ |
| static const struct cpu_vector_cost qdf24xx_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 1, /* scalar_fp_stmt_cost */ |
| 1, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 3, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &qdf24xx_advsimd_vector_cost, /* advsimd */ |
| nullptr, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| |
| static const advsimd_vec_cost thunderx_advsimd_vector_cost = |
| { |
| 4, /* int_stmt_cost */ |
| 1, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 4, /* permute_cost */ |
| 2, /* reduc_i8_cost */ |
| 2, /* reduc_i16_cost */ |
| 2, /* reduc_i32_cost */ |
| 2, /* reduc_i64_cost */ |
| 2, /* reduc_f16_cost */ |
| 2, /* reduc_f32_cost */ |
| 2, /* reduc_f64_cost */ |
| 2, /* store_elt_extra_cost */ |
| 2, /* vec_to_scalar_cost */ |
| 2, /* scalar_to_vec_cost */ |
| 3, /* align_load_cost */ |
| 5, /* unalign_load_cost */ |
| 5, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| /* ThunderX costs for vector insn classes. */ |
| static const struct cpu_vector_cost thunderx_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 1, /* scalar_fp_stmt_cost */ |
| 3, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 3, /* cond_taken_branch_cost */ |
| 3, /* cond_not_taken_branch_cost */ |
| &thunderx_advsimd_vector_cost, /* advsimd */ |
| nullptr, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| static const advsimd_vec_cost tsv110_advsimd_vector_cost = |
| { |
| 2, /* int_stmt_cost */ |
| 2, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 2, /* permute_cost */ |
| 3, /* reduc_i8_cost */ |
| 3, /* reduc_i16_cost */ |
| 3, /* reduc_i32_cost */ |
| 3, /* reduc_i64_cost */ |
| 3, /* reduc_f16_cost */ |
| 3, /* reduc_f32_cost */ |
| 3, /* reduc_f64_cost */ |
| 3, /* store_elt_extra_cost */ |
| 3, /* vec_to_scalar_cost */ |
| 2, /* scalar_to_vec_cost */ |
| 5, /* align_load_cost */ |
| 5, /* unalign_load_cost */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| static const struct cpu_vector_cost tsv110_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 1, /* scalar_fp_stmt_cost */ |
| 5, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 1, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &tsv110_advsimd_vector_cost, /* advsimd */ |
| nullptr, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| static const advsimd_vec_cost cortexa57_advsimd_vector_cost = |
| { |
| 2, /* int_stmt_cost */ |
| 2, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 3, /* permute_cost */ |
| 8, /* reduc_i8_cost */ |
| 8, /* reduc_i16_cost */ |
| 8, /* reduc_i32_cost */ |
| 8, /* reduc_i64_cost */ |
| 8, /* reduc_f16_cost */ |
| 8, /* reduc_f32_cost */ |
| 8, /* reduc_f64_cost */ |
| 8, /* store_elt_extra_cost */ |
| 8, /* vec_to_scalar_cost */ |
| 8, /* scalar_to_vec_cost */ |
| 4, /* align_load_cost */ |
| 4, /* unalign_load_cost */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| /* Cortex-A57 costs for vector insn classes. */ |
| static const struct cpu_vector_cost cortexa57_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 1, /* scalar_fp_stmt_cost */ |
| 4, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 1, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &cortexa57_advsimd_vector_cost, /* advsimd */ |
| nullptr, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| static const advsimd_vec_cost exynosm1_advsimd_vector_cost = |
| { |
| 3, /* int_stmt_cost */ |
| 3, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 3, /* permute_cost */ |
| 3, /* reduc_i8_cost */ |
| 3, /* reduc_i16_cost */ |
| 3, /* reduc_i32_cost */ |
| 3, /* reduc_i64_cost */ |
| 3, /* reduc_f16_cost */ |
| 3, /* reduc_f32_cost */ |
| 3, /* reduc_f64_cost */ |
| 3, /* store_elt_extra_cost */ |
| 3, /* vec_to_scalar_cost */ |
| 3, /* scalar_to_vec_cost */ |
| 5, /* align_load_cost */ |
| 5, /* unalign_load_cost */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| static const struct cpu_vector_cost exynosm1_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 1, /* scalar_fp_stmt_cost */ |
| 5, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 1, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &exynosm1_advsimd_vector_cost, /* advsimd */ |
| nullptr, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| static const advsimd_vec_cost xgene1_advsimd_vector_cost = |
| { |
| 2, /* int_stmt_cost */ |
| 2, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 2, /* permute_cost */ |
| 4, /* reduc_i8_cost */ |
| 4, /* reduc_i16_cost */ |
| 4, /* reduc_i32_cost */ |
| 4, /* reduc_i64_cost */ |
| 4, /* reduc_f16_cost */ |
| 4, /* reduc_f32_cost */ |
| 4, /* reduc_f64_cost */ |
| 4, /* store_elt_extra_cost */ |
| 4, /* vec_to_scalar_cost */ |
| 4, /* scalar_to_vec_cost */ |
| 10, /* align_load_cost */ |
| 10, /* unalign_load_cost */ |
| 2, /* unalign_store_cost */ |
| 2 /* store_cost */ |
| }; |
| |
| /* Generic costs for vector insn classes. */ |
| static const struct cpu_vector_cost xgene1_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 1, /* scalar_fp_stmt_cost */ |
| 5, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 2, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &xgene1_advsimd_vector_cost, /* advsimd */ |
| nullptr, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost = |
| { |
| 4, /* int_stmt_cost */ |
| 5, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 10, /* permute_cost */ |
| 6, /* reduc_i8_cost */ |
| 6, /* reduc_i16_cost */ |
| 6, /* reduc_i32_cost */ |
| 6, /* reduc_i64_cost */ |
| 6, /* reduc_f16_cost */ |
| 6, /* reduc_f32_cost */ |
| 6, /* reduc_f64_cost */ |
| 6, /* store_elt_extra_cost */ |
| 6, /* vec_to_scalar_cost */ |
| 5, /* scalar_to_vec_cost */ |
| 4, /* align_load_cost */ |
| 4, /* unalign_load_cost */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| /* Costs for vector insn classes for Vulcan. */ |
| static const struct cpu_vector_cost thunderx2t99_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 6, /* scalar_fp_stmt_cost */ |
| 4, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 2, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &thunderx2t99_advsimd_vector_cost, /* advsimd */ |
| nullptr, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost = |
| { |
| 5, /* int_stmt_cost */ |
| 5, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 10, /* permute_cost */ |
| 5, /* reduc_i8_cost */ |
| 5, /* reduc_i16_cost */ |
| 5, /* reduc_i32_cost */ |
| 5, /* reduc_i64_cost */ |
| 5, /* reduc_f16_cost */ |
| 5, /* reduc_f32_cost */ |
| 5, /* reduc_f64_cost */ |
| 5, /* store_elt_extra_cost */ |
| 5, /* vec_to_scalar_cost */ |
| 5, /* scalar_to_vec_cost */ |
| 4, /* align_load_cost */ |
| 4, /* unalign_load_cost */ |
| 4, /* unalign_store_cost */ |
| 4 /* store_cost */ |
| }; |
| |
| static const struct cpu_vector_cost thunderx3t110_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 5, /* scalar_fp_stmt_cost */ |
| 4, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 2, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &thunderx3t110_advsimd_vector_cost, /* advsimd */ |
| nullptr, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| |
| /* Generic costs for branch instructions. */ |
| static const struct cpu_branch_cost generic_branch_cost = |
| { |
| 1, /* Predictable. */ |
| 3 /* Unpredictable. */ |
| }; |
| |
| /* Generic approximation modes. */ |
| static const cpu_approx_modes generic_approx_modes = |
| { |
| AARCH64_APPROX_NONE, /* division */ |
| AARCH64_APPROX_NONE, /* sqrt */ |
| AARCH64_APPROX_NONE /* recip_sqrt */ |
| }; |
| |
| /* Approximation modes for Exynos M1. */ |
| static const cpu_approx_modes exynosm1_approx_modes = |
| { |
| AARCH64_APPROX_NONE, /* division */ |
| AARCH64_APPROX_ALL, /* sqrt */ |
| AARCH64_APPROX_ALL /* recip_sqrt */ |
| }; |
| |
| /* Approximation modes for X-Gene 1. */ |
| static const cpu_approx_modes xgene1_approx_modes = |
| { |
| AARCH64_APPROX_NONE, /* division */ |
| AARCH64_APPROX_NONE, /* sqrt */ |
| AARCH64_APPROX_ALL /* recip_sqrt */ |
| }; |
| |
| /* Generic prefetch settings (which disable prefetch). */ |
| static const cpu_prefetch_tune generic_prefetch_tune = |
| { |
| 0, /* num_slots */ |
| -1, /* l1_cache_size */ |
| -1, /* l1_cache_line_size */ |
| -1, /* l2_cache_size */ |
| true, /* prefetch_dynamic_strides */ |
| -1, /* minimum_stride */ |
| -1 /* default_opt_level */ |
| }; |
| |
| static const cpu_prefetch_tune exynosm1_prefetch_tune = |
| { |
| 0, /* num_slots */ |
| -1, /* l1_cache_size */ |
| 64, /* l1_cache_line_size */ |
| -1, /* l2_cache_size */ |
| true, /* prefetch_dynamic_strides */ |
| -1, /* minimum_stride */ |
| -1 /* default_opt_level */ |
| }; |
| |
| static const cpu_prefetch_tune qdf24xx_prefetch_tune = |
| { |
| 4, /* num_slots */ |
| 32, /* l1_cache_size */ |
| 64, /* l1_cache_line_size */ |
| 512, /* l2_cache_size */ |
| false, /* prefetch_dynamic_strides */ |
| 2048, /* minimum_stride */ |
| 3 /* default_opt_level */ |
| }; |
| |
| static const cpu_prefetch_tune thunderxt88_prefetch_tune = |
| { |
| 8, /* num_slots */ |
| 32, /* l1_cache_size */ |
| 128, /* l1_cache_line_size */ |
| 16*1024, /* l2_cache_size */ |
| true, /* prefetch_dynamic_strides */ |
| -1, /* minimum_stride */ |
| 3 /* default_opt_level */ |
| }; |
| |
| static const cpu_prefetch_tune thunderx_prefetch_tune = |
| { |
| 8, /* num_slots */ |
| 32, /* l1_cache_size */ |
| 128, /* l1_cache_line_size */ |
| -1, /* l2_cache_size */ |
| true, /* prefetch_dynamic_strides */ |
| -1, /* minimum_stride */ |
| -1 /* default_opt_level */ |
| }; |
| |
| static const cpu_prefetch_tune thunderx2t99_prefetch_tune = |
| { |
| 8, /* num_slots */ |
| 32, /* l1_cache_size */ |
| 64, /* l1_cache_line_size */ |
| 256, /* l2_cache_size */ |
| true, /* prefetch_dynamic_strides */ |
| -1, /* minimum_stride */ |
| -1 /* default_opt_level */ |
| }; |
| |
| static const cpu_prefetch_tune thunderx3t110_prefetch_tune = |
| { |
| 8, /* num_slots */ |
| 32, /* l1_cache_size */ |
| 64, /* l1_cache_line_size */ |
| 256, /* l2_cache_size */ |
| true, /* prefetch_dynamic_strides */ |
| -1, /* minimum_stride */ |
| -1 /* default_opt_level */ |
| }; |
| |
| static const cpu_prefetch_tune tsv110_prefetch_tune = |
| { |
| 0, /* num_slots */ |
| 64, /* l1_cache_size */ |
| 64, /* l1_cache_line_size */ |
| 512, /* l2_cache_size */ |
| true, /* prefetch_dynamic_strides */ |
| -1, /* minimum_stride */ |
| -1 /* default_opt_level */ |
| }; |
| |
| static const cpu_prefetch_tune xgene1_prefetch_tune = |
| { |
| 8, /* num_slots */ |
| 32, /* l1_cache_size */ |
| 64, /* l1_cache_line_size */ |
| 256, /* l2_cache_size */ |
| true, /* prefetch_dynamic_strides */ |
| -1, /* minimum_stride */ |
| -1 /* default_opt_level */ |
| }; |
| |
| static const cpu_prefetch_tune a64fx_prefetch_tune = |
| { |
| 8, /* num_slots */ |
| 64, /* l1_cache_size */ |
| 256, /* l1_cache_line_size */ |
| 32768, /* l2_cache_size */ |
| true, /* prefetch_dynamic_strides */ |
| -1, /* minimum_stride */ |
| -1 /* default_opt_level */ |
| }; |
| |
| static const struct tune_params generic_tunings = |
| { |
| &cortexa57_extra_costs, |
| &generic_addrcost_table, |
| &generic_regmove_cost, |
| &generic_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| 4, /* memmov_cost */ |
| 2, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ |
| "16:12", /* function_align. */ |
| "4", /* jump_align. */ |
| "8", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits |
| Neoverse V1. It does not have a noticeable effect on A64FX and should |
| have at most a very minor effect on SVE2 cores. */ |
| (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const struct tune_params cortexa35_tunings = |
| { |
| &cortexa53_extra_costs, |
| &generic_addrcost_table, |
| &cortexa53_regmove_cost, |
| &generic_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| 4, /* memmov_cost */ |
| 1, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD |
| | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ |
| "16", /* function_align. */ |
| "4", /* jump_align. */ |
| "8", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const struct tune_params cortexa53_tunings = |
| { |
| &cortexa53_extra_costs, |
| &generic_addrcost_table, |
| &cortexa53_regmove_cost, |
| &generic_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| 4, /* memmov_cost */ |
| 2, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD |
| | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ |
| "16", /* function_align. */ |
| "4", /* jump_align. */ |
| "8", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const struct tune_params cortexa57_tunings = |
| { |
| &cortexa57_extra_costs, |
| &generic_addrcost_table, |
| &cortexa57_regmove_cost, |
| &cortexa57_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| 4, /* memmov_cost */ |
| 3, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD |
| | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */ |
| "16", /* function_align. */ |
| "4", /* jump_align. */ |
| "8", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const struct tune_params cortexa72_tunings = |
| { |
| &cortexa57_extra_costs, |
| &generic_addrcost_table, |
| &cortexa57_regmove_cost, |
| &cortexa57_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| 4, /* memmov_cost */ |
| 3, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD |
| | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */ |
| "16", /* function_align. */ |
| "4", /* jump_align. */ |
| "8", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const struct tune_params cortexa73_tunings = |
| { |
| &cortexa57_extra_costs, |
| &generic_addrcost_table, |
| &cortexa57_regmove_cost, |
| &cortexa57_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| 4, /* memmov_cost. */ |
| 2, /* issue_rate. */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD |
| | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ |
| "16", /* function_align. */ |
| "4", /* jump_align. */ |
| "8", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| |
| |
| static const struct tune_params exynosm1_tunings = |
| { |
| &exynosm1_extra_costs, |
| &exynosm1_addrcost_table, |
| &exynosm1_regmove_cost, |
| &exynosm1_vector_cost, |
| &generic_branch_cost, |
| &exynosm1_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| 4, /* memmov_cost */ |
| 3, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC), /* fusible_ops */ |
| "4", /* function_align. */ |
| "4", /* jump_align. */ |
| "4", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 48, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &exynosm1_prefetch_tune |
| }; |
| |
| static const struct tune_params thunderxt88_tunings = |
| { |
| &thunderx_extra_costs, |
| &generic_addrcost_table, |
| &thunderx_regmove_cost, |
| &thunderx_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| 6, /* memmov_cost */ |
| 2, /* issue_rate */ |
| AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */ |
| "8", /* function_align. */ |
| "8", /* jump_align. */ |
| "8", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */ |
| &thunderxt88_prefetch_tune |
| }; |
| |
| static const struct tune_params thunderx_tunings = |
| { |
| &thunderx_extra_costs, |
| &generic_addrcost_table, |
| &thunderx_regmove_cost, |
| &thunderx_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| 6, /* memmov_cost */ |
| 2, /* issue_rate */ |
| AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */ |
| "8", /* function_align. */ |
| "8", /* jump_align. */ |
| "8", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW |
| | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ |
| &thunderx_prefetch_tune |
| }; |
| |
| static const struct tune_params tsv110_tunings = |
| { |
| &tsv110_extra_costs, |
| &tsv110_addrcost_table, |
| &tsv110_regmove_cost, |
| &tsv110_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| 4, /* memmov_cost */ |
| 4, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH |
| | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */ |
| "16", /* function_align. */ |
| "4", /* jump_align. */ |
| "8", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &tsv110_prefetch_tune |
| }; |
| |
| static const struct tune_params xgene1_tunings = |
| { |
| &xgene1_extra_costs, |
| &xgene1_addrcost_table, |
| &xgene1_regmove_cost, |
| &xgene1_vector_cost, |
| &generic_branch_cost, |
| &xgene1_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| 6, /* memmov_cost */ |
| 4, /* issue_rate */ |
| AARCH64_FUSE_NOTHING, /* fusible_ops */ |
| "16", /* function_align. */ |
| "16", /* jump_align. */ |
| "16", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 17, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */ |
| &xgene1_prefetch_tune |
| }; |
| |
| static const struct tune_params emag_tunings = |
| { |
| &xgene1_extra_costs, |
| &xgene1_addrcost_table, |
| &xgene1_regmove_cost, |
| &xgene1_vector_cost, |
| &generic_branch_cost, |
| &xgene1_approx_modes, |
| SVE_NOT_IMPLEMENTED, |
| 6, /* memmov_cost */ |
| 4, /* issue_rate */ |
| AARCH64_FUSE_NOTHING, /* fusible_ops */ |
| "16", /* function_align. */ |
| "16", /* jump_align. */ |
| "16", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 17, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */ |
| &xgene1_prefetch_tune |
| }; |
| |
| static const struct tune_params qdf24xx_tunings = |
| { |
| &qdf24xx_extra_costs, |
| &qdf24xx_addrcost_table, |
| &qdf24xx_regmove_cost, |
| &qdf24xx_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| 4, /* memmov_cost */ |
| 4, /* issue_rate */ |
| (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD |
| | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */ |
| "16", /* function_align. */ |
| "8", /* jump_align. */ |
| "16", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */ |
| &qdf24xx_prefetch_tune |
| }; |
| |
| /* Tuning structure for the Qualcomm Saphira core. Default to falkor values |
| for now. */ |
| static const struct tune_params saphira_tunings = |
| { |
| &generic_extra_costs, |
| &generic_addrcost_table, |
| &generic_regmove_cost, |
| &generic_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| 4, /* memmov_cost */ |
| 4, /* issue_rate */ |
| (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD |
| | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */ |
| "16", /* function_align. */ |
| "8", /* jump_align. */ |
| "16", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const struct tune_params thunderx2t99_tunings = |
| { |
| &thunderx2t99_extra_costs, |
| &thunderx2t99_addrcost_table, |
| &thunderx2t99_regmove_cost, |
| &thunderx2t99_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| 4, /* memmov_cost. */ |
| 4, /* issue_rate. */ |
| (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC |
| | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */ |
| "16", /* function_align. */ |
| "8", /* jump_align. */ |
| "16", /* loop_align. */ |
| 3, /* int_reassoc_width. */ |
| 2, /* fp_reassoc_width. */ |
| 2, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &thunderx2t99_prefetch_tune |
| }; |
| |
| static const struct tune_params thunderx3t110_tunings = |
| { |
| &thunderx3t110_extra_costs, |
| &thunderx3t110_addrcost_table, |
| &thunderx3t110_regmove_cost, |
| &thunderx3t110_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| 4, /* memmov_cost. */ |
| 6, /* issue_rate. */ |
| (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC |
| | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */ |
| "16", /* function_align. */ |
| "8", /* jump_align. */ |
| "16", /* loop_align. */ |
| 3, /* int_reassoc_width. */ |
| 2, /* fp_reassoc_width. */ |
| 2, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &thunderx3t110_prefetch_tune |
| }; |
| |
| static const struct tune_params neoversen1_tunings = |
| { |
| &cortexa76_extra_costs, |
| &generic_addrcost_table, |
| &generic_regmove_cost, |
| &cortexa57_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| 4, /* memmov_cost */ |
| 3, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ |
| "32:16", /* function_align. */ |
| "4", /* jump_align. */ |
| "32:16", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 2, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const advsimd_vec_cost neoversev1_advsimd_vector_cost = |
| { |
| 2, /* int_stmt_cost */ |
| 2, /* fp_stmt_cost */ |
| 4, /* ld2_st2_permute_cost */ |
| 4, /* ld3_st3_permute_cost */ |
| 5, /* ld4_st4_permute_cost */ |
| 3, /* permute_cost */ |
| 4, /* reduc_i8_cost */ |
| 4, /* reduc_i16_cost */ |
| 2, /* reduc_i32_cost */ |
| 2, /* reduc_i64_cost */ |
| 6, /* reduc_f16_cost */ |
| 3, /* reduc_f32_cost */ |
| 2, /* reduc_f64_cost */ |
| 2, /* store_elt_extra_cost */ |
| /* This value is just inherited from the Cortex-A57 table. */ |
| 8, /* vec_to_scalar_cost */ |
| /* This depends very much on what the scalar value is and |
| where it comes from. E.g. some constants take two dependent |
| instructions or a load, while others might be moved from a GPR. |
| 4 seems to be a reasonable compromise in practice. */ |
| 4, /* scalar_to_vec_cost */ |
| 4, /* align_load_cost */ |
| 4, /* unalign_load_cost */ |
| /* Although stores have a latency of 2 and compete for the |
| vector pipes, in practice it's better not to model that. */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| static const sve_vec_cost neoversev1_sve_vector_cost = |
| { |
| { |
| 2, /* int_stmt_cost */ |
| 2, /* fp_stmt_cost */ |
| 4, /* ld2_st2_permute_cost */ |
| 7, /* ld3_st3_permute_cost */ |
| 8, /* ld4_st4_permute_cost */ |
| 3, /* permute_cost */ |
| /* Theoretically, a reduction involving 31 scalar ADDs could |
| complete in ~9 cycles and would have a cost of 31. [SU]ADDV |
| completes in 14 cycles, so give it a cost of 31 + 5. */ |
| 36, /* reduc_i8_cost */ |
| /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7. */ |
| 22, /* reduc_i16_cost */ |
| /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7. */ |
| 14, /* reduc_i32_cost */ |
| /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8. */ |
| 11, /* reduc_i64_cost */ |
| /* Theoretically, a reduction involving 15 scalar FADDs could |
| complete in ~9 cycles and would have a cost of 30. FADDV |
| completes in 13 cycles, so give it a cost of 30 + 4. */ |
| 34, /* reduc_f16_cost */ |
| /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5. */ |
| 19, /* reduc_f32_cost */ |
| /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5. */ |
| 11, /* reduc_f64_cost */ |
| 2, /* store_elt_extra_cost */ |
| /* This value is just inherited from the Cortex-A57 table. */ |
| 8, /* vec_to_scalar_cost */ |
| /* See the comment above the Advanced SIMD versions. */ |
| 4, /* scalar_to_vec_cost */ |
| 4, /* align_load_cost */ |
| 4, /* unalign_load_cost */ |
| /* Although stores have a latency of 2 and compete for the |
| vector pipes, in practice it's better not to model that. */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }, |
| 3, /* clast_cost */ |
| 19, /* fadda_f16_cost */ |
| 11, /* fadda_f32_cost */ |
| 8, /* fadda_f64_cost */ |
| 32, /* gather_load_x32_cost */ |
| 16, /* gather_load_x64_cost */ |
| 3 /* scatter_store_elt_cost */ |
| }; |
| |
| static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info = |
| { |
| 3, /* loads_stores_per_cycle */ |
| 2, /* stores_per_cycle */ |
| 4, /* general_ops_per_cycle */ |
| 0, /* fp_simd_load_general_ops */ |
| 1 /* fp_simd_store_general_ops */ |
| }; |
| |
| static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info = |
| { |
| { |
| 3, /* loads_stores_per_cycle */ |
| 2, /* stores_per_cycle */ |
| 4, /* general_ops_per_cycle */ |
| 0, /* fp_simd_load_general_ops */ |
| 1 /* fp_simd_store_general_ops */ |
| }, |
| 2, /* ld2_st2_general_ops */ |
| 2, /* ld3_st3_general_ops */ |
| 3 /* ld4_st4_general_ops */ |
| }; |
| |
| static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info = |
| { |
| { |
| { |
| 2, /* loads_per_cycle */ |
| 2, /* stores_per_cycle */ |
| 2, /* general_ops_per_cycle */ |
| 0, /* fp_simd_load_general_ops */ |
| 1 /* fp_simd_store_general_ops */ |
| }, |
| 2, /* ld2_st2_general_ops */ |
| 2, /* ld3_st3_general_ops */ |
| 3 /* ld4_st4_general_ops */ |
| }, |
| 1, /* pred_ops_per_cycle */ |
| 2, /* while_pred_ops */ |
| 2, /* int_cmp_pred_ops */ |
| 1, /* fp_cmp_pred_ops */ |
| 1, /* gather_scatter_pair_general_ops */ |
| 1 /* gather_scatter_pair_pred_ops */ |
| }; |
| |
| static const aarch64_vec_issue_info neoversev1_vec_issue_info = |
| { |
| &neoversev1_scalar_issue_info, |
| &neoversev1_advsimd_issue_info, |
| &neoversev1_sve_issue_info |
| }; |
| |
| /* Neoverse V1 costs for vector insn classes. */ |
| static const struct cpu_vector_cost neoversev1_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 2, /* scalar_fp_stmt_cost */ |
| 4, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 1, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &neoversev1_advsimd_vector_cost, /* advsimd */ |
| &neoversev1_sve_vector_cost, /* sve */ |
| &neoversev1_vec_issue_info /* issue_info */ |
| }; |
| |
| static const struct tune_params neoversev1_tunings = |
| { |
| &cortexa76_extra_costs, |
| &neoversev1_addrcost_table, |
| &generic_regmove_cost, |
| &neoversev1_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_256, /* sve_width */ |
| 4, /* memmov_cost */ |
| 3, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ |
| "32:16", /* function_align. */ |
| "4", /* jump_align. */ |
| "32:16", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 2, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS |
| | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS |
| | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const sve_vec_cost neoverse512tvb_sve_vector_cost = |
| { |
| { |
| 2, /* int_stmt_cost */ |
| 2, /* fp_stmt_cost */ |
| 4, /* ld2_st2_permute_cost */ |
| 5, /* ld3_st3_permute_cost */ |
| 5, /* ld4_st4_permute_cost */ |
| 3, /* permute_cost */ |
| /* Theoretically, a reduction involving 15 scalar ADDs could |
| complete in ~5 cycles and would have a cost of 15. Assume that |
| [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6. */ |
| 21, /* reduc_i8_cost */ |
| /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */ |
| 13, /* reduc_i16_cost */ |
| /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */ |
| 9, /* reduc_i32_cost */ |
| /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7. */ |
| 8, /* reduc_i64_cost */ |
| /* Theoretically, a reduction involving 7 scalar FADDs could |
| complete in ~6 cycles and would have a cost of 14. Assume that |
| FADDV completes in 8 cycles and so give it a cost of 14 + 2. */ |
| 16, /* reduc_f16_cost */ |
| /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */ |
| 8, /* reduc_f32_cost */ |
| /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2. */ |
| 4, /* reduc_f64_cost */ |
| 2, /* store_elt_extra_cost */ |
| /* This value is just inherited from the Cortex-A57 table. */ |
| 8, /* vec_to_scalar_cost */ |
| /* This depends very much on what the scalar value is and |
| where it comes from. E.g. some constants take two dependent |
| instructions or a load, while others might be moved from a GPR. |
| 4 seems to be a reasonable compromise in practice. */ |
| 4, /* scalar_to_vec_cost */ |
| 4, /* align_load_cost */ |
| 4, /* unalign_load_cost */ |
| /* Although stores generally have a latency of 2 and compete for the |
| vector pipes, in practice it's better not to model that. */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }, |
| 3, /* clast_cost */ |
| 10, /* fadda_f16_cost */ |
| 6, /* fadda_f32_cost */ |
| 4, /* fadda_f64_cost */ |
| /* A strided Advanced SIMD x64 load would take two parallel FP loads |
| (6 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather |
| is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads |
| (cost 8) and a vec_construct (cost 2). Add a full vector operation |
| (cost 2) to that, to avoid the difference being lost in rounding. |
| |
| There is no easy comparison between a strided Advanced SIMD x32 load |
| and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector |
| operation more than a 64-bit gather. */ |
| 14, /* gather_load_x32_cost */ |
| 12, /* gather_load_x64_cost */ |
| 3 /* scatter_store_elt_cost */ |
| }; |
| |
| static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info = |
| { |
| { |
| { |
| 3, /* loads_per_cycle */ |
| 2, /* stores_per_cycle */ |
| 4, /* general_ops_per_cycle */ |
| 0, /* fp_simd_load_general_ops */ |
| 1 /* fp_simd_store_general_ops */ |
| }, |
| 2, /* ld2_st2_general_ops */ |
| 2, /* ld3_st3_general_ops */ |
| 3 /* ld4_st4_general_ops */ |
| }, |
| 2, /* pred_ops_per_cycle */ |
| 2, /* while_pred_ops */ |
| 2, /* int_cmp_pred_ops */ |
| 1, /* fp_cmp_pred_ops */ |
| 1, /* gather_scatter_pair_general_ops */ |
| 1 /* gather_scatter_pair_pred_ops */ |
| }; |
| |
| static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info = |
| { |
| &neoversev1_scalar_issue_info, |
| &neoversev1_advsimd_issue_info, |
| &neoverse512tvb_sve_issue_info |
| }; |
| |
| static const struct cpu_vector_cost neoverse512tvb_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 2, /* scalar_fp_stmt_cost */ |
| 4, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 1, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &neoversev1_advsimd_vector_cost, /* advsimd */ |
| &neoverse512tvb_sve_vector_cost, /* sve */ |
| &neoverse512tvb_vec_issue_info /* issue_info */ |
| }; |
| |
| static const struct tune_params neoverse512tvb_tunings = |
| { |
| &cortexa76_extra_costs, |
| &neoversev1_addrcost_table, |
| &generic_regmove_cost, |
| &neoverse512tvb_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_128 | SVE_256, /* sve_width */ |
| 4, /* memmov_cost */ |
| 3, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ |
| "32:16", /* function_align. */ |
| "4", /* jump_align. */ |
| "32:16", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 2, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS |
| | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS |
| | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const struct tune_params neoversen2_tunings = |
| { |
| &cortexa76_extra_costs, |
| &generic_addrcost_table, |
| &generic_regmove_cost, |
| &cortexa57_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_128, /* sve_width */ |
| 4, /* memmov_cost */ |
| 3, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ |
| "32:16", /* function_align. */ |
| "4", /* jump_align. */ |
| "32:16", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 2, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const struct tune_params a64fx_tunings = |
| { |
| &a64fx_extra_costs, |
| &a64fx_addrcost_table, |
| &a64fx_regmove_cost, |
| &a64fx_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_512, /* sve_width */ |
| 4, /* memmov_cost */ |
| 7, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ |
| "32", /* function_align. */ |
| "16", /* jump_align. */ |
| "32", /* loop_align. */ |
| 4, /* int_reassoc_width. */ |
| 2, /* fp_reassoc_width. */ |
| 2, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &a64fx_prefetch_tune |
| }; |
| |
| /* Support for fine-grained override of the tuning structures. */ |
| struct aarch64_tuning_override_function |
| { |
| const char* name; |
| void (*parse_override)(const char*, struct tune_params*); |
| }; |
| |
| static void aarch64_parse_fuse_string (const char*, struct tune_params*); |
| static void aarch64_parse_tune_string (const char*, struct tune_params*); |
| static void aarch64_parse_sve_width_string (const char*, struct tune_params*); |
| |
| static const struct aarch64_tuning_override_function |
| aarch64_tuning_override_functions[] = |
| { |
| { "fuse", aarch64_parse_fuse_string }, |
| { "tune", aarch64_parse_tune_string }, |
| { "sve_width", aarch64_parse_sve_width_string }, |
| { NULL, NULL } |
| }; |
| |
| /* A processor implementing AArch64. */ |
| struct processor |
| { |
| const char *const name; |
| enum aarch64_processor ident; |
| enum aarch64_processor sched_core; |
| enum aarch64_arch arch; |
| unsigned architecture_version; |
| const uint64_t flags; |
| const struct tune_params *const tune; |
| }; |
| |
| /* Architectures implementing AArch64. */ |
| static const struct processor all_architectures[] = |
| { |
| #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \ |
| {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL}, |
| #include "aarch64-arches.def" |
| {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL} |
| }; |
| |
| /* Processor cores implementing AArch64. */ |
| static const struct processor all_cores[] = |
| { |
| #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \ |
| {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \ |
| all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \ |
| FLAGS, &COSTS##_tunings}, |
| #include "aarch64-cores.def" |
| {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8, |
| AARCH64_FL_FOR_ARCH8, &generic_tunings}, |
| {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL} |
| }; |
| |
| |
| /* Target specification. These are populated by the -march, -mtune, -mcpu |
| handling code or by target attributes. */ |
| static const struct processor *selected_arch; |
| static const struct processor *selected_cpu; |
| static const struct processor *selected_tune; |
| |
| enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A; |
| |
| /* The current tuning set. */ |
| struct tune_params aarch64_tune_params = generic_tunings; |
| |
| /* Check whether an 'aarch64_vector_pcs' attribute is valid. */ |
| |
| static tree |
| handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree, |
| int, bool *no_add_attrs) |
| { |
| /* Since we set fn_type_req to true, the caller should have checked |
| this for us. */ |
| gcc_assert (FUNC_OR_METHOD_TYPE_P (*node)); |
| switch ((arm_pcs) fntype_abi (*node).id ()) |
| { |
| case ARM_PCS_AAPCS64: |
| case ARM_PCS_SIMD: |
| return NULL_TREE; |
| |
| case ARM_PCS_SVE: |
| error ("the %qE attribute cannot be applied to an SVE function type", |
| name); |
| *no_add_attrs = true; |
| return NULL_TREE; |
| |
| case ARM_PCS_TLSDESC: |
| case ARM_PCS_UNKNOWN: |
| break; |
| } |
| gcc_unreachable (); |
| } |
| |
| /* Table of machine attributes. */ |
| static const struct attribute_spec aarch64_attribute_table[] = |
| { |
| /* { name, min_len, max_len, decl_req, type_req, fn_type_req, |
| affects_type_identity, handler, exclude } */ |
| { "aarch64_vector_pcs", 0, 0, false, true, true, true, |
| handle_aarch64_vector_pcs_attribute, NULL }, |
| { "arm_sve_vector_bits", 1, 1, false, true, false, true, |
| aarch64_sve::handle_arm_sve_vector_bits_attribute, |
| NULL }, |
| { "Advanced SIMD type", 1, 1, false, true, false, true, NULL, NULL }, |
| { "SVE type", 3, 3, false, true, false, true, NULL, NULL }, |
| { "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL }, |
| { NULL, 0, 0, false, false, false, false, NULL, NULL } |
| }; |
| |
| #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0) |
| |
| /* An ISA extension in the co-processor and main instruction set space. */ |
| struct aarch64_option_extension |
| { |
| const char *const name; |
| const unsigned long flags_on; |
| const unsigned long flags_off; |
| }; |
| |
| typedef enum aarch64_cond_code |
| { |
| AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL, |
| AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT, |
| AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV |
| } |
| aarch64_cc; |
| |
| #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1)) |
| |
| struct aarch64_branch_protect_type |
| { |
| /* The type's name that the user passes to the branch-protection option |
| string. */ |
| const char* name; |
| /* Function to handle the protection type and set global variables. |
| First argument is the string token corresponding with this type and the |
| second argument is the next token in the option string. |
| Return values: |
| * AARCH64_PARSE_OK: Handling was sucessful. |
| * AARCH64_INVALID_ARG: The type is invalid in this context and the caller |
| should print an error. |
| * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its |
| own error. */ |
| enum aarch64_parse_opt_result (*handler)(char*, char*); |
| /* A list of types that can follow this type in the option string. */ |
| const aarch64_branch_protect_type* subtypes; |
| unsigned int num_subtypes; |
| }; |
| |
| static enum aarch64_parse_opt_result |
| aarch64_handle_no_branch_protection (char* str, char* rest) |
| { |
| aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE; |
| aarch64_enable_bti = 0; |
| if (rest) |
| { |
| error ("unexpected %<%s%> after %<%s%>", rest, str); |
| return AARCH64_PARSE_INVALID_FEATURE; |
| } |
| return AARCH64_PARSE_OK; |
| } |
| |
| static enum aarch64_parse_opt_result |
| aarch64_handle_standard_branch_protection (char* str, char* rest) |
| { |
| aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF; |
| aarch64_ra_sign_key = AARCH64_KEY_A; |
| aarch64_enable_bti = 1; |
| if (rest) |
| { |
| error ("unexpected %<%s%> after %<%s%>", rest, str); |
| return AARCH64_PARSE_INVALID_FEATURE; |
| } |
| return AARCH64_PARSE_OK; |
| } |
| |
| static enum aarch64_parse_opt_result |
| aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED, |
| char* rest ATTRIBUTE_UNUSED) |
| { |
| aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF; |
| aarch64_ra_sign_key = AARCH64_KEY_A; |
| return AARCH64_PARSE_OK; |
| } |
| |
| static enum aarch64_parse_opt_result |
| aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED, |
| char* rest ATTRIBUTE_UNUSED) |
| { |
| aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL; |
| return AARCH64_PARSE_OK; |
| } |
| |
| static enum aarch64_parse_opt_result |
| aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED, |
| char* rest ATTRIBUTE_UNUSED) |
| { |
| aarch64_ra_sign_key = AARCH64_KEY_B; |
| return AARCH64_PARSE_OK; |
| } |
| |
| static enum aarch64_parse_opt_result |
| aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED, |
| char* rest ATTRIBUTE_UNUSED) |
| { |
| aarch64_enable_bti = 1; |
| return AARCH64_PARSE_OK; |
| } |
| |
| static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = { |
| { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 }, |
| { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 }, |
| { NULL, NULL, NULL, 0 } |
| }; |
| |
| static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = { |
| { "none", aarch64_handle_no_branch_protection, NULL, 0 }, |
| { "standard", aarch64_handle_standard_branch_protection, NULL, 0 }, |
| { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes, |
| ARRAY_SIZE (aarch64_pac_ret_subtypes) }, |
| { "bti", aarch64_handle_bti_protection, NULL, 0 }, |
| { NULL, NULL, NULL, 0 } |
| }; |
| |
| /* The condition codes of the processor, and the inverse function. */ |
| static const char * const aarch64_condition_codes[] = |
| { |
| "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc", |
| "hi", "ls", "ge", "lt", "gt", "le", "al", "nv" |
| }; |
| |
| /* The preferred condition codes for SVE conditions. */ |
| static const char *const aarch64_sve_condition_codes[] = |
| { |
| "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc", |
| "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv" |
| }; |
| |
| /* Return the assembly token for svpattern value VALUE. */ |
| |
| static const char * |
| svpattern_token (enum aarch64_svpattern pattern) |
| { |
| switch (pattern) |
| { |
| #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER; |
| AARCH64_FOR_SVPATTERN (CASE) |
| #undef CASE |
| case AARCH64_NUM_SVPATTERNS: |
| break; |
| } |
| gcc_unreachable (); |
| } |
| |
| /* Return the location of a piece that is known to be passed or returned |
| in registers. FIRST_ZR is the first unused vector argument register |
| and FIRST_PR is the first unused predicate argument register. */ |
| |
| rtx |
| pure_scalable_type_info::piece::get_rtx (unsigned int first_zr, |
| unsigned int first_pr) const |
| { |
| gcc_assert (VECTOR_MODE_P (mode) |
| && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS |
| && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS); |
| |
| if (num_zr > 0 && num_pr == 0) |
| return gen_rtx_REG (mode, first_zr); |
| |
| if (num_zr == 0 && num_pr == 1) |
| return gen_rtx_REG (mode, first_pr); |
| |
| gcc_unreachable (); |
| } |
| |
| /* Return the total number of vector registers required by the PST. */ |
| |
| unsigned int |
| pure_scalable_type_info::num_zr () const |
| { |
| unsigned int res = 0; |
| for (unsigned int i = 0; i < pieces.length (); ++i) |
| res += pieces[i].num_zr; |
| return res; |
| } |
| |
| /* Return the total number of predicate registers required by the PST. */ |
| |
| unsigned int |
| pure_scalable_type_info::num_pr () const |
| { |
| unsigned int res = 0; |
| for (unsigned int i = 0; i < pieces.length (); ++i) |
| res += pieces[i].num_pr; |
| return res; |
| } |
| |
| /* Return the location of a PST that is known to be passed or returned |
| in registers. FIRST_ZR is the first unused vector argument register |
| and FIRST_PR is the first unused predicate argument register. */ |
| |
| rtx |
| pure_scalable_type_info::get_rtx (machine_mode mode, |
| unsigned int first_zr, |
| unsigned int first_pr) const |
| { |
| /* Try to return a single REG if possible. This leads to better |
| code generation; it isn't required for correctness. */ |
| if (mode == pieces[0].mode) |
| { |
| gcc_assert (pieces.length () == 1); |
| return pieces[0].get_rtx (first_zr, first_pr); |
| } |
| |
| /* Build up a PARALLEL that contains the individual pieces. */ |
| rtvec rtxes = rtvec_alloc (pieces.length ()); |
| for (unsigned int i = 0; i < pieces.length (); ++i) |
| { |
| rtx reg = pieces[i].get_rtx (first_zr, first_pr); |
| rtx offset = gen_int_mode (pieces[i].offset, Pmode); |
| RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset); |
| first_zr += pieces[i].num_zr; |
| first_pr += pieces[i].num_pr; |
| } |
| return gen_rtx_PARALLEL (mode, rtxes); |
| } |
| |
| /* Analyze whether TYPE is a Pure Scalable Type according to the rules |
| in the AAPCS64. */ |
| |
| pure_scalable_type_info::analysis_result |
| pure_scalable_type_info::analyze (const_tree type) |
| { |
| /* Prevent accidental reuse. */ |
| gcc_assert (pieces.is_empty ()); |
| |
| /* No code will be generated for erroneous types, so we won't establish |
| an ABI mapping. */ |
| if (type == error_mark_node) |
| return NO_ABI_IDENTITY; |
| |
| /* Zero-sized types disappear in the language->ABI mapping. */ |
| if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type))) |
| return NO_ABI_IDENTITY; |
| |
| /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */ |
| piece p = {}; |
| if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr)) |
| { |
| machine_mode mode = TYPE_MODE_RAW (type); |
| gcc_assert (VECTOR_MODE_P (mode) |
| && (!TARGET_SVE || aarch64_sve_mode_p (mode))); |
| |
| p.mode = p.orig_mode = mode; |
| add_piece (p); |
| return IS_PST; |
| } |
| |
| /* Check for user-defined PSTs. */ |
| if (TREE_CODE (type) == ARRAY_TYPE) |
| return analyze_array (type); |
| if (TREE_CODE (type) == RECORD_TYPE) |
| return analyze_record (type); |
| |
| return ISNT_PST; |
| } |
| |
| /* Analyze a type that is known not to be passed or returned in memory. |
| Return true if it has an ABI identity and is a Pure Scalable Type. */ |
| |
| bool |
| pure_scalable_type_info::analyze_registers (const_tree type) |
| { |
| analysis_result result = analyze (type); |
| gcc_assert (result != DOESNT_MATTER); |
| return result == IS_PST; |
| } |
| |
| /* Subroutine of analyze for handling ARRAY_TYPEs. */ |
| |
| pure_scalable_type_info::analysis_result |
| pure_scalable_type_info::analyze_array (const_tree type) |
| { |
| /* Analyze the element type. */ |
| pure_scalable_type_info element_info; |
| analysis_result result = element_info.analyze (TREE_TYPE (type)); |
| if (result != IS_PST) |
| return result; |
| |
| /* An array of unknown, flexible or variable length will be passed and |
| returned by reference whatever we do. */ |
| tree nelts_minus_one = array_type_nelts (type); |
| if (!tree_fits_uhwi_p (nelts_minus_one)) |
| return DOESNT_MATTER; |
| |
| /* Likewise if the array is constant-sized but too big to be interesting. |
| The double checks against MAX_PIECES are to protect against overflow. */ |
| unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one); |
| if (count > MAX_PIECES) |
| return DOESNT_MATTER; |
| count += 1; |
| if (count * element_info.pieces.length () > MAX_PIECES) |
| return DOESNT_MATTER; |
| |
| /* The above checks should have weeded out elements of unknown size. */ |
| poly_uint64 element_bytes; |
| if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes)) |
| gcc_unreachable (); |
| |
| /* Build up the list of individual vectors and predicates. */ |
| gcc_assert (!element_info.pieces.is_empty ()); |
| for (unsigned int i = 0; i < count; ++i) |
| for (unsigned int j = 0; j < element_info.pieces.length (); ++j) |
| { |
| piece p = element_info.pieces[j]; |
| p.offset += i * element_bytes; |
| add_piece (p); |
| } |
| return IS_PST; |
| } |
| |
| /* Subroutine of analyze for handling RECORD_TYPEs. */ |
| |
| pure_scalable_type_info::analysis_result |
| pure_scalable_type_info::analyze_record (const_tree type) |
| { |
| for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field)) |
| { |
| if (TREE_CODE (field) != FIELD_DECL) |
| continue; |
| |
| /* Zero-sized fields disappear in the language->ABI mapping. */ |
| if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field))) |
| continue; |
| |
| /* All fields with an ABI identity must be PSTs for the record as |
| a whole to be a PST. If any individual field is too big to be |
| interesting then the record is too. */ |
| pure_scalable_type_info field_info; |
| analysis_result subresult = field_info.analyze (TREE_TYPE (field)); |
| if (subresult == NO_ABI_IDENTITY) |
| continue; |
| if (subresult != IS_PST) |
| return subresult; |
| |
| /* Since all previous fields are PSTs, we ought to be able to track |
| the field offset using poly_ints. */ |
| tree bitpos = bit_position (field); |
| gcc_assert (poly_int_tree_p (bitpos)); |
| |
| /* For the same reason, it shouldn't be possible to create a PST field |
| whose offset isn't byte-aligned. */ |
| poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos), |
| BITS_PER_UNIT); |
| |
| /* Punt if the record is too big to be interesting. */ |
| poly_uint64 bytepos; |
| if (!wide_bytepos.to_uhwi (&bytepos) |
| || pieces.length () + field_info.pieces.length () > MAX_PIECES) |
| return DOESNT_MATTER; |
| |
| /* Add the individual vectors and predicates in the field to the |
| record's list. */ |
| gcc_assert (!field_info.pieces.is_empty ()); |
| for (unsigned int i = 0; i < field_info.pieces.length (); ++i) |
| { |
| piece p = field_info.pieces[i]; |
| p.offset += bytepos; |
| add_piece (p); |
| } |
| } |
| /* Empty structures disappear in the language->ABI mapping. */ |
| return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST; |
| } |
| |
| /* Add P to the list of pieces in the type. */ |
| |
| void |
| pure_scalable_type_info::add_piece (const piece &p) |
| { |
| /* Try to fold the new piece into the previous one to form a |
| single-mode PST. For example, if we see three consecutive vectors |
| of the same mode, we can represent them using the corresponding |
| 3-tuple mode. |
| |
| This is purely an optimization. */ |
| if (!pieces.is_empty ()) |
| { |
| piece &prev = pieces.last (); |
| gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode)); |
| unsigned int nelems1, nelems2; |
| if (prev.orig_mode == p.orig_mode |
| && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset) |
| && constant_multiple_p (GET_MODE_NUNITS (prev.mode), |
| GET_MODE_NUNITS (p.orig_mode), &nelems1) |
| && constant_multiple_p (GET_MODE_NUNITS (p.mode), |
| GET_MODE_NUNITS (p.orig_mode), &nelems2) |
| && targetm.array_mode (p.orig_mode, |
| nelems1 + nelems2).exists (&prev.mode)) |
| { |
| prev.num_zr += p.num_zr; |
| prev.num_pr += p.num_pr; |
| return; |
| } |
| } |
| pieces.quick_push (p); |
| } |
| |
| /* Return true if at least one possible value of type TYPE includes at |
| least one object of Pure Scalable Type, in the sense of the AAPCS64. |
| |
| This is a relatively expensive test for some types, so it should |
| generally be made as late as possible. */ |
| |
| static bool |
| aarch64_some_values_include_pst_objects_p (const_tree type) |
| { |
| if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type))) |
| return false; |
| |
| if (aarch64_sve::builtin_type_p (type)) |
| return true; |
| |
| if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE) |
| return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type)); |
| |
| if (RECORD_OR_UNION_TYPE_P (type)) |
| for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field)) |
| if (TREE_CODE (field) == FIELD_DECL |
| && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field))) |
| return true; |
| |
| return false; |
| } |
| |
| /* Return the descriptor of the SIMD ABI. */ |
| |
| static const predefined_function_abi & |
| aarch64_simd_abi (void) |
| { |
| predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD]; |
| if (!simd_abi.initialized_p ()) |
| { |
| HARD_REG_SET full_reg_clobbers |
| = default_function_abi.full_reg_clobbers (); |
| for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) |
| if (FP_SIMD_SAVED_REGNUM_P (regno)) |
| CLEAR_HARD_REG_BIT (full_reg_clobbers, regno); |
| simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers); |
| } |
| return simd_abi; |
| } |
| |
| /* Return the descriptor of the SVE PCS. */ |
| |
| static const predefined_function_abi & |
| aarch64_sve_abi (void) |
| { |
| predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE]; |
| if (!sve_abi.initialized_p ()) |
| { |
| HARD_REG_SET full_reg_clobbers |
| = default_function_abi.full_reg_clobbers (); |
| for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno) |
| CLEAR_HARD_REG_BIT (full_reg_clobbers, regno); |
| for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno) |
| CLEAR_HARD_REG_BIT (full_reg_clobbers, regno); |
| sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers); |
| } |
| return sve_abi; |
| } |
| |
| /* If X is an UNSPEC_SALT_ADDR expression, return the address that it |
| wraps, otherwise return X itself. */ |
| |
| static rtx |
| strip_salt (rtx x) |
| { |
| rtx search = x; |
| if (GET_CODE (search) == CONST) |
| search = XEXP (search, 0); |
| if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR) |
| x = XVECEXP (search, 0, 0); |
| return x; |
| } |
| |
| /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the |
| expression. */ |
| |
| static rtx |
| strip_offset_and_salt (rtx addr, poly_int64 *offset) |
| { |
| return strip_salt (strip_offset (addr, offset)); |
| } |
| |
| /* Generate code to enable conditional branches in functions over 1 MiB. */ |
| const char * |
| aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest, |
| const char * branch_format) |
| { |
| rtx_code_label * tmp_label = gen_label_rtx (); |
| char label_buf[256]; |
| char buffer[128]; |
| ASM_GENERATE_INTERNAL_LABEL (label_buf, dest, |
| CODE_LABEL_NUMBER (tmp_label)); |
| const char *label_ptr = targetm.strip_name_encoding (label_buf); |
| rtx dest_label = operands[pos_label]; |
| operands[pos_label] = tmp_label; |
| |
| snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr); |
| output_asm_insn (buffer, operands); |
| |
| snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr); |
| operands[pos_label] = dest_label; |
| output_asm_insn (buffer, operands); |
| return ""; |
| } |
| |
| void |
| aarch64_err_no_fpadvsimd (machine_mode mode) |
| { |
| if (TARGET_GENERAL_REGS_ONLY) |
| if (FLOAT_MODE_P (mode)) |
| error ("%qs is incompatible with the use of floating-point types", |
| "-mgeneral-regs-only"); |
| else |
| error ("%qs is incompatible with the use of vector types", |
| "-mgeneral-regs-only"); |
| else |
| if (FLOAT_MODE_P (mode)) |
| error ("%qs feature modifier is incompatible with the use of" |
| " floating-point types", "+nofp"); |
| else |
| error ("%qs feature modifier is incompatible with the use of" |
| " vector types", "+nofp"); |
| } |
| |
| /* Report when we try to do something that requires SVE when SVE is disabled. |
| This is an error of last resort and isn't very high-quality. It usually |
| involves attempts to measure the vector length in some way. */ |
| static void |
| aarch64_report_sve_required (void) |
| { |
| static bool reported_p = false; |
| |
| /* Avoid reporting a slew of messages for a single oversight. */ |
| if (reported_p) |
| return; |
| |
| error ("this operation requires the SVE ISA extension"); |
| inform (input_location, "you can enable SVE using the command-line" |
| " option %<-march%>, or by using the %<target%>" |
| " attribute or pragma"); |
| reported_p = true; |
| } |
| |
| /* Return true if REGNO is P0-P15 or one of the special FFR-related |
| registers. */ |
| inline bool |
| pr_or_ffr_regnum_p (unsigned int regno) |
| { |
| return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM; |
| } |
| |
| /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS. |
| The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and |
| GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much |
| higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS |
| and GENERAL_REGS is lower than the memory cost (in this case the best class |
| is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its |
| cost results in bad allocations with many redundant int<->FP moves which |
| are expensive on various cores. |
| To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but |
| force a decision between FP_REGS and GENERAL_REGS. We use the allocno class |
| if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't |
| POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode. |
| The result of this is that it is no longer inefficient to have a higher |
| memory move cost than the register move cost. |
| */ |
| |
| static reg_class_t |
| aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class, |
| reg_class_t best_class) |
| { |
| machine_mode mode; |
| |
| if (!reg_class_subset_p (GENERAL_REGS, allocno_class) |
| || !reg_class_subset_p (FP_REGS, allocno_class)) |
| return allocno_class; |
| |
| if (!reg_class_subset_p (GENERAL_REGS, best_class) |
| || !reg_class_subset_p (FP_REGS, best_class)) |
| return best_class; |
| |
| mode = PSEUDO_REGNO_MODE (regno); |
| return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS; |
| } |
| |
| static unsigned int |
| aarch64_min_divisions_for_recip_mul (machine_mode mode) |
| { |
| if (GET_MODE_UNIT_SIZE (mode) == 4) |
| return aarch64_tune_params.min_div_recip_mul_sf; |
| return aarch64_tune_params.min_div_recip_mul_df; |
| } |
| |
| /* Return the reassociation width of treeop OPC with mode MODE. */ |
| static int |
| aarch64_reassociation_width (unsigned opc, machine_mode mode) |
| { |
| if (VECTOR_MODE_P (mode)) |
| return aarch64_tune_params.vec_reassoc_width; |
| if (INTEGRAL_MODE_P (mode)) |
| return aarch64_tune_params.int_reassoc_width; |
| /* Avoid reassociating floating point addition so we emit more FMAs. */ |
| if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR) |
| return aarch64_tune_params.fp_reassoc_width; |
| return 1; |
| } |
| |
| /* Provide a mapping from gcc register numbers to dwarf register numbers. */ |
| unsigned |
| aarch64_dbx_register_number (unsigned regno) |
| { |
| if (GP_REGNUM_P (regno)) |
| return AARCH64_DWARF_R0 + regno - R0_REGNUM; |
| else if (regno == SP_REGNUM) |
| return AARCH64_DWARF_SP; |
| else if (FP_REGNUM_P (regno)) |
| return AARCH64_DWARF_V0 + regno - V0_REGNUM; |
| else if (PR_REGNUM_P (regno)) |
| return AARCH64_DWARF_P0 + regno - P0_REGNUM; |
| else if (regno == VG_REGNUM) |
| return AARCH64_DWARF_VG; |
| |
| /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no |
| equivalent DWARF register. */ |
| return DWARF_FRAME_REGISTERS; |
| } |
| |
| /* If X is a CONST_DOUBLE, return its bit representation as a constant |
| integer, otherwise return X unmodified. */ |
| static rtx |
| aarch64_bit_representation (rtx x) |
| { |
| if (CONST_DOUBLE_P (x)) |
| x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x); |
| return x; |
| } |
| |
| /* Return an estimate for the number of quadwords in an SVE vector. This is |
| equivalent to the number of Advanced SIMD vectors in an SVE vector. */ |
| static unsigned int |
| aarch64_estimated_sve_vq () |
| { |
| return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128; |
| } |
| |
| /* Return true if MODE is any of the Advanced SIMD structure modes. */ |
| static bool |
| aarch64_advsimd_struct_mode_p (machine_mode mode) |
| { |
| return (TARGET_SIMD |
| && (mode == OImode || mode == CImode || mode == XImode)); |
| } |
| |
| /* Return true if MODE is an SVE predicate mode. */ |
| static bool |
| aarch64_sve_pred_mode_p (machine_mode mode) |
| { |
| return (TARGET_SVE |
| && (mode == VNx16BImode |
| || mode == VNx8BImode |
| || mode == VNx4BImode |
| || mode == VNx2BImode)); |
| } |
| |
| /* Three mutually-exclusive flags describing a vector or predicate type. */ |
| const unsigned int VEC_ADVSIMD = 1; |
| const unsigned int VEC_SVE_DATA = 2; |
| const unsigned int VEC_SVE_PRED = 4; |
| /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate |
| a structure of 2, 3 or 4 vectors. */ |
| const unsigned int VEC_STRUCT = 8; |
| /* Can be used in combination with VEC_SVE_DATA to indicate that the |
| vector has fewer significant bytes than a full SVE vector. */ |
| const unsigned int VEC_PARTIAL = 16; |
| /* Useful combinations of the above. */ |
| const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED; |
| const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA; |
| |
| /* Return a set of flags describing the vector properties of mode MODE. |
| Ignore modes that are not supported by the current target. */ |
| static unsigned int |
| aarch64_classify_vector_mode (machine_mode mode) |
| { |
| if (aarch64_advsimd_struct_mode_p (mode)) |
| return VEC_ADVSIMD | VEC_STRUCT; |
| |
| if (aarch64_sve_pred_mode_p (mode)) |
| return VEC_SVE_PRED; |
| |
| /* Make the decision based on the mode's enum value rather than its |
| properties, so that we keep the correct classification regardless |
| of -msve-vector-bits. */ |
| switch (mode) |
| { |
| /* Partial SVE QI vectors. */ |
| case E_VNx2QImode: |
| case E_VNx4QImode: |
| case E_VNx8QImode: |
| /* Partial SVE HI vectors. */ |
| case E_VNx2HImode: |
| case E_VNx4HImode: |
| /* Partial SVE SI vector. */ |
| case E_VNx2SImode: |
| /* Partial SVE HF vectors. */ |
| case E_VNx2HFmode: |
| case E_VNx4HFmode: |
| /* Partial SVE BF vectors. */ |
| case E_VNx2BFmode: |
| case E_VNx4BFmode: |
| /* Partial SVE SF vector. */ |
| case E_VNx2SFmode: |
| return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0; |
| |
| case E_VNx16QImode: |
| case E_VNx8HImode: |
| case E_VNx4SImode: |
| case E_VNx2DImode: |
| case E_VNx8BFmode: |
| case E_VNx8HFmode: |
| case E_VNx4SFmode: |
| case E_VNx2DFmode: |
| return TARGET_SVE ? VEC_SVE_DATA : 0; |
| |
| /* x2 SVE vectors. */ |
| case E_VNx32QImode: |
| case E_VNx16HImode: |
| case E_VNx8SImode: |
| case E_VNx4DImode: |
| case E_VNx16BFmode: |
| case E_VNx16HFmode: |
| case E_VNx8SFmode: |
| case E_VNx4DFmode: |
| /* x3 SVE vectors. */ |
| case E_VNx48QImode: |
| case E_VNx24HImode: |
| case E_VNx12SImode: |
| case E_VNx6DImode: |
| case E_VNx24BFmode: |
| case E_VNx24HFmode: |
| case E_VNx12SFmode: |
| case E_VNx6DFmode: |
| /* x4 SVE vectors. */ |
| case E_VNx64QImode: |
| case E_VNx32HImode: |
| case E_VNx16SImode: |
| case E_VNx8DImode: |
| case E_VNx32BFmode: |
| case E_VNx32HFmode: |
| case E_VNx16SFmode: |
| case E_VNx8DFmode: |
| return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0; |
| |
| /* 64-bit Advanced SIMD vectors. */ |
| case E_V8QImode: |
| case E_V4HImode: |
| case E_V2SImode: |
| /* ...E_V1DImode doesn't exist. */ |
| case E_V4HFmode: |
| case E_V4BFmode: |
| case E_V2SFmode: |
| case E_V1DFmode: |
| /* 128-bit Advanced SIMD vectors. */ |
| case E_V16QImode: |
| case E_V8HImode: |
| case E_V4SImode: |
| case E_V2DImode: |
| case E_V8HFmode: |
| case E_V8BFmode: |
| case E_V4SFmode: |
| case E_V2DFmode: |
| return TARGET_SIMD ? VEC_ADVSIMD : 0; |
| |
| default: |
| return 0; |
| } |
| } |
| |
| /* Return true if MODE is any of the data vector modes, including |
| structure modes. */ |
| static bool |
| aarch64_vector_data_mode_p (machine_mode mode) |
| { |
| return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA; |
| } |
| |
| /* Return true if MODE is any form of SVE mode, including predicates, |
| vectors and structures. */ |
| bool |
| aarch64_sve_mode_p (machine_mode mode) |
| { |
| return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE; |
| } |
| |
| /* Return true if MODE is an SVE data vector mode; either a single vector |
| or a structure of vectors. */ |
| static bool |
| aarch64_sve_data_mode_p (machine_mode mode) |
| { |
| return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA; |
| } |
| |
| /* Return the number of defined bytes in one constituent vector of |
| SVE mode MODE, which has vector flags VEC_FLAGS. */ |
| static poly_int64 |
| aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags) |
| { |
| if (vec_flags & VEC_PARTIAL) |
| /* A single partial vector. */ |
| return GET_MODE_SIZE (mode); |
| |
| if (vec_flags & VEC_SVE_DATA) |
| /* A single vector or a tuple. */ |
| return BYTES_PER_SVE_VECTOR; |
| |
| /* A single predicate. */ |
| gcc_assert (vec_flags & VEC_SVE_PRED); |
| return BYTES_PER_SVE_PRED; |
| } |
| |
| /* Implement target hook TARGET_ARRAY_MODE. */ |
| static opt_machine_mode |
| aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems) |
| { |
| if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA |
| && IN_RANGE (nelems, 2, 4)) |
| return mode_for_vector (GET_MODE_INNER (mode), |
| GET_MODE_NUNITS (mode) * nelems); |
| |
| return opt_machine_mode (); |
| } |
| |
| /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */ |
| static bool |
| aarch64_array_mode_supported_p (machine_mode mode, |
| unsigned HOST_WIDE_INT nelems) |
| { |
| if (TARGET_SIMD |
| && (AARCH64_VALID_SIMD_QREG_MODE (mode) |
| || AARCH64_VALID_SIMD_DREG_MODE (mode)) |
| && (nelems >= 2 && nelems <= 4)) |
| return true; |
| |
| return false; |
| } |
| |
| /* MODE is some form of SVE vector mode. For data modes, return the number |
| of vector register bits that each element of MODE occupies, such as 64 |
| for both VNx2DImode and VNx2SImode (where each 32-bit value is stored |
| in a 64-bit container). For predicate modes, return the number of |
| data bits controlled by each significant predicate bit. */ |
| |
| static unsigned int |
| aarch64_sve_container_bits (machine_mode mode) |
| { |
| unsigned int vec_flags = aarch64_classify_vector_mode (mode); |
| poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED) |
| ? BITS_PER_SVE_VECTOR |
| : GET_MODE_BITSIZE (mode)); |
| return vector_element_size (vector_bits, GET_MODE_NUNITS (mode)); |
| } |
| |
| /* Return the SVE predicate mode to use for elements that have |
| ELEM_NBYTES bytes, if such a mode exists. */ |
| |
| opt_machine_mode |
| aarch64_sve_pred_mode (unsigned int elem_nbytes) |
| { |
| if (TARGET_SVE) |
| { |
| if (elem_nbytes == 1) |
| return VNx16BImode; |
| if (elem_nbytes == 2) |
| return VNx8BImode; |
| if (elem_nbytes == 4) |
| return VNx4BImode; |
| if (elem_nbytes == 8) |
| return VNx2BImode; |
| } |
| return opt_machine_mode (); |
| } |
| |
| /* Return the SVE predicate mode that should be used to control |
| SVE mode MODE. */ |
| |
| machine_mode |
| aarch64_sve_pred_mode (machine_mode mode) |
| { |
| unsigned int bits = aarch64_sve_container_bits (mode); |
| return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require (); |
| } |
| |
| /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */ |
| |
| static opt_machine_mode |
| aarch64_get_mask_mode (machine_mode mode) |
| { |
| unsigned int vec_flags = aarch64_classify_vector_mode (mode); |
| if (vec_flags & VEC_SVE_DATA) |
| return aarch64_sve_pred_mode (mode); |
| |
| return default_get_mask_mode (mode); |
| } |
| |
| /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */ |
| |
| opt_machine_mode |
| aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits) |
| { |
| enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode) |
| ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT); |
| machine_mode mode; |
| FOR_EACH_MODE_IN_CLASS (mode, mclass) |
| if (inner_mode == GET_MODE_INNER (mode) |
| && known_eq (nunits, GET_MODE_NUNITS (mode)) |
| && aarch64_sve_data_mode_p (mode)) |
| return mode; |
| return opt_machine_mode (); |
| } |
| |
| /* Return the integer element mode associated with SVE mode MODE. */ |
| |
| static scalar_int_mode |
| aarch64_sve_element_int_mode (machine_mode mode) |
| { |
| poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL |
| ? BITS_PER_SVE_VECTOR |
| : GET_MODE_BITSIZE (mode)); |
| unsigned int elt_bits = vector_element_size (vector_bits, |
| GET_MODE_NUNITS (mode)); |
| return int_mode_for_size (elt_bits, 0).require (); |
| } |
| |
| /* Return an integer element mode that contains exactly |
| aarch64_sve_container_bits (MODE) bits. This is wider than |
| aarch64_sve_element_int_mode if MODE is a partial vector, |
| otherwise it's the same. */ |
| |
| static scalar_int_mode |
| aarch64_sve_container_int_mode (machine_mode mode) |
| { |
| return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require (); |
| } |
| |
| /* Return the integer vector mode associated with SVE mode MODE. |
| Unlike related_int_vector_mode, this can handle the case in which |
| MODE is a predicate (and thus has a different total size). */ |
| |
| machine_mode |
| aarch64_sve_int_mode (machine_mode mode) |
| { |
| scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode); |
| return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require (); |
| } |
| |
| /* Implement TARGET_VECTORIZE_RELATED_MODE. */ |
| |
| static opt_machine_mode |
| aarch64_vectorize_related_mode (machine_mode vector_mode, |
| scalar_mode element_mode, |
| poly_uint64 nunits) |
| { |
| unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode); |
| |
| /* If we're operating on SVE vectors, try to return an SVE mode. */ |
| poly_uint64 sve_nunits; |
| if ((vec_flags & VEC_SVE_DATA) |
| && multiple_p (BYTES_PER_SVE_VECTOR, |
| GET_MODE_SIZE (element_mode), &sve_nunits)) |
| { |
| machine_mode sve_mode; |
| if (maybe_ne (nunits, 0U)) |
| { |
| /* Try to find a full or partial SVE mode with exactly |
| NUNITS units. */ |
| if (multiple_p (sve_nunits, nunits) |
| && aarch64_sve_data_mode (element_mode, |
| nunits).exists (&sve_mode)) |
| return sve_mode; |
| } |
| else |
| { |
| /* Take the preferred number of units from the number of bytes |
| that fit in VECTOR_MODE. We always start by "autodetecting" |
| a full vector mode with preferred_simd_mode, so vectors |
| chosen here will also be full vector modes. Then |
| autovectorize_vector_modes tries smaller starting modes |
| and thus smaller preferred numbers of units. */ |
| sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode)); |
| if (aarch64_sve_data_mode (element_mode, |
| sve_nunits).exists (&sve_mode)) |
| return sve_mode; |
| } |
| } |
| |
| /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */ |
| if ((vec_flags & VEC_ADVSIMD) |
| && known_eq (nunits, 0U) |
| && known_eq (GET_MODE_BITSIZE (vector_mode), 64U) |
| && maybe_ge (GET_MODE_BITSIZE (element_mode) |
| * GET_MODE_NUNITS (vector_mode), 128U)) |
| { |
| machine_mode res = aarch64_simd_container_mode (element_mode, 128); |
| if (VECTOR_MODE_P (res)) |
| return res; |
| } |
| |
| return default_vectorize_related_mode (vector_mode, element_mode, nunits); |
| } |
| |
| /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations, |
| prefer to use the first arithmetic operand as the else value if |
| the else value doesn't matter, since that exactly matches the SVE |
| destructive merging form. For ternary operations we could either |
| pick the first operand and use FMAD-like instructions or the last |
| operand and use FMLA-like instructions; the latter seems more |
| natural. */ |
| |
| static tree |
| aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops) |
| { |
| return nops == 3 ? ops[2] : ops[0]; |
| } |
| |
| /* Implement TARGET_HARD_REGNO_NREGS. */ |
| |
| static unsigned int |
| aarch64_hard_regno_nregs (unsigned regno, machine_mode mode) |
| { |
| /* ??? Logically we should only need to provide a value when |
| HARD_REGNO_MODE_OK says that the combination is valid, |
| but at the moment we need to handle all modes. Just ignore |
| any runtime parts for registers that can't store them. */ |
| HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode)); |
| switch (aarch64_regno_regclass (regno)) |
| { |
| case FP_REGS: |
| case FP_LO_REGS: |
| case FP_LO8_REGS: |
| { |
| unsigned int vec_flags = aarch64_classify_vector_mode (mode); |
| if (vec_flags & VEC_SVE_DATA) |
| return exact_div (GET_MODE_SIZE (mode), |
| aarch64_vl_bytes (mode, vec_flags)).to_constant (); |
| return CEIL (lowest_size, UNITS_PER_VREG); |
| } |
| case PR_REGS: |
| case PR_LO_REGS: |
| case PR_HI_REGS: |
| case FFR_REGS: |
| case PR_AND_FFR_REGS: |
| return 1; |
| default: |
| return CEIL (lowest_size, UNITS_PER_WORD); |
| } |
| gcc_unreachable (); |
| } |
| |
| /* Implement TARGET_HARD_REGNO_MODE_OK. */ |
| |
| static bool |
| aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode) |
| { |
| if (GET_MODE_CLASS (mode) == MODE_CC) |
| return regno == CC_REGNUM; |
| |
| if (regno == VG_REGNUM) |
| /* This must have the same size as _Unwind_Word. */ |
| return mode == DImode; |
| |
| unsigned int vec_flags = aarch64_classify_vector_mode (mode); |
| if (vec_flags & VEC_SVE_PRED) |
| return pr_or_ffr_regnum_p (regno); |
| |
| if (pr_or_ffr_regnum_p (regno)) |
| return false; |
| |
| if (regno == SP_REGNUM) |
| /* The purpose of comparing with ptr_mode is to support the |
| global register variable associated with the stack pointer |
| register via the syntax of asm ("wsp") in ILP32. */ |
| return mode == Pmode || mode == ptr_mode; |
| |
| if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM) |
| return mode == Pmode; |
| |
| if (GP_REGNUM_P (regno)) |
| { |
| if (vec_flags & VEC_ANY_SVE) |
| return false; |
| if (known_le (GET_MODE_SIZE (mode), 8)) |
| return true; |
| if (known_le (GET_MODE_SIZE (mode), 16)) |
| return (regno & 1) == 0; |
| } |
| else if (FP_REGNUM_P (regno)) |
| { |
| if (vec_flags & VEC_STRUCT) |
| return end_hard_regno (mode, regno) - 1 <= V31_REGNUM; |
| else |
| return !VECTOR_MODE_P (mode) || vec_flags != 0; |
| } |
| |
| return false; |
| } |
| |
| /* Return true if a function with type FNTYPE returns its value in |
| SVE vector or predicate registers. */ |
| |
| static bool |
| aarch64_returns_value_in_sve_regs_p (const_tree fntype) |
| { |
| tree return_type = TREE_TYPE (fntype); |
| |
| pure_scalable_type_info pst_info; |
| switch (pst_info.analyze (return_type)) |
| { |
| case pure_scalable_type_info::IS_PST: |
| return (pst_info.num_zr () <= NUM_FP_ARG_REGS |
| && pst_info.num_pr () <= NUM_PR_ARG_REGS); |
| |
| case pure_scalable_type_info::DOESNT_MATTER: |
| gcc_assert (aarch64_return_in_memory_1 (return_type)); |
| return false; |
| |
| case pure_scalable_type_info::NO_ABI_IDENTITY: |
| case pure_scalable_type_info::ISNT_PST: |
| return false; |
| } |
| gcc_unreachable (); |
| } |
| |
| /* Return true if a function with type FNTYPE takes arguments in |
| SVE vector or predicate registers. */ |
| |
| static bool |
| aarch64_takes_arguments_in_sve_regs_p (const_tree fntype) |
|