| /* Machine description for AArch64 architecture. |
| Copyright (C) 2009-2022 Free Software Foundation, Inc. |
| Contributed by ARM Ltd. |
| |
| This file is part of GCC. |
| |
| GCC is free software; you can redistribute it and/or modify it |
| under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 3, or (at your option) |
| any later version. |
| |
| GCC is distributed in the hope that it will be useful, but |
| WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with GCC; see the file COPYING3. If not see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #define IN_TARGET_CODE 1 |
| |
| #define INCLUDE_STRING |
| #define INCLUDE_ALGORITHM |
| #include "config.h" |
| #include "system.h" |
| #include "coretypes.h" |
| #include "backend.h" |
| #include "target.h" |
| #include "rtl.h" |
| #include "tree.h" |
| #include "memmodel.h" |
| #include "gimple.h" |
| #include "cfghooks.h" |
| #include "cfgloop.h" |
| #include "df.h" |
| #include "tm_p.h" |
| #include "stringpool.h" |
| #include "attribs.h" |
| #include "optabs.h" |
| #include "regs.h" |
| #include "emit-rtl.h" |
| #include "recog.h" |
| #include "cgraph.h" |
| #include "diagnostic.h" |
| #include "insn-attr.h" |
| #include "alias.h" |
| #include "fold-const.h" |
| #include "stor-layout.h" |
| #include "calls.h" |
| #include "varasm.h" |
| #include "output.h" |
| #include "flags.h" |
| #include "explow.h" |
| #include "expr.h" |
| #include "reload.h" |
| #include "langhooks.h" |
| #include "opts.h" |
| #include "gimplify.h" |
| #include "dwarf2.h" |
| #include "gimple-iterator.h" |
| #include "tree-vectorizer.h" |
| #include "aarch64-cost-tables.h" |
| #include "dumpfile.h" |
| #include "builtins.h" |
| #include "rtl-iter.h" |
| #include "tm-constrs.h" |
| #include "sched-int.h" |
| #include "target-globals.h" |
| #include "common/common-target.h" |
| #include "cfgrtl.h" |
| #include "selftest.h" |
| #include "selftest-rtl.h" |
| #include "rtx-vector-builder.h" |
| #include "intl.h" |
| #include "expmed.h" |
| #include "function-abi.h" |
| #include "gimple-pretty-print.h" |
| #include "tree-ssa-loop-niter.h" |
| #include "fractional-cost.h" |
| #include "rtlanal.h" |
| #include "tree-dfa.h" |
| #include "asan.h" |
| #include "aarch64-feature-deps.h" |
| |
| /* This file should be included last. */ |
| #include "target-def.h" |
| |
| /* Defined for convenience. */ |
| #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT) |
| |
| /* Information about a legitimate vector immediate operand. */ |
| struct simd_immediate_info |
| { |
| enum insn_type { MOV, MVN, INDEX, PTRUE }; |
| enum modifier_type { LSL, MSL }; |
| |
| simd_immediate_info () {} |
| simd_immediate_info (scalar_float_mode, rtx); |
| simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT, |
| insn_type = MOV, modifier_type = LSL, |
| unsigned int = 0); |
| simd_immediate_info (scalar_mode, rtx, rtx); |
| simd_immediate_info (scalar_int_mode, aarch64_svpattern); |
| |
| /* The mode of the elements. */ |
| scalar_mode elt_mode; |
| |
| /* The instruction to use to move the immediate into a vector. */ |
| insn_type insn; |
| |
| union |
| { |
| /* For MOV and MVN. */ |
| struct |
| { |
| /* The value of each element. */ |
| rtx value; |
| |
| /* The kind of shift modifier to use, and the number of bits to shift. |
| This is (LSL, 0) if no shift is needed. */ |
| modifier_type modifier; |
| unsigned int shift; |
| } mov; |
| |
| /* For INDEX. */ |
| struct |
| { |
| /* The value of the first element and the step to be added for each |
| subsequent element. */ |
| rtx base, step; |
| } index; |
| |
| /* For PTRUE. */ |
| aarch64_svpattern pattern; |
| } u; |
| }; |
| |
| /* Construct a floating-point immediate in which each element has mode |
| ELT_MODE_IN and value VALUE_IN. */ |
| inline simd_immediate_info |
| ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in) |
| : elt_mode (elt_mode_in), insn (MOV) |
| { |
| u.mov.value = value_in; |
| u.mov.modifier = LSL; |
| u.mov.shift = 0; |
| } |
| |
| /* Construct an integer immediate in which each element has mode ELT_MODE_IN |
| and value VALUE_IN. The other parameters are as for the structure |
| fields. */ |
| inline simd_immediate_info |
| ::simd_immediate_info (scalar_int_mode elt_mode_in, |
| unsigned HOST_WIDE_INT value_in, |
| insn_type insn_in, modifier_type modifier_in, |
| unsigned int shift_in) |
| : elt_mode (elt_mode_in), insn (insn_in) |
| { |
| u.mov.value = gen_int_mode (value_in, elt_mode_in); |
| u.mov.modifier = modifier_in; |
| u.mov.shift = shift_in; |
| } |
| |
| /* Construct an integer immediate in which each element has mode ELT_MODE_IN |
| and where element I is equal to BASE_IN + I * STEP_IN. */ |
| inline simd_immediate_info |
| ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in) |
| : elt_mode (elt_mode_in), insn (INDEX) |
| { |
| u.index.base = base_in; |
| u.index.step = step_in; |
| } |
| |
| /* Construct a predicate that controls elements of mode ELT_MODE_IN |
| and has PTRUE pattern PATTERN_IN. */ |
| inline simd_immediate_info |
| ::simd_immediate_info (scalar_int_mode elt_mode_in, |
| aarch64_svpattern pattern_in) |
| : elt_mode (elt_mode_in), insn (PTRUE) |
| { |
| u.pattern = pattern_in; |
| } |
| |
| namespace { |
| |
| /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */ |
| class pure_scalable_type_info |
| { |
| public: |
| /* Represents the result of analyzing a type. All values are nonzero, |
| in the possibly forlorn hope that accidental conversions to bool |
| trigger a warning. */ |
| enum analysis_result |
| { |
| /* The type does not have an ABI identity; i.e. it doesn't contain |
| at least one object whose type is a Fundamental Data Type. */ |
| NO_ABI_IDENTITY = 1, |
| |
| /* The type is definitely a Pure Scalable Type. */ |
| IS_PST, |
| |
| /* The type is definitely not a Pure Scalable Type. */ |
| ISNT_PST, |
| |
| /* It doesn't matter for PCS purposes whether the type is a Pure |
| Scalable Type or not, since the type will be handled the same |
| way regardless. |
| |
| Specifically, this means that if the type is a Pure Scalable Type, |
| there aren't enough argument registers to hold it, and so it will |
| need to be passed or returned in memory. If the type isn't a |
| Pure Scalable Type, it's too big to be passed or returned in core |
| or SIMD&FP registers, and so again will need to go in memory. */ |
| DOESNT_MATTER |
| }; |
| |
| /* Aggregates of 17 bytes or more are normally passed and returned |
| in memory, so aggregates of that size can safely be analyzed as |
| DOESNT_MATTER. We need to be able to collect enough pieces to |
| represent a PST that is smaller than that. Since predicates are |
| 2 bytes in size for -msve-vector-bits=128, that means we need to be |
| able to store at least 8 pieces. |
| |
| We also need to be able to store enough pieces to represent |
| a single vector in each vector argument register and a single |
| predicate in each predicate argument register. This means that |
| we need at least 12 pieces. */ |
| static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS; |
| static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates"); |
| |
| /* Describes one piece of a PST. Each piece is one of: |
| |
| - a single Scalable Vector Type (SVT) |
| - a single Scalable Predicate Type (SPT) |
| - a PST containing 2, 3 or 4 SVTs, with no padding |
| |
| It either represents a single built-in type or a PST formed from |
| multiple homogeneous built-in types. */ |
| struct piece |
| { |
| rtx get_rtx (unsigned int, unsigned int) const; |
| |
| /* The number of vector and predicate registers that the piece |
| occupies. One of the two is always zero. */ |
| unsigned int num_zr; |
| unsigned int num_pr; |
| |
| /* The mode of the registers described above. */ |
| machine_mode mode; |
| |
| /* If this piece is formed from multiple homogeneous built-in types, |
| this is the mode of the built-in types, otherwise it is MODE. */ |
| machine_mode orig_mode; |
| |
| /* The offset in bytes of the piece from the start of the type. */ |
| poly_uint64_pod offset; |
| }; |
| |
| /* Divides types analyzed as IS_PST into individual pieces. The pieces |
| are in memory order. */ |
| auto_vec<piece, MAX_PIECES> pieces; |
| |
| unsigned int num_zr () const; |
| unsigned int num_pr () const; |
| |
| rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const; |
| |
| analysis_result analyze (const_tree); |
| bool analyze_registers (const_tree); |
| |
| private: |
| analysis_result analyze_array (const_tree); |
| analysis_result analyze_record (const_tree); |
| void add_piece (const piece &); |
| }; |
| } |
| |
| /* The current code model. */ |
| enum aarch64_code_model aarch64_cmodel; |
| |
| /* The number of 64-bit elements in an SVE vector. */ |
| poly_uint16 aarch64_sve_vg; |
| |
| #ifdef HAVE_AS_TLS |
| #undef TARGET_HAVE_TLS |
| #define TARGET_HAVE_TLS 1 |
| #endif |
| |
| static bool aarch64_composite_type_p (const_tree, machine_mode); |
| static bool aarch64_return_in_memory_1 (const_tree); |
| static bool aarch64_vfp_is_call_or_return_candidate (machine_mode, |
| const_tree, |
| machine_mode *, int *, |
| bool *, bool); |
| static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED; |
| static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED; |
| static void aarch64_override_options_after_change (void); |
| static bool aarch64_vector_mode_supported_p (machine_mode); |
| static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool); |
| static bool aarch64_builtin_support_vector_misalignment (machine_mode mode, |
| const_tree type, |
| int misalignment, |
| bool is_packed); |
| static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64); |
| static bool aarch64_print_address_internal (FILE*, machine_mode, rtx, |
| aarch64_addr_query_type); |
| |
| /* The processor for which instructions should be scheduled. */ |
| enum aarch64_processor aarch64_tune = cortexa53; |
| |
| /* Mask to specify which instruction scheduling options should be used. */ |
| uint64_t aarch64_tune_flags = 0; |
| |
| /* Global flag for PC relative loads. */ |
| bool aarch64_pcrelative_literal_loads; |
| |
| /* Global flag for whether frame pointer is enabled. */ |
| bool aarch64_use_frame_pointer; |
| |
| #define BRANCH_PROTECT_STR_MAX 255 |
| char *accepted_branch_protection_string = NULL; |
| |
| static enum aarch64_parse_opt_result |
| aarch64_parse_branch_protection (const char*, char**); |
| |
| /* Support for command line parsing of boolean flags in the tuning |
| structures. */ |
| struct aarch64_flag_desc |
| { |
| const char* name; |
| unsigned int flag; |
| }; |
| |
| #define AARCH64_FUSION_PAIR(name, internal_name) \ |
| { name, AARCH64_FUSE_##internal_name }, |
| static const struct aarch64_flag_desc aarch64_fusible_pairs[] = |
| { |
| { "none", AARCH64_FUSE_NOTHING }, |
| #include "aarch64-fusion-pairs.def" |
| { "all", AARCH64_FUSE_ALL }, |
| { NULL, AARCH64_FUSE_NOTHING } |
| }; |
| |
| #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \ |
| { name, AARCH64_EXTRA_TUNE_##internal_name }, |
| static const struct aarch64_flag_desc aarch64_tuning_flags[] = |
| { |
| { "none", AARCH64_EXTRA_TUNE_NONE }, |
| #include "aarch64-tuning-flags.def" |
| { "all", AARCH64_EXTRA_TUNE_ALL }, |
| { NULL, AARCH64_EXTRA_TUNE_NONE } |
| }; |
| |
| /* Tuning parameters. */ |
| |
| static const struct cpu_addrcost_table generic_addrcost_table = |
| { |
| { |
| 1, /* hi */ |
| 0, /* si */ |
| 0, /* di */ |
| 1, /* ti */ |
| }, |
| 0, /* pre_modify */ |
| 0, /* post_modify */ |
| 0, /* post_modify_ld3_st3 */ |
| 0, /* post_modify_ld4_st4 */ |
| 0, /* register_offset */ |
| 0, /* register_sextend */ |
| 0, /* register_zextend */ |
| 0 /* imm_offset */ |
| }; |
| |
| static const struct cpu_addrcost_table exynosm1_addrcost_table = |
| { |
| { |
| 0, /* hi */ |
| 0, /* si */ |
| 0, /* di */ |
| 2, /* ti */ |
| }, |
| 0, /* pre_modify */ |
| 0, /* post_modify */ |
| 0, /* post_modify_ld3_st3 */ |
| 0, /* post_modify_ld4_st4 */ |
| 1, /* register_offset */ |
| 1, /* register_sextend */ |
| 2, /* register_zextend */ |
| 0, /* imm_offset */ |
| }; |
| |
| static const struct cpu_addrcost_table xgene1_addrcost_table = |
| { |
| { |
| 1, /* hi */ |
| 0, /* si */ |
| 0, /* di */ |
| 1, /* ti */ |
| }, |
| 1, /* pre_modify */ |
| 1, /* post_modify */ |
| 1, /* post_modify_ld3_st3 */ |
| 1, /* post_modify_ld4_st4 */ |
| 0, /* register_offset */ |
| 1, /* register_sextend */ |
| 1, /* register_zextend */ |
| 0, /* imm_offset */ |
| }; |
| |
| static const struct cpu_addrcost_table thunderx2t99_addrcost_table = |
| { |
| { |
| 1, /* hi */ |
| 1, /* si */ |
| 1, /* di */ |
| 2, /* ti */ |
| }, |
| 0, /* pre_modify */ |
| 0, /* post_modify */ |
| 0, /* post_modify_ld3_st3 */ |
| 0, /* post_modify_ld4_st4 */ |
| 2, /* register_offset */ |
| 3, /* register_sextend */ |
| 3, /* register_zextend */ |
| 0, /* imm_offset */ |
| }; |
| |
| static const struct cpu_addrcost_table thunderx3t110_addrcost_table = |
| { |
| { |
| 1, /* hi */ |
| 1, /* si */ |
| 1, /* di */ |
| 2, /* ti */ |
| }, |
| 0, /* pre_modify */ |
| 0, /* post_modify */ |
| 0, /* post_modify_ld3_st3 */ |
| 0, /* post_modify_ld4_st4 */ |
| 2, /* register_offset */ |
| 3, /* register_sextend */ |
| 3, /* register_zextend */ |
| 0, /* imm_offset */ |
| }; |
| |
| static const struct cpu_addrcost_table tsv110_addrcost_table = |
| { |
| { |
| 1, /* hi */ |
| 0, /* si */ |
| 0, /* di */ |
| 1, /* ti */ |
| }, |
| 0, /* pre_modify */ |
| 0, /* post_modify */ |
| 0, /* post_modify_ld3_st3 */ |
| 0, /* post_modify_ld4_st4 */ |
| 0, /* register_offset */ |
| 1, /* register_sextend */ |
| 1, /* register_zextend */ |
| 0, /* imm_offset */ |
| }; |
| |
| static const struct cpu_addrcost_table qdf24xx_addrcost_table = |
| { |
| { |
| 1, /* hi */ |
| 1, /* si */ |
| 1, /* di */ |
| 2, /* ti */ |
| }, |
| 1, /* pre_modify */ |
| 1, /* post_modify */ |
| 1, /* post_modify_ld3_st3 */ |
| 1, /* post_modify_ld4_st4 */ |
| 3, /* register_offset */ |
| 3, /* register_sextend */ |
| 3, /* register_zextend */ |
| 2, /* imm_offset */ |
| }; |
| |
| static const struct cpu_addrcost_table a64fx_addrcost_table = |
| { |
| { |
| 1, /* hi */ |
| 1, /* si */ |
| 1, /* di */ |
| 2, /* ti */ |
| }, |
| 0, /* pre_modify */ |
| 0, /* post_modify */ |
| 0, /* post_modify_ld3_st3 */ |
| 0, /* post_modify_ld4_st4 */ |
| 2, /* register_offset */ |
| 3, /* register_sextend */ |
| 3, /* register_zextend */ |
| 0, /* imm_offset */ |
| }; |
| |
| static const struct cpu_addrcost_table neoversev1_addrcost_table = |
| { |
| { |
| 1, /* hi */ |
| 0, /* si */ |
| 0, /* di */ |
| 1, /* ti */ |
| }, |
| 0, /* pre_modify */ |
| 0, /* post_modify */ |
| 3, /* post_modify_ld3_st3 */ |
| 3, /* post_modify_ld4_st4 */ |
| 0, /* register_offset */ |
| 0, /* register_sextend */ |
| 0, /* register_zextend */ |
| 0 /* imm_offset */ |
| }; |
| |
| static const struct cpu_addrcost_table neoversen2_addrcost_table = |
| { |
| { |
| 1, /* hi */ |
| 0, /* si */ |
| 0, /* di */ |
| 1, /* ti */ |
| }, |
| 0, /* pre_modify */ |
| 0, /* post_modify */ |
| 2, /* post_modify_ld3_st3 */ |
| 2, /* post_modify_ld4_st4 */ |
| 0, /* register_offset */ |
| 0, /* register_sextend */ |
| 0, /* register_zextend */ |
| 0 /* imm_offset */ |
| }; |
| |
| static const struct cpu_addrcost_table neoversev2_addrcost_table = |
| { |
| { |
| 1, /* hi */ |
| 0, /* si */ |
| 0, /* di */ |
| 1, /* ti */ |
| }, |
| 0, /* pre_modify */ |
| 0, /* post_modify */ |
| 2, /* post_modify_ld3_st3 */ |
| 2, /* post_modify_ld4_st4 */ |
| 0, /* register_offset */ |
| 0, /* register_sextend */ |
| 0, /* register_zextend */ |
| 0 /* imm_offset */ |
| }; |
| |
| static const struct cpu_regmove_cost generic_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Avoid the use of slow int<->fp moves for spilling by setting |
| their cost higher than memmov_cost. */ |
| 5, /* GP2FP */ |
| 5, /* FP2GP */ |
| 2 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost cortexa57_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Avoid the use of slow int<->fp moves for spilling by setting |
| their cost higher than memmov_cost. */ |
| 5, /* GP2FP */ |
| 5, /* FP2GP */ |
| 2 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost cortexa53_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Avoid the use of slow int<->fp moves for spilling by setting |
| their cost higher than memmov_cost. */ |
| 5, /* GP2FP */ |
| 5, /* FP2GP */ |
| 2 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost exynosm1_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Avoid the use of slow int<->fp moves for spilling by setting |
| their cost higher than memmov_cost (actual, 4 and 9). */ |
| 9, /* GP2FP */ |
| 9, /* FP2GP */ |
| 1 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost thunderx_regmove_cost = |
| { |
| 2, /* GP2GP */ |
| 2, /* GP2FP */ |
| 6, /* FP2GP */ |
| 4 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost xgene1_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Avoid the use of slow int<->fp moves for spilling by setting |
| their cost higher than memmov_cost. */ |
| 8, /* GP2FP */ |
| 8, /* FP2GP */ |
| 2 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost qdf24xx_regmove_cost = |
| { |
| 2, /* GP2GP */ |
| /* Avoid the use of int<->fp moves for spilling. */ |
| 6, /* GP2FP */ |
| 6, /* FP2GP */ |
| 4 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost thunderx2t99_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Avoid the use of int<->fp moves for spilling. */ |
| 5, /* GP2FP */ |
| 6, /* FP2GP */ |
| 3, /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost thunderx3t110_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Avoid the use of int<->fp moves for spilling. */ |
| 4, /* GP2FP */ |
| 5, /* FP2GP */ |
| 4 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost tsv110_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Avoid the use of slow int<->fp moves for spilling by setting |
| their cost higher than memmov_cost. */ |
| 2, /* GP2FP */ |
| 3, /* FP2GP */ |
| 2 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost a64fx_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Avoid the use of slow int<->fp moves for spilling by setting |
| their cost higher than memmov_cost. */ |
| 5, /* GP2FP */ |
| 7, /* FP2GP */ |
| 2 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost neoversen2_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Spilling to int<->fp instead of memory is recommended so set |
| realistic costs compared to memmov_cost. */ |
| 3, /* GP2FP */ |
| 2, /* FP2GP */ |
| 2 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost neoversev1_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Spilling to int<->fp instead of memory is recommended so set |
| realistic costs compared to memmov_cost. */ |
| 3, /* GP2FP */ |
| 2, /* FP2GP */ |
| 2 /* FP2FP */ |
| }; |
| |
| static const struct cpu_regmove_cost neoversev2_regmove_cost = |
| { |
| 1, /* GP2GP */ |
| /* Spilling to int<->fp instead of memory is recommended so set |
| realistic costs compared to memmov_cost. */ |
| 3, /* GP2FP */ |
| 2, /* FP2GP */ |
| 2 /* FP2FP */ |
| }; |
| |
| /* Generic costs for Advanced SIMD vector operations. */ |
| static const advsimd_vec_cost generic_advsimd_vector_cost = |
| { |
| 1, /* int_stmt_cost */ |
| 1, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 2, /* permute_cost */ |
| 2, /* reduc_i8_cost */ |
| 2, /* reduc_i16_cost */ |
| 2, /* reduc_i32_cost */ |
| 2, /* reduc_i64_cost */ |
| 2, /* reduc_f16_cost */ |
| 2, /* reduc_f32_cost */ |
| 2, /* reduc_f64_cost */ |
| 2, /* store_elt_extra_cost */ |
| 2, /* vec_to_scalar_cost */ |
| 1, /* scalar_to_vec_cost */ |
| 1, /* align_load_cost */ |
| 1, /* unalign_load_cost */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| /* Generic costs for SVE vector operations. */ |
| static const sve_vec_cost generic_sve_vector_cost = |
| { |
| { |
| 1, /* int_stmt_cost */ |
| 1, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 2, /* permute_cost */ |
| 2, /* reduc_i8_cost */ |
| 2, /* reduc_i16_cost */ |
| 2, /* reduc_i32_cost */ |
| 2, /* reduc_i64_cost */ |
| 2, /* reduc_f16_cost */ |
| 2, /* reduc_f32_cost */ |
| 2, /* reduc_f64_cost */ |
| 2, /* store_elt_extra_cost */ |
| 2, /* vec_to_scalar_cost */ |
| 1, /* scalar_to_vec_cost */ |
| 1, /* align_load_cost */ |
| 1, /* unalign_load_cost */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }, |
| 2, /* clast_cost */ |
| 2, /* fadda_f16_cost */ |
| 2, /* fadda_f32_cost */ |
| 2, /* fadda_f64_cost */ |
| 4, /* gather_load_x32_cost */ |
| 2, /* gather_load_x64_cost */ |
| 1 /* scatter_store_elt_cost */ |
| }; |
| |
| /* Generic costs for vector insn classes. */ |
| static const struct cpu_vector_cost generic_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 1, /* scalar_fp_stmt_cost */ |
| 1, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 3, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &generic_advsimd_vector_cost, /* advsimd */ |
| &generic_sve_vector_cost, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| static const advsimd_vec_cost a64fx_advsimd_vector_cost = |
| { |
| 2, /* int_stmt_cost */ |
| 5, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 3, /* permute_cost */ |
| 13, /* reduc_i8_cost */ |
| 13, /* reduc_i16_cost */ |
| 13, /* reduc_i32_cost */ |
| 13, /* reduc_i64_cost */ |
| 13, /* reduc_f16_cost */ |
| 13, /* reduc_f32_cost */ |
| 13, /* reduc_f64_cost */ |
| 13, /* store_elt_extra_cost */ |
| 13, /* vec_to_scalar_cost */ |
| 4, /* scalar_to_vec_cost */ |
| 6, /* align_load_cost */ |
| 6, /* unalign_load_cost */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| static const sve_vec_cost a64fx_sve_vector_cost = |
| { |
| { |
| 2, /* int_stmt_cost */ |
| 5, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 3, /* permute_cost */ |
| 13, /* reduc_i8_cost */ |
| 13, /* reduc_i16_cost */ |
| 13, /* reduc_i32_cost */ |
| 13, /* reduc_i64_cost */ |
| 13, /* reduc_f16_cost */ |
| 13, /* reduc_f32_cost */ |
| 13, /* reduc_f64_cost */ |
| 13, /* store_elt_extra_cost */ |
| 13, /* vec_to_scalar_cost */ |
| 4, /* scalar_to_vec_cost */ |
| 6, /* align_load_cost */ |
| 6, /* unalign_load_cost */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }, |
| 13, /* clast_cost */ |
| 13, /* fadda_f16_cost */ |
| 13, /* fadda_f32_cost */ |
| 13, /* fadda_f64_cost */ |
| 64, /* gather_load_x32_cost */ |
| 32, /* gather_load_x64_cost */ |
| 1 /* scatter_store_elt_cost */ |
| }; |
| |
| static const struct cpu_vector_cost a64fx_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 5, /* scalar_fp_stmt_cost */ |
| 4, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 3, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &a64fx_advsimd_vector_cost, /* advsimd */ |
| &a64fx_sve_vector_cost, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| static const advsimd_vec_cost qdf24xx_advsimd_vector_cost = |
| { |
| 1, /* int_stmt_cost */ |
| 3, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 2, /* permute_cost */ |
| 1, /* reduc_i8_cost */ |
| 1, /* reduc_i16_cost */ |
| 1, /* reduc_i32_cost */ |
| 1, /* reduc_i64_cost */ |
| 1, /* reduc_f16_cost */ |
| 1, /* reduc_f32_cost */ |
| 1, /* reduc_f64_cost */ |
| 1, /* store_elt_extra_cost */ |
| 1, /* vec_to_scalar_cost */ |
| 1, /* scalar_to_vec_cost */ |
| 1, /* align_load_cost */ |
| 1, /* unalign_load_cost */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| /* QDF24XX costs for vector insn classes. */ |
| static const struct cpu_vector_cost qdf24xx_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 1, /* scalar_fp_stmt_cost */ |
| 1, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 3, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &qdf24xx_advsimd_vector_cost, /* advsimd */ |
| nullptr, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| |
| static const advsimd_vec_cost thunderx_advsimd_vector_cost = |
| { |
| 4, /* int_stmt_cost */ |
| 1, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 4, /* permute_cost */ |
| 2, /* reduc_i8_cost */ |
| 2, /* reduc_i16_cost */ |
| 2, /* reduc_i32_cost */ |
| 2, /* reduc_i64_cost */ |
| 2, /* reduc_f16_cost */ |
| 2, /* reduc_f32_cost */ |
| 2, /* reduc_f64_cost */ |
| 2, /* store_elt_extra_cost */ |
| 2, /* vec_to_scalar_cost */ |
| 2, /* scalar_to_vec_cost */ |
| 3, /* align_load_cost */ |
| 5, /* unalign_load_cost */ |
| 5, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| /* ThunderX costs for vector insn classes. */ |
| static const struct cpu_vector_cost thunderx_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 1, /* scalar_fp_stmt_cost */ |
| 3, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 3, /* cond_taken_branch_cost */ |
| 3, /* cond_not_taken_branch_cost */ |
| &thunderx_advsimd_vector_cost, /* advsimd */ |
| nullptr, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| static const advsimd_vec_cost tsv110_advsimd_vector_cost = |
| { |
| 2, /* int_stmt_cost */ |
| 2, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 2, /* permute_cost */ |
| 3, /* reduc_i8_cost */ |
| 3, /* reduc_i16_cost */ |
| 3, /* reduc_i32_cost */ |
| 3, /* reduc_i64_cost */ |
| 3, /* reduc_f16_cost */ |
| 3, /* reduc_f32_cost */ |
| 3, /* reduc_f64_cost */ |
| 3, /* store_elt_extra_cost */ |
| 3, /* vec_to_scalar_cost */ |
| 2, /* scalar_to_vec_cost */ |
| 5, /* align_load_cost */ |
| 5, /* unalign_load_cost */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| static const struct cpu_vector_cost tsv110_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 1, /* scalar_fp_stmt_cost */ |
| 5, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 1, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &tsv110_advsimd_vector_cost, /* advsimd */ |
| nullptr, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| static const advsimd_vec_cost cortexa57_advsimd_vector_cost = |
| { |
| 2, /* int_stmt_cost */ |
| 2, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 3, /* permute_cost */ |
| 8, /* reduc_i8_cost */ |
| 8, /* reduc_i16_cost */ |
| 8, /* reduc_i32_cost */ |
| 8, /* reduc_i64_cost */ |
| 8, /* reduc_f16_cost */ |
| 8, /* reduc_f32_cost */ |
| 8, /* reduc_f64_cost */ |
| 8, /* store_elt_extra_cost */ |
| 8, /* vec_to_scalar_cost */ |
| 8, /* scalar_to_vec_cost */ |
| 4, /* align_load_cost */ |
| 4, /* unalign_load_cost */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| /* Cortex-A57 costs for vector insn classes. */ |
| static const struct cpu_vector_cost cortexa57_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 1, /* scalar_fp_stmt_cost */ |
| 4, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 1, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &cortexa57_advsimd_vector_cost, /* advsimd */ |
| nullptr, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| static const advsimd_vec_cost exynosm1_advsimd_vector_cost = |
| { |
| 3, /* int_stmt_cost */ |
| 3, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 3, /* permute_cost */ |
| 3, /* reduc_i8_cost */ |
| 3, /* reduc_i16_cost */ |
| 3, /* reduc_i32_cost */ |
| 3, /* reduc_i64_cost */ |
| 3, /* reduc_f16_cost */ |
| 3, /* reduc_f32_cost */ |
| 3, /* reduc_f64_cost */ |
| 3, /* store_elt_extra_cost */ |
| 3, /* vec_to_scalar_cost */ |
| 3, /* scalar_to_vec_cost */ |
| 5, /* align_load_cost */ |
| 5, /* unalign_load_cost */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| static const struct cpu_vector_cost exynosm1_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 1, /* scalar_fp_stmt_cost */ |
| 5, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 1, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &exynosm1_advsimd_vector_cost, /* advsimd */ |
| nullptr, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| static const advsimd_vec_cost xgene1_advsimd_vector_cost = |
| { |
| 2, /* int_stmt_cost */ |
| 2, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 2, /* permute_cost */ |
| 4, /* reduc_i8_cost */ |
| 4, /* reduc_i16_cost */ |
| 4, /* reduc_i32_cost */ |
| 4, /* reduc_i64_cost */ |
| 4, /* reduc_f16_cost */ |
| 4, /* reduc_f32_cost */ |
| 4, /* reduc_f64_cost */ |
| 4, /* store_elt_extra_cost */ |
| 4, /* vec_to_scalar_cost */ |
| 4, /* scalar_to_vec_cost */ |
| 10, /* align_load_cost */ |
| 10, /* unalign_load_cost */ |
| 2, /* unalign_store_cost */ |
| 2 /* store_cost */ |
| }; |
| |
| /* Generic costs for vector insn classes. */ |
| static const struct cpu_vector_cost xgene1_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 1, /* scalar_fp_stmt_cost */ |
| 5, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 2, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &xgene1_advsimd_vector_cost, /* advsimd */ |
| nullptr, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost = |
| { |
| 4, /* int_stmt_cost */ |
| 5, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 10, /* permute_cost */ |
| 6, /* reduc_i8_cost */ |
| 6, /* reduc_i16_cost */ |
| 6, /* reduc_i32_cost */ |
| 6, /* reduc_i64_cost */ |
| 6, /* reduc_f16_cost */ |
| 6, /* reduc_f32_cost */ |
| 6, /* reduc_f64_cost */ |
| 6, /* store_elt_extra_cost */ |
| 6, /* vec_to_scalar_cost */ |
| 5, /* scalar_to_vec_cost */ |
| 4, /* align_load_cost */ |
| 4, /* unalign_load_cost */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| /* Costs for vector insn classes for Vulcan. */ |
| static const struct cpu_vector_cost thunderx2t99_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 6, /* scalar_fp_stmt_cost */ |
| 4, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 2, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &thunderx2t99_advsimd_vector_cost, /* advsimd */ |
| nullptr, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost = |
| { |
| 5, /* int_stmt_cost */ |
| 5, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 10, /* permute_cost */ |
| 5, /* reduc_i8_cost */ |
| 5, /* reduc_i16_cost */ |
| 5, /* reduc_i32_cost */ |
| 5, /* reduc_i64_cost */ |
| 5, /* reduc_f16_cost */ |
| 5, /* reduc_f32_cost */ |
| 5, /* reduc_f64_cost */ |
| 5, /* store_elt_extra_cost */ |
| 5, /* vec_to_scalar_cost */ |
| 5, /* scalar_to_vec_cost */ |
| 4, /* align_load_cost */ |
| 4, /* unalign_load_cost */ |
| 4, /* unalign_store_cost */ |
| 4 /* store_cost */ |
| }; |
| |
| static const struct cpu_vector_cost thunderx3t110_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 5, /* scalar_fp_stmt_cost */ |
| 4, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 2, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &thunderx3t110_advsimd_vector_cost, /* advsimd */ |
| nullptr, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| static const advsimd_vec_cost ampere1_advsimd_vector_cost = |
| { |
| 3, /* int_stmt_cost */ |
| 3, /* fp_stmt_cost */ |
| 0, /* ld2_st2_permute_cost */ |
| 0, /* ld3_st3_permute_cost */ |
| 0, /* ld4_st4_permute_cost */ |
| 2, /* permute_cost */ |
| 12, /* reduc_i8_cost */ |
| 9, /* reduc_i16_cost */ |
| 6, /* reduc_i32_cost */ |
| 5, /* reduc_i64_cost */ |
| 9, /* reduc_f16_cost */ |
| 6, /* reduc_f32_cost */ |
| 5, /* reduc_f64_cost */ |
| 8, /* store_elt_extra_cost */ |
| 6, /* vec_to_scalar_cost */ |
| 7, /* scalar_to_vec_cost */ |
| 5, /* align_load_cost */ |
| 5, /* unalign_load_cost */ |
| 2, /* unalign_store_cost */ |
| 2 /* store_cost */ |
| }; |
| |
| /* Ampere-1 costs for vector insn classes. */ |
| static const struct cpu_vector_cost ampere1_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 1, /* scalar_fp_stmt_cost */ |
| 4, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 1, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &ere1_advsimd_vector_cost, /* advsimd */ |
| nullptr, /* sve */ |
| nullptr /* issue_info */ |
| }; |
| |
| /* Generic costs for branch instructions. */ |
| static const struct cpu_branch_cost generic_branch_cost = |
| { |
| 1, /* Predictable. */ |
| 3 /* Unpredictable. */ |
| }; |
| |
| /* Generic approximation modes. */ |
| static const cpu_approx_modes generic_approx_modes = |
| { |
| AARCH64_APPROX_NONE, /* division */ |
| AARCH64_APPROX_NONE, /* sqrt */ |
| AARCH64_APPROX_NONE /* recip_sqrt */ |
| }; |
| |
| /* Approximation modes for Exynos M1. */ |
| static const cpu_approx_modes exynosm1_approx_modes = |
| { |
| AARCH64_APPROX_NONE, /* division */ |
| AARCH64_APPROX_ALL, /* sqrt */ |
| AARCH64_APPROX_ALL /* recip_sqrt */ |
| }; |
| |
| /* Approximation modes for X-Gene 1. */ |
| static const cpu_approx_modes xgene1_approx_modes = |
| { |
| AARCH64_APPROX_NONE, /* division */ |
| AARCH64_APPROX_NONE, /* sqrt */ |
| AARCH64_APPROX_ALL /* recip_sqrt */ |
| }; |
| |
| /* Generic prefetch settings (which disable prefetch). */ |
| static const cpu_prefetch_tune generic_prefetch_tune = |
| { |
| 0, /* num_slots */ |
| -1, /* l1_cache_size */ |
| -1, /* l1_cache_line_size */ |
| -1, /* l2_cache_size */ |
| true, /* prefetch_dynamic_strides */ |
| -1, /* minimum_stride */ |
| -1 /* default_opt_level */ |
| }; |
| |
| static const cpu_prefetch_tune exynosm1_prefetch_tune = |
| { |
| 0, /* num_slots */ |
| -1, /* l1_cache_size */ |
| 64, /* l1_cache_line_size */ |
| -1, /* l2_cache_size */ |
| true, /* prefetch_dynamic_strides */ |
| -1, /* minimum_stride */ |
| -1 /* default_opt_level */ |
| }; |
| |
| static const cpu_prefetch_tune qdf24xx_prefetch_tune = |
| { |
| 4, /* num_slots */ |
| 32, /* l1_cache_size */ |
| 64, /* l1_cache_line_size */ |
| 512, /* l2_cache_size */ |
| false, /* prefetch_dynamic_strides */ |
| 2048, /* minimum_stride */ |
| 3 /* default_opt_level */ |
| }; |
| |
| static const cpu_prefetch_tune thunderxt88_prefetch_tune = |
| { |
| 8, /* num_slots */ |
| 32, /* l1_cache_size */ |
| 128, /* l1_cache_line_size */ |
| 16*1024, /* l2_cache_size */ |
| true, /* prefetch_dynamic_strides */ |
| -1, /* minimum_stride */ |
| 3 /* default_opt_level */ |
| }; |
| |
| static const cpu_prefetch_tune thunderx_prefetch_tune = |
| { |
| 8, /* num_slots */ |
| 32, /* l1_cache_size */ |
| 128, /* l1_cache_line_size */ |
| -1, /* l2_cache_size */ |
| true, /* prefetch_dynamic_strides */ |
| -1, /* minimum_stride */ |
| -1 /* default_opt_level */ |
| }; |
| |
| static const cpu_prefetch_tune thunderx2t99_prefetch_tune = |
| { |
| 8, /* num_slots */ |
| 32, /* l1_cache_size */ |
| 64, /* l1_cache_line_size */ |
| 256, /* l2_cache_size */ |
| true, /* prefetch_dynamic_strides */ |
| -1, /* minimum_stride */ |
| -1 /* default_opt_level */ |
| }; |
| |
| static const cpu_prefetch_tune thunderx3t110_prefetch_tune = |
| { |
| 8, /* num_slots */ |
| 32, /* l1_cache_size */ |
| 64, /* l1_cache_line_size */ |
| 256, /* l2_cache_size */ |
| true, /* prefetch_dynamic_strides */ |
| -1, /* minimum_stride */ |
| -1 /* default_opt_level */ |
| }; |
| |
| static const cpu_prefetch_tune tsv110_prefetch_tune = |
| { |
| 0, /* num_slots */ |
| 64, /* l1_cache_size */ |
| 64, /* l1_cache_line_size */ |
| 512, /* l2_cache_size */ |
| true, /* prefetch_dynamic_strides */ |
| -1, /* minimum_stride */ |
| -1 /* default_opt_level */ |
| }; |
| |
| static const cpu_prefetch_tune xgene1_prefetch_tune = |
| { |
| 8, /* num_slots */ |
| 32, /* l1_cache_size */ |
| 64, /* l1_cache_line_size */ |
| 256, /* l2_cache_size */ |
| true, /* prefetch_dynamic_strides */ |
| -1, /* minimum_stride */ |
| -1 /* default_opt_level */ |
| }; |
| |
| static const cpu_prefetch_tune a64fx_prefetch_tune = |
| { |
| 8, /* num_slots */ |
| 64, /* l1_cache_size */ |
| 256, /* l1_cache_line_size */ |
| 32768, /* l2_cache_size */ |
| true, /* prefetch_dynamic_strides */ |
| -1, /* minimum_stride */ |
| -1 /* default_opt_level */ |
| }; |
| |
| static const cpu_prefetch_tune ampere1_prefetch_tune = |
| { |
| 0, /* num_slots */ |
| 64, /* l1_cache_size */ |
| 64, /* l1_cache_line_size */ |
| 2048, /* l2_cache_size */ |
| true, /* prefetch_dynamic_strides */ |
| -1, /* minimum_stride */ |
| -1 /* default_opt_level */ |
| }; |
| |
| static const struct tune_params generic_tunings = |
| { |
| &cortexa57_extra_costs, |
| &generic_addrcost_table, |
| &generic_regmove_cost, |
| &generic_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| { 4, /* load_int. */ |
| 4, /* store_int. */ |
| 4, /* load_fp. */ |
| 4, /* store_fp. */ |
| 4, /* load_pred. */ |
| 4 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 2, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ |
| "16:12", /* function_align. */ |
| "4", /* jump_align. */ |
| "8", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits |
| Neoverse V1. It does not have a noticeable effect on A64FX and should |
| have at most a very minor effect on SVE2 cores. */ |
| (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const struct tune_params cortexa35_tunings = |
| { |
| &cortexa53_extra_costs, |
| &generic_addrcost_table, |
| &cortexa53_regmove_cost, |
| &generic_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| { 4, /* load_int. */ |
| 4, /* store_int. */ |
| 4, /* load_fp. */ |
| 4, /* store_fp. */ |
| 4, /* load_pred. */ |
| 4 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 1, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD |
| | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ |
| "16", /* function_align. */ |
| "4", /* jump_align. */ |
| "8", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const struct tune_params cortexa53_tunings = |
| { |
| &cortexa53_extra_costs, |
| &generic_addrcost_table, |
| &cortexa53_regmove_cost, |
| &generic_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| { 4, /* load_int. */ |
| 4, /* store_int. */ |
| 4, /* load_fp. */ |
| 4, /* store_fp. */ |
| 4, /* load_pred. */ |
| 4 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 2, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD |
| | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ |
| "16", /* function_align. */ |
| "4", /* jump_align. */ |
| "8", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const struct tune_params cortexa57_tunings = |
| { |
| &cortexa57_extra_costs, |
| &generic_addrcost_table, |
| &cortexa57_regmove_cost, |
| &cortexa57_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| { 4, /* load_int. */ |
| 4, /* store_int. */ |
| 4, /* load_fp. */ |
| 4, /* store_fp. */ |
| 4, /* load_pred. */ |
| 4 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 3, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD |
| | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */ |
| "16", /* function_align. */ |
| "4", /* jump_align. */ |
| "8", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const struct tune_params cortexa72_tunings = |
| { |
| &cortexa57_extra_costs, |
| &generic_addrcost_table, |
| &cortexa57_regmove_cost, |
| &cortexa57_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| { 4, /* load_int. */ |
| 4, /* store_int. */ |
| 4, /* load_fp. */ |
| 4, /* store_fp. */ |
| 4, /* load_pred. */ |
| 4 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 3, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD |
| | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */ |
| "16", /* function_align. */ |
| "4", /* jump_align. */ |
| "8", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const struct tune_params cortexa73_tunings = |
| { |
| &cortexa57_extra_costs, |
| &generic_addrcost_table, |
| &cortexa57_regmove_cost, |
| &cortexa57_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| { 4, /* load_int. */ |
| 4, /* store_int. */ |
| 4, /* load_fp. */ |
| 4, /* store_fp. */ |
| 4, /* load_pred. */ |
| 4 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 2, /* issue_rate. */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD |
| | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ |
| "16", /* function_align. */ |
| "4", /* jump_align. */ |
| "8", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| |
| |
| static const struct tune_params exynosm1_tunings = |
| { |
| &exynosm1_extra_costs, |
| &exynosm1_addrcost_table, |
| &exynosm1_regmove_cost, |
| &exynosm1_vector_cost, |
| &generic_branch_cost, |
| &exynosm1_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| { 4, /* load_int. */ |
| 4, /* store_int. */ |
| 4, /* load_fp. */ |
| 4, /* store_fp. */ |
| 4, /* load_pred. */ |
| 4 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 3, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC), /* fusible_ops */ |
| "4", /* function_align. */ |
| "4", /* jump_align. */ |
| "4", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 48, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &exynosm1_prefetch_tune |
| }; |
| |
| static const struct tune_params thunderxt88_tunings = |
| { |
| &thunderx_extra_costs, |
| &generic_addrcost_table, |
| &thunderx_regmove_cost, |
| &thunderx_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| { 6, /* load_int. */ |
| 6, /* store_int. */ |
| 6, /* load_fp. */ |
| 6, /* store_fp. */ |
| 6, /* load_pred. */ |
| 6 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 2, /* issue_rate */ |
| AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */ |
| "8", /* function_align. */ |
| "8", /* jump_align. */ |
| "8", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */ |
| &thunderxt88_prefetch_tune |
| }; |
| |
| static const struct tune_params thunderx_tunings = |
| { |
| &thunderx_extra_costs, |
| &generic_addrcost_table, |
| &thunderx_regmove_cost, |
| &thunderx_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| { 6, /* load_int. */ |
| 6, /* store_int. */ |
| 6, /* load_fp. */ |
| 6, /* store_fp. */ |
| 6, /* load_pred. */ |
| 6 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 2, /* issue_rate */ |
| AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */ |
| "8", /* function_align. */ |
| "8", /* jump_align. */ |
| "8", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW |
| | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ |
| &thunderx_prefetch_tune |
| }; |
| |
| static const struct tune_params tsv110_tunings = |
| { |
| &tsv110_extra_costs, |
| &tsv110_addrcost_table, |
| &tsv110_regmove_cost, |
| &tsv110_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| { 4, /* load_int. */ |
| 4, /* store_int. */ |
| 4, /* load_fp. */ |
| 4, /* store_fp. */ |
| 4, /* load_pred. */ |
| 4 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 4, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH |
| | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */ |
| "16", /* function_align. */ |
| "4", /* jump_align. */ |
| "8", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &tsv110_prefetch_tune |
| }; |
| |
| static const struct tune_params xgene1_tunings = |
| { |
| &xgene1_extra_costs, |
| &xgene1_addrcost_table, |
| &xgene1_regmove_cost, |
| &xgene1_vector_cost, |
| &generic_branch_cost, |
| &xgene1_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| { 6, /* load_int. */ |
| 6, /* store_int. */ |
| 6, /* load_fp. */ |
| 6, /* store_fp. */ |
| 6, /* load_pred. */ |
| 6 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 4, /* issue_rate */ |
| AARCH64_FUSE_NOTHING, /* fusible_ops */ |
| "16", /* function_align. */ |
| "16", /* jump_align. */ |
| "16", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 17, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */ |
| &xgene1_prefetch_tune |
| }; |
| |
| static const struct tune_params emag_tunings = |
| { |
| &xgene1_extra_costs, |
| &xgene1_addrcost_table, |
| &xgene1_regmove_cost, |
| &xgene1_vector_cost, |
| &generic_branch_cost, |
| &xgene1_approx_modes, |
| SVE_NOT_IMPLEMENTED, |
| { 6, /* load_int. */ |
| 6, /* store_int. */ |
| 6, /* load_fp. */ |
| 6, /* store_fp. */ |
| 6, /* load_pred. */ |
| 6 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 4, /* issue_rate */ |
| AARCH64_FUSE_NOTHING, /* fusible_ops */ |
| "16", /* function_align. */ |
| "16", /* jump_align. */ |
| "16", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 17, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */ |
| &xgene1_prefetch_tune |
| }; |
| |
| static const struct tune_params qdf24xx_tunings = |
| { |
| &qdf24xx_extra_costs, |
| &qdf24xx_addrcost_table, |
| &qdf24xx_regmove_cost, |
| &qdf24xx_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| { 4, /* load_int. */ |
| 4, /* store_int. */ |
| 4, /* load_fp. */ |
| 4, /* store_fp. */ |
| 4, /* load_pred. */ |
| 4 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 4, /* issue_rate */ |
| (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD |
| | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */ |
| "16", /* function_align. */ |
| "8", /* jump_align. */ |
| "16", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */ |
| &qdf24xx_prefetch_tune |
| }; |
| |
| /* Tuning structure for the Qualcomm Saphira core. Default to falkor values |
| for now. */ |
| static const struct tune_params saphira_tunings = |
| { |
| &generic_extra_costs, |
| &generic_addrcost_table, |
| &generic_regmove_cost, |
| &generic_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| { 4, /* load_int. */ |
| 4, /* store_int. */ |
| 4, /* load_fp. */ |
| 4, /* store_fp. */ |
| 4, /* load_pred. */ |
| 4 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 4, /* issue_rate */ |
| (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD |
| | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */ |
| "16", /* function_align. */ |
| "8", /* jump_align. */ |
| "16", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 1, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const struct tune_params thunderx2t99_tunings = |
| { |
| &thunderx2t99_extra_costs, |
| &thunderx2t99_addrcost_table, |
| &thunderx2t99_regmove_cost, |
| &thunderx2t99_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| { 4, /* load_int. */ |
| 4, /* store_int. */ |
| 4, /* load_fp. */ |
| 4, /* store_fp. */ |
| 4, /* load_pred. */ |
| 4 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 4, /* issue_rate. */ |
| (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC |
| | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */ |
| "16", /* function_align. */ |
| "8", /* jump_align. */ |
| "16", /* loop_align. */ |
| 3, /* int_reassoc_width. */ |
| 2, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 2, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &thunderx2t99_prefetch_tune |
| }; |
| |
| static const struct tune_params thunderx3t110_tunings = |
| { |
| &thunderx3t110_extra_costs, |
| &thunderx3t110_addrcost_table, |
| &thunderx3t110_regmove_cost, |
| &thunderx3t110_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| { 4, /* load_int. */ |
| 4, /* store_int. */ |
| 4, /* load_fp. */ |
| 4, /* store_fp. */ |
| 4, /* load_pred. */ |
| 4 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 6, /* issue_rate. */ |
| (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC |
| | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */ |
| "16", /* function_align. */ |
| "8", /* jump_align. */ |
| "16", /* loop_align. */ |
| 3, /* int_reassoc_width. */ |
| 2, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 2, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &thunderx3t110_prefetch_tune |
| }; |
| |
| static const struct tune_params neoversen1_tunings = |
| { |
| &cortexa76_extra_costs, |
| &generic_addrcost_table, |
| &generic_regmove_cost, |
| &cortexa57_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| { 4, /* load_int. */ |
| 2, /* store_int. */ |
| 5, /* load_fp. */ |
| 2, /* store_fp. */ |
| 4, /* load_pred. */ |
| 4 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 3, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ |
| "32:16", /* function_align. */ |
| "4", /* jump_align. */ |
| "32:16", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 2, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const struct tune_params ampere1_tunings = |
| { |
| &ere1_extra_costs, |
| &generic_addrcost_table, |
| &generic_regmove_cost, |
| &ere1_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| { 4, /* load_int. */ |
| 4, /* store_int. */ |
| 4, /* load_fp. */ |
| 4, /* store_fp. */ |
| 4, /* load_pred. */ |
| 4 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 4, /* issue_rate */ |
| (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC | |
| AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK | |
| AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ | |
| AARCH64_FUSE_CMP_BRANCH), |
| /* fusible_ops */ |
| "32", /* function_align. */ |
| "4", /* jump_align. */ |
| "32:16", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 2, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &ere1_prefetch_tune |
| }; |
| |
| static const struct tune_params ampere1a_tunings = |
| { |
| &ere1a_extra_costs, |
| &generic_addrcost_table, |
| &generic_regmove_cost, |
| &ere1_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_NOT_IMPLEMENTED, /* sve_width */ |
| { 4, /* load_int. */ |
| 4, /* store_int. */ |
| 4, /* load_fp. */ |
| 4, /* store_fp. */ |
| 4, /* load_pred. */ |
| 4 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 4, /* issue_rate */ |
| (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC | |
| AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK | |
| AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ | |
| AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ | |
| AARCH64_FUSE_ADDSUB_2REG_CONST1), |
| /* fusible_ops */ |
| "32", /* function_align. */ |
| "4", /* jump_align. */ |
| "32:16", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 2, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &ere1_prefetch_tune |
| }; |
| |
| static const advsimd_vec_cost neoversev1_advsimd_vector_cost = |
| { |
| 2, /* int_stmt_cost */ |
| 2, /* fp_stmt_cost */ |
| 4, /* ld2_st2_permute_cost */ |
| 4, /* ld3_st3_permute_cost */ |
| 5, /* ld4_st4_permute_cost */ |
| 3, /* permute_cost */ |
| 4, /* reduc_i8_cost */ |
| 4, /* reduc_i16_cost */ |
| 2, /* reduc_i32_cost */ |
| 2, /* reduc_i64_cost */ |
| 6, /* reduc_f16_cost */ |
| 3, /* reduc_f32_cost */ |
| 2, /* reduc_f64_cost */ |
| 2, /* store_elt_extra_cost */ |
| /* This value is just inherited from the Cortex-A57 table. */ |
| 8, /* vec_to_scalar_cost */ |
| /* This depends very much on what the scalar value is and |
| where it comes from. E.g. some constants take two dependent |
| instructions or a load, while others might be moved from a GPR. |
| 4 seems to be a reasonable compromise in practice. */ |
| 4, /* scalar_to_vec_cost */ |
| 4, /* align_load_cost */ |
| 4, /* unalign_load_cost */ |
| /* Although stores have a latency of 2 and compete for the |
| vector pipes, in practice it's better not to model that. */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| static const sve_vec_cost neoversev1_sve_vector_cost = |
| { |
| { |
| 2, /* int_stmt_cost */ |
| 2, /* fp_stmt_cost */ |
| 4, /* ld2_st2_permute_cost */ |
| 7, /* ld3_st3_permute_cost */ |
| 8, /* ld4_st4_permute_cost */ |
| 3, /* permute_cost */ |
| /* Theoretically, a reduction involving 31 scalar ADDs could |
| complete in ~9 cycles and would have a cost of 31. [SU]ADDV |
| completes in 14 cycles, so give it a cost of 31 + 5. */ |
| 36, /* reduc_i8_cost */ |
| /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7. */ |
| 22, /* reduc_i16_cost */ |
| /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7. */ |
| 14, /* reduc_i32_cost */ |
| /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8. */ |
| 11, /* reduc_i64_cost */ |
| /* Theoretically, a reduction involving 15 scalar FADDs could |
| complete in ~9 cycles and would have a cost of 30. FADDV |
| completes in 13 cycles, so give it a cost of 30 + 4. */ |
| 34, /* reduc_f16_cost */ |
| /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5. */ |
| 19, /* reduc_f32_cost */ |
| /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5. */ |
| 11, /* reduc_f64_cost */ |
| 2, /* store_elt_extra_cost */ |
| /* This value is just inherited from the Cortex-A57 table. */ |
| 8, /* vec_to_scalar_cost */ |
| /* See the comment above the Advanced SIMD versions. */ |
| 4, /* scalar_to_vec_cost */ |
| 4, /* align_load_cost */ |
| 4, /* unalign_load_cost */ |
| /* Although stores have a latency of 2 and compete for the |
| vector pipes, in practice it's better not to model that. */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }, |
| 3, /* clast_cost */ |
| 19, /* fadda_f16_cost */ |
| 11, /* fadda_f32_cost */ |
| 8, /* fadda_f64_cost */ |
| 32, /* gather_load_x32_cost */ |
| 16, /* gather_load_x64_cost */ |
| 3 /* scatter_store_elt_cost */ |
| }; |
| |
| static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info = |
| { |
| 3, /* loads_stores_per_cycle */ |
| 2, /* stores_per_cycle */ |
| 4, /* general_ops_per_cycle */ |
| 0, /* fp_simd_load_general_ops */ |
| 1 /* fp_simd_store_general_ops */ |
| }; |
| |
| static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info = |
| { |
| { |
| 3, /* loads_stores_per_cycle */ |
| 2, /* stores_per_cycle */ |
| 4, /* general_ops_per_cycle */ |
| 0, /* fp_simd_load_general_ops */ |
| 1 /* fp_simd_store_general_ops */ |
| }, |
| 2, /* ld2_st2_general_ops */ |
| 2, /* ld3_st3_general_ops */ |
| 3 /* ld4_st4_general_ops */ |
| }; |
| |
| static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info = |
| { |
| { |
| { |
| 2, /* loads_per_cycle */ |
| 2, /* stores_per_cycle */ |
| 2, /* general_ops_per_cycle */ |
| 0, /* fp_simd_load_general_ops */ |
| 1 /* fp_simd_store_general_ops */ |
| }, |
| 2, /* ld2_st2_general_ops */ |
| 2, /* ld3_st3_general_ops */ |
| 3 /* ld4_st4_general_ops */ |
| }, |
| 1, /* pred_ops_per_cycle */ |
| 2, /* while_pred_ops */ |
| 2, /* int_cmp_pred_ops */ |
| 1, /* fp_cmp_pred_ops */ |
| 1, /* gather_scatter_pair_general_ops */ |
| 1 /* gather_scatter_pair_pred_ops */ |
| }; |
| |
| static const aarch64_vec_issue_info neoversev1_vec_issue_info = |
| { |
| &neoversev1_scalar_issue_info, |
| &neoversev1_advsimd_issue_info, |
| &neoversev1_sve_issue_info |
| }; |
| |
| /* Neoverse V1 costs for vector insn classes. */ |
| static const struct cpu_vector_cost neoversev1_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 2, /* scalar_fp_stmt_cost */ |
| 4, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 1, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &neoversev1_advsimd_vector_cost, /* advsimd */ |
| &neoversev1_sve_vector_cost, /* sve */ |
| &neoversev1_vec_issue_info /* issue_info */ |
| }; |
| |
| static const struct tune_params neoversev1_tunings = |
| { |
| &cortexa76_extra_costs, |
| &neoversev1_addrcost_table, |
| &neoversev1_regmove_cost, |
| &neoversev1_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_256, /* sve_width */ |
| { 4, /* load_int. */ |
| 2, /* store_int. */ |
| 6, /* load_fp. */ |
| 2, /* store_fp. */ |
| 6, /* load_pred. */ |
| 1 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 3, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ |
| "32:16", /* function_align. */ |
| "4", /* jump_align. */ |
| "32:16", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 4, /* fma_reassoc_width. */ |
| 2, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS |
| | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS |
| | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT |
| | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const sve_vec_cost neoverse512tvb_sve_vector_cost = |
| { |
| { |
| 2, /* int_stmt_cost */ |
| 2, /* fp_stmt_cost */ |
| 4, /* ld2_st2_permute_cost */ |
| 5, /* ld3_st3_permute_cost */ |
| 5, /* ld4_st4_permute_cost */ |
| 3, /* permute_cost */ |
| /* Theoretically, a reduction involving 15 scalar ADDs could |
| complete in ~5 cycles and would have a cost of 15. Assume that |
| [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6. */ |
| 21, /* reduc_i8_cost */ |
| /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */ |
| 13, /* reduc_i16_cost */ |
| /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */ |
| 9, /* reduc_i32_cost */ |
| /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7. */ |
| 8, /* reduc_i64_cost */ |
| /* Theoretically, a reduction involving 7 scalar FADDs could |
| complete in ~6 cycles and would have a cost of 14. Assume that |
| FADDV completes in 8 cycles and so give it a cost of 14 + 2. */ |
| 16, /* reduc_f16_cost */ |
| /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */ |
| 8, /* reduc_f32_cost */ |
| /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2. */ |
| 4, /* reduc_f64_cost */ |
| 2, /* store_elt_extra_cost */ |
| /* This value is just inherited from the Cortex-A57 table. */ |
| 8, /* vec_to_scalar_cost */ |
| /* This depends very much on what the scalar value is and |
| where it comes from. E.g. some constants take two dependent |
| instructions or a load, while others might be moved from a GPR. |
| 4 seems to be a reasonable compromise in practice. */ |
| 4, /* scalar_to_vec_cost */ |
| 4, /* align_load_cost */ |
| 4, /* unalign_load_cost */ |
| /* Although stores generally have a latency of 2 and compete for the |
| vector pipes, in practice it's better not to model that. */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }, |
| 3, /* clast_cost */ |
| 10, /* fadda_f16_cost */ |
| 6, /* fadda_f32_cost */ |
| 4, /* fadda_f64_cost */ |
| /* A strided Advanced SIMD x64 load would take two parallel FP loads |
| (6 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather |
| is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads |
| (cost 8) and a vec_construct (cost 2). Add a full vector operation |
| (cost 2) to that, to avoid the difference being lost in rounding. |
| |
| There is no easy comparison between a strided Advanced SIMD x32 load |
| and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector |
| operation more than a 64-bit gather. */ |
| 14, /* gather_load_x32_cost */ |
| 12, /* gather_load_x64_cost */ |
| 3 /* scatter_store_elt_cost */ |
| }; |
| |
| static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info = |
| { |
| { |
| { |
| 3, /* loads_per_cycle */ |
| 2, /* stores_per_cycle */ |
| 4, /* general_ops_per_cycle */ |
| 0, /* fp_simd_load_general_ops */ |
| 1 /* fp_simd_store_general_ops */ |
| }, |
| 2, /* ld2_st2_general_ops */ |
| 2, /* ld3_st3_general_ops */ |
| 3 /* ld4_st4_general_ops */ |
| }, |
| 2, /* pred_ops_per_cycle */ |
| 2, /* while_pred_ops */ |
| 2, /* int_cmp_pred_ops */ |
| 1, /* fp_cmp_pred_ops */ |
| 1, /* gather_scatter_pair_general_ops */ |
| 1 /* gather_scatter_pair_pred_ops */ |
| }; |
| |
| static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info = |
| { |
| &neoversev1_scalar_issue_info, |
| &neoversev1_advsimd_issue_info, |
| &neoverse512tvb_sve_issue_info |
| }; |
| |
| static const struct cpu_vector_cost neoverse512tvb_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 2, /* scalar_fp_stmt_cost */ |
| 4, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 1, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &neoversev1_advsimd_vector_cost, /* advsimd */ |
| &neoverse512tvb_sve_vector_cost, /* sve */ |
| &neoverse512tvb_vec_issue_info /* issue_info */ |
| }; |
| |
| static const struct tune_params neoverse512tvb_tunings = |
| { |
| &cortexa76_extra_costs, |
| &neoversev1_addrcost_table, |
| &neoversev1_regmove_cost, |
| &neoverse512tvb_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_128 | SVE_256, /* sve_width */ |
| { 4, /* load_int. */ |
| 2, /* store_int. */ |
| 6, /* load_fp. */ |
| 2, /* store_fp. */ |
| 6, /* load_pred. */ |
| 1 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 3, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ |
| "32:16", /* function_align. */ |
| "4", /* jump_align. */ |
| "32:16", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 4, /* fma_reassoc_width. */ |
| 2, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS |
| | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS |
| | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const advsimd_vec_cost neoversen2_advsimd_vector_cost = |
| { |
| 2, /* int_stmt_cost */ |
| 2, /* fp_stmt_cost */ |
| 2, /* ld2_st2_permute_cost */ |
| 2, /* ld3_st3_permute_cost */ |
| 3, /* ld4_st4_permute_cost */ |
| 3, /* permute_cost */ |
| 4, /* reduc_i8_cost */ |
| 4, /* reduc_i16_cost */ |
| 2, /* reduc_i32_cost */ |
| 2, /* reduc_i64_cost */ |
| 6, /* reduc_f16_cost */ |
| 4, /* reduc_f32_cost */ |
| 2, /* reduc_f64_cost */ |
| 2, /* store_elt_extra_cost */ |
| /* This value is just inherited from the Cortex-A57 table. */ |
| 8, /* vec_to_scalar_cost */ |
| /* This depends very much on what the scalar value is and |
| where it comes from. E.g. some constants take two dependent |
| instructions or a load, while others might be moved from a GPR. |
| 4 seems to be a reasonable compromise in practice. */ |
| 4, /* scalar_to_vec_cost */ |
| 4, /* align_load_cost */ |
| 4, /* unalign_load_cost */ |
| /* Although stores have a latency of 2 and compete for the |
| vector pipes, in practice it's better not to model that. */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| static const sve_vec_cost neoversen2_sve_vector_cost = |
| { |
| { |
| 2, /* int_stmt_cost */ |
| 2, /* fp_stmt_cost */ |
| 3, /* ld2_st2_permute_cost */ |
| 4, /* ld3_st3_permute_cost */ |
| 4, /* ld4_st4_permute_cost */ |
| 3, /* permute_cost */ |
| /* Theoretically, a reduction involving 15 scalar ADDs could |
| complete in ~5 cycles and would have a cost of 15. [SU]ADDV |
| completes in 11 cycles, so give it a cost of 15 + 6. */ |
| 21, /* reduc_i8_cost */ |
| /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */ |
| 13, /* reduc_i16_cost */ |
| /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */ |
| 9, /* reduc_i32_cost */ |
| /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */ |
| 2, /* reduc_i64_cost */ |
| /* Theoretically, a reduction involving 7 scalar FADDs could |
| complete in ~8 cycles and would have a cost of 14. FADDV |
| completes in 6 cycles, so give it a cost of 14 - 2. */ |
| 12, /* reduc_f16_cost */ |
| /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0. */ |
| 6, /* reduc_f32_cost */ |
| /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0. */ |
| 2, /* reduc_f64_cost */ |
| 2, /* store_elt_extra_cost */ |
| /* This value is just inherited from the Cortex-A57 table. */ |
| 8, /* vec_to_scalar_cost */ |
| /* See the comment above the Advanced SIMD versions. */ |
| 4, /* scalar_to_vec_cost */ |
| 4, /* align_load_cost */ |
| 4, /* unalign_load_cost */ |
| /* Although stores have a latency of 2 and compete for the |
| vector pipes, in practice it's better not to model that. */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }, |
| 3, /* clast_cost */ |
| 10, /* fadda_f16_cost */ |
| 6, /* fadda_f32_cost */ |
| 4, /* fadda_f64_cost */ |
| /* A strided Advanced SIMD x64 load would take two parallel FP loads |
| (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather |
| is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads |
| (cost 8) and a vec_construct (cost 2). Add a full vector operation |
| (cost 2) to that, to avoid the difference being lost in rounding. |
| |
| There is no easy comparison between a strided Advanced SIMD x32 load |
| and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector |
| operation more than a 64-bit gather. */ |
| 14, /* gather_load_x32_cost */ |
| 12, /* gather_load_x64_cost */ |
| 3 /* scatter_store_elt_cost */ |
| }; |
| |
| static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info = |
| { |
| 3, /* loads_stores_per_cycle */ |
| 2, /* stores_per_cycle */ |
| 4, /* general_ops_per_cycle */ |
| 0, /* fp_simd_load_general_ops */ |
| 1 /* fp_simd_store_general_ops */ |
| }; |
| |
| static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info = |
| { |
| { |
| 3, /* loads_stores_per_cycle */ |
| 2, /* stores_per_cycle */ |
| 2, /* general_ops_per_cycle */ |
| 0, /* fp_simd_load_general_ops */ |
| 1 /* fp_simd_store_general_ops */ |
| }, |
| 2, /* ld2_st2_general_ops */ |
| 2, /* ld3_st3_general_ops */ |
| 3 /* ld4_st4_general_ops */ |
| }; |
| |
| static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info = |
| { |
| { |
| { |
| 3, /* loads_per_cycle */ |
| 2, /* stores_per_cycle */ |
| 2, /* general_ops_per_cycle */ |
| 0, /* fp_simd_load_general_ops */ |
| 1 /* fp_simd_store_general_ops */ |
| }, |
| 2, /* ld2_st2_general_ops */ |
| 3, /* ld3_st3_general_ops */ |
| 3 /* ld4_st4_general_ops */ |
| }, |
| 2, /* pred_ops_per_cycle */ |
| 2, /* while_pred_ops */ |
| 2, /* int_cmp_pred_ops */ |
| 1, /* fp_cmp_pred_ops */ |
| 1, /* gather_scatter_pair_general_ops */ |
| 1 /* gather_scatter_pair_pred_ops */ |
| }; |
| |
| static const aarch64_vec_issue_info neoversen2_vec_issue_info = |
| { |
| &neoversen2_scalar_issue_info, |
| &neoversen2_advsimd_issue_info, |
| &neoversen2_sve_issue_info |
| }; |
| |
| /* Neoverse N2 costs for vector insn classes. */ |
| static const struct cpu_vector_cost neoversen2_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 2, /* scalar_fp_stmt_cost */ |
| 4, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 1, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &neoversen2_advsimd_vector_cost, /* advsimd */ |
| &neoversen2_sve_vector_cost, /* sve */ |
| &neoversen2_vec_issue_info /* issue_info */ |
| }; |
| |
| static const struct tune_params neoversen2_tunings = |
| { |
| &cortexa76_extra_costs, |
| &neoversen2_addrcost_table, |
| &neoversen2_regmove_cost, |
| &neoversen2_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_128, /* sve_width */ |
| { 4, /* load_int. */ |
| 1, /* store_int. */ |
| 6, /* load_fp. */ |
| 2, /* store_fp. */ |
| 6, /* load_pred. */ |
| 1 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 3, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ |
| "32:16", /* function_align. */ |
| "4", /* jump_align. */ |
| "32:16", /* loop_align. */ |
| 2, /* int_reassoc_width. */ |
| 4, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 2, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND |
| | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS |
| | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS |
| | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const advsimd_vec_cost neoversev2_advsimd_vector_cost = |
| { |
| 2, /* int_stmt_cost */ |
| 2, /* fp_stmt_cost */ |
| 2, /* ld2_st2_permute_cost */ |
| 2, /* ld3_st3_permute_cost */ |
| 3, /* ld4_st4_permute_cost */ |
| 3, /* permute_cost */ |
| 4, /* reduc_i8_cost */ |
| 4, /* reduc_i16_cost */ |
| 2, /* reduc_i32_cost */ |
| 2, /* reduc_i64_cost */ |
| 6, /* reduc_f16_cost */ |
| 3, /* reduc_f32_cost */ |
| 2, /* reduc_f64_cost */ |
| 2, /* store_elt_extra_cost */ |
| /* This value is just inherited from the Cortex-A57 table. */ |
| 8, /* vec_to_scalar_cost */ |
| /* This depends very much on what the scalar value is and |
| where it comes from. E.g. some constants take two dependent |
| instructions or a load, while others might be moved from a GPR. |
| 4 seems to be a reasonable compromise in practice. */ |
| 4, /* scalar_to_vec_cost */ |
| 4, /* align_load_cost */ |
| 4, /* unalign_load_cost */ |
| /* Although stores have a latency of 2 and compete for the |
| vector pipes, in practice it's better not to model that. */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }; |
| |
| static const sve_vec_cost neoversev2_sve_vector_cost = |
| { |
| { |
| 2, /* int_stmt_cost */ |
| 2, /* fp_stmt_cost */ |
| 3, /* ld2_st2_permute_cost */ |
| 3, /* ld3_st3_permute_cost */ |
| 4, /* ld4_st4_permute_cost */ |
| 3, /* permute_cost */ |
| /* Theoretically, a reduction involving 15 scalar ADDs could |
| complete in ~3 cycles and would have a cost of 15. [SU]ADDV |
| completes in 11 cycles, so give it a cost of 15 + 8. */ |
| 21, /* reduc_i8_cost */ |
| /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7. */ |
| 14, /* reduc_i16_cost */ |
| /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4. */ |
| 7, /* reduc_i32_cost */ |
| /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */ |
| 2, /* reduc_i64_cost */ |
| /* Theoretically, a reduction involving 7 scalar FADDs could |
| complete in ~6 cycles and would have a cost of 14. FADDV |
| completes in 8 cycles, so give it a cost of 14 + 2. */ |
| 16, /* reduc_f16_cost */ |
| /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */ |
| 8, /* reduc_f32_cost */ |
| /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2. */ |
| 4, /* reduc_f64_cost */ |
| 2, /* store_elt_extra_cost */ |
| /* This value is just inherited from the Cortex-A57 table. */ |
| 8, /* vec_to_scalar_cost */ |
| /* See the comment above the Advanced SIMD versions. */ |
| 4, /* scalar_to_vec_cost */ |
| 4, /* align_load_cost */ |
| 4, /* unalign_load_cost */ |
| /* Although stores have a latency of 2 and compete for the |
| vector pipes, in practice it's better not to model that. */ |
| 1, /* unalign_store_cost */ |
| 1 /* store_cost */ |
| }, |
| 3, /* clast_cost */ |
| 10, /* fadda_f16_cost */ |
| 6, /* fadda_f32_cost */ |
| 4, /* fadda_f64_cost */ |
| /* A strided Advanced SIMD x64 load would take two parallel FP loads |
| (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather |
| is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads |
| (cost 8) and a vec_construct (cost 2). Add a full vector operation |
| (cost 2) to that, to avoid the difference being lost in rounding. |
| |
| There is no easy comparison between a strided Advanced SIMD x32 load |
| and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector |
| operation more than a 64-bit gather. */ |
| 14, /* gather_load_x32_cost */ |
| 12, /* gather_load_x64_cost */ |
| 3 /* scatter_store_elt_cost */ |
| }; |
| |
| static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info = |
| { |
| 3, /* loads_stores_per_cycle */ |
| 2, /* stores_per_cycle */ |
| 6, /* general_ops_per_cycle */ |
| 0, /* fp_simd_load_general_ops */ |
| 1 /* fp_simd_store_general_ops */ |
| }; |
| |
| static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info = |
| { |
| { |
| 3, /* loads_stores_per_cycle */ |
| 2, /* stores_per_cycle */ |
| 4, /* general_ops_per_cycle */ |
| 0, /* fp_simd_load_general_ops */ |
| 1 /* fp_simd_store_general_ops */ |
| }, |
| 2, /* ld2_st2_general_ops */ |
| 2, /* ld3_st3_general_ops */ |
| 3 /* ld4_st4_general_ops */ |
| }; |
| |
| static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info = |
| { |
| { |
| { |
| 3, /* loads_per_cycle */ |
| 2, /* stores_per_cycle */ |
| 4, /* general_ops_per_cycle */ |
| 0, /* fp_simd_load_general_ops */ |
| 1 /* fp_simd_store_general_ops */ |
| }, |
| 2, /* ld2_st2_general_ops */ |
| 3, /* ld3_st3_general_ops */ |
| 3 /* ld4_st4_general_ops */ |
| }, |
| 2, /* pred_ops_per_cycle */ |
| 2, /* while_pred_ops */ |
| 2, /* int_cmp_pred_ops */ |
| 1, /* fp_cmp_pred_ops */ |
| 1, /* gather_scatter_pair_general_ops */ |
| 1 /* gather_scatter_pair_pred_ops */ |
| }; |
| |
| static const aarch64_vec_issue_info neoversev2_vec_issue_info = |
| { |
| &neoversev2_scalar_issue_info, |
| &neoversev2_advsimd_issue_info, |
| &neoversev2_sve_issue_info |
| }; |
| |
| /* Demeter costs for vector insn classes. */ |
| static const struct cpu_vector_cost neoversev2_vector_cost = |
| { |
| 1, /* scalar_int_stmt_cost */ |
| 2, /* scalar_fp_stmt_cost */ |
| 4, /* scalar_load_cost */ |
| 1, /* scalar_store_cost */ |
| 1, /* cond_taken_branch_cost */ |
| 1, /* cond_not_taken_branch_cost */ |
| &neoversev2_advsimd_vector_cost, /* advsimd */ |
| &neoversev2_sve_vector_cost, /* sve */ |
| &neoversev2_vec_issue_info /* issue_info */ |
| }; |
| |
| static const struct tune_params neoversev2_tunings = |
| { |
| &cortexa76_extra_costs, |
| &neoversev2_addrcost_table, |
| &neoversev2_regmove_cost, |
| &neoversev2_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_128, /* sve_width */ |
| { 4, /* load_int. */ |
| 2, /* store_int. */ |
| 6, /* load_fp. */ |
| 1, /* store_fp. */ |
| 6, /* load_pred. */ |
| 2 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 5, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ |
| "32:16", /* function_align. */ |
| "4", /* jump_align. */ |
| "32:16", /* loop_align. */ |
| 3, /* int_reassoc_width. */ |
| 6, /* fp_reassoc_width. */ |
| 4, /* fma_reassoc_width. */ |
| 3, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND |
| | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS |
| | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS |
| | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ |
| &generic_prefetch_tune |
| }; |
| |
| static const struct tune_params a64fx_tunings = |
| { |
| &a64fx_extra_costs, |
| &a64fx_addrcost_table, |
| &a64fx_regmove_cost, |
| &a64fx_vector_cost, |
| &generic_branch_cost, |
| &generic_approx_modes, |
| SVE_512, /* sve_width */ |
| { 4, /* load_int. */ |
| 4, /* store_int. */ |
| 4, /* load_fp. */ |
| 4, /* store_fp. */ |
| 4, /* load_pred. */ |
| 4 /* store_pred. */ |
| }, /* memmov_cost. */ |
| 7, /* issue_rate */ |
| (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ |
| "32", /* function_align. */ |
| "16", /* jump_align. */ |
| "32", /* loop_align. */ |
| 4, /* int_reassoc_width. */ |
| 2, /* fp_reassoc_width. */ |
| 1, /* fma_reassoc_width. */ |
| 2, /* vec_reassoc_width. */ |
| 2, /* min_div_recip_mul_sf. */ |
| 2, /* min_div_recip_mul_df. */ |
| 0, /* max_case_values. */ |
| tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ |
| (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ |
| &a64fx_prefetch_tune |
| }; |
| |
| /* Support for fine-grained override of the tuning structures. */ |
| struct aarch64_tuning_override_function |
| { |
| const char* name; |
| void (*parse_override)(const char*, struct tune_params*); |
| }; |
| |
| static void aarch64_parse_fuse_string (const char*, struct tune_params*); |
| static void aarch64_parse_tune_string (const char*, struct tune_params*); |
| static void aarch64_parse_sve_width_string (const char*, struct tune_params*); |
| |
| static const struct aarch64_tuning_override_function |
| aarch64_tuning_override_functions[] = |
| { |
| { "fuse", aarch64_parse_fuse_string }, |
| { "tune", aarch64_parse_tune_string }, |
| { "sve_width", aarch64_parse_sve_width_string }, |
| { NULL, NULL } |
| }; |
| |
| /* A processor implementing AArch64. */ |
| struct processor |
| { |
| const char *name; |
| aarch64_processor ident; |
| aarch64_processor sched_core; |
| aarch64_arch arch; |
| aarch64_feature_flags flags; |
| const tune_params *tune; |
| }; |
| |
| /* Architectures implementing AArch64. */ |
| static CONSTEXPR const processor all_architectures[] = |
| { |
| #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \ |
| {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, \ |
| feature_deps::ARCH_IDENT ().enable, NULL}, |
| #include "aarch64-arches.def" |
| {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL} |
| }; |
| |
| /* Processor cores implementing AArch64. */ |
| static const struct processor all_cores[] = |
| { |
| #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \ |
| {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \ |
| feature_deps::cpu_##IDENT, &COSTS##_tunings}, |
| #include "aarch64-cores.def" |
| {"generic", generic, cortexa53, AARCH64_ARCH_V8A, |
| feature_deps::V8A ().enable, &generic_tunings}, |
| {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL} |
| }; |
| |
| /* The current tuning set. */ |
| struct tune_params aarch64_tune_params = generic_tunings; |
| |
| /* Check whether an 'aarch64_vector_pcs' attribute is valid. */ |
| |
| static tree |
| handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree, |
| int, bool *no_add_attrs) |
| { |
| /* Since we set fn_type_req to true, the caller should have checked |
| this for us. */ |
| gcc_assert (FUNC_OR_METHOD_TYPE_P (*node)); |
| switch ((arm_pcs) fntype_abi (*node).id ()) |
| { |
| case ARM_PCS_AAPCS64: |
| case ARM_PCS_SIMD: |
| return NULL_TREE; |
| |
| case ARM_PCS_SVE: |
| error ("the %qE attribute cannot be applied to an SVE function type", |
| name); |
| *no_add_attrs = true; |
| return NULL_TREE; |
| |
| case ARM_PCS_TLSDESC: |
| case ARM_PCS_UNKNOWN: |
| break; |
| } |
| gcc_unreachable (); |
| } |
| |
| /* Table of machine attributes. */ |
| static const struct attribute_spec aarch64_attribute_table[] = |
| { |
| /* { name, min_len, max_len, decl_req, type_req, fn_type_req, |
| affects_type_identity, handler, exclude } */ |
| { "aarch64_vector_pcs", 0, 0, false, true, true, true, |
| handle_aarch64_vector_pcs_attribute, NULL }, |
| { "arm_sve_vector_bits", 1, 1, false, true, false, true, |
| aarch64_sve::handle_arm_sve_vector_bits_attribute, |
| NULL }, |
| { "Advanced SIMD type", 1, 1, false, true, false, true, NULL, NULL }, |
| { "SVE type", 3, 3, false, true, false, true, NULL, NULL }, |
| { "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL }, |
| { NULL, 0, 0, false, false, false, false, NULL, NULL } |
| }; |
| |
| /* An ISA extension in the co-processor and main instruction set space. */ |
| struct aarch64_option_extension |
| { |
| const char *const name; |
| const unsigned long flags_on; |
| const unsigned long flags_off; |
| }; |
| |
| typedef enum aarch64_cond_code |
| { |
| AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL, |
| AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT, |
| AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV |
| } |
| aarch64_cc; |
| |
| #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1)) |
| |
| struct aarch64_branch_protect_type |
| { |
| /* The type's name that the user passes to the branch-protection option |
| string. */ |
| const char* name; |
| /* Function to handle the protection type and set global variables. |
| First argument is the string token corresponding with this type and the |
| second argument is the next token in the option string. |
| Return values: |
| * AARCH64_PARSE_OK: Handling was sucessful. |
| * AARCH64_INVALID_ARG: The type is invalid in this context and the caller |
| should print an error. |
| * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its |
| own error. */ |
| enum aarch64_parse_opt_result (*handler)(char*, char*); |
| /* A list of types that can follow this type in the option string. */ |
| const aarch64_branch_protect_type* subtypes; |
| unsigned int num_subtypes; |
| }; |
| |
| static enum aarch64_parse_opt_result |
| aarch64_handle_no_branch_protection (char* str, char* rest) |
| { |
| aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE; |
| aarch64_enable_bti = 0; |
| if (rest) |
| { |
| error ("unexpected %<%s%> after %<%s%>", rest, str); |
| return AARCH64_PARSE_INVALID_FEATURE; |
| } |
| return AARCH64_PARSE_OK; |
| } |
| |
| static enum aarch64_parse_opt_result |
| aarch64_handle_standard_branch_protection (char* str, char* rest) |
| { |
| aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF; |
| aarch64_ra_sign_key = AARCH64_KEY_A; |
| aarch64_enable_bti = 1; |
| if (rest) |
| { |
| error ("unexpected %<%s%> after %<%s%>", rest, str); |
| return AARCH64_PARSE_INVALID_FEATURE; |
| } |
| return AARCH64_PARSE_OK; |
| } |
| |
| static enum aarch64_parse_opt_result |
| aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED, |
| char* rest ATTRIBUTE_UNUSED) |
| { |
| aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF; |
| aarch64_ra_sign_key = AARCH64_KEY_A; |
| return AARCH64_PARSE_OK; |
| } |
| |
| static enum aarch64_parse_opt_result |
| aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED, |
| char* rest ATTRIBUTE_UNUSED) |
| { |
| aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL; |
| return AARCH64_PARSE_OK; |
| } |
| |
| static enum aarch64_parse_opt_result |
| aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED, |
| char* rest ATTRIBUTE_UNUSED) |
| { |
| aarch64_ra_sign_key = AARCH64_KEY_B; |
| return AARCH64_PARSE_OK; |
| } |
| |
| static enum aarch64_parse_opt_result |
| aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED, |
| char* rest ATTRIBUTE_UNUSED) |
| { |
| aarch64_enable_bti = 1; |
| return AARCH64_PARSE_OK; |
| } |
| |
| static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = { |
| { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 }, |
| { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 }, |
| { NULL, NULL, NULL, 0 } |
| }; |
| |
| static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = { |
| { "none", aarch64_handle_no_branch_protection, NULL, 0 }, |
| { "standard", aarch64_handle_standard_branch_protection, NULL, 0 }, |
| { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes, |
| ARRAY_SIZE (aarch64_pac_ret_subtypes) }, |
| { "bti", aarch64_handle_bti_protection, NULL, 0 }, |
| { NULL, NULL, NULL, 0 } |
| }; |
| |
| /* The condition codes of the processor, and the inverse function. */ |
| static const char * const aarch64_condition_codes[] = |
| { |
| "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc", |
| "hi", "ls", "ge", "lt", "gt", "le", "al", "nv" |
| }; |
| |
| /* The preferred condition codes for SVE conditions. */ |
| static const char *const aarch64_sve_condition_codes[] = |
| { |
| "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc", |
| "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv" |
| }; |
| |
| /* Return the assembly token for svpattern value VALUE. */ |
| |
| static const char * |
| svpattern_token (enum aarch64_svpattern pattern) |
| { |
| switch (pattern) |
| { |
| #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER; |
| AARCH64_FOR_SVPATTERN (CASE) |
| #undef CASE |
| case AARCH64_NUM_SVPATTERNS: |
| break; |
| } |
| gcc_unreachable (); |
| } |
| |
| /* Return the location of a piece that is known to be passed or returned |
| in registers. FIRST_ZR is the first unused vector argument register |
| and FIRST_PR is the first unused predicate argument register. */ |
| |
| rtx |
| pure_scalable_type_info::piece::get_rtx (unsigned int first_zr, |
| unsigned int first_pr) const |
| { |
| gcc_assert (VECTOR_MODE_P (mode) |
| && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS |
| && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS); |
| |
| if (num_zr > 0 && num_pr == 0) |
| return gen_rtx_REG (mode, first_zr); |
| |
| if (num_zr == 0 && num_pr == 1) |
| return gen_rtx_REG (mode, first_pr); |
| |
| gcc_unreachable (); |
| } |
| |
| /* Return the total number of vector registers required by the PST. */ |
| |
| unsigned int |
| pure_scalable_type_info::num_zr () const |
| { |
| unsigned int res = 0; |
| for (unsigned int i = 0; i < pieces.length (); ++i) |
| res += pieces[i].num_zr; |
| return res; |
| } |
| |
| /* Return the total number of predicate registers required by the PST. */ |
| |
| unsigned int |
| pure_scalable_type_info::num_pr () const |
| { |
| unsigned int res = 0; |
| for (unsigned int i = 0; i < pieces.length (); ++i) |
| res += pieces[i].num_pr; |
| return res; |
| } |
| |
| /* Return the location of a PST that is known to be passed or returned |
| in registers. FIRST_ZR is the first unused vector argument register |
| and FIRST_PR is the first unused predicate argument register. */ |
| |
| rtx |
| pure_scalable_type_info::get_rtx (machine_mode mode, |
| unsigned int first_zr, |
| unsigned int first_pr) const |
| { |
| /* Try to return a single REG if possible. This leads to better |
| code generation; it isn't required for correctness. */ |
| if (mode == pieces[0].mode) |
| { |
| gcc_assert (pieces.length () == 1); |
| return pieces[0].get_rtx (first_zr, first_pr); |
| } |
| |
| /* Build up a PARALLEL that contains the individual pieces. */ |
| rtvec rtxes = rtvec_alloc (pieces.length ()); |
| for (unsigned int i = 0; i < pieces.length (); ++i) |
| { |
| rtx reg = pieces[i].get_rtx (first_zr, first_pr); |
| rtx offset = gen_int_mode (pieces[i].offset, Pmode); |
| RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset); |
| first_zr += pieces[i].num_zr; |
| first_pr += pieces[i].num_pr; |
| } |
| return gen_rtx_PARALLEL (mode, rtxes); |
| } |
| |
| /* Analyze whether TYPE is a Pure Scalable Type according to the rules |
| in the AAPCS64. */ |
| |
| pure_scalable_type_info::analysis_result |
| pure_scalable_type_info::analyze (const_tree type) |
| { |
| /* Prevent accidental reuse. */ |
| gcc_assert (pieces.is_empty ()); |
| |
| /* No code will be generated for erroneous types, so we won't establish |
| an ABI mapping. */ |
| if (type == error_mark_node) |
| return NO_ABI_IDENTITY; |
| |
| /* Zero-sized types disappear in the language->ABI mapping. */ |
| if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type))) |
| return NO_ABI_IDENTITY; |
| |
| /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */ |
| piece p = {}; |
| if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr)) |
| { |
| machine_mode mode = TYPE_MODE_RAW (type); |
| gcc_assert (VECTOR_MODE_P (mode) |
| && (!TARGET_SVE || aarch64_sve_mode_p (mode))); |
| |
| p.mode = p.orig_mode = mode; |
| add_piece (p); |
| return IS_PST; |
| } |
| |
| /* Check for user-defined PSTs. */ |
| if (TREE_CODE (type) == ARRAY_TYPE) |
| return analyze_array (type); |
| if (TREE_CODE (type) == RECORD_TYPE) |
| return analyze_record (type); |
| |
| return ISNT_PST; |
| } |
| |
| /* Analyze a type that is known not to be passed or returned in memory. |
| Return true if it has an ABI identity and is a Pure Scalable Type. */ |
| |
| bool |
| pure_scalable_type_info::analyze_registers (const_tree type) |
| { |
| analysis_result result = analyze (type); |
| gcc_assert (result != DOESNT_MATTER); |
| return result == IS_PST; |
| } |
| |
| /* Subroutine of analyze for handling ARRAY_TYPEs. */ |
| |
| pure_scalable_type_info::analysis_result |
| pure_scalable_type_info::analyze_array (const_tree type) |
| { |
| /* Analyze the element type. */ |
| pure_scalable_type_info element_info; |
| analysis_result result = element_info.analyze (TREE_TYPE (type)); |
| if (result != IS_PST) |
| return result; |
| |
| /* An array of unknown, flexible or variable length will be passed and |
| returned by reference whatever we do. */ |
| tree nelts_minus_one = array_type_nelts (type); |
| if (!tree_fits_uhwi_p (nelts_minus_one)) |
| return DOESNT_MATTER; |
| |
| /* Likewise if the array is constant-sized but too big to be interesting. |
| The double checks against MAX_PIECES are to protect against overflow. */ |
| unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one); |
| if (count > MAX_PIECES) |
| return DOESNT_MATTER; |
| count += 1; |
| if (count * element_info.pieces.length () > MAX_PIECES) |
| return DOESNT_MATTER; |
| |
| /* The above checks should have weeded out elements of unknown size. */ |
| poly_uint64 element_bytes; |
| if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes)) |
| gcc_unreachable (); |
| |
| /* Build up the list of individual vectors and predicates. */ |
| gcc_assert (!element_info.pieces.is_empty ()); |
| for (unsigned int i = 0; i < count; ++i) |
| for (unsigned int j = 0; j < element_info.pieces.length (); ++j) |
| { |
| piece p = element_info.pieces[j]; |
| p.offset += i * element_bytes; |
| add_piece (p); |
| } |
| return IS_PST; |
| } |
| |
| /* Subroutine of analyze for handling RECORD_TYPEs. */ |
| |
| pure_scalable_type_info::analysis_result |
| pure_scalable_type_info::analyze_record (const_tree type) |
| { |
| for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field)) |
| { |
| if (TREE_CODE (field) != FIELD_DECL) |
| continue; |
| |
| /* Zero-sized fields disappear in the language->ABI mapping. */ |
| if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field))) |
| continue; |
| |
| /* All fields with an ABI identity must be PSTs for the record as |
| a whole to be a PST. If any individual field is too big to be |
| interesting then the record is too. */ |
| pure_scalable_type_info field_info; |
| analysis_result subresult = field_info.analyze (TREE_TYPE (field)); |
| if (subresult == NO_ABI_IDENTITY) |
| continue; |
| if (subresult != IS_PST) |
| return subresult; |
| |
| /* Since all previous fields are PSTs, we ought to be able to track |
| the field offset using poly_ints. */ |
| tree bitpos = bit_position (field); |
| gcc_assert (poly_int_tree_p (bitpos)); |
| |
| /* For the same reason, it shouldn't be possible to create a PST field |
| whose offset isn't byte-aligned. */ |
| poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos), |
| BITS_PER_UNIT); |
| |
| /* Punt if the record is too big to be interesting. */ |
| poly_uint64 bytepos; |
| if (!wide_bytepos.to_uhwi (&bytepos) |
| || pieces.length () + field_info.pieces.length () > MAX_PIECES) |
| return DOESNT_MATTER; |
| |
| /* Add the individual vectors and predicates in the field to the |
| record's list. */ |
| gcc_assert (!field_info.pieces.is_empty ()); |
| for (unsigned int i = 0; i < field_info.pieces.length (); ++i) |
| { |
| piece p = field_info.pieces[i]; |
| p.offset += bytepos; |
| add_piece (p); |
| } |
| } |
| /* Empty structures disappear in the language->ABI mapping. */ |
| return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST; |
| } |
| |
| /* Add P to the list of pieces in the type. */ |
| |
| void |
| pure_scalable_type_info::add_piece (const piece &p) |
| { |
| /* Try to fold the new piece into the previous one to form a |
| single-mode PST. For example, if we see three consecutive vectors |
| of the same mode, we can represent them using the corresponding |
| 3-tuple mode. |
| |
| This is purely an optimization. */ |
| if (!pieces.is_empty ()) |
| { |
| piece &prev = pieces.last (); |
| gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode)); |
| unsigned int nelems1, nelems2; |
| if (prev.orig_mode == p.orig_mode |
| && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset) |
| && constant_multiple_p (GET_MODE_NUNITS (prev.mode), |
| GET_MODE_NUNITS (p.orig_mode), &nelems1) |
| && constant_multiple_p (GET_MODE_NUNITS (p.mode), |
| GET_MODE_NUNITS (p.orig_mode), &nelems2) |
| && targetm.array_mode (p.orig_mode, |
| nelems1 + nelems2).exists (&prev.mode)) |
| { |
| prev.num_zr += p.num_zr; |
| prev.num_pr += p.num_pr; |
| return; |
| } |
| } |
| pieces.quick_push (p); |
| } |
| |
| /* Return true if at least one possible value of type TYPE includes at |
| least one object of Pure Scalable Type, in the sense of the AAPCS64. |
| |
| This is a relatively expensive test for some types, so it should |
| generally be made as late as possible. */ |
| |
| static bool |
| aarch64_some_values_include_pst_objects_p (const_tree type) |
| { |
| if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type))) |
| return false; |
| |
| if (aarch64_sve::builtin_type_p (type)) |
| return true; |
| |
| if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE) |
| return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type)); |
| |
| if (RECORD_OR_UNION_TYPE_P (type)) |
| for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field)) |
| if (TREE_CODE (field) == FIELD_DECL |
| && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field))) |
| return true; |
| |
| return false; |
| } |
| |
| /* Return the descriptor of the SIMD ABI. */ |
| |
| static const predefined_function_abi & |
| aarch64_simd_abi (void) |
| { |
| predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD]; |
| if (!simd_abi.initialized_p ()) |
| { |
| HARD_REG_SET full_reg_clobbers |
| = default_function_abi.full_reg_clobbers (); |
| for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) |
| if (FP_SIMD_SAVED_REGNUM_P (regno)) |
| CLEAR_HARD_REG_BIT (full_reg_clobbers, regno); |
| simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers); |
| } |
| return simd_abi; |
| } |
| |
| /* Return the descriptor of the SVE PCS. */ |
| |
| static const predefined_function_abi & |
| aarch64_sve_abi (void) |
| { |
| predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE]; |
| if (!sve_abi.initialized_p ()) |
| { |
| HARD_REG_SET full_reg_clobbers |
| = default_function_abi.full_reg_clobbers (); |
| for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno) |
| CLEAR_HARD_REG_BIT (full_reg_clobbers, regno); |
| for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno) |
| CLEAR_HARD_REG_BIT (full_reg_clobbers, regno); |
| sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers); |
| } |
| return sve_abi; |
| } |
| |
| /* If X is an UNSPEC_SALT_ADDR expression, return the address that it |
| wraps, otherwise return X itself. */ |
| |
| static rtx |
| strip_salt (rtx x) |
| { |
| rtx search = x; |
|