blob: 14f1994adf3a251a782a1784c509ac38bc017999 [file] [log] [blame]
/* Machine description for AArch64 architecture.
Copyright (C) 2009-2022 Free Software Foundation, Inc.
Contributed by ARM Ltd.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
GCC is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
#define IN_TARGET_CODE 1
#define INCLUDE_STRING
#define INCLUDE_ALGORITHM
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "backend.h"
#include "target.h"
#include "rtl.h"
#include "tree.h"
#include "memmodel.h"
#include "gimple.h"
#include "cfghooks.h"
#include "cfgloop.h"
#include "df.h"
#include "tm_p.h"
#include "stringpool.h"
#include "attribs.h"
#include "optabs.h"
#include "regs.h"
#include "emit-rtl.h"
#include "recog.h"
#include "cgraph.h"
#include "diagnostic.h"
#include "insn-attr.h"
#include "alias.h"
#include "fold-const.h"
#include "stor-layout.h"
#include "calls.h"
#include "varasm.h"
#include "output.h"
#include "flags.h"
#include "explow.h"
#include "expr.h"
#include "reload.h"
#include "langhooks.h"
#include "opts.h"
#include "gimplify.h"
#include "dwarf2.h"
#include "gimple-iterator.h"
#include "tree-vectorizer.h"
#include "aarch64-cost-tables.h"
#include "dumpfile.h"
#include "builtins.h"
#include "rtl-iter.h"
#include "tm-constrs.h"
#include "sched-int.h"
#include "target-globals.h"
#include "common/common-target.h"
#include "cfgrtl.h"
#include "selftest.h"
#include "selftest-rtl.h"
#include "rtx-vector-builder.h"
#include "intl.h"
#include "expmed.h"
#include "function-abi.h"
#include "gimple-pretty-print.h"
#include "tree-ssa-loop-niter.h"
#include "fractional-cost.h"
#include "rtlanal.h"
#include "tree-dfa.h"
#include "asan.h"
#include "aarch64-feature-deps.h"
/* This file should be included last. */
#include "target-def.h"
/* Defined for convenience. */
#define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
/* Information about a legitimate vector immediate operand. */
struct simd_immediate_info
{
enum insn_type { MOV, MVN, INDEX, PTRUE };
enum modifier_type { LSL, MSL };
simd_immediate_info () {}
simd_immediate_info (scalar_float_mode, rtx);
simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
insn_type = MOV, modifier_type = LSL,
unsigned int = 0);
simd_immediate_info (scalar_mode, rtx, rtx);
simd_immediate_info (scalar_int_mode, aarch64_svpattern);
/* The mode of the elements. */
scalar_mode elt_mode;
/* The instruction to use to move the immediate into a vector. */
insn_type insn;
union
{
/* For MOV and MVN. */
struct
{
/* The value of each element. */
rtx value;
/* The kind of shift modifier to use, and the number of bits to shift.
This is (LSL, 0) if no shift is needed. */
modifier_type modifier;
unsigned int shift;
} mov;
/* For INDEX. */
struct
{
/* The value of the first element and the step to be added for each
subsequent element. */
rtx base, step;
} index;
/* For PTRUE. */
aarch64_svpattern pattern;
} u;
};
/* Construct a floating-point immediate in which each element has mode
ELT_MODE_IN and value VALUE_IN. */
inline simd_immediate_info
::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
: elt_mode (elt_mode_in), insn (MOV)
{
u.mov.value = value_in;
u.mov.modifier = LSL;
u.mov.shift = 0;
}
/* Construct an integer immediate in which each element has mode ELT_MODE_IN
and value VALUE_IN. The other parameters are as for the structure
fields. */
inline simd_immediate_info
::simd_immediate_info (scalar_int_mode elt_mode_in,
unsigned HOST_WIDE_INT value_in,
insn_type insn_in, modifier_type modifier_in,
unsigned int shift_in)
: elt_mode (elt_mode_in), insn (insn_in)
{
u.mov.value = gen_int_mode (value_in, elt_mode_in);
u.mov.modifier = modifier_in;
u.mov.shift = shift_in;
}
/* Construct an integer immediate in which each element has mode ELT_MODE_IN
and where element I is equal to BASE_IN + I * STEP_IN. */
inline simd_immediate_info
::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
: elt_mode (elt_mode_in), insn (INDEX)
{
u.index.base = base_in;
u.index.step = step_in;
}
/* Construct a predicate that controls elements of mode ELT_MODE_IN
and has PTRUE pattern PATTERN_IN. */
inline simd_immediate_info
::simd_immediate_info (scalar_int_mode elt_mode_in,
aarch64_svpattern pattern_in)
: elt_mode (elt_mode_in), insn (PTRUE)
{
u.pattern = pattern_in;
}
namespace {
/* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
class pure_scalable_type_info
{
public:
/* Represents the result of analyzing a type. All values are nonzero,
in the possibly forlorn hope that accidental conversions to bool
trigger a warning. */
enum analysis_result
{
/* The type does not have an ABI identity; i.e. it doesn't contain
at least one object whose type is a Fundamental Data Type. */
NO_ABI_IDENTITY = 1,
/* The type is definitely a Pure Scalable Type. */
IS_PST,
/* The type is definitely not a Pure Scalable Type. */
ISNT_PST,
/* It doesn't matter for PCS purposes whether the type is a Pure
Scalable Type or not, since the type will be handled the same
way regardless.
Specifically, this means that if the type is a Pure Scalable Type,
there aren't enough argument registers to hold it, and so it will
need to be passed or returned in memory. If the type isn't a
Pure Scalable Type, it's too big to be passed or returned in core
or SIMD&FP registers, and so again will need to go in memory. */
DOESNT_MATTER
};
/* Aggregates of 17 bytes or more are normally passed and returned
in memory, so aggregates of that size can safely be analyzed as
DOESNT_MATTER. We need to be able to collect enough pieces to
represent a PST that is smaller than that. Since predicates are
2 bytes in size for -msve-vector-bits=128, that means we need to be
able to store at least 8 pieces.
We also need to be able to store enough pieces to represent
a single vector in each vector argument register and a single
predicate in each predicate argument register. This means that
we need at least 12 pieces. */
static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
/* Describes one piece of a PST. Each piece is one of:
- a single Scalable Vector Type (SVT)
- a single Scalable Predicate Type (SPT)
- a PST containing 2, 3 or 4 SVTs, with no padding
It either represents a single built-in type or a PST formed from
multiple homogeneous built-in types. */
struct piece
{
rtx get_rtx (unsigned int, unsigned int) const;
/* The number of vector and predicate registers that the piece
occupies. One of the two is always zero. */
unsigned int num_zr;
unsigned int num_pr;
/* The mode of the registers described above. */
machine_mode mode;
/* If this piece is formed from multiple homogeneous built-in types,
this is the mode of the built-in types, otherwise it is MODE. */
machine_mode orig_mode;
/* The offset in bytes of the piece from the start of the type. */
poly_uint64_pod offset;
};
/* Divides types analyzed as IS_PST into individual pieces. The pieces
are in memory order. */
auto_vec<piece, MAX_PIECES> pieces;
unsigned int num_zr () const;
unsigned int num_pr () const;
rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
analysis_result analyze (const_tree);
bool analyze_registers (const_tree);
private:
analysis_result analyze_array (const_tree);
analysis_result analyze_record (const_tree);
void add_piece (const piece &);
};
}
/* The current code model. */
enum aarch64_code_model aarch64_cmodel;
/* The number of 64-bit elements in an SVE vector. */
poly_uint16 aarch64_sve_vg;
#ifdef HAVE_AS_TLS
#undef TARGET_HAVE_TLS
#define TARGET_HAVE_TLS 1
#endif
static bool aarch64_composite_type_p (const_tree, machine_mode);
static bool aarch64_return_in_memory_1 (const_tree);
static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
const_tree,
machine_mode *, int *,
bool *, bool);
static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
static void aarch64_override_options_after_change (void);
static bool aarch64_vector_mode_supported_p (machine_mode);
static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
const_tree type,
int misalignment,
bool is_packed);
static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
aarch64_addr_query_type);
/* The processor for which instructions should be scheduled. */
enum aarch64_processor aarch64_tune = cortexa53;
/* Mask to specify which instruction scheduling options should be used. */
uint64_t aarch64_tune_flags = 0;
/* Global flag for PC relative loads. */
bool aarch64_pcrelative_literal_loads;
/* Global flag for whether frame pointer is enabled. */
bool aarch64_use_frame_pointer;
#define BRANCH_PROTECT_STR_MAX 255
char *accepted_branch_protection_string = NULL;
static enum aarch64_parse_opt_result
aarch64_parse_branch_protection (const char*, char**);
/* Support for command line parsing of boolean flags in the tuning
structures. */
struct aarch64_flag_desc
{
const char* name;
unsigned int flag;
};
#define AARCH64_FUSION_PAIR(name, internal_name) \
{ name, AARCH64_FUSE_##internal_name },
static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
{
{ "none", AARCH64_FUSE_NOTHING },
#include "aarch64-fusion-pairs.def"
{ "all", AARCH64_FUSE_ALL },
{ NULL, AARCH64_FUSE_NOTHING }
};
#define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
{ name, AARCH64_EXTRA_TUNE_##internal_name },
static const struct aarch64_flag_desc aarch64_tuning_flags[] =
{
{ "none", AARCH64_EXTRA_TUNE_NONE },
#include "aarch64-tuning-flags.def"
{ "all", AARCH64_EXTRA_TUNE_ALL },
{ NULL, AARCH64_EXTRA_TUNE_NONE }
};
/* Tuning parameters. */
static const struct cpu_addrcost_table generic_addrcost_table =
{
{
1, /* hi */
0, /* si */
0, /* di */
1, /* ti */
},
0, /* pre_modify */
0, /* post_modify */
0, /* post_modify_ld3_st3 */
0, /* post_modify_ld4_st4 */
0, /* register_offset */
0, /* register_sextend */
0, /* register_zextend */
0 /* imm_offset */
};
static const struct cpu_addrcost_table exynosm1_addrcost_table =
{
{
0, /* hi */
0, /* si */
0, /* di */
2, /* ti */
},
0, /* pre_modify */
0, /* post_modify */
0, /* post_modify_ld3_st3 */
0, /* post_modify_ld4_st4 */
1, /* register_offset */
1, /* register_sextend */
2, /* register_zextend */
0, /* imm_offset */
};
static const struct cpu_addrcost_table xgene1_addrcost_table =
{
{
1, /* hi */
0, /* si */
0, /* di */
1, /* ti */
},
1, /* pre_modify */
1, /* post_modify */
1, /* post_modify_ld3_st3 */
1, /* post_modify_ld4_st4 */
0, /* register_offset */
1, /* register_sextend */
1, /* register_zextend */
0, /* imm_offset */
};
static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
{
{
1, /* hi */
1, /* si */
1, /* di */
2, /* ti */
},
0, /* pre_modify */
0, /* post_modify */
0, /* post_modify_ld3_st3 */
0, /* post_modify_ld4_st4 */
2, /* register_offset */
3, /* register_sextend */
3, /* register_zextend */
0, /* imm_offset */
};
static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
{
{
1, /* hi */
1, /* si */
1, /* di */
2, /* ti */
},
0, /* pre_modify */
0, /* post_modify */
0, /* post_modify_ld3_st3 */
0, /* post_modify_ld4_st4 */
2, /* register_offset */
3, /* register_sextend */
3, /* register_zextend */
0, /* imm_offset */
};
static const struct cpu_addrcost_table tsv110_addrcost_table =
{
{
1, /* hi */
0, /* si */
0, /* di */
1, /* ti */
},
0, /* pre_modify */
0, /* post_modify */
0, /* post_modify_ld3_st3 */
0, /* post_modify_ld4_st4 */
0, /* register_offset */
1, /* register_sextend */
1, /* register_zextend */
0, /* imm_offset */
};
static const struct cpu_addrcost_table qdf24xx_addrcost_table =
{
{
1, /* hi */
1, /* si */
1, /* di */
2, /* ti */
},
1, /* pre_modify */
1, /* post_modify */
1, /* post_modify_ld3_st3 */
1, /* post_modify_ld4_st4 */
3, /* register_offset */
3, /* register_sextend */
3, /* register_zextend */
2, /* imm_offset */
};
static const struct cpu_addrcost_table a64fx_addrcost_table =
{
{
1, /* hi */
1, /* si */
1, /* di */
2, /* ti */
},
0, /* pre_modify */
0, /* post_modify */
0, /* post_modify_ld3_st3 */
0, /* post_modify_ld4_st4 */
2, /* register_offset */
3, /* register_sextend */
3, /* register_zextend */
0, /* imm_offset */
};
static const struct cpu_addrcost_table neoversev1_addrcost_table =
{
{
1, /* hi */
0, /* si */
0, /* di */
1, /* ti */
},
0, /* pre_modify */
0, /* post_modify */
3, /* post_modify_ld3_st3 */
3, /* post_modify_ld4_st4 */
0, /* register_offset */
0, /* register_sextend */
0, /* register_zextend */
0 /* imm_offset */
};
static const struct cpu_addrcost_table neoversen2_addrcost_table =
{
{
1, /* hi */
0, /* si */
0, /* di */
1, /* ti */
},
0, /* pre_modify */
0, /* post_modify */
2, /* post_modify_ld3_st3 */
2, /* post_modify_ld4_st4 */
0, /* register_offset */
0, /* register_sextend */
0, /* register_zextend */
0 /* imm_offset */
};
static const struct cpu_addrcost_table neoversev2_addrcost_table =
{
{
1, /* hi */
0, /* si */
0, /* di */
1, /* ti */
},
0, /* pre_modify */
0, /* post_modify */
2, /* post_modify_ld3_st3 */
2, /* post_modify_ld4_st4 */
0, /* register_offset */
0, /* register_sextend */
0, /* register_zextend */
0 /* imm_offset */
};
static const struct cpu_regmove_cost generic_regmove_cost =
{
1, /* GP2GP */
/* Avoid the use of slow int<->fp moves for spilling by setting
their cost higher than memmov_cost. */
5, /* GP2FP */
5, /* FP2GP */
2 /* FP2FP */
};
static const struct cpu_regmove_cost cortexa57_regmove_cost =
{
1, /* GP2GP */
/* Avoid the use of slow int<->fp moves for spilling by setting
their cost higher than memmov_cost. */
5, /* GP2FP */
5, /* FP2GP */
2 /* FP2FP */
};
static const struct cpu_regmove_cost cortexa53_regmove_cost =
{
1, /* GP2GP */
/* Avoid the use of slow int<->fp moves for spilling by setting
their cost higher than memmov_cost. */
5, /* GP2FP */
5, /* FP2GP */
2 /* FP2FP */
};
static const struct cpu_regmove_cost exynosm1_regmove_cost =
{
1, /* GP2GP */
/* Avoid the use of slow int<->fp moves for spilling by setting
their cost higher than memmov_cost (actual, 4 and 9). */
9, /* GP2FP */
9, /* FP2GP */
1 /* FP2FP */
};
static const struct cpu_regmove_cost thunderx_regmove_cost =
{
2, /* GP2GP */
2, /* GP2FP */
6, /* FP2GP */
4 /* FP2FP */
};
static const struct cpu_regmove_cost xgene1_regmove_cost =
{
1, /* GP2GP */
/* Avoid the use of slow int<->fp moves for spilling by setting
their cost higher than memmov_cost. */
8, /* GP2FP */
8, /* FP2GP */
2 /* FP2FP */
};
static const struct cpu_regmove_cost qdf24xx_regmove_cost =
{
2, /* GP2GP */
/* Avoid the use of int<->fp moves for spilling. */
6, /* GP2FP */
6, /* FP2GP */
4 /* FP2FP */
};
static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
{
1, /* GP2GP */
/* Avoid the use of int<->fp moves for spilling. */
5, /* GP2FP */
6, /* FP2GP */
3, /* FP2FP */
};
static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
{
1, /* GP2GP */
/* Avoid the use of int<->fp moves for spilling. */
4, /* GP2FP */
5, /* FP2GP */
4 /* FP2FP */
};
static const struct cpu_regmove_cost tsv110_regmove_cost =
{
1, /* GP2GP */
/* Avoid the use of slow int<->fp moves for spilling by setting
their cost higher than memmov_cost. */
2, /* GP2FP */
3, /* FP2GP */
2 /* FP2FP */
};
static const struct cpu_regmove_cost a64fx_regmove_cost =
{
1, /* GP2GP */
/* Avoid the use of slow int<->fp moves for spilling by setting
their cost higher than memmov_cost. */
5, /* GP2FP */
7, /* FP2GP */
2 /* FP2FP */
};
static const struct cpu_regmove_cost neoversen2_regmove_cost =
{
1, /* GP2GP */
/* Spilling to int<->fp instead of memory is recommended so set
realistic costs compared to memmov_cost. */
3, /* GP2FP */
2, /* FP2GP */
2 /* FP2FP */
};
static const struct cpu_regmove_cost neoversev1_regmove_cost =
{
1, /* GP2GP */
/* Spilling to int<->fp instead of memory is recommended so set
realistic costs compared to memmov_cost. */
3, /* GP2FP */
2, /* FP2GP */
2 /* FP2FP */
};
static const struct cpu_regmove_cost neoversev2_regmove_cost =
{
1, /* GP2GP */
/* Spilling to int<->fp instead of memory is recommended so set
realistic costs compared to memmov_cost. */
3, /* GP2FP */
2, /* FP2GP */
2 /* FP2FP */
};
/* Generic costs for Advanced SIMD vector operations. */
static const advsimd_vec_cost generic_advsimd_vector_cost =
{
1, /* int_stmt_cost */
1, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
2, /* permute_cost */
2, /* reduc_i8_cost */
2, /* reduc_i16_cost */
2, /* reduc_i32_cost */
2, /* reduc_i64_cost */
2, /* reduc_f16_cost */
2, /* reduc_f32_cost */
2, /* reduc_f64_cost */
2, /* store_elt_extra_cost */
2, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* align_load_cost */
1, /* unalign_load_cost */
1, /* unalign_store_cost */
1 /* store_cost */
};
/* Generic costs for SVE vector operations. */
static const sve_vec_cost generic_sve_vector_cost =
{
{
1, /* int_stmt_cost */
1, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
2, /* permute_cost */
2, /* reduc_i8_cost */
2, /* reduc_i16_cost */
2, /* reduc_i32_cost */
2, /* reduc_i64_cost */
2, /* reduc_f16_cost */
2, /* reduc_f32_cost */
2, /* reduc_f64_cost */
2, /* store_elt_extra_cost */
2, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* align_load_cost */
1, /* unalign_load_cost */
1, /* unalign_store_cost */
1 /* store_cost */
},
2, /* clast_cost */
2, /* fadda_f16_cost */
2, /* fadda_f32_cost */
2, /* fadda_f64_cost */
4, /* gather_load_x32_cost */
2, /* gather_load_x64_cost */
1 /* scatter_store_elt_cost */
};
/* Generic costs for vector insn classes. */
static const struct cpu_vector_cost generic_vector_cost =
{
1, /* scalar_int_stmt_cost */
1, /* scalar_fp_stmt_cost */
1, /* scalar_load_cost */
1, /* scalar_store_cost */
3, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&generic_advsimd_vector_cost, /* advsimd */
&generic_sve_vector_cost, /* sve */
nullptr /* issue_info */
};
static const advsimd_vec_cost a64fx_advsimd_vector_cost =
{
2, /* int_stmt_cost */
5, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
3, /* permute_cost */
13, /* reduc_i8_cost */
13, /* reduc_i16_cost */
13, /* reduc_i32_cost */
13, /* reduc_i64_cost */
13, /* reduc_f16_cost */
13, /* reduc_f32_cost */
13, /* reduc_f64_cost */
13, /* store_elt_extra_cost */
13, /* vec_to_scalar_cost */
4, /* scalar_to_vec_cost */
6, /* align_load_cost */
6, /* unalign_load_cost */
1, /* unalign_store_cost */
1 /* store_cost */
};
static const sve_vec_cost a64fx_sve_vector_cost =
{
{
2, /* int_stmt_cost */
5, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
3, /* permute_cost */
13, /* reduc_i8_cost */
13, /* reduc_i16_cost */
13, /* reduc_i32_cost */
13, /* reduc_i64_cost */
13, /* reduc_f16_cost */
13, /* reduc_f32_cost */
13, /* reduc_f64_cost */
13, /* store_elt_extra_cost */
13, /* vec_to_scalar_cost */
4, /* scalar_to_vec_cost */
6, /* align_load_cost */
6, /* unalign_load_cost */
1, /* unalign_store_cost */
1 /* store_cost */
},
13, /* clast_cost */
13, /* fadda_f16_cost */
13, /* fadda_f32_cost */
13, /* fadda_f64_cost */
64, /* gather_load_x32_cost */
32, /* gather_load_x64_cost */
1 /* scatter_store_elt_cost */
};
static const struct cpu_vector_cost a64fx_vector_cost =
{
1, /* scalar_int_stmt_cost */
5, /* scalar_fp_stmt_cost */
4, /* scalar_load_cost */
1, /* scalar_store_cost */
3, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&a64fx_advsimd_vector_cost, /* advsimd */
&a64fx_sve_vector_cost, /* sve */
nullptr /* issue_info */
};
static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
{
1, /* int_stmt_cost */
3, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
2, /* permute_cost */
1, /* reduc_i8_cost */
1, /* reduc_i16_cost */
1, /* reduc_i32_cost */
1, /* reduc_i64_cost */
1, /* reduc_f16_cost */
1, /* reduc_f32_cost */
1, /* reduc_f64_cost */
1, /* store_elt_extra_cost */
1, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* align_load_cost */
1, /* unalign_load_cost */
1, /* unalign_store_cost */
1 /* store_cost */
};
/* QDF24XX costs for vector insn classes. */
static const struct cpu_vector_cost qdf24xx_vector_cost =
{
1, /* scalar_int_stmt_cost */
1, /* scalar_fp_stmt_cost */
1, /* scalar_load_cost */
1, /* scalar_store_cost */
3, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&qdf24xx_advsimd_vector_cost, /* advsimd */
nullptr, /* sve */
nullptr /* issue_info */
};
static const advsimd_vec_cost thunderx_advsimd_vector_cost =
{
4, /* int_stmt_cost */
1, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
4, /* permute_cost */
2, /* reduc_i8_cost */
2, /* reduc_i16_cost */
2, /* reduc_i32_cost */
2, /* reduc_i64_cost */
2, /* reduc_f16_cost */
2, /* reduc_f32_cost */
2, /* reduc_f64_cost */
2, /* store_elt_extra_cost */
2, /* vec_to_scalar_cost */
2, /* scalar_to_vec_cost */
3, /* align_load_cost */
5, /* unalign_load_cost */
5, /* unalign_store_cost */
1 /* store_cost */
};
/* ThunderX costs for vector insn classes. */
static const struct cpu_vector_cost thunderx_vector_cost =
{
1, /* scalar_int_stmt_cost */
1, /* scalar_fp_stmt_cost */
3, /* scalar_load_cost */
1, /* scalar_store_cost */
3, /* cond_taken_branch_cost */
3, /* cond_not_taken_branch_cost */
&thunderx_advsimd_vector_cost, /* advsimd */
nullptr, /* sve */
nullptr /* issue_info */
};
static const advsimd_vec_cost tsv110_advsimd_vector_cost =
{
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
2, /* permute_cost */
3, /* reduc_i8_cost */
3, /* reduc_i16_cost */
3, /* reduc_i32_cost */
3, /* reduc_i64_cost */
3, /* reduc_f16_cost */
3, /* reduc_f32_cost */
3, /* reduc_f64_cost */
3, /* store_elt_extra_cost */
3, /* vec_to_scalar_cost */
2, /* scalar_to_vec_cost */
5, /* align_load_cost */
5, /* unalign_load_cost */
1, /* unalign_store_cost */
1 /* store_cost */
};
static const struct cpu_vector_cost tsv110_vector_cost =
{
1, /* scalar_int_stmt_cost */
1, /* scalar_fp_stmt_cost */
5, /* scalar_load_cost */
1, /* scalar_store_cost */
1, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&tsv110_advsimd_vector_cost, /* advsimd */
nullptr, /* sve */
nullptr /* issue_info */
};
static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
{
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
3, /* permute_cost */
8, /* reduc_i8_cost */
8, /* reduc_i16_cost */
8, /* reduc_i32_cost */
8, /* reduc_i64_cost */
8, /* reduc_f16_cost */
8, /* reduc_f32_cost */
8, /* reduc_f64_cost */
8, /* store_elt_extra_cost */
8, /* vec_to_scalar_cost */
8, /* scalar_to_vec_cost */
4, /* align_load_cost */
4, /* unalign_load_cost */
1, /* unalign_store_cost */
1 /* store_cost */
};
/* Cortex-A57 costs for vector insn classes. */
static const struct cpu_vector_cost cortexa57_vector_cost =
{
1, /* scalar_int_stmt_cost */
1, /* scalar_fp_stmt_cost */
4, /* scalar_load_cost */
1, /* scalar_store_cost */
1, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&cortexa57_advsimd_vector_cost, /* advsimd */
nullptr, /* sve */
nullptr /* issue_info */
};
static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
{
3, /* int_stmt_cost */
3, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
3, /* permute_cost */
3, /* reduc_i8_cost */
3, /* reduc_i16_cost */
3, /* reduc_i32_cost */
3, /* reduc_i64_cost */
3, /* reduc_f16_cost */
3, /* reduc_f32_cost */
3, /* reduc_f64_cost */
3, /* store_elt_extra_cost */
3, /* vec_to_scalar_cost */
3, /* scalar_to_vec_cost */
5, /* align_load_cost */
5, /* unalign_load_cost */
1, /* unalign_store_cost */
1 /* store_cost */
};
static const struct cpu_vector_cost exynosm1_vector_cost =
{
1, /* scalar_int_stmt_cost */
1, /* scalar_fp_stmt_cost */
5, /* scalar_load_cost */
1, /* scalar_store_cost */
1, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&exynosm1_advsimd_vector_cost, /* advsimd */
nullptr, /* sve */
nullptr /* issue_info */
};
static const advsimd_vec_cost xgene1_advsimd_vector_cost =
{
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
2, /* permute_cost */
4, /* reduc_i8_cost */
4, /* reduc_i16_cost */
4, /* reduc_i32_cost */
4, /* reduc_i64_cost */
4, /* reduc_f16_cost */
4, /* reduc_f32_cost */
4, /* reduc_f64_cost */
4, /* store_elt_extra_cost */
4, /* vec_to_scalar_cost */
4, /* scalar_to_vec_cost */
10, /* align_load_cost */
10, /* unalign_load_cost */
2, /* unalign_store_cost */
2 /* store_cost */
};
/* Generic costs for vector insn classes. */
static const struct cpu_vector_cost xgene1_vector_cost =
{
1, /* scalar_int_stmt_cost */
1, /* scalar_fp_stmt_cost */
5, /* scalar_load_cost */
1, /* scalar_store_cost */
2, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&xgene1_advsimd_vector_cost, /* advsimd */
nullptr, /* sve */
nullptr /* issue_info */
};
static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
{
4, /* int_stmt_cost */
5, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
10, /* permute_cost */
6, /* reduc_i8_cost */
6, /* reduc_i16_cost */
6, /* reduc_i32_cost */
6, /* reduc_i64_cost */
6, /* reduc_f16_cost */
6, /* reduc_f32_cost */
6, /* reduc_f64_cost */
6, /* store_elt_extra_cost */
6, /* vec_to_scalar_cost */
5, /* scalar_to_vec_cost */
4, /* align_load_cost */
4, /* unalign_load_cost */
1, /* unalign_store_cost */
1 /* store_cost */
};
/* Costs for vector insn classes for Vulcan. */
static const struct cpu_vector_cost thunderx2t99_vector_cost =
{
1, /* scalar_int_stmt_cost */
6, /* scalar_fp_stmt_cost */
4, /* scalar_load_cost */
1, /* scalar_store_cost */
2, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&thunderx2t99_advsimd_vector_cost, /* advsimd */
nullptr, /* sve */
nullptr /* issue_info */
};
static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
{
5, /* int_stmt_cost */
5, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
10, /* permute_cost */
5, /* reduc_i8_cost */
5, /* reduc_i16_cost */
5, /* reduc_i32_cost */
5, /* reduc_i64_cost */
5, /* reduc_f16_cost */
5, /* reduc_f32_cost */
5, /* reduc_f64_cost */
5, /* store_elt_extra_cost */
5, /* vec_to_scalar_cost */
5, /* scalar_to_vec_cost */
4, /* align_load_cost */
4, /* unalign_load_cost */
4, /* unalign_store_cost */
4 /* store_cost */
};
static const struct cpu_vector_cost thunderx3t110_vector_cost =
{
1, /* scalar_int_stmt_cost */
5, /* scalar_fp_stmt_cost */
4, /* scalar_load_cost */
1, /* scalar_store_cost */
2, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&thunderx3t110_advsimd_vector_cost, /* advsimd */
nullptr, /* sve */
nullptr /* issue_info */
};
static const advsimd_vec_cost ampere1_advsimd_vector_cost =
{
3, /* int_stmt_cost */
3, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
2, /* permute_cost */
12, /* reduc_i8_cost */
9, /* reduc_i16_cost */
6, /* reduc_i32_cost */
5, /* reduc_i64_cost */
9, /* reduc_f16_cost */
6, /* reduc_f32_cost */
5, /* reduc_f64_cost */
8, /* store_elt_extra_cost */
6, /* vec_to_scalar_cost */
7, /* scalar_to_vec_cost */
5, /* align_load_cost */
5, /* unalign_load_cost */
2, /* unalign_store_cost */
2 /* store_cost */
};
/* Ampere-1 costs for vector insn classes. */
static const struct cpu_vector_cost ampere1_vector_cost =
{
1, /* scalar_int_stmt_cost */
1, /* scalar_fp_stmt_cost */
4, /* scalar_load_cost */
1, /* scalar_store_cost */
1, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&ampere1_advsimd_vector_cost, /* advsimd */
nullptr, /* sve */
nullptr /* issue_info */
};
/* Generic costs for branch instructions. */
static const struct cpu_branch_cost generic_branch_cost =
{
1, /* Predictable. */
3 /* Unpredictable. */
};
/* Generic approximation modes. */
static const cpu_approx_modes generic_approx_modes =
{
AARCH64_APPROX_NONE, /* division */
AARCH64_APPROX_NONE, /* sqrt */
AARCH64_APPROX_NONE /* recip_sqrt */
};
/* Approximation modes for Exynos M1. */
static const cpu_approx_modes exynosm1_approx_modes =
{
AARCH64_APPROX_NONE, /* division */
AARCH64_APPROX_ALL, /* sqrt */
AARCH64_APPROX_ALL /* recip_sqrt */
};
/* Approximation modes for X-Gene 1. */
static const cpu_approx_modes xgene1_approx_modes =
{
AARCH64_APPROX_NONE, /* division */
AARCH64_APPROX_NONE, /* sqrt */
AARCH64_APPROX_ALL /* recip_sqrt */
};
/* Generic prefetch settings (which disable prefetch). */
static const cpu_prefetch_tune generic_prefetch_tune =
{
0, /* num_slots */
-1, /* l1_cache_size */
-1, /* l1_cache_line_size */
-1, /* l2_cache_size */
true, /* prefetch_dynamic_strides */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
static const cpu_prefetch_tune exynosm1_prefetch_tune =
{
0, /* num_slots */
-1, /* l1_cache_size */
64, /* l1_cache_line_size */
-1, /* l2_cache_size */
true, /* prefetch_dynamic_strides */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
static const cpu_prefetch_tune qdf24xx_prefetch_tune =
{
4, /* num_slots */
32, /* l1_cache_size */
64, /* l1_cache_line_size */
512, /* l2_cache_size */
false, /* prefetch_dynamic_strides */
2048, /* minimum_stride */
3 /* default_opt_level */
};
static const cpu_prefetch_tune thunderxt88_prefetch_tune =
{
8, /* num_slots */
32, /* l1_cache_size */
128, /* l1_cache_line_size */
16*1024, /* l2_cache_size */
true, /* prefetch_dynamic_strides */
-1, /* minimum_stride */
3 /* default_opt_level */
};
static const cpu_prefetch_tune thunderx_prefetch_tune =
{
8, /* num_slots */
32, /* l1_cache_size */
128, /* l1_cache_line_size */
-1, /* l2_cache_size */
true, /* prefetch_dynamic_strides */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
{
8, /* num_slots */
32, /* l1_cache_size */
64, /* l1_cache_line_size */
256, /* l2_cache_size */
true, /* prefetch_dynamic_strides */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
{
8, /* num_slots */
32, /* l1_cache_size */
64, /* l1_cache_line_size */
256, /* l2_cache_size */
true, /* prefetch_dynamic_strides */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
static const cpu_prefetch_tune tsv110_prefetch_tune =
{
0, /* num_slots */
64, /* l1_cache_size */
64, /* l1_cache_line_size */
512, /* l2_cache_size */
true, /* prefetch_dynamic_strides */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
static const cpu_prefetch_tune xgene1_prefetch_tune =
{
8, /* num_slots */
32, /* l1_cache_size */
64, /* l1_cache_line_size */
256, /* l2_cache_size */
true, /* prefetch_dynamic_strides */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
static const cpu_prefetch_tune a64fx_prefetch_tune =
{
8, /* num_slots */
64, /* l1_cache_size */
256, /* l1_cache_line_size */
32768, /* l2_cache_size */
true, /* prefetch_dynamic_strides */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
static const cpu_prefetch_tune ampere1_prefetch_tune =
{
0, /* num_slots */
64, /* l1_cache_size */
64, /* l1_cache_line_size */
2048, /* l2_cache_size */
true, /* prefetch_dynamic_strides */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
static const struct tune_params generic_tunings =
{
&cortexa57_extra_costs,
&generic_addrcost_table,
&generic_regmove_cost,
&generic_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
{ 4, /* load_int. */
4, /* store_int. */
4, /* load_fp. */
4, /* store_fp. */
4, /* load_pred. */
4 /* store_pred. */
}, /* memmov_cost. */
2, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
"16:12", /* function_align. */
"4", /* jump_align. */
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
/* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
Neoverse V1. It does not have a noticeable effect on A64FX and should
have at most a very minor effect on SVE2 cores. */
(AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */
&generic_prefetch_tune
};
static const struct tune_params cortexa35_tunings =
{
&cortexa53_extra_costs,
&generic_addrcost_table,
&cortexa53_regmove_cost,
&generic_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
{ 4, /* load_int. */
4, /* store_int. */
4, /* load_fp. */
4, /* store_fp. */
4, /* load_pred. */
4 /* store_pred. */
}, /* memmov_cost. */
1, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
"16", /* function_align. */
"4", /* jump_align. */
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&generic_prefetch_tune
};
static const struct tune_params cortexa53_tunings =
{
&cortexa53_extra_costs,
&generic_addrcost_table,
&cortexa53_regmove_cost,
&generic_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
{ 4, /* load_int. */
4, /* store_int. */
4, /* load_fp. */
4, /* store_fp. */
4, /* load_pred. */
4 /* store_pred. */
}, /* memmov_cost. */
2, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
"16", /* function_align. */
"4", /* jump_align. */
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&generic_prefetch_tune
};
static const struct tune_params cortexa57_tunings =
{
&cortexa57_extra_costs,
&generic_addrcost_table,
&cortexa57_regmove_cost,
&cortexa57_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
{ 4, /* load_int. */
4, /* store_int. */
4, /* load_fp. */
4, /* store_fp. */
4, /* load_pred. */
4 /* store_pred. */
}, /* memmov_cost. */
3, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
"16", /* function_align. */
"4", /* jump_align. */
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
&generic_prefetch_tune
};
static const struct tune_params cortexa72_tunings =
{
&cortexa57_extra_costs,
&generic_addrcost_table,
&cortexa57_regmove_cost,
&cortexa57_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
{ 4, /* load_int. */
4, /* store_int. */
4, /* load_fp. */
4, /* store_fp. */
4, /* load_pred. */
4 /* store_pred. */
}, /* memmov_cost. */
3, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
"16", /* function_align. */
"4", /* jump_align. */
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&generic_prefetch_tune
};
static const struct tune_params cortexa73_tunings =
{
&cortexa57_extra_costs,
&generic_addrcost_table,
&cortexa57_regmove_cost,
&cortexa57_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
{ 4, /* load_int. */
4, /* store_int. */
4, /* load_fp. */
4, /* store_fp. */
4, /* load_pred. */
4 /* store_pred. */
}, /* memmov_cost. */
2, /* issue_rate. */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
"16", /* function_align. */
"4", /* jump_align. */
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&generic_prefetch_tune
};
static const struct tune_params exynosm1_tunings =
{
&exynosm1_extra_costs,
&exynosm1_addrcost_table,
&exynosm1_regmove_cost,
&exynosm1_vector_cost,
&generic_branch_cost,
&exynosm1_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
{ 4, /* load_int. */
4, /* store_int. */
4, /* load_fp. */
4, /* store_fp. */
4, /* load_pred. */
4 /* store_pred. */
}, /* memmov_cost. */
3, /* issue_rate */
(AARCH64_FUSE_AES_AESMC), /* fusible_ops */
"4", /* function_align. */
"4", /* jump_align. */
"4", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
48, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&exynosm1_prefetch_tune
};
static const struct tune_params thunderxt88_tunings =
{
&thunderx_extra_costs,
&generic_addrcost_table,
&thunderx_regmove_cost,
&thunderx_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
{ 6, /* load_int. */
6, /* store_int. */
6, /* load_fp. */
6, /* store_fp. */
6, /* load_pred. */
6 /* store_pred. */
}, /* memmov_cost. */
2, /* issue_rate */
AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
"8", /* function_align. */
"8", /* jump_align. */
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
&thunderxt88_prefetch_tune
};
static const struct tune_params thunderx_tunings =
{
&thunderx_extra_costs,
&generic_addrcost_table,
&thunderx_regmove_cost,
&thunderx_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
{ 6, /* load_int. */
6, /* store_int. */
6, /* load_fp. */
6, /* store_fp. */
6, /* load_pred. */
6 /* store_pred. */
}, /* memmov_cost. */
2, /* issue_rate */
AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
"8", /* function_align. */
"8", /* jump_align. */
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
| AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
&thunderx_prefetch_tune
};
static const struct tune_params tsv110_tunings =
{
&tsv110_extra_costs,
&tsv110_addrcost_table,
&tsv110_regmove_cost,
&tsv110_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
{ 4, /* load_int. */
4, /* store_int. */
4, /* load_fp. */
4, /* store_fp. */
4, /* load_pred. */
4 /* store_pred. */
}, /* memmov_cost. */
4, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
| AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
"16", /* function_align. */
"4", /* jump_align. */
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&tsv110_prefetch_tune
};
static const struct tune_params xgene1_tunings =
{
&xgene1_extra_costs,
&xgene1_addrcost_table,
&xgene1_regmove_cost,
&xgene1_vector_cost,
&generic_branch_cost,
&xgene1_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
{ 6, /* load_int. */
6, /* store_int. */
6, /* load_fp. */
6, /* store_fp. */
6, /* load_pred. */
6 /* store_pred. */
}, /* memmov_cost. */
4, /* issue_rate */
AARCH64_FUSE_NOTHING, /* fusible_ops */
"16", /* function_align. */
"16", /* jump_align. */
"16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
17, /* max_case_values. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
&xgene1_prefetch_tune
};
static const struct tune_params emag_tunings =
{
&xgene1_extra_costs,
&xgene1_addrcost_table,
&xgene1_regmove_cost,
&xgene1_vector_cost,
&generic_branch_cost,
&xgene1_approx_modes,
SVE_NOT_IMPLEMENTED,
{ 6, /* load_int. */
6, /* store_int. */
6, /* load_fp. */
6, /* store_fp. */
6, /* load_pred. */
6 /* store_pred. */
}, /* memmov_cost. */
4, /* issue_rate */
AARCH64_FUSE_NOTHING, /* fusible_ops */
"16", /* function_align. */
"16", /* jump_align. */
"16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
17, /* max_case_values. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
&xgene1_prefetch_tune
};
static const struct tune_params qdf24xx_tunings =
{
&qdf24xx_extra_costs,
&qdf24xx_addrcost_table,
&qdf24xx_regmove_cost,
&qdf24xx_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
{ 4, /* load_int. */
4, /* store_int. */
4, /* load_fp. */
4, /* store_fp. */
4, /* load_pred. */
4 /* store_pred. */
}, /* memmov_cost. */
4, /* issue_rate */
(AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
"16", /* function_align. */
"8", /* jump_align. */
"16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
&qdf24xx_prefetch_tune
};
/* Tuning structure for the Qualcomm Saphira core. Default to falkor values
for now. */
static const struct tune_params saphira_tunings =
{
&generic_extra_costs,
&generic_addrcost_table,
&generic_regmove_cost,
&generic_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
{ 4, /* load_int. */
4, /* store_int. */
4, /* load_fp. */
4, /* store_fp. */
4, /* load_pred. */
4 /* store_pred. */
}, /* memmov_cost. */
4, /* issue_rate */
(AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
"16", /* function_align. */
"8", /* jump_align. */
"16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&generic_prefetch_tune
};
static const struct tune_params thunderx2t99_tunings =
{
&thunderx2t99_extra_costs,
&thunderx2t99_addrcost_table,
&thunderx2t99_regmove_cost,
&thunderx2t99_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
{ 4, /* load_int. */
4, /* store_int. */
4, /* load_fp. */
4, /* store_fp. */
4, /* load_pred. */
4 /* store_pred. */
}, /* memmov_cost. */
4, /* issue_rate. */
(AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
"16", /* function_align. */
"8", /* jump_align. */
"16", /* loop_align. */
3, /* int_reassoc_width. */
2, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&thunderx2t99_prefetch_tune
};
static const struct tune_params thunderx3t110_tunings =
{
&thunderx3t110_extra_costs,
&thunderx3t110_addrcost_table,
&thunderx3t110_regmove_cost,
&thunderx3t110_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
{ 4, /* load_int. */
4, /* store_int. */
4, /* load_fp. */
4, /* store_fp. */
4, /* load_pred. */
4 /* store_pred. */
}, /* memmov_cost. */
6, /* issue_rate. */
(AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
"16", /* function_align. */
"8", /* jump_align. */
"16", /* loop_align. */
3, /* int_reassoc_width. */
2, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&thunderx3t110_prefetch_tune
};
static const struct tune_params neoversen1_tunings =
{
&cortexa76_extra_costs,
&generic_addrcost_table,
&generic_regmove_cost,
&cortexa57_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
{ 4, /* load_int. */
2, /* store_int. */
5, /* load_fp. */
2, /* store_fp. */
4, /* load_pred. */
4 /* store_pred. */
}, /* memmov_cost. */
3, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
"32:16", /* function_align. */
"4", /* jump_align. */
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
&generic_prefetch_tune
};
static const struct tune_params ampere1_tunings =
{
&ampere1_extra_costs,
&generic_addrcost_table,
&generic_regmove_cost,
&ampere1_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
{ 4, /* load_int. */
4, /* store_int. */
4, /* load_fp. */
4, /* store_fp. */
4, /* load_pred. */
4 /* store_pred. */
}, /* memmov_cost. */
4, /* issue_rate */
(AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
AARCH64_FUSE_CMP_BRANCH),
/* fusible_ops */
"32", /* function_align. */
"4", /* jump_align. */
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&ampere1_prefetch_tune
};
static const struct tune_params ampere1a_tunings =
{
&ampere1a_extra_costs,
&generic_addrcost_table,
&generic_regmove_cost,
&ampere1_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
{ 4, /* load_int. */
4, /* store_int. */
4, /* load_fp. */
4, /* store_fp. */
4, /* load_pred. */
4 /* store_pred. */
}, /* memmov_cost. */
4, /* issue_rate */
(AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
AARCH64_FUSE_ADDSUB_2REG_CONST1),
/* fusible_ops */
"32", /* function_align. */
"4", /* jump_align. */
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&ampere1_prefetch_tune
};
static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
{
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
4, /* ld2_st2_permute_cost */
4, /* ld3_st3_permute_cost */
5, /* ld4_st4_permute_cost */
3, /* permute_cost */
4, /* reduc_i8_cost */
4, /* reduc_i16_cost */
2, /* reduc_i32_cost */
2, /* reduc_i64_cost */
6, /* reduc_f16_cost */
3, /* reduc_f32_cost */
2, /* reduc_f64_cost */
2, /* store_elt_extra_cost */
/* This value is just inherited from the Cortex-A57 table. */
8, /* vec_to_scalar_cost */
/* This depends very much on what the scalar value is and
where it comes from. E.g. some constants take two dependent
instructions or a load, while others might be moved from a GPR.
4 seems to be a reasonable compromise in practice. */
4, /* scalar_to_vec_cost */
4, /* align_load_cost */
4, /* unalign_load_cost */
/* Although stores have a latency of 2 and compete for the
vector pipes, in practice it's better not to model that. */
1, /* unalign_store_cost */
1 /* store_cost */
};
static const sve_vec_cost neoversev1_sve_vector_cost =
{
{
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
4, /* ld2_st2_permute_cost */
7, /* ld3_st3_permute_cost */
8, /* ld4_st4_permute_cost */
3, /* permute_cost */
/* Theoretically, a reduction involving 31 scalar ADDs could
complete in ~9 cycles and would have a cost of 31. [SU]ADDV
completes in 14 cycles, so give it a cost of 31 + 5. */
36, /* reduc_i8_cost */
/* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7. */
22, /* reduc_i16_cost */
/* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7. */
14, /* reduc_i32_cost */
/* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8. */
11, /* reduc_i64_cost */
/* Theoretically, a reduction involving 15 scalar FADDs could
complete in ~9 cycles and would have a cost of 30. FADDV
completes in 13 cycles, so give it a cost of 30 + 4. */
34, /* reduc_f16_cost */
/* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5. */
19, /* reduc_f32_cost */
/* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5. */
11, /* reduc_f64_cost */
2, /* store_elt_extra_cost */
/* This value is just inherited from the Cortex-A57 table. */
8, /* vec_to_scalar_cost */
/* See the comment above the Advanced SIMD versions. */
4, /* scalar_to_vec_cost */
4, /* align_load_cost */
4, /* unalign_load_cost */
/* Although stores have a latency of 2 and compete for the
vector pipes, in practice it's better not to model that. */
1, /* unalign_store_cost */
1 /* store_cost */
},
3, /* clast_cost */
19, /* fadda_f16_cost */
11, /* fadda_f32_cost */
8, /* fadda_f64_cost */
32, /* gather_load_x32_cost */
16, /* gather_load_x64_cost */
3 /* scatter_store_elt_cost */
};
static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
{
3, /* loads_stores_per_cycle */
2, /* stores_per_cycle */
4, /* general_ops_per_cycle */
0, /* fp_simd_load_general_ops */
1 /* fp_simd_store_general_ops */
};
static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
{
{
3, /* loads_stores_per_cycle */
2, /* stores_per_cycle */
4, /* general_ops_per_cycle */
0, /* fp_simd_load_general_ops */
1 /* fp_simd_store_general_ops */
},
2, /* ld2_st2_general_ops */
2, /* ld3_st3_general_ops */
3 /* ld4_st4_general_ops */
};
static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
{
{
{
2, /* loads_per_cycle */
2, /* stores_per_cycle */
2, /* general_ops_per_cycle */
0, /* fp_simd_load_general_ops */
1 /* fp_simd_store_general_ops */
},
2, /* ld2_st2_general_ops */
2, /* ld3_st3_general_ops */
3 /* ld4_st4_general_ops */
},
1, /* pred_ops_per_cycle */
2, /* while_pred_ops */
2, /* int_cmp_pred_ops */
1, /* fp_cmp_pred_ops */
1, /* gather_scatter_pair_general_ops */
1 /* gather_scatter_pair_pred_ops */
};
static const aarch64_vec_issue_info neoversev1_vec_issue_info =
{
&neoversev1_scalar_issue_info,
&neoversev1_advsimd_issue_info,
&neoversev1_sve_issue_info
};
/* Neoverse V1 costs for vector insn classes. */
static const struct cpu_vector_cost neoversev1_vector_cost =
{
1, /* scalar_int_stmt_cost */
2, /* scalar_fp_stmt_cost */
4, /* scalar_load_cost */
1, /* scalar_store_cost */
1, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&neoversev1_advsimd_vector_cost, /* advsimd */
&neoversev1_sve_vector_cost, /* sve */
&neoversev1_vec_issue_info /* issue_info */
};
static const struct tune_params neoversev1_tunings =
{
&cortexa76_extra_costs,
&neoversev1_addrcost_table,
&neoversev1_regmove_cost,
&neoversev1_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_256, /* sve_width */
{ 4, /* load_int. */
2, /* store_int. */
6, /* load_fp. */
2, /* store_fp. */
6, /* load_pred. */
1 /* store_pred. */
}, /* memmov_cost. */
3, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
"32:16", /* function_align. */
"4", /* jump_align. */
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
4, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
| AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
&generic_prefetch_tune
};
static const sve_vec_cost neoverse512tvb_sve_vector_cost =
{
{
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
4, /* ld2_st2_permute_cost */
5, /* ld3_st3_permute_cost */
5, /* ld4_st4_permute_cost */
3, /* permute_cost */
/* Theoretically, a reduction involving 15 scalar ADDs could
complete in ~5 cycles and would have a cost of 15. Assume that
[SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6. */
21, /* reduc_i8_cost */
/* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
13, /* reduc_i16_cost */
/* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
9, /* reduc_i32_cost */
/* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7. */
8, /* reduc_i64_cost */
/* Theoretically, a reduction involving 7 scalar FADDs could
complete in ~6 cycles and would have a cost of 14. Assume that
FADDV completes in 8 cycles and so give it a cost of 14 + 2. */
16, /* reduc_f16_cost */
/* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
8, /* reduc_f32_cost */
/* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2. */
4, /* reduc_f64_cost */
2, /* store_elt_extra_cost */
/* This value is just inherited from the Cortex-A57 table. */
8, /* vec_to_scalar_cost */
/* This depends very much on what the scalar value is and
where it comes from. E.g. some constants take two dependent
instructions or a load, while others might be moved from a GPR.
4 seems to be a reasonable compromise in practice. */
4, /* scalar_to_vec_cost */
4, /* align_load_cost */
4, /* unalign_load_cost */
/* Although stores generally have a latency of 2 and compete for the
vector pipes, in practice it's better not to model that. */
1, /* unalign_store_cost */
1 /* store_cost */
},
3, /* clast_cost */
10, /* fadda_f16_cost */
6, /* fadda_f32_cost */
4, /* fadda_f64_cost */
/* A strided Advanced SIMD x64 load would take two parallel FP loads
(6 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
(cost 8) and a vec_construct (cost 2). Add a full vector operation
(cost 2) to that, to avoid the difference being lost in rounding.
There is no easy comparison between a strided Advanced SIMD x32 load
and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
operation more than a 64-bit gather. */
14, /* gather_load_x32_cost */
12, /* gather_load_x64_cost */
3 /* scatter_store_elt_cost */
};
static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
{
{
{
3, /* loads_per_cycle */
2, /* stores_per_cycle */
4, /* general_ops_per_cycle */
0, /* fp_simd_load_general_ops */
1 /* fp_simd_store_general_ops */
},
2, /* ld2_st2_general_ops */
2, /* ld3_st3_general_ops */
3 /* ld4_st4_general_ops */
},
2, /* pred_ops_per_cycle */
2, /* while_pred_ops */
2, /* int_cmp_pred_ops */
1, /* fp_cmp_pred_ops */
1, /* gather_scatter_pair_general_ops */
1 /* gather_scatter_pair_pred_ops */
};
static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
{
&neoversev1_scalar_issue_info,
&neoversev1_advsimd_issue_info,
&neoverse512tvb_sve_issue_info
};
static const struct cpu_vector_cost neoverse512tvb_vector_cost =
{
1, /* scalar_int_stmt_cost */
2, /* scalar_fp_stmt_cost */
4, /* scalar_load_cost */
1, /* scalar_store_cost */
1, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&neoversev1_advsimd_vector_cost, /* advsimd */
&neoverse512tvb_sve_vector_cost, /* sve */
&neoverse512tvb_vec_issue_info /* issue_info */
};
static const struct tune_params neoverse512tvb_tunings =
{
&cortexa76_extra_costs,
&neoversev1_addrcost_table,
&neoversev1_regmove_cost,
&neoverse512tvb_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_128 | SVE_256, /* sve_width */
{ 4, /* load_int. */
2, /* store_int. */
6, /* load_fp. */
2, /* store_fp. */
6, /* load_pred. */
1 /* store_pred. */
}, /* memmov_cost. */
3, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
"32:16", /* function_align. */
"4", /* jump_align. */
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
4, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
&generic_prefetch_tune
};
static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
{
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
2, /* ld2_st2_permute_cost */
2, /* ld3_st3_permute_cost */
3, /* ld4_st4_permute_cost */
3, /* permute_cost */
4, /* reduc_i8_cost */
4, /* reduc_i16_cost */
2, /* reduc_i32_cost */
2, /* reduc_i64_cost */
6, /* reduc_f16_cost */
4, /* reduc_f32_cost */
2, /* reduc_f64_cost */
2, /* store_elt_extra_cost */
/* This value is just inherited from the Cortex-A57 table. */
8, /* vec_to_scalar_cost */
/* This depends very much on what the scalar value is and
where it comes from. E.g. some constants take two dependent
instructions or a load, while others might be moved from a GPR.
4 seems to be a reasonable compromise in practice. */
4, /* scalar_to_vec_cost */
4, /* align_load_cost */
4, /* unalign_load_cost */
/* Although stores have a latency of 2 and compete for the
vector pipes, in practice it's better not to model that. */
1, /* unalign_store_cost */
1 /* store_cost */
};
static const sve_vec_cost neoversen2_sve_vector_cost =
{
{
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
3, /* ld2_st2_permute_cost */
4, /* ld3_st3_permute_cost */
4, /* ld4_st4_permute_cost */
3, /* permute_cost */
/* Theoretically, a reduction involving 15 scalar ADDs could
complete in ~5 cycles and would have a cost of 15. [SU]ADDV
completes in 11 cycles, so give it a cost of 15 + 6. */
21, /* reduc_i8_cost */
/* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
13, /* reduc_i16_cost */
/* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
9, /* reduc_i32_cost */
/* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
2, /* reduc_i64_cost */
/* Theoretically, a reduction involving 7 scalar FADDs could
complete in ~8 cycles and would have a cost of 14. FADDV
completes in 6 cycles, so give it a cost of 14 - 2. */
12, /* reduc_f16_cost */
/* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0. */
6, /* reduc_f32_cost */
/* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0. */
2, /* reduc_f64_cost */
2, /* store_elt_extra_cost */
/* This value is just inherited from the Cortex-A57 table. */
8, /* vec_to_scalar_cost */
/* See the comment above the Advanced SIMD versions. */
4, /* scalar_to_vec_cost */
4, /* align_load_cost */
4, /* unalign_load_cost */
/* Although stores have a latency of 2 and compete for the
vector pipes, in practice it's better not to model that. */
1, /* unalign_store_cost */
1 /* store_cost */
},
3, /* clast_cost */
10, /* fadda_f16_cost */
6, /* fadda_f32_cost */
4, /* fadda_f64_cost */
/* A strided Advanced SIMD x64 load would take two parallel FP loads
(8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
(cost 8) and a vec_construct (cost 2). Add a full vector operation
(cost 2) to that, to avoid the difference being lost in rounding.
There is no easy comparison between a strided Advanced SIMD x32 load
and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
operation more than a 64-bit gather. */
14, /* gather_load_x32_cost */
12, /* gather_load_x64_cost */
3 /* scatter_store_elt_cost */
};
static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
{
3, /* loads_stores_per_cycle */
2, /* stores_per_cycle */
4, /* general_ops_per_cycle */
0, /* fp_simd_load_general_ops */
1 /* fp_simd_store_general_ops */
};
static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
{
{
3, /* loads_stores_per_cycle */
2, /* stores_per_cycle */
2, /* general_ops_per_cycle */
0, /* fp_simd_load_general_ops */
1 /* fp_simd_store_general_ops */
},
2, /* ld2_st2_general_ops */
2, /* ld3_st3_general_ops */
3 /* ld4_st4_general_ops */
};
static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
{
{
{
3, /* loads_per_cycle */
2, /* stores_per_cycle */
2, /* general_ops_per_cycle */
0, /* fp_simd_load_general_ops */
1 /* fp_simd_store_general_ops */
},
2, /* ld2_st2_general_ops */
3, /* ld3_st3_general_ops */
3 /* ld4_st4_general_ops */
},
2, /* pred_ops_per_cycle */
2, /* while_pred_ops */
2, /* int_cmp_pred_ops */
1, /* fp_cmp_pred_ops */
1, /* gather_scatter_pair_general_ops */
1 /* gather_scatter_pair_pred_ops */
};
static const aarch64_vec_issue_info neoversen2_vec_issue_info =
{
&neoversen2_scalar_issue_info,
&neoversen2_advsimd_issue_info,
&neoversen2_sve_issue_info
};
/* Neoverse N2 costs for vector insn classes. */
static const struct cpu_vector_cost neoversen2_vector_cost =
{
1, /* scalar_int_stmt_cost */
2, /* scalar_fp_stmt_cost */
4, /* scalar_load_cost */
1, /* scalar_store_cost */
1, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&neoversen2_advsimd_vector_cost, /* advsimd */
&neoversen2_sve_vector_cost, /* sve */
&neoversen2_vec_issue_info /* issue_info */
};
static const struct tune_params neoversen2_tunings =
{
&cortexa76_extra_costs,
&neoversen2_addrcost_table,
&neoversen2_regmove_cost,
&neoversen2_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_128, /* sve_width */
{ 4, /* load_int. */
1, /* store_int. */
6, /* load_fp. */
2, /* store_fp. */
6, /* load_pred. */
1 /* store_pred. */
}, /* memmov_cost. */
3, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
"32:16", /* function_align. */
"4", /* jump_align. */
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
&generic_prefetch_tune
};
static const advsimd_vec_cost neoversev2_advsimd_vector_cost =
{
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
2, /* ld2_st2_permute_cost */
2, /* ld3_st3_permute_cost */
3, /* ld4_st4_permute_cost */
3, /* permute_cost */
4, /* reduc_i8_cost */
4, /* reduc_i16_cost */
2, /* reduc_i32_cost */
2, /* reduc_i64_cost */
6, /* reduc_f16_cost */
3, /* reduc_f32_cost */
2, /* reduc_f64_cost */
2, /* store_elt_extra_cost */
/* This value is just inherited from the Cortex-A57 table. */
8, /* vec_to_scalar_cost */
/* This depends very much on what the scalar value is and
where it comes from. E.g. some constants take two dependent
instructions or a load, while others might be moved from a GPR.
4 seems to be a reasonable compromise in practice. */
4, /* scalar_to_vec_cost */
4, /* align_load_cost */
4, /* unalign_load_cost */
/* Although stores have a latency of 2 and compete for the
vector pipes, in practice it's better not to model that. */
1, /* unalign_store_cost */
1 /* store_cost */
};
static const sve_vec_cost neoversev2_sve_vector_cost =
{
{
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
3, /* ld2_st2_permute_cost */
3, /* ld3_st3_permute_cost */
4, /* ld4_st4_permute_cost */
3, /* permute_cost */
/* Theoretically, a reduction involving 15 scalar ADDs could
complete in ~3 cycles and would have a cost of 15. [SU]ADDV
completes in 11 cycles, so give it a cost of 15 + 8. */
21, /* reduc_i8_cost */
/* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7. */
14, /* reduc_i16_cost */
/* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4. */
7, /* reduc_i32_cost */
/* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
2, /* reduc_i64_cost */
/* Theoretically, a reduction involving 7 scalar FADDs could
complete in ~6 cycles and would have a cost of 14. FADDV
completes in 8 cycles, so give it a cost of 14 + 2. */
16, /* reduc_f16_cost */
/* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
8, /* reduc_f32_cost */
/* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2. */
4, /* reduc_f64_cost */
2, /* store_elt_extra_cost */
/* This value is just inherited from the Cortex-A57 table. */
8, /* vec_to_scalar_cost */
/* See the comment above the Advanced SIMD versions. */
4, /* scalar_to_vec_cost */
4, /* align_load_cost */
4, /* unalign_load_cost */
/* Although stores have a latency of 2 and compete for the
vector pipes, in practice it's better not to model that. */
1, /* unalign_store_cost */
1 /* store_cost */
},
3, /* clast_cost */
10, /* fadda_f16_cost */
6, /* fadda_f32_cost */
4, /* fadda_f64_cost */
/* A strided Advanced SIMD x64 load would take two parallel FP loads
(8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
(cost 8) and a vec_construct (cost 2). Add a full vector operation
(cost 2) to that, to avoid the difference being lost in rounding.
There is no easy comparison between a strided Advanced SIMD x32 load
and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
operation more than a 64-bit gather. */
14, /* gather_load_x32_cost */
12, /* gather_load_x64_cost */
3 /* scatter_store_elt_cost */
};
static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info =
{
3, /* loads_stores_per_cycle */
2, /* stores_per_cycle */
6, /* general_ops_per_cycle */
0, /* fp_simd_load_general_ops */
1 /* fp_simd_store_general_ops */
};
static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info =
{
{
3, /* loads_stores_per_cycle */
2, /* stores_per_cycle */
4, /* general_ops_per_cycle */
0, /* fp_simd_load_general_ops */
1 /* fp_simd_store_general_ops */
},
2, /* ld2_st2_general_ops */
2, /* ld3_st3_general_ops */
3 /* ld4_st4_general_ops */
};
static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info =
{
{
{
3, /* loads_per_cycle */
2, /* stores_per_cycle */
4, /* general_ops_per_cycle */
0, /* fp_simd_load_general_ops */
1 /* fp_simd_store_general_ops */
},
2, /* ld2_st2_general_ops */
3, /* ld3_st3_general_ops */
3 /* ld4_st4_general_ops */
},
2, /* pred_ops_per_cycle */
2, /* while_pred_ops */
2, /* int_cmp_pred_ops */
1, /* fp_cmp_pred_ops */
1, /* gather_scatter_pair_general_ops */
1 /* gather_scatter_pair_pred_ops */
};
static const aarch64_vec_issue_info neoversev2_vec_issue_info =
{
&neoversev2_scalar_issue_info,
&neoversev2_advsimd_issue_info,
&neoversev2_sve_issue_info
};
/* Demeter costs for vector insn classes. */
static const struct cpu_vector_cost neoversev2_vector_cost =
{
1, /* scalar_int_stmt_cost */
2, /* scalar_fp_stmt_cost */
4, /* scalar_load_cost */
1, /* scalar_store_cost */
1, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&neoversev2_advsimd_vector_cost, /* advsimd */
&neoversev2_sve_vector_cost, /* sve */
&neoversev2_vec_issue_info /* issue_info */
};
static const struct tune_params neoversev2_tunings =
{
&cortexa76_extra_costs,
&neoversev2_addrcost_table,
&neoversev2_regmove_cost,
&neoversev2_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_128, /* sve_width */
{ 4, /* load_int. */
2, /* store_int. */
6, /* load_fp. */
1, /* store_fp. */
6, /* load_pred. */
2 /* store_pred. */
}, /* memmov_cost. */
5, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
"32:16", /* function_align. */
"4", /* jump_align. */
"32:16", /* loop_align. */
3, /* int_reassoc_width. */
6, /* fp_reassoc_width. */
4, /* fma_reassoc_width. */
3, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
&generic_prefetch_tune
};
static const struct tune_params a64fx_tunings =
{
&a64fx_extra_costs,
&a64fx_addrcost_table,
&a64fx_regmove_cost,
&a64fx_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_512, /* sve_width */
{ 4, /* load_int. */
4, /* store_int. */
4, /* load_fp. */
4, /* store_fp. */
4, /* load_pred. */
4 /* store_pred. */
}, /* memmov_cost. */
7, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
"32", /* function_align. */
"16", /* jump_align. */
"32", /* loop_align. */
4, /* int_reassoc_width. */
2, /* fp_reassoc_width. */
1, /* fma_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&a64fx_prefetch_tune
};
/* Support for fine-grained override of the tuning structures. */
struct aarch64_tuning_override_function
{
const char* name;
void (*parse_override)(const char*, struct tune_params*);
};
static void aarch64_parse_fuse_string (const char*, struct tune_params*);
static void aarch64_parse_tune_string (const char*, struct tune_params*);
static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
static const struct aarch64_tuning_override_function
aarch64_tuning_override_functions[] =
{
{ "fuse", aarch64_parse_fuse_string },
{ "tune", aarch64_parse_tune_string },
{ "sve_width", aarch64_parse_sve_width_string },
{ NULL, NULL }
};
/* A processor implementing AArch64. */
struct processor
{
const char *name;
aarch64_processor ident;
aarch64_processor sched_core;
aarch64_arch arch;
aarch64_feature_flags flags;
const tune_params *tune;
};
/* Architectures implementing AArch64. */
static CONSTEXPR const processor all_architectures[] =
{
#define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
{NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, \
feature_deps::ARCH_IDENT ().enable, NULL},
#include "aarch64-arches.def"
{NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
};
/* Processor cores implementing AArch64. */
static const struct processor all_cores[] =
{
#define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
{NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
feature_deps::cpu_##IDENT, &COSTS##_tunings},
#include "aarch64-cores.def"
{"generic", generic, cortexa53, AARCH64_ARCH_V8A,
feature_deps::V8A ().enable, &generic_tunings},
{NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
};
/* The current tuning set. */
struct tune_params aarch64_tune_params = generic_tunings;
/* Check whether an 'aarch64_vector_pcs' attribute is valid. */
static tree
handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
int, bool *no_add_attrs)
{
/* Since we set fn_type_req to true, the caller should have checked
this for us. */
gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
switch ((arm_pcs) fntype_abi (*node).id ())
{
case ARM_PCS_AAPCS64:
case ARM_PCS_SIMD:
return NULL_TREE;
case ARM_PCS_SVE:
error ("the %qE attribute cannot be applied to an SVE function type",
name);
*no_add_attrs = true;
return NULL_TREE;
case ARM_PCS_TLSDESC:
case ARM_PCS_UNKNOWN:
break;
}
gcc_unreachable ();
}
/* Table of machine attributes. */
static const struct attribute_spec aarch64_attribute_table[] =
{
/* { name, min_len, max_len, decl_req, type_req, fn_type_req,
affects_type_identity, handler, exclude } */
{ "aarch64_vector_pcs", 0, 0, false, true, true, true,
handle_aarch64_vector_pcs_attribute, NULL },
{ "arm_sve_vector_bits", 1, 1, false, true, false, true,
aarch64_sve::handle_arm_sve_vector_bits_attribute,
NULL },
{ "Advanced SIMD type", 1, 1, false, true, false, true, NULL, NULL },
{ "SVE type", 3, 3, false, true, false, true, NULL, NULL },
{ "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL },
{ NULL, 0, 0, false, false, false, false, NULL, NULL }
};
/* An ISA extension in the co-processor and main instruction set space. */
struct aarch64_option_extension
{
const char *const name;
const unsigned long flags_on;
const unsigned long flags_off;
};
typedef enum aarch64_cond_code
{
AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
}
aarch64_cc;
#define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
struct aarch64_branch_protect_type
{
/* The type's name that the user passes to the branch-protection option
string. */
const char* name;
/* Function to handle the protection type and set global variables.
First argument is the string token corresponding with this type and the
second argument is the next token in the option string.
Return values:
* AARCH64_PARSE_OK: Handling was sucessful.
* AARCH64_INVALID_ARG: The type is invalid in this context and the caller
should print an error.
* AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
own error. */
enum aarch64_parse_opt_result (*handler)(char*, char*);
/* A list of types that can follow this type in the option string. */
const aarch64_branch_protect_type* subtypes;
unsigned int num_subtypes;
};
static enum aarch64_parse_opt_result
aarch64_handle_no_branch_protection (char* str, char* rest)
{
aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
aarch64_enable_bti = 0;
if (rest)
{
error ("unexpected %<%s%> after %<%s%>", rest, str);
return AARCH64_PARSE_INVALID_FEATURE;
}
return AARCH64_PARSE_OK;
}
static enum aarch64_parse_opt_result
aarch64_handle_standard_branch_protection (char* str, char* rest)
{
aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
aarch64_ra_sign_key = AARCH64_KEY_A;
aarch64_enable_bti = 1;
if (rest)
{
error ("unexpected %<%s%> after %<%s%>", rest, str);
return AARCH64_PARSE_INVALID_FEATURE;
}
return AARCH64_PARSE_OK;
}
static enum aarch64_parse_opt_result
aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
char* rest ATTRIBUTE_UNUSED)
{
aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
aarch64_ra_sign_key = AARCH64_KEY_A;
return AARCH64_PARSE_OK;
}
static enum aarch64_parse_opt_result
aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
char* rest ATTRIBUTE_UNUSED)
{
aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
return AARCH64_PARSE_OK;
}
static enum aarch64_parse_opt_result
aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
char* rest ATTRIBUTE_UNUSED)
{
aarch64_ra_sign_key = AARCH64_KEY_B;
return AARCH64_PARSE_OK;
}
static enum aarch64_parse_opt_result
aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
char* rest ATTRIBUTE_UNUSED)
{
aarch64_enable_bti = 1;
return AARCH64_PARSE_OK;
}
static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
{ "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
{ "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
{ NULL, NULL, NULL, 0 }
};
static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
{ "none", aarch64_handle_no_branch_protection, NULL, 0 },
{ "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
{ "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
ARRAY_SIZE (aarch64_pac_ret_subtypes) },
{ "bti", aarch64_handle_bti_protection, NULL, 0 },
{ NULL, NULL, NULL, 0 }
};
/* The condition codes of the processor, and the inverse function. */
static const char * const aarch64_condition_codes[] =
{
"eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
"hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
};
/* The preferred condition codes for SVE conditions. */
static const char *const aarch64_sve_condition_codes[] =
{
"none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
"pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
};
/* Return the assembly token for svpattern value VALUE. */
static const char *
svpattern_token (enum aarch64_svpattern pattern)
{
switch (pattern)
{
#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
AARCH64_FOR_SVPATTERN (CASE)
#undef CASE
case AARCH64_NUM_SVPATTERNS:
break;
}
gcc_unreachable ();
}
/* Return the location of a piece that is known to be passed or returned
in registers. FIRST_ZR is the first unused vector argument register
and FIRST_PR is the first unused predicate argument register. */
rtx
pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
unsigned int first_pr) const
{
gcc_assert (VECTOR_MODE_P (mode)
&& first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
&& first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
if (num_zr > 0 && num_pr == 0)
return gen_rtx_REG (mode, first_zr);
if (num_zr == 0 && num_pr == 1)
return gen_rtx_REG (mode, first_pr);
gcc_unreachable ();
}
/* Return the total number of vector registers required by the PST. */
unsigned int
pure_scalable_type_info::num_zr () const
{
unsigned int res = 0;
for (unsigned int i = 0; i < pieces.length (); ++i)
res += pieces[i].num_zr;
return res;
}
/* Return the total number of predicate registers required by the PST. */
unsigned int
pure_scalable_type_info::num_pr () const
{
unsigned int res = 0;
for (unsigned int i = 0; i < pieces.length (); ++i)
res += pieces[i].num_pr;
return res;
}
/* Return the location of a PST that is known to be passed or returned
in registers. FIRST_ZR is the first unused vector argument register
and FIRST_PR is the first unused predicate argument register. */
rtx
pure_scalable_type_info::get_rtx (machine_mode mode,
unsigned int first_zr,
unsigned int first_pr) const
{
/* Try to return a single REG if possible. This leads to better
code generation; it isn't required for correctness. */
if (mode == pieces[0].mode)
{
gcc_assert (pieces.length () == 1);
return pieces[0].get_rtx (first_zr, first_pr);
}
/* Build up a PARALLEL that contains the individual pieces. */
rtvec rtxes = rtvec_alloc (pieces.length ());
for (unsigned int i = 0; i < pieces.length (); ++i)
{
rtx reg = pieces[i].get_rtx (first_zr, first_pr);
rtx offset = gen_int_mode (pieces[i].offset, Pmode);
RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
first_zr += pieces[i].num_zr;
first_pr += pieces[i].num_pr;
}
return gen_rtx_PARALLEL (mode, rtxes);
}
/* Analyze whether TYPE is a Pure Scalable Type according to the rules
in the AAPCS64. */
pure_scalable_type_info::analysis_result
pure_scalable_type_info::analyze (const_tree type)
{
/* Prevent accidental reuse. */
gcc_assert (pieces.is_empty ());
/* No code will be generated for erroneous types, so we won't establish
an ABI mapping. */
if (type == error_mark_node)
return NO_ABI_IDENTITY;
/* Zero-sized types disappear in the language->ABI mapping. */
if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
return NO_ABI_IDENTITY;
/* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
piece p = {};
if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
{
machine_mode mode = TYPE_MODE_RAW (type);
gcc_assert (VECTOR_MODE_P (mode)
&& (!TARGET_SVE || aarch64_sve_mode_p (mode)));
p.mode = p.orig_mode = mode;
add_piece (p);
return IS_PST;
}
/* Check for user-defined PSTs. */
if (TREE_CODE (type) == ARRAY_TYPE)
return analyze_array (type);
if (TREE_CODE (type) == RECORD_TYPE)
return analyze_record (type);
return ISNT_PST;
}
/* Analyze a type that is known not to be passed or returned in memory.
Return true if it has an ABI identity and is a Pure Scalable Type. */
bool
pure_scalable_type_info::analyze_registers (const_tree type)
{
analysis_result result = analyze (type);
gcc_assert (result != DOESNT_MATTER);
return result == IS_PST;
}
/* Subroutine of analyze for handling ARRAY_TYPEs. */
pure_scalable_type_info::analysis_result
pure_scalable_type_info::analyze_array (const_tree type)
{
/* Analyze the element type. */
pure_scalable_type_info element_info;
analysis_result result = element_info.analyze (TREE_TYPE (type));
if (result != IS_PST)
return result;
/* An array of unknown, flexible or variable length will be passed and
returned by reference whatever we do. */
tree nelts_minus_one = array_type_nelts (type);
if (!tree_fits_uhwi_p (nelts_minus_one))
return DOESNT_MATTER;
/* Likewise if the array is constant-sized but too big to be interesting.
The double checks against MAX_PIECES are to protect against overflow. */
unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
if (count > MAX_PIECES)
return DOESNT_MATTER;
count += 1;
if (count * element_info.pieces.length () > MAX_PIECES)
return DOESNT_MATTER;
/* The above checks should have weeded out elements of unknown size. */
poly_uint64 element_bytes;
if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
gcc_unreachable ();
/* Build up the list of individual vectors and predicates. */
gcc_assert (!element_info.pieces.is_empty ());
for (unsigned int i = 0; i < count; ++i)
for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
{
piece p = element_info.pieces[j];
p.offset += i * element_bytes;
add_piece (p);
}
return IS_PST;
}
/* Subroutine of analyze for handling RECORD_TYPEs. */
pure_scalable_type_info::analysis_result
pure_scalable_type_info::analyze_record (const_tree type)
{
for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
{
if (TREE_CODE (field) != FIELD_DECL)
continue;
/* Zero-sized fields disappear in the language->ABI mapping. */
if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
continue;
/* All fields with an ABI identity must be PSTs for the record as
a whole to be a PST. If any individual field is too big to be
interesting then the record is too. */
pure_scalable_type_info field_info;
analysis_result subresult = field_info.analyze (TREE_TYPE (field));
if (subresult == NO_ABI_IDENTITY)
continue;
if (subresult != IS_PST)
return subresult;
/* Since all previous fields are PSTs, we ought to be able to track
the field offset using poly_ints. */
tree bitpos = bit_position (field);
gcc_assert (poly_int_tree_p (bitpos));
/* For the same reason, it shouldn't be possible to create a PST field
whose offset isn't byte-aligned. */
poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
BITS_PER_UNIT);
/* Punt if the record is too big to be interesting. */
poly_uint64 bytepos;
if (!wide_bytepos.to_uhwi (&bytepos)
|| pieces.length () + field_info.pieces.length () > MAX_PIECES)
return DOESNT_MATTER;
/* Add the individual vectors and predicates in the field to the
record's list. */
gcc_assert (!field_info.pieces.is_empty ());
for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
{
piece p = field_info.pieces[i];
p.offset += bytepos;
add_piece (p);
}
}
/* Empty structures disappear in the language->ABI mapping. */
return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
}
/* Add P to the list of pieces in the type. */
void
pure_scalable_type_info::add_piece (const piece &p)
{
/* Try to fold the new piece into the previous one to form a
single-mode PST. For example, if we see three consecutive vectors
of the same mode, we can represent them using the corresponding
3-tuple mode.
This is purely an optimization. */
if (!pieces.is_empty ())
{
piece &prev = pieces.last ();
gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
unsigned int nelems1, nelems2;
if (prev.orig_mode == p.orig_mode
&& known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
&& constant_multiple_p (GET_MODE_NUNITS (prev.mode),
GET_MODE_NUNITS (p.orig_mode), &nelems1)
&& constant_multiple_p (GET_MODE_NUNITS (p.mode),
GET_MODE_NUNITS (p.orig_mode), &nelems2)
&& targetm.array_mode (p.orig_mode,
nelems1 + nelems2).exists (&prev.mode))
{
prev.num_zr += p.num_zr;
prev.num_pr += p.num_pr;
return;
}
}
pieces.quick_push (p);
}
/* Return true if at least one possible value of type TYPE includes at
least one object of Pure Scalable Type, in the sense of the AAPCS64.
This is a relatively expensive test for some types, so it should
generally be made as late as possible. */
static bool
aarch64_some_values_include_pst_objects_p (const_tree type)
{
if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
return false;
if (aarch64_sve::builtin_type_p (type))
return true;
if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
if (RECORD_OR_UNION_TYPE_P (type))
for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
if (TREE_CODE (field) == FIELD_DECL
&& aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
return true;
return false;
}
/* Return the descriptor of the SIMD ABI. */
static const predefined_function_abi &
aarch64_simd_abi (void)
{
predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
if (!simd_abi.initialized_p ())
{
HARD_REG_SET full_reg_clobbers
= default_function_abi.full_reg_clobbers ();
for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
if (FP_SIMD_SAVED_REGNUM_P (regno))
CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
}
return simd_abi;
}
/* Return the descriptor of the SVE PCS. */
static const predefined_function_abi &
aarch64_sve_abi (void)
{
predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
if (!sve_abi.initialized_p ())
{
HARD_REG_SET full_reg_clobbers
= default_function_abi.full_reg_clobbers ();
for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
}
return sve_abi;
}
/* If X is an UNSPEC_SALT_ADDR expression, return the address that it
wraps, otherwise return X itself. */
static rtx
strip_salt (rtx x)
{
rtx search = x;