blob: a9a1800af53b18306465e382e9dd149d0e335b09 [file] [log] [blame]
/* Machine description for AArch64 architecture.
Copyright (C) 2009-2021 Free Software Foundation, Inc.
Contributed by ARM Ltd.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
GCC is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
#define IN_TARGET_CODE 1
#define INCLUDE_STRING
#define INCLUDE_ALGORITHM
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "backend.h"
#include "target.h"
#include "rtl.h"
#include "tree.h"
#include "memmodel.h"
#include "gimple.h"
#include "cfghooks.h"
#include "cfgloop.h"
#include "df.h"
#include "tm_p.h"
#include "stringpool.h"
#include "attribs.h"
#include "optabs.h"
#include "regs.h"
#include "emit-rtl.h"
#include "recog.h"
#include "cgraph.h"
#include "diagnostic.h"
#include "insn-attr.h"
#include "alias.h"
#include "fold-const.h"
#include "stor-layout.h"
#include "calls.h"
#include "varasm.h"
#include "output.h"
#include "flags.h"
#include "explow.h"
#include "expr.h"
#include "reload.h"
#include "langhooks.h"
#include "opts.h"
#include "gimplify.h"
#include "dwarf2.h"
#include "gimple-iterator.h"
#include "tree-vectorizer.h"
#include "aarch64-cost-tables.h"
#include "dumpfile.h"
#include "builtins.h"
#include "rtl-iter.h"
#include "tm-constrs.h"
#include "sched-int.h"
#include "target-globals.h"
#include "common/common-target.h"
#include "cfgrtl.h"
#include "selftest.h"
#include "selftest-rtl.h"
#include "rtx-vector-builder.h"
#include "intl.h"
#include "expmed.h"
#include "function-abi.h"
#include "gimple-pretty-print.h"
#include "tree-ssa-loop-niter.h"
#include "fractional-cost.h"
#include "rtlanal.h"
/* This file should be included last. */
#include "target-def.h"
/* Defined for convenience. */
#define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
/* Information about a legitimate vector immediate operand. */
struct simd_immediate_info
{
enum insn_type { MOV, MVN, INDEX, PTRUE };
enum modifier_type { LSL, MSL };
simd_immediate_info () {}
simd_immediate_info (scalar_float_mode, rtx);
simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
insn_type = MOV, modifier_type = LSL,
unsigned int = 0);
simd_immediate_info (scalar_mode, rtx, rtx);
simd_immediate_info (scalar_int_mode, aarch64_svpattern);
/* The mode of the elements. */
scalar_mode elt_mode;
/* The instruction to use to move the immediate into a vector. */
insn_type insn;
union
{
/* For MOV and MVN. */
struct
{
/* The value of each element. */
rtx value;
/* The kind of shift modifier to use, and the number of bits to shift.
This is (LSL, 0) if no shift is needed. */
modifier_type modifier;
unsigned int shift;
} mov;
/* For INDEX. */
struct
{
/* The value of the first element and the step to be added for each
subsequent element. */
rtx base, step;
} index;
/* For PTRUE. */
aarch64_svpattern pattern;
} u;
};
/* Construct a floating-point immediate in which each element has mode
ELT_MODE_IN and value VALUE_IN. */
inline simd_immediate_info
::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
: elt_mode (elt_mode_in), insn (MOV)
{
u.mov.value = value_in;
u.mov.modifier = LSL;
u.mov.shift = 0;
}
/* Construct an integer immediate in which each element has mode ELT_MODE_IN
and value VALUE_IN. The other parameters are as for the structure
fields. */
inline simd_immediate_info
::simd_immediate_info (scalar_int_mode elt_mode_in,
unsigned HOST_WIDE_INT value_in,
insn_type insn_in, modifier_type modifier_in,
unsigned int shift_in)
: elt_mode (elt_mode_in), insn (insn_in)
{
u.mov.value = gen_int_mode (value_in, elt_mode_in);
u.mov.modifier = modifier_in;
u.mov.shift = shift_in;
}
/* Construct an integer immediate in which each element has mode ELT_MODE_IN
and where element I is equal to BASE_IN + I * STEP_IN. */
inline simd_immediate_info
::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
: elt_mode (elt_mode_in), insn (INDEX)
{
u.index.base = base_in;
u.index.step = step_in;
}
/* Construct a predicate that controls elements of mode ELT_MODE_IN
and has PTRUE pattern PATTERN_IN. */
inline simd_immediate_info
::simd_immediate_info (scalar_int_mode elt_mode_in,
aarch64_svpattern pattern_in)
: elt_mode (elt_mode_in), insn (PTRUE)
{
u.pattern = pattern_in;
}
namespace {
/* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
class pure_scalable_type_info
{
public:
/* Represents the result of analyzing a type. All values are nonzero,
in the possibly forlorn hope that accidental conversions to bool
trigger a warning. */
enum analysis_result
{
/* The type does not have an ABI identity; i.e. it doesn't contain
at least one object whose type is a Fundamental Data Type. */
NO_ABI_IDENTITY = 1,
/* The type is definitely a Pure Scalable Type. */
IS_PST,
/* The type is definitely not a Pure Scalable Type. */
ISNT_PST,
/* It doesn't matter for PCS purposes whether the type is a Pure
Scalable Type or not, since the type will be handled the same
way regardless.
Specifically, this means that if the type is a Pure Scalable Type,
there aren't enough argument registers to hold it, and so it will
need to be passed or returned in memory. If the type isn't a
Pure Scalable Type, it's too big to be passed or returned in core
or SIMD&FP registers, and so again will need to go in memory. */
DOESNT_MATTER
};
/* Aggregates of 17 bytes or more are normally passed and returned
in memory, so aggregates of that size can safely be analyzed as
DOESNT_MATTER. We need to be able to collect enough pieces to
represent a PST that is smaller than that. Since predicates are
2 bytes in size for -msve-vector-bits=128, that means we need to be
able to store at least 8 pieces.
We also need to be able to store enough pieces to represent
a single vector in each vector argument register and a single
predicate in each predicate argument register. This means that
we need at least 12 pieces. */
static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
/* Describes one piece of a PST. Each piece is one of:
- a single Scalable Vector Type (SVT)
- a single Scalable Predicate Type (SPT)
- a PST containing 2, 3 or 4 SVTs, with no padding
It either represents a single built-in type or a PST formed from
multiple homogeneous built-in types. */
struct piece
{
rtx get_rtx (unsigned int, unsigned int) const;
/* The number of vector and predicate registers that the piece
occupies. One of the two is always zero. */
unsigned int num_zr;
unsigned int num_pr;
/* The mode of the registers described above. */
machine_mode mode;
/* If this piece is formed from multiple homogeneous built-in types,
this is the mode of the built-in types, otherwise it is MODE. */
machine_mode orig_mode;
/* The offset in bytes of the piece from the start of the type. */
poly_uint64_pod offset;
};
/* Divides types analyzed as IS_PST into individual pieces. The pieces
are in memory order. */
auto_vec<piece, MAX_PIECES> pieces;
unsigned int num_zr () const;
unsigned int num_pr () const;
rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
analysis_result analyze (const_tree);
bool analyze_registers (const_tree);
private:
analysis_result analyze_array (const_tree);
analysis_result analyze_record (const_tree);
void add_piece (const piece &);
};
}
/* The current code model. */
enum aarch64_code_model aarch64_cmodel;
/* The number of 64-bit elements in an SVE vector. */
poly_uint16 aarch64_sve_vg;
#ifdef HAVE_AS_TLS
#undef TARGET_HAVE_TLS
#define TARGET_HAVE_TLS 1
#endif
static bool aarch64_composite_type_p (const_tree, machine_mode);
static bool aarch64_return_in_memory_1 (const_tree);
static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
const_tree,
machine_mode *, int *,
bool *, bool);
static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
static void aarch64_override_options_after_change (void);
static bool aarch64_vector_mode_supported_p (machine_mode);
static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
const_tree type,
int misalignment,
bool is_packed);
static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
aarch64_addr_query_type);
static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
/* Major revision number of the ARM Architecture implemented by the target. */
unsigned aarch64_architecture_version;
/* The processor for which instructions should be scheduled. */
enum aarch64_processor aarch64_tune = cortexa53;
/* Mask to specify which instruction scheduling options should be used. */
uint64_t aarch64_tune_flags = 0;
/* Global flag for PC relative loads. */
bool aarch64_pcrelative_literal_loads;
/* Global flag for whether frame pointer is enabled. */
bool aarch64_use_frame_pointer;
#define BRANCH_PROTECT_STR_MAX 255
char *accepted_branch_protection_string = NULL;
static enum aarch64_parse_opt_result
aarch64_parse_branch_protection (const char*, char**);
/* Support for command line parsing of boolean flags in the tuning
structures. */
struct aarch64_flag_desc
{
const char* name;
unsigned int flag;
};
#define AARCH64_FUSION_PAIR(name, internal_name) \
{ name, AARCH64_FUSE_##internal_name },
static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
{
{ "none", AARCH64_FUSE_NOTHING },
#include "aarch64-fusion-pairs.def"
{ "all", AARCH64_FUSE_ALL },
{ NULL, AARCH64_FUSE_NOTHING }
};
#define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
{ name, AARCH64_EXTRA_TUNE_##internal_name },
static const struct aarch64_flag_desc aarch64_tuning_flags[] =
{
{ "none", AARCH64_EXTRA_TUNE_NONE },
#include "aarch64-tuning-flags.def"
{ "all", AARCH64_EXTRA_TUNE_ALL },
{ NULL, AARCH64_EXTRA_TUNE_NONE }
};
/* Tuning parameters. */
static const struct cpu_addrcost_table generic_addrcost_table =
{
{
1, /* hi */
0, /* si */
0, /* di */
1, /* ti */
},
0, /* pre_modify */
0, /* post_modify */
0, /* post_modify_ld3_st3 */
0, /* post_modify_ld4_st4 */
0, /* register_offset */
0, /* register_sextend */
0, /* register_zextend */
0 /* imm_offset */
};
static const struct cpu_addrcost_table exynosm1_addrcost_table =
{
{
0, /* hi */
0, /* si */
0, /* di */
2, /* ti */
},
0, /* pre_modify */
0, /* post_modify */
0, /* post_modify_ld3_st3 */
0, /* post_modify_ld4_st4 */
1, /* register_offset */
1, /* register_sextend */
2, /* register_zextend */
0, /* imm_offset */
};
static const struct cpu_addrcost_table xgene1_addrcost_table =
{
{
1, /* hi */
0, /* si */
0, /* di */
1, /* ti */
},
1, /* pre_modify */
1, /* post_modify */
1, /* post_modify_ld3_st3 */
1, /* post_modify_ld4_st4 */
0, /* register_offset */
1, /* register_sextend */
1, /* register_zextend */
0, /* imm_offset */
};
static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
{
{
1, /* hi */
1, /* si */
1, /* di */
2, /* ti */
},
0, /* pre_modify */
0, /* post_modify */
0, /* post_modify_ld3_st3 */
0, /* post_modify_ld4_st4 */
2, /* register_offset */
3, /* register_sextend */
3, /* register_zextend */
0, /* imm_offset */
};
static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
{
{
1, /* hi */
1, /* si */
1, /* di */
2, /* ti */
},
0, /* pre_modify */
0, /* post_modify */
0, /* post_modify_ld3_st3 */
0, /* post_modify_ld4_st4 */
2, /* register_offset */
3, /* register_sextend */
3, /* register_zextend */
0, /* imm_offset */
};
static const struct cpu_addrcost_table tsv110_addrcost_table =
{
{
1, /* hi */
0, /* si */
0, /* di */
1, /* ti */
},
0, /* pre_modify */
0, /* post_modify */
0, /* post_modify_ld3_st3 */
0, /* post_modify_ld4_st4 */
0, /* register_offset */
1, /* register_sextend */
1, /* register_zextend */
0, /* imm_offset */
};
static const struct cpu_addrcost_table qdf24xx_addrcost_table =
{
{
1, /* hi */
1, /* si */
1, /* di */
2, /* ti */
},
1, /* pre_modify */
1, /* post_modify */
1, /* post_modify_ld3_st3 */
1, /* post_modify_ld4_st4 */
3, /* register_offset */
3, /* register_sextend */
3, /* register_zextend */
2, /* imm_offset */
};
static const struct cpu_addrcost_table a64fx_addrcost_table =
{
{
1, /* hi */
1, /* si */
1, /* di */
2, /* ti */
},
0, /* pre_modify */
0, /* post_modify */
0, /* post_modify_ld3_st3 */
0, /* post_modify_ld4_st4 */
2, /* register_offset */
3, /* register_sextend */
3, /* register_zextend */
0, /* imm_offset */
};
static const struct cpu_addrcost_table neoversev1_addrcost_table =
{
{
1, /* hi */
0, /* si */
0, /* di */
1, /* ti */
},
0, /* pre_modify */
0, /* post_modify */
3, /* post_modify_ld3_st3 */
3, /* post_modify_ld4_st4 */
0, /* register_offset */
0, /* register_sextend */
0, /* register_zextend */
0 /* imm_offset */
};
static const struct cpu_regmove_cost generic_regmove_cost =
{
1, /* GP2GP */
/* Avoid the use of slow int<->fp moves for spilling by setting
their cost higher than memmov_cost. */
5, /* GP2FP */
5, /* FP2GP */
2 /* FP2FP */
};
static const struct cpu_regmove_cost cortexa57_regmove_cost =
{
1, /* GP2GP */
/* Avoid the use of slow int<->fp moves for spilling by setting
their cost higher than memmov_cost. */
5, /* GP2FP */
5, /* FP2GP */
2 /* FP2FP */
};
static const struct cpu_regmove_cost cortexa53_regmove_cost =
{
1, /* GP2GP */
/* Avoid the use of slow int<->fp moves for spilling by setting
their cost higher than memmov_cost. */
5, /* GP2FP */
5, /* FP2GP */
2 /* FP2FP */
};
static const struct cpu_regmove_cost exynosm1_regmove_cost =
{
1, /* GP2GP */
/* Avoid the use of slow int<->fp moves for spilling by setting
their cost higher than memmov_cost (actual, 4 and 9). */
9, /* GP2FP */
9, /* FP2GP */
1 /* FP2FP */
};
static const struct cpu_regmove_cost thunderx_regmove_cost =
{
2, /* GP2GP */
2, /* GP2FP */
6, /* FP2GP */
4 /* FP2FP */
};
static const struct cpu_regmove_cost xgene1_regmove_cost =
{
1, /* GP2GP */
/* Avoid the use of slow int<->fp moves for spilling by setting
their cost higher than memmov_cost. */
8, /* GP2FP */
8, /* FP2GP */
2 /* FP2FP */
};
static const struct cpu_regmove_cost qdf24xx_regmove_cost =
{
2, /* GP2GP */
/* Avoid the use of int<->fp moves for spilling. */
6, /* GP2FP */
6, /* FP2GP */
4 /* FP2FP */
};
static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
{
1, /* GP2GP */
/* Avoid the use of int<->fp moves for spilling. */
5, /* GP2FP */
6, /* FP2GP */
3, /* FP2FP */
};
static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
{
1, /* GP2GP */
/* Avoid the use of int<->fp moves for spilling. */
4, /* GP2FP */
5, /* FP2GP */
4 /* FP2FP */
};
static const struct cpu_regmove_cost tsv110_regmove_cost =
{
1, /* GP2GP */
/* Avoid the use of slow int<->fp moves for spilling by setting
their cost higher than memmov_cost. */
2, /* GP2FP */
3, /* FP2GP */
2 /* FP2FP */
};
static const struct cpu_regmove_cost a64fx_regmove_cost =
{
1, /* GP2GP */
/* Avoid the use of slow int<->fp moves for spilling by setting
their cost higher than memmov_cost. */
5, /* GP2FP */
7, /* FP2GP */
2 /* FP2FP */
};
/* Generic costs for Advanced SIMD vector operations. */
static const advsimd_vec_cost generic_advsimd_vector_cost =
{
1, /* int_stmt_cost */
1, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
2, /* permute_cost */
2, /* reduc_i8_cost */
2, /* reduc_i16_cost */
2, /* reduc_i32_cost */
2, /* reduc_i64_cost */
2, /* reduc_f16_cost */
2, /* reduc_f32_cost */
2, /* reduc_f64_cost */
2, /* store_elt_extra_cost */
2, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* align_load_cost */
1, /* unalign_load_cost */
1, /* unalign_store_cost */
1 /* store_cost */
};
/* Generic costs for SVE vector operations. */
static const sve_vec_cost generic_sve_vector_cost =
{
{
1, /* int_stmt_cost */
1, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
2, /* permute_cost */
2, /* reduc_i8_cost */
2, /* reduc_i16_cost */
2, /* reduc_i32_cost */
2, /* reduc_i64_cost */
2, /* reduc_f16_cost */
2, /* reduc_f32_cost */
2, /* reduc_f64_cost */
2, /* store_elt_extra_cost */
2, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* align_load_cost */
1, /* unalign_load_cost */
1, /* unalign_store_cost */
1 /* store_cost */
},
2, /* clast_cost */
2, /* fadda_f16_cost */
2, /* fadda_f32_cost */
2, /* fadda_f64_cost */
4, /* gather_load_x32_cost */
2, /* gather_load_x64_cost */
1 /* scatter_store_elt_cost */
};
/* Generic costs for vector insn classes. */
static const struct cpu_vector_cost generic_vector_cost =
{
1, /* scalar_int_stmt_cost */
1, /* scalar_fp_stmt_cost */
1, /* scalar_load_cost */
1, /* scalar_store_cost */
3, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&generic_advsimd_vector_cost, /* advsimd */
&generic_sve_vector_cost, /* sve */
nullptr /* issue_info */
};
static const advsimd_vec_cost a64fx_advsimd_vector_cost =
{
2, /* int_stmt_cost */
5, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
3, /* permute_cost */
13, /* reduc_i8_cost */
13, /* reduc_i16_cost */
13, /* reduc_i32_cost */
13, /* reduc_i64_cost */
13, /* reduc_f16_cost */
13, /* reduc_f32_cost */
13, /* reduc_f64_cost */
13, /* store_elt_extra_cost */
13, /* vec_to_scalar_cost */
4, /* scalar_to_vec_cost */
6, /* align_load_cost */
6, /* unalign_load_cost */
1, /* unalign_store_cost */
1 /* store_cost */
};
static const sve_vec_cost a64fx_sve_vector_cost =
{
{
2, /* int_stmt_cost */
5, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
3, /* permute_cost */
13, /* reduc_i8_cost */
13, /* reduc_i16_cost */
13, /* reduc_i32_cost */
13, /* reduc_i64_cost */
13, /* reduc_f16_cost */
13, /* reduc_f32_cost */
13, /* reduc_f64_cost */
13, /* store_elt_extra_cost */
13, /* vec_to_scalar_cost */
4, /* scalar_to_vec_cost */
6, /* align_load_cost */
6, /* unalign_load_cost */
1, /* unalign_store_cost */
1 /* store_cost */
},
13, /* clast_cost */
13, /* fadda_f16_cost */
13, /* fadda_f32_cost */
13, /* fadda_f64_cost */
64, /* gather_load_x32_cost */
32, /* gather_load_x64_cost */
1 /* scatter_store_elt_cost */
};
static const struct cpu_vector_cost a64fx_vector_cost =
{
1, /* scalar_int_stmt_cost */
5, /* scalar_fp_stmt_cost */
4, /* scalar_load_cost */
1, /* scalar_store_cost */
3, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&a64fx_advsimd_vector_cost, /* advsimd */
&a64fx_sve_vector_cost, /* sve */
nullptr /* issue_info */
};
static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
{
1, /* int_stmt_cost */
3, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
2, /* permute_cost */
1, /* reduc_i8_cost */
1, /* reduc_i16_cost */
1, /* reduc_i32_cost */
1, /* reduc_i64_cost */
1, /* reduc_f16_cost */
1, /* reduc_f32_cost */
1, /* reduc_f64_cost */
1, /* store_elt_extra_cost */
1, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* align_load_cost */
1, /* unalign_load_cost */
1, /* unalign_store_cost */
1 /* store_cost */
};
/* QDF24XX costs for vector insn classes. */
static const struct cpu_vector_cost qdf24xx_vector_cost =
{
1, /* scalar_int_stmt_cost */
1, /* scalar_fp_stmt_cost */
1, /* scalar_load_cost */
1, /* scalar_store_cost */
3, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&qdf24xx_advsimd_vector_cost, /* advsimd */
nullptr, /* sve */
nullptr /* issue_info */
};
static const advsimd_vec_cost thunderx_advsimd_vector_cost =
{
4, /* int_stmt_cost */
1, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
4, /* permute_cost */
2, /* reduc_i8_cost */
2, /* reduc_i16_cost */
2, /* reduc_i32_cost */
2, /* reduc_i64_cost */
2, /* reduc_f16_cost */
2, /* reduc_f32_cost */
2, /* reduc_f64_cost */
2, /* store_elt_extra_cost */
2, /* vec_to_scalar_cost */
2, /* scalar_to_vec_cost */
3, /* align_load_cost */
5, /* unalign_load_cost */
5, /* unalign_store_cost */
1 /* store_cost */
};
/* ThunderX costs for vector insn classes. */
static const struct cpu_vector_cost thunderx_vector_cost =
{
1, /* scalar_int_stmt_cost */
1, /* scalar_fp_stmt_cost */
3, /* scalar_load_cost */
1, /* scalar_store_cost */
3, /* cond_taken_branch_cost */
3, /* cond_not_taken_branch_cost */
&thunderx_advsimd_vector_cost, /* advsimd */
nullptr, /* sve */
nullptr /* issue_info */
};
static const advsimd_vec_cost tsv110_advsimd_vector_cost =
{
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
2, /* permute_cost */
3, /* reduc_i8_cost */
3, /* reduc_i16_cost */
3, /* reduc_i32_cost */
3, /* reduc_i64_cost */
3, /* reduc_f16_cost */
3, /* reduc_f32_cost */
3, /* reduc_f64_cost */
3, /* store_elt_extra_cost */
3, /* vec_to_scalar_cost */
2, /* scalar_to_vec_cost */
5, /* align_load_cost */
5, /* unalign_load_cost */
1, /* unalign_store_cost */
1 /* store_cost */
};
static const struct cpu_vector_cost tsv110_vector_cost =
{
1, /* scalar_int_stmt_cost */
1, /* scalar_fp_stmt_cost */
5, /* scalar_load_cost */
1, /* scalar_store_cost */
1, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&tsv110_advsimd_vector_cost, /* advsimd */
nullptr, /* sve */
nullptr /* issue_info */
};
static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
{
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
3, /* permute_cost */
8, /* reduc_i8_cost */
8, /* reduc_i16_cost */
8, /* reduc_i32_cost */
8, /* reduc_i64_cost */
8, /* reduc_f16_cost */
8, /* reduc_f32_cost */
8, /* reduc_f64_cost */
8, /* store_elt_extra_cost */
8, /* vec_to_scalar_cost */
8, /* scalar_to_vec_cost */
4, /* align_load_cost */
4, /* unalign_load_cost */
1, /* unalign_store_cost */
1 /* store_cost */
};
/* Cortex-A57 costs for vector insn classes. */
static const struct cpu_vector_cost cortexa57_vector_cost =
{
1, /* scalar_int_stmt_cost */
1, /* scalar_fp_stmt_cost */
4, /* scalar_load_cost */
1, /* scalar_store_cost */
1, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&cortexa57_advsimd_vector_cost, /* advsimd */
nullptr, /* sve */
nullptr /* issue_info */
};
static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
{
3, /* int_stmt_cost */
3, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
3, /* permute_cost */
3, /* reduc_i8_cost */
3, /* reduc_i16_cost */
3, /* reduc_i32_cost */
3, /* reduc_i64_cost */
3, /* reduc_f16_cost */
3, /* reduc_f32_cost */
3, /* reduc_f64_cost */
3, /* store_elt_extra_cost */
3, /* vec_to_scalar_cost */
3, /* scalar_to_vec_cost */
5, /* align_load_cost */
5, /* unalign_load_cost */
1, /* unalign_store_cost */
1 /* store_cost */
};
static const struct cpu_vector_cost exynosm1_vector_cost =
{
1, /* scalar_int_stmt_cost */
1, /* scalar_fp_stmt_cost */
5, /* scalar_load_cost */
1, /* scalar_store_cost */
1, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&exynosm1_advsimd_vector_cost, /* advsimd */
nullptr, /* sve */
nullptr /* issue_info */
};
static const advsimd_vec_cost xgene1_advsimd_vector_cost =
{
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
2, /* permute_cost */
4, /* reduc_i8_cost */
4, /* reduc_i16_cost */
4, /* reduc_i32_cost */
4, /* reduc_i64_cost */
4, /* reduc_f16_cost */
4, /* reduc_f32_cost */
4, /* reduc_f64_cost */
4, /* store_elt_extra_cost */
4, /* vec_to_scalar_cost */
4, /* scalar_to_vec_cost */
10, /* align_load_cost */
10, /* unalign_load_cost */
2, /* unalign_store_cost */
2 /* store_cost */
};
/* Generic costs for vector insn classes. */
static const struct cpu_vector_cost xgene1_vector_cost =
{
1, /* scalar_int_stmt_cost */
1, /* scalar_fp_stmt_cost */
5, /* scalar_load_cost */
1, /* scalar_store_cost */
2, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&xgene1_advsimd_vector_cost, /* advsimd */
nullptr, /* sve */
nullptr /* issue_info */
};
static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
{
4, /* int_stmt_cost */
5, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
10, /* permute_cost */
6, /* reduc_i8_cost */
6, /* reduc_i16_cost */
6, /* reduc_i32_cost */
6, /* reduc_i64_cost */
6, /* reduc_f16_cost */
6, /* reduc_f32_cost */
6, /* reduc_f64_cost */
6, /* store_elt_extra_cost */
6, /* vec_to_scalar_cost */
5, /* scalar_to_vec_cost */
4, /* align_load_cost */
4, /* unalign_load_cost */
1, /* unalign_store_cost */
1 /* store_cost */
};
/* Costs for vector insn classes for Vulcan. */
static const struct cpu_vector_cost thunderx2t99_vector_cost =
{
1, /* scalar_int_stmt_cost */
6, /* scalar_fp_stmt_cost */
4, /* scalar_load_cost */
1, /* scalar_store_cost */
2, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&thunderx2t99_advsimd_vector_cost, /* advsimd */
nullptr, /* sve */
nullptr /* issue_info */
};
static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
{
5, /* int_stmt_cost */
5, /* fp_stmt_cost */
0, /* ld2_st2_permute_cost */
0, /* ld3_st3_permute_cost */
0, /* ld4_st4_permute_cost */
10, /* permute_cost */
5, /* reduc_i8_cost */
5, /* reduc_i16_cost */
5, /* reduc_i32_cost */
5, /* reduc_i64_cost */
5, /* reduc_f16_cost */
5, /* reduc_f32_cost */
5, /* reduc_f64_cost */
5, /* store_elt_extra_cost */
5, /* vec_to_scalar_cost */
5, /* scalar_to_vec_cost */
4, /* align_load_cost */
4, /* unalign_load_cost */
4, /* unalign_store_cost */
4 /* store_cost */
};
static const struct cpu_vector_cost thunderx3t110_vector_cost =
{
1, /* scalar_int_stmt_cost */
5, /* scalar_fp_stmt_cost */
4, /* scalar_load_cost */
1, /* scalar_store_cost */
2, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&thunderx3t110_advsimd_vector_cost, /* advsimd */
nullptr, /* sve */
nullptr /* issue_info */
};
/* Generic costs for branch instructions. */
static const struct cpu_branch_cost generic_branch_cost =
{
1, /* Predictable. */
3 /* Unpredictable. */
};
/* Generic approximation modes. */
static const cpu_approx_modes generic_approx_modes =
{
AARCH64_APPROX_NONE, /* division */
AARCH64_APPROX_NONE, /* sqrt */
AARCH64_APPROX_NONE /* recip_sqrt */
};
/* Approximation modes for Exynos M1. */
static const cpu_approx_modes exynosm1_approx_modes =
{
AARCH64_APPROX_NONE, /* division */
AARCH64_APPROX_ALL, /* sqrt */
AARCH64_APPROX_ALL /* recip_sqrt */
};
/* Approximation modes for X-Gene 1. */
static const cpu_approx_modes xgene1_approx_modes =
{
AARCH64_APPROX_NONE, /* division */
AARCH64_APPROX_NONE, /* sqrt */
AARCH64_APPROX_ALL /* recip_sqrt */
};
/* Generic prefetch settings (which disable prefetch). */
static const cpu_prefetch_tune generic_prefetch_tune =
{
0, /* num_slots */
-1, /* l1_cache_size */
-1, /* l1_cache_line_size */
-1, /* l2_cache_size */
true, /* prefetch_dynamic_strides */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
static const cpu_prefetch_tune exynosm1_prefetch_tune =
{
0, /* num_slots */
-1, /* l1_cache_size */
64, /* l1_cache_line_size */
-1, /* l2_cache_size */
true, /* prefetch_dynamic_strides */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
static const cpu_prefetch_tune qdf24xx_prefetch_tune =
{
4, /* num_slots */
32, /* l1_cache_size */
64, /* l1_cache_line_size */
512, /* l2_cache_size */
false, /* prefetch_dynamic_strides */
2048, /* minimum_stride */
3 /* default_opt_level */
};
static const cpu_prefetch_tune thunderxt88_prefetch_tune =
{
8, /* num_slots */
32, /* l1_cache_size */
128, /* l1_cache_line_size */
16*1024, /* l2_cache_size */
true, /* prefetch_dynamic_strides */
-1, /* minimum_stride */
3 /* default_opt_level */
};
static const cpu_prefetch_tune thunderx_prefetch_tune =
{
8, /* num_slots */
32, /* l1_cache_size */
128, /* l1_cache_line_size */
-1, /* l2_cache_size */
true, /* prefetch_dynamic_strides */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
{
8, /* num_slots */
32, /* l1_cache_size */
64, /* l1_cache_line_size */
256, /* l2_cache_size */
true, /* prefetch_dynamic_strides */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
{
8, /* num_slots */
32, /* l1_cache_size */
64, /* l1_cache_line_size */
256, /* l2_cache_size */
true, /* prefetch_dynamic_strides */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
static const cpu_prefetch_tune tsv110_prefetch_tune =
{
0, /* num_slots */
64, /* l1_cache_size */
64, /* l1_cache_line_size */
512, /* l2_cache_size */
true, /* prefetch_dynamic_strides */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
static const cpu_prefetch_tune xgene1_prefetch_tune =
{
8, /* num_slots */
32, /* l1_cache_size */
64, /* l1_cache_line_size */
256, /* l2_cache_size */
true, /* prefetch_dynamic_strides */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
static const cpu_prefetch_tune a64fx_prefetch_tune =
{
8, /* num_slots */
64, /* l1_cache_size */
256, /* l1_cache_line_size */
32768, /* l2_cache_size */
true, /* prefetch_dynamic_strides */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
static const struct tune_params generic_tunings =
{
&cortexa57_extra_costs,
&generic_addrcost_table,
&generic_regmove_cost,
&generic_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
4, /* memmov_cost */
2, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
"16:12", /* function_align. */
"4", /* jump_align. */
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
/* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
Neoverse V1. It does not have a noticeable effect on A64FX and should
have at most a very minor effect on SVE2 cores. */
(AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */
&generic_prefetch_tune
};
static const struct tune_params cortexa35_tunings =
{
&cortexa53_extra_costs,
&generic_addrcost_table,
&cortexa53_regmove_cost,
&generic_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
4, /* memmov_cost */
1, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
"16", /* function_align. */
"4", /* jump_align. */
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&generic_prefetch_tune
};
static const struct tune_params cortexa53_tunings =
{
&cortexa53_extra_costs,
&generic_addrcost_table,
&cortexa53_regmove_cost,
&generic_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
4, /* memmov_cost */
2, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
"16", /* function_align. */
"4", /* jump_align. */
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&generic_prefetch_tune
};
static const struct tune_params cortexa57_tunings =
{
&cortexa57_extra_costs,
&generic_addrcost_table,
&cortexa57_regmove_cost,
&cortexa57_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
4, /* memmov_cost */
3, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
"16", /* function_align. */
"4", /* jump_align. */
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
&generic_prefetch_tune
};
static const struct tune_params cortexa72_tunings =
{
&cortexa57_extra_costs,
&generic_addrcost_table,
&cortexa57_regmove_cost,
&cortexa57_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
4, /* memmov_cost */
3, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
"16", /* function_align. */
"4", /* jump_align. */
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&generic_prefetch_tune
};
static const struct tune_params cortexa73_tunings =
{
&cortexa57_extra_costs,
&generic_addrcost_table,
&cortexa57_regmove_cost,
&cortexa57_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
4, /* memmov_cost. */
2, /* issue_rate. */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
"16", /* function_align. */
"4", /* jump_align. */
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&generic_prefetch_tune
};
static const struct tune_params exynosm1_tunings =
{
&exynosm1_extra_costs,
&exynosm1_addrcost_table,
&exynosm1_regmove_cost,
&exynosm1_vector_cost,
&generic_branch_cost,
&exynosm1_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
4, /* memmov_cost */
3, /* issue_rate */
(AARCH64_FUSE_AES_AESMC), /* fusible_ops */
"4", /* function_align. */
"4", /* jump_align. */
"4", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
48, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&exynosm1_prefetch_tune
};
static const struct tune_params thunderxt88_tunings =
{
&thunderx_extra_costs,
&generic_addrcost_table,
&thunderx_regmove_cost,
&thunderx_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
6, /* memmov_cost */
2, /* issue_rate */
AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
"8", /* function_align. */
"8", /* jump_align. */
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
&thunderxt88_prefetch_tune
};
static const struct tune_params thunderx_tunings =
{
&thunderx_extra_costs,
&generic_addrcost_table,
&thunderx_regmove_cost,
&thunderx_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
6, /* memmov_cost */
2, /* issue_rate */
AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
"8", /* function_align. */
"8", /* jump_align. */
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
| AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
&thunderx_prefetch_tune
};
static const struct tune_params tsv110_tunings =
{
&tsv110_extra_costs,
&tsv110_addrcost_table,
&tsv110_regmove_cost,
&tsv110_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
4, /* memmov_cost */
4, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
| AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
"16", /* function_align. */
"4", /* jump_align. */
"8", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&tsv110_prefetch_tune
};
static const struct tune_params xgene1_tunings =
{
&xgene1_extra_costs,
&xgene1_addrcost_table,
&xgene1_regmove_cost,
&xgene1_vector_cost,
&generic_branch_cost,
&xgene1_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
6, /* memmov_cost */
4, /* issue_rate */
AARCH64_FUSE_NOTHING, /* fusible_ops */
"16", /* function_align. */
"16", /* jump_align. */
"16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
17, /* max_case_values. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
&xgene1_prefetch_tune
};
static const struct tune_params emag_tunings =
{
&xgene1_extra_costs,
&xgene1_addrcost_table,
&xgene1_regmove_cost,
&xgene1_vector_cost,
&generic_branch_cost,
&xgene1_approx_modes,
SVE_NOT_IMPLEMENTED,
6, /* memmov_cost */
4, /* issue_rate */
AARCH64_FUSE_NOTHING, /* fusible_ops */
"16", /* function_align. */
"16", /* jump_align. */
"16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
17, /* max_case_values. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
&xgene1_prefetch_tune
};
static const struct tune_params qdf24xx_tunings =
{
&qdf24xx_extra_costs,
&qdf24xx_addrcost_table,
&qdf24xx_regmove_cost,
&qdf24xx_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
4, /* memmov_cost */
4, /* issue_rate */
(AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
"16", /* function_align. */
"8", /* jump_align. */
"16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
&qdf24xx_prefetch_tune
};
/* Tuning structure for the Qualcomm Saphira core. Default to falkor values
for now. */
static const struct tune_params saphira_tunings =
{
&generic_extra_costs,
&generic_addrcost_table,
&generic_regmove_cost,
&generic_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
4, /* memmov_cost */
4, /* issue_rate */
(AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
"16", /* function_align. */
"8", /* jump_align. */
"16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&generic_prefetch_tune
};
static const struct tune_params thunderx2t99_tunings =
{
&thunderx2t99_extra_costs,
&thunderx2t99_addrcost_table,
&thunderx2t99_regmove_cost,
&thunderx2t99_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
4, /* memmov_cost. */
4, /* issue_rate. */
(AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
"16", /* function_align. */
"8", /* jump_align. */
"16", /* loop_align. */
3, /* int_reassoc_width. */
2, /* fp_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&thunderx2t99_prefetch_tune
};
static const struct tune_params thunderx3t110_tunings =
{
&thunderx3t110_extra_costs,
&thunderx3t110_addrcost_table,
&thunderx3t110_regmove_cost,
&thunderx3t110_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
4, /* memmov_cost. */
6, /* issue_rate. */
(AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
"16", /* function_align. */
"8", /* jump_align. */
"16", /* loop_align. */
3, /* int_reassoc_width. */
2, /* fp_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&thunderx3t110_prefetch_tune
};
static const struct tune_params neoversen1_tunings =
{
&cortexa76_extra_costs,
&generic_addrcost_table,
&generic_regmove_cost,
&cortexa57_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_NOT_IMPLEMENTED, /* sve_width */
4, /* memmov_cost */
3, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
"32:16", /* function_align. */
"4", /* jump_align. */
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
&generic_prefetch_tune
};
static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
{
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
4, /* ld2_st2_permute_cost */
4, /* ld3_st3_permute_cost */
5, /* ld4_st4_permute_cost */
3, /* permute_cost */
4, /* reduc_i8_cost */
4, /* reduc_i16_cost */
2, /* reduc_i32_cost */
2, /* reduc_i64_cost */
6, /* reduc_f16_cost */
3, /* reduc_f32_cost */
2, /* reduc_f64_cost */
2, /* store_elt_extra_cost */
/* This value is just inherited from the Cortex-A57 table. */
8, /* vec_to_scalar_cost */
/* This depends very much on what the scalar value is and
where it comes from. E.g. some constants take two dependent
instructions or a load, while others might be moved from a GPR.
4 seems to be a reasonable compromise in practice. */
4, /* scalar_to_vec_cost */
4, /* align_load_cost */
4, /* unalign_load_cost */
/* Although stores have a latency of 2 and compete for the
vector pipes, in practice it's better not to model that. */
1, /* unalign_store_cost */
1 /* store_cost */
};
static const sve_vec_cost neoversev1_sve_vector_cost =
{
{
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
4, /* ld2_st2_permute_cost */
7, /* ld3_st3_permute_cost */
8, /* ld4_st4_permute_cost */
3, /* permute_cost */
/* Theoretically, a reduction involving 31 scalar ADDs could
complete in ~9 cycles and would have a cost of 31. [SU]ADDV
completes in 14 cycles, so give it a cost of 31 + 5. */
36, /* reduc_i8_cost */
/* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7. */
22, /* reduc_i16_cost */
/* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7. */
14, /* reduc_i32_cost */
/* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8. */
11, /* reduc_i64_cost */
/* Theoretically, a reduction involving 15 scalar FADDs could
complete in ~9 cycles and would have a cost of 30. FADDV
completes in 13 cycles, so give it a cost of 30 + 4. */
34, /* reduc_f16_cost */
/* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5. */
19, /* reduc_f32_cost */
/* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5. */
11, /* reduc_f64_cost */
2, /* store_elt_extra_cost */
/* This value is just inherited from the Cortex-A57 table. */
8, /* vec_to_scalar_cost */
/* See the comment above the Advanced SIMD versions. */
4, /* scalar_to_vec_cost */
4, /* align_load_cost */
4, /* unalign_load_cost */
/* Although stores have a latency of 2 and compete for the
vector pipes, in practice it's better not to model that. */
1, /* unalign_store_cost */
1 /* store_cost */
},
3, /* clast_cost */
19, /* fadda_f16_cost */
11, /* fadda_f32_cost */
8, /* fadda_f64_cost */
32, /* gather_load_x32_cost */
16, /* gather_load_x64_cost */
3 /* scatter_store_elt_cost */
};
static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
{
3, /* loads_stores_per_cycle */
2, /* stores_per_cycle */
4, /* general_ops_per_cycle */
0, /* fp_simd_load_general_ops */
1 /* fp_simd_store_general_ops */
};
static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
{
{
3, /* loads_stores_per_cycle */
2, /* stores_per_cycle */
4, /* general_ops_per_cycle */
0, /* fp_simd_load_general_ops */
1 /* fp_simd_store_general_ops */
},
2, /* ld2_st2_general_ops */
2, /* ld3_st3_general_ops */
3 /* ld4_st4_general_ops */
};
static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
{
{
{
2, /* loads_per_cycle */
2, /* stores_per_cycle */
2, /* general_ops_per_cycle */
0, /* fp_simd_load_general_ops */
1 /* fp_simd_store_general_ops */
},
2, /* ld2_st2_general_ops */
2, /* ld3_st3_general_ops */
3 /* ld4_st4_general_ops */
},
1, /* pred_ops_per_cycle */
2, /* while_pred_ops */
2, /* int_cmp_pred_ops */
1, /* fp_cmp_pred_ops */
1, /* gather_scatter_pair_general_ops */
1 /* gather_scatter_pair_pred_ops */
};
static const aarch64_vec_issue_info neoversev1_vec_issue_info =
{
&neoversev1_scalar_issue_info,
&neoversev1_advsimd_issue_info,
&neoversev1_sve_issue_info
};
/* Neoverse V1 costs for vector insn classes. */
static const struct cpu_vector_cost neoversev1_vector_cost =
{
1, /* scalar_int_stmt_cost */
2, /* scalar_fp_stmt_cost */
4, /* scalar_load_cost */
1, /* scalar_store_cost */
1, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&neoversev1_advsimd_vector_cost, /* advsimd */
&neoversev1_sve_vector_cost, /* sve */
&neoversev1_vec_issue_info /* issue_info */
};
static const struct tune_params neoversev1_tunings =
{
&cortexa76_extra_costs,
&neoversev1_addrcost_table,
&generic_regmove_cost,
&neoversev1_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_256, /* sve_width */
4, /* memmov_cost */
3, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
"32:16", /* function_align. */
"4", /* jump_align. */
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
&generic_prefetch_tune
};
static const sve_vec_cost neoverse512tvb_sve_vector_cost =
{
{
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
4, /* ld2_st2_permute_cost */
5, /* ld3_st3_permute_cost */
5, /* ld4_st4_permute_cost */
3, /* permute_cost */
/* Theoretically, a reduction involving 15 scalar ADDs could
complete in ~5 cycles and would have a cost of 15. Assume that
[SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6. */
21, /* reduc_i8_cost */
/* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
13, /* reduc_i16_cost */
/* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
9, /* reduc_i32_cost */
/* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7. */
8, /* reduc_i64_cost */
/* Theoretically, a reduction involving 7 scalar FADDs could
complete in ~6 cycles and would have a cost of 14. Assume that
FADDV completes in 8 cycles and so give it a cost of 14 + 2. */
16, /* reduc_f16_cost */
/* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
8, /* reduc_f32_cost */
/* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2. */
4, /* reduc_f64_cost */
2, /* store_elt_extra_cost */
/* This value is just inherited from the Cortex-A57 table. */
8, /* vec_to_scalar_cost */
/* This depends very much on what the scalar value is and
where it comes from. E.g. some constants take two dependent
instructions or a load, while others might be moved from a GPR.
4 seems to be a reasonable compromise in practice. */
4, /* scalar_to_vec_cost */
4, /* align_load_cost */
4, /* unalign_load_cost */
/* Although stores generally have a latency of 2 and compete for the
vector pipes, in practice it's better not to model that. */
1, /* unalign_store_cost */
1 /* store_cost */
},
3, /* clast_cost */
10, /* fadda_f16_cost */
6, /* fadda_f32_cost */
4, /* fadda_f64_cost */
/* A strided Advanced SIMD x64 load would take two parallel FP loads
(6 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
(cost 8) and a vec_construct (cost 2). Add a full vector operation
(cost 2) to that, to avoid the difference being lost in rounding.
There is no easy comparison between a strided Advanced SIMD x32 load
and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
operation more than a 64-bit gather. */
14, /* gather_load_x32_cost */
12, /* gather_load_x64_cost */
3 /* scatter_store_elt_cost */
};
static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
{
{
{
3, /* loads_per_cycle */
2, /* stores_per_cycle */
4, /* general_ops_per_cycle */
0, /* fp_simd_load_general_ops */
1 /* fp_simd_store_general_ops */
},
2, /* ld2_st2_general_ops */
2, /* ld3_st3_general_ops */
3 /* ld4_st4_general_ops */
},
2, /* pred_ops_per_cycle */
2, /* while_pred_ops */
2, /* int_cmp_pred_ops */
1, /* fp_cmp_pred_ops */
1, /* gather_scatter_pair_general_ops */
1 /* gather_scatter_pair_pred_ops */
};
static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
{
&neoversev1_scalar_issue_info,
&neoversev1_advsimd_issue_info,
&neoverse512tvb_sve_issue_info
};
static const struct cpu_vector_cost neoverse512tvb_vector_cost =
{
1, /* scalar_int_stmt_cost */
2, /* scalar_fp_stmt_cost */
4, /* scalar_load_cost */
1, /* scalar_store_cost */
1, /* cond_taken_branch_cost */
1, /* cond_not_taken_branch_cost */
&neoversev1_advsimd_vector_cost, /* advsimd */
&neoverse512tvb_sve_vector_cost, /* sve */
&neoverse512tvb_vec_issue_info /* issue_info */
};
static const struct tune_params neoverse512tvb_tunings =
{
&cortexa76_extra_costs,
&neoversev1_addrcost_table,
&generic_regmove_cost,
&neoverse512tvb_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_128 | SVE_256, /* sve_width */
4, /* memmov_cost */
3, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
"32:16", /* function_align. */
"4", /* jump_align. */
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
&generic_prefetch_tune
};
static const struct tune_params neoversen2_tunings =
{
&cortexa76_extra_costs,
&generic_addrcost_table,
&generic_regmove_cost,
&cortexa57_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_128, /* sve_width */
4, /* memmov_cost */
3, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
"32:16", /* function_align. */
"4", /* jump_align. */
"32:16", /* loop_align. */
2, /* int_reassoc_width. */
4, /* fp_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&generic_prefetch_tune
};
static const struct tune_params a64fx_tunings =
{
&a64fx_extra_costs,
&a64fx_addrcost_table,
&a64fx_regmove_cost,
&a64fx_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_512, /* sve_width */
4, /* memmov_cost */
7, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
"32", /* function_align. */
"16", /* jump_align. */
"32", /* loop_align. */
4, /* int_reassoc_width. */
2, /* fp_reassoc_width. */
2, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
&a64fx_prefetch_tune
};
/* Support for fine-grained override of the tuning structures. */
struct aarch64_tuning_override_function
{
const char* name;
void (*parse_override)(const char*, struct tune_params*);
};
static void aarch64_parse_fuse_string (const char*, struct tune_params*);
static void aarch64_parse_tune_string (const char*, struct tune_params*);
static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
static const struct aarch64_tuning_override_function
aarch64_tuning_override_functions[] =
{
{ "fuse", aarch64_parse_fuse_string },
{ "tune", aarch64_parse_tune_string },
{ "sve_width", aarch64_parse_sve_width_string },
{ NULL, NULL }
};
/* A processor implementing AArch64. */
struct processor
{
const char *const name;
enum aarch64_processor ident;
enum aarch64_processor sched_core;
enum aarch64_arch arch;
unsigned architecture_version;
const uint64_t flags;
const struct tune_params *const tune;
};
/* Architectures implementing AArch64. */
static const struct processor all_architectures[] =
{
#define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
{NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
#include "aarch64-arches.def"
{NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
};
/* Processor cores implementing AArch64. */
static const struct processor all_cores[] =
{
#define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
{NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
FLAGS, &COSTS##_tunings},
#include "aarch64-cores.def"
{"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
AARCH64_FL_FOR_ARCH8, &generic_tunings},
{NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
};
/* Target specification. These are populated by the -march, -mtune, -mcpu
handling code or by target attributes. */
static const struct processor *selected_arch;
static const struct processor *selected_cpu;
static const struct processor *selected_tune;
enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
/* The current tuning set. */
struct tune_params aarch64_tune_params = generic_tunings;
/* Check whether an 'aarch64_vector_pcs' attribute is valid. */
static tree
handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
int, bool *no_add_attrs)
{
/* Since we set fn_type_req to true, the caller should have checked
this for us. */
gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
switch ((arm_pcs) fntype_abi (*node).id ())
{
case ARM_PCS_AAPCS64:
case ARM_PCS_SIMD:
return NULL_TREE;
case ARM_PCS_SVE:
error ("the %qE attribute cannot be applied to an SVE function type",
name);
*no_add_attrs = true;
return NULL_TREE;
case ARM_PCS_TLSDESC:
case ARM_PCS_UNKNOWN:
break;
}
gcc_unreachable ();
}
/* Table of machine attributes. */
static const struct attribute_spec aarch64_attribute_table[] =
{
/* { name, min_len, max_len, decl_req, type_req, fn_type_req,
affects_type_identity, handler, exclude } */
{ "aarch64_vector_pcs", 0, 0, false, true, true, true,
handle_aarch64_vector_pcs_attribute, NULL },
{ "arm_sve_vector_bits", 1, 1, false, true, false, true,
aarch64_sve::handle_arm_sve_vector_bits_attribute,
NULL },
{ "Advanced SIMD type", 1, 1, false, true, false, true, NULL, NULL },
{ "SVE type", 3, 3, false, true, false, true, NULL, NULL },
{ "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL },
{ NULL, 0, 0, false, false, false, false, NULL, NULL }
};
#define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
/* An ISA extension in the co-processor and main instruction set space. */
struct aarch64_option_extension
{
const char *const name;
const unsigned long flags_on;
const unsigned long flags_off;
};
typedef enum aarch64_cond_code
{
AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
}
aarch64_cc;
#define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
struct aarch64_branch_protect_type
{
/* The type's name that the user passes to the branch-protection option
string. */
const char* name;
/* Function to handle the protection type and set global variables.
First argument is the string token corresponding with this type and the
second argument is the next token in the option string.
Return values:
* AARCH64_PARSE_OK: Handling was sucessful.
* AARCH64_INVALID_ARG: The type is invalid in this context and the caller
should print an error.
* AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
own error. */
enum aarch64_parse_opt_result (*handler)(char*, char*);
/* A list of types that can follow this type in the option string. */
const aarch64_branch_protect_type* subtypes;
unsigned int num_subtypes;
};
static enum aarch64_parse_opt_result
aarch64_handle_no_branch_protection (char* str, char* rest)
{
aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
aarch64_enable_bti = 0;
if (rest)
{
error ("unexpected %<%s%> after %<%s%>", rest, str);
return AARCH64_PARSE_INVALID_FEATURE;
}
return AARCH64_PARSE_OK;
}
static enum aarch64_parse_opt_result
aarch64_handle_standard_branch_protection (char* str, char* rest)
{
aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
aarch64_ra_sign_key = AARCH64_KEY_A;
aarch64_enable_bti = 1;
if (rest)
{
error ("unexpected %<%s%> after %<%s%>", rest, str);
return AARCH64_PARSE_INVALID_FEATURE;
}
return AARCH64_PARSE_OK;
}
static enum aarch64_parse_opt_result
aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
char* rest ATTRIBUTE_UNUSED)
{
aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
aarch64_ra_sign_key = AARCH64_KEY_A;
return AARCH64_PARSE_OK;
}
static enum aarch64_parse_opt_result
aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
char* rest ATTRIBUTE_UNUSED)
{
aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
return AARCH64_PARSE_OK;
}
static enum aarch64_parse_opt_result
aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
char* rest ATTRIBUTE_UNUSED)
{
aarch64_ra_sign_key = AARCH64_KEY_B;
return AARCH64_PARSE_OK;
}
static enum aarch64_parse_opt_result
aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
char* rest ATTRIBUTE_UNUSED)
{
aarch64_enable_bti = 1;
return AARCH64_PARSE_OK;
}
static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
{ "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
{ "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
{ NULL, NULL, NULL, 0 }
};
static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
{ "none", aarch64_handle_no_branch_protection, NULL, 0 },
{ "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
{ "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
ARRAY_SIZE (aarch64_pac_ret_subtypes) },
{ "bti", aarch64_handle_bti_protection, NULL, 0 },
{ NULL, NULL, NULL, 0 }
};
/* The condition codes of the processor, and the inverse function. */
static const char * const aarch64_condition_codes[] =
{
"eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
"hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
};
/* The preferred condition codes for SVE conditions. */
static const char *const aarch64_sve_condition_codes[] =
{
"none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
"pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
};
/* Return the assembly token for svpattern value VALUE. */
static const char *
svpattern_token (enum aarch64_svpattern pattern)
{
switch (pattern)
{
#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
AARCH64_FOR_SVPATTERN (CASE)
#undef CASE
case AARCH64_NUM_SVPATTERNS:
break;
}
gcc_unreachable ();
}
/* Return the location of a piece that is known to be passed or returned
in registers. FIRST_ZR is the first unused vector argument register
and FIRST_PR is the first unused predicate argument register. */
rtx
pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
unsigned int first_pr) const
{
gcc_assert (VECTOR_MODE_P (mode)
&& first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
&& first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
if (num_zr > 0 && num_pr == 0)
return gen_rtx_REG (mode, first_zr);
if (num_zr == 0 && num_pr == 1)
return gen_rtx_REG (mode, first_pr);
gcc_unreachable ();
}
/* Return the total number of vector registers required by the PST. */
unsigned int
pure_scalable_type_info::num_zr () const
{
unsigned int res = 0;
for (unsigned int i = 0; i < pieces.length (); ++i)
res += pieces[i].num_zr;
return res;
}
/* Return the total number of predicate registers required by the PST. */
unsigned int
pure_scalable_type_info::num_pr () const
{
unsigned int res = 0;
for (unsigned int i = 0; i < pieces.length (); ++i)
res += pieces[i].num_pr;
return res;
}
/* Return the location of a PST that is known to be passed or returned
in registers. FIRST_ZR is the first unused vector argument register
and FIRST_PR is the first unused predicate argument register. */
rtx
pure_scalable_type_info::get_rtx (machine_mode mode,
unsigned int first_zr,
unsigned int first_pr) const
{
/* Try to return a single REG if possible. This leads to better
code generation; it isn't required for correctness. */
if (mode == pieces[0].mode)
{
gcc_assert (pieces.length () == 1);
return pieces[0].get_rtx (first_zr, first_pr);
}
/* Build up a PARALLEL that contains the individual pieces. */
rtvec rtxes = rtvec_alloc (pieces.length ());
for (unsigned int i = 0; i < pieces.length (); ++i)
{
rtx reg = pieces[i].get_rtx (first_zr, first_pr);
rtx offset = gen_int_mode (pieces[i].offset, Pmode);
RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
first_zr += pieces[i].num_zr;
first_pr += pieces[i].num_pr;
}
return gen_rtx_PARALLEL (mode, rtxes);
}
/* Analyze whether TYPE is a Pure Scalable Type according to the rules
in the AAPCS64. */
pure_scalable_type_info::analysis_result
pure_scalable_type_info::analyze (const_tree type)
{
/* Prevent accidental reuse. */
gcc_assert (pieces.is_empty ());
/* No code will be generated for erroneous types, so we won't establish
an ABI mapping. */
if (type == error_mark_node)
return NO_ABI_IDENTITY;
/* Zero-sized types disappear in the language->ABI mapping. */
if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
return NO_ABI_IDENTITY;
/* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
piece p = {};
if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
{
machine_mode mode = TYPE_MODE_RAW (type);
gcc_assert (VECTOR_MODE_P (mode)
&& (!TARGET_SVE || aarch64_sve_mode_p (mode)));
p.mode = p.orig_mode = mode;
add_piece (p);
return IS_PST;
}
/* Check for user-defined PSTs. */
if (TREE_CODE (type) == ARRAY_TYPE)
return analyze_array (type);
if (TREE_CODE (type) == RECORD_TYPE)
return analyze_record (type);
return ISNT_PST;
}
/* Analyze a type that is known not to be passed or returned in memory.
Return true if it has an ABI identity and is a Pure Scalable Type. */
bool
pure_scalable_type_info::analyze_registers (const_tree type)
{
analysis_result result = analyze (type);
gcc_assert (result != DOESNT_MATTER);
return result == IS_PST;
}
/* Subroutine of analyze for handling ARRAY_TYPEs. */
pure_scalable_type_info::analysis_result
pure_scalable_type_info::analyze_array (const_tree type)
{
/* Analyze the element type. */
pure_scalable_type_info element_info;
analysis_result result = element_info.analyze (TREE_TYPE (type));
if (result != IS_PST)
return result;
/* An array of unknown, flexible or variable length will be passed and
returned by reference whatever we do. */
tree nelts_minus_one = array_type_nelts (type);
if (!tree_fits_uhwi_p (nelts_minus_one))
return DOESNT_MATTER;
/* Likewise if the array is constant-sized but too big to be interesting.
The double checks against MAX_PIECES are to protect against overflow. */
unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
if (count > MAX_PIECES)
return DOESNT_MATTER;
count += 1;
if (count * element_info.pieces.length () > MAX_PIECES)
return DOESNT_MATTER;
/* The above checks should have weeded out elements of unknown size. */
poly_uint64 element_bytes;
if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
gcc_unreachable ();
/* Build up the list of individual vectors and predicates. */
gcc_assert (!element_info.pieces.is_empty ());
for (unsigned int i = 0; i < count; ++i)
for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
{
piece p = element_info.pieces[j];
p.offset += i * element_bytes;
add_piece (p);
}
return IS_PST;
}
/* Subroutine of analyze for handling RECORD_TYPEs. */
pure_scalable_type_info::analysis_result
pure_scalable_type_info::analyze_record (const_tree type)
{
for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
{
if (TREE_CODE (field) != FIELD_DECL)
continue;
/* Zero-sized fields disappear in the language->ABI mapping. */
if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
continue;
/* All fields with an ABI identity must be PSTs for the record as
a whole to be a PST. If any individual field is too big to be
interesting then the record is too. */
pure_scalable_type_info field_info;
analysis_result subresult = field_info.analyze (TREE_TYPE (field));
if (subresult == NO_ABI_IDENTITY)
continue;
if (subresult != IS_PST)
return subresult;
/* Since all previous fields are PSTs, we ought to be able to track
the field offset using poly_ints. */
tree bitpos = bit_position (field);
gcc_assert (poly_int_tree_p (bitpos));
/* For the same reason, it shouldn't be possible to create a PST field
whose offset isn't byte-aligned. */
poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
BITS_PER_UNIT);
/* Punt if the record is too big to be interesting. */
poly_uint64 bytepos;
if (!wide_bytepos.to_uhwi (&bytepos)
|| pieces.length () + field_info.pieces.length () > MAX_PIECES)
return DOESNT_MATTER;
/* Add the individual vectors and predicates in the field to the
record's list. */
gcc_assert (!field_info.pieces.is_empty ());
for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
{
piece p = field_info.pieces[i];
p.offset += bytepos;
add_piece (p);
}
}
/* Empty structures disappear in the language->ABI mapping. */
return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
}
/* Add P to the list of pieces in the type. */
void
pure_scalable_type_info::add_piece (const piece &p)
{
/* Try to fold the new piece into the previous one to form a
single-mode PST. For example, if we see three consecutive vectors
of the same mode, we can represent them using the corresponding
3-tuple mode.
This is purely an optimization. */
if (!pieces.is_empty ())
{
piece &prev = pieces.last ();
gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
unsigned int nelems1, nelems2;
if (prev.orig_mode == p.orig_mode
&& known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
&& constant_multiple_p (GET_MODE_NUNITS (prev.mode),
GET_MODE_NUNITS (p.orig_mode), &nelems1)
&& constant_multiple_p (GET_MODE_NUNITS (p.mode),
GET_MODE_NUNITS (p.orig_mode), &nelems2)
&& targetm.array_mode (p.orig_mode,
nelems1 + nelems2).exists (&prev.mode))
{
prev.num_zr += p.num_zr;
prev.num_pr += p.num_pr;
return;
}
}
pieces.quick_push (p);
}
/* Return true if at least one possible value of type TYPE includes at
least one object of Pure Scalable Type, in the sense of the AAPCS64.
This is a relatively expensive test for some types, so it should
generally be made as late as possible. */
static bool
aarch64_some_values_include_pst_objects_p (const_tree type)
{
if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
return false;
if (aarch64_sve::builtin_type_p (type))
return true;
if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
if (RECORD_OR_UNION_TYPE_P (type))
for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
if (TREE_CODE (field) == FIELD_DECL
&& aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
return true;
return false;
}
/* Return the descriptor of the SIMD ABI. */
static const predefined_function_abi &
aarch64_simd_abi (void)
{
predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
if (!simd_abi.initialized_p ())
{
HARD_REG_SET full_reg_clobbers
= default_function_abi.full_reg_clobbers ();
for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
if (FP_SIMD_SAVED_REGNUM_P (regno))
CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
}
return simd_abi;
}
/* Return the descriptor of the SVE PCS. */
static const predefined_function_abi &
aarch64_sve_abi (void)
{
predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
if (!sve_abi.initialized_p ())
{
HARD_REG_SET full_reg_clobbers
= default_function_abi.full_reg_clobbers ();
for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
}
return sve_abi;
}
/* If X is an UNSPEC_SALT_ADDR expression, return the address that it
wraps, otherwise return X itself. */
static rtx
strip_salt (rtx x)
{
rtx search = x;
if (GET_CODE (search) == CONST)
search = XEXP (search, 0);
if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
x = XVECEXP (search, 0, 0);
return x;
}
/* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
expression. */
static rtx
strip_offset_and_salt (rtx addr, poly_int64 *offset)
{
return strip_salt (strip_offset (addr, offset));
}
/* Generate code to enable conditional branches in functions over 1 MiB. */
const char *
aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
const char * branch_format)
{
rtx_code_label * tmp_label = gen_label_rtx ();
char label_buf[256];
char buffer[128];
ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
CODE_LABEL_NUMBER (tmp_label));
const char *label_ptr = targetm.strip_name_encoding (label_buf);
rtx dest_label = operands[pos_label];
operands[pos_label] = tmp_label;
snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
output_asm_insn (buffer, operands);
snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
operands[pos_label] = dest_label;
output_asm_insn (buffer, operands);
return "";
}
void
aarch64_err_no_fpadvsimd (machine_mode mode)
{
if (TARGET_GENERAL_REGS_ONLY)
if (FLOAT_MODE_P (mode))
error ("%qs is incompatible with the use of floating-point types",
"-mgeneral-regs-only");
else
error ("%qs is incompatible with the use of vector types",
"-mgeneral-regs-only");
else
if (FLOAT_MODE_P (mode))
error ("%qs feature modifier is incompatible with the use of"
" floating-point types", "+nofp");
else
error ("%qs feature modifier is incompatible with the use of"
" vector types", "+nofp");
}
/* Report when we try to do something that requires SVE when SVE is disabled.
This is an error of last resort and isn't very high-quality. It usually
involves attempts to measure the vector length in some way. */
static void
aarch64_report_sve_required (void)
{
static bool reported_p = false;
/* Avoid reporting a slew of messages for a single oversight. */
if (reported_p)
return;
error ("this operation requires the SVE ISA extension");
inform (input_location, "you can enable SVE using the command-line"
" option %<-march%>, or by using the %<target%>"
" attribute or pragma");
reported_p = true;
}
/* Return true if REGNO is P0-P15 or one of the special FFR-related
registers. */
inline bool
pr_or_ffr_regnum_p (unsigned int regno)
{
return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
}
/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
and GENERAL_REGS is lower than the memory cost (in this case the best class
is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
cost results in bad allocations with many redundant int<->FP moves which
are expensive on various cores.
To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
The result of this is that it is no longer inefficient to have a higher
memory move cost than the register move cost.
*/
static reg_class_t
aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
reg_class_t best_class)
{
machine_mode mode;
if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
|| !reg_class_subset_p (FP_REGS, allocno_class))
return allocno_class;
if (!reg_class_subset_p (GENERAL_REGS, best_class)
|| !reg_class_subset_p (FP_REGS, best_class))
return best_class;
mode = PSEUDO_REGNO_MODE (regno);
return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
}
static unsigned int
aarch64_min_divisions_for_recip_mul (machine_mode mode)
{
if (GET_MODE_UNIT_SIZE (mode) == 4)
return aarch64_tune_params.min_div_recip_mul_sf;
return aarch64_tune_params.min_div_recip_mul_df;
}
/* Return the reassociation width of treeop OPC with mode MODE. */
static int
aarch64_reassociation_width (unsigned opc, machine_mode mode)
{
if (VECTOR_MODE_P (mode))
return aarch64_tune_params.vec_reassoc_width;
if (INTEGRAL_MODE_P (mode))
return aarch64_tune_params.int_reassoc_width;
/* Avoid reassociating floating point addition so we emit more FMAs. */
if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
return aarch64_tune_params.fp_reassoc_width;
return 1;
}
/* Provide a mapping from gcc register numbers to dwarf register numbers. */
unsigned
aarch64_dbx_register_number (unsigned regno)
{
if (GP_REGNUM_P (regno))
return AARCH64_DWARF_R0 + regno - R0_REGNUM;
else if (regno == SP_REGNUM)
return AARCH64_DWARF_SP;
else if (FP_REGNUM_P (regno))
return AARCH64_DWARF_V0 + regno - V0_REGNUM;
else if (PR_REGNUM_P (regno))
return AARCH64_DWARF_P0 + regno - P0_REGNUM;
else if (regno == VG_REGNUM)
return AARCH64_DWARF_VG;
/* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
equivalent DWARF register. */
return DWARF_FRAME_REGISTERS;
}
/* If X is a CONST_DOUBLE, return its bit representation as a constant
integer, otherwise return X unmodified. */
static rtx
aarch64_bit_representation (rtx x)
{
if (CONST_DOUBLE_P (x))
x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
return x;
}
/* Return an estimate for the number of quadwords in an SVE vector. This is
equivalent to the number of Advanced SIMD vectors in an SVE vector. */
static unsigned int
aarch64_estimated_sve_vq ()
{
return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
}
/* Return true if MODE is any of the Advanced SIMD structure modes. */
static bool
aarch64_advsimd_struct_mode_p (machine_mode mode)
{
return (TARGET_SIMD
&& (mode == OImode || mode == CImode || mode == XImode));
}
/* Return true if MODE is an SVE predicate mode. */
static bool
aarch64_sve_pred_mode_p (machine_mode mode)
{
return (TARGET_SVE
&& (mode == VNx16BImode
|| mode == VNx8BImode
|| mode == VNx4BImode
|| mode == VNx2BImode));
}
/* Three mutually-exclusive flags describing a vector or predicate type. */
const unsigned int VEC_ADVSIMD = 1;
const unsigned int VEC_SVE_DATA = 2;
const unsigned int VEC_SVE_PRED = 4;
/* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
a structure of 2, 3 or 4 vectors. */
const unsigned int VEC_STRUCT = 8;
/* Can be used in combination with VEC_SVE_DATA to indicate that the
vector has fewer significant bytes than a full SVE vector. */
const unsigned int VEC_PARTIAL = 16;
/* Useful combinations of the above. */
const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
/* Return a set of flags describing the vector properties of mode MODE.
Ignore modes that are not supported by the current target. */
static unsigned int
aarch64_classify_vector_mode (machine_mode mode)
{
if (aarch64_advsimd_struct_mode_p (mode))
return VEC_ADVSIMD | VEC_STRUCT;
if (aarch64_sve_pred_mode_p (mode))
return VEC_SVE_PRED;
/* Make the decision based on the mode's enum value rather than its
properties, so that we keep the correct classification regardless
of -msve-vector-bits. */
switch (mode)
{
/* Partial SVE QI vectors. */
case E_VNx2QImode:
case E_VNx4QImode:
case E_VNx8QImode:
/* Partial SVE HI vectors. */
case E_VNx2HImode:
case E_VNx4HImode:
/* Partial SVE SI vector. */
case E_VNx2SImode:
/* Partial SVE HF vectors. */
case E_VNx2HFmode:
case E_VNx4HFmode:
/* Partial SVE BF vectors. */
case E_VNx2BFmode:
case E_VNx4BFmode:
/* Partial SVE SF vector. */
case E_VNx2SFmode:
return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
case E_VNx16QImode:
case E_VNx8HImode:
case E_VNx4SImode:
case E_VNx2DImode:
case E_VNx8BFmode:
case E_VNx8HFmode:
case E_VNx4SFmode:
case E_VNx2DFmode:
return TARGET_SVE ? VEC_SVE_DATA : 0;
/* x2 SVE vectors. */
case E_VNx32QImode:
case E_VNx16HImode:
case E_VNx8SImode:
case E_VNx4DImode:
case E_VNx16BFmode:
case E_VNx16HFmode:
case E_VNx8SFmode:
case E_VNx4DFmode:
/* x3 SVE vectors. */
case E_VNx48QImode:
case E_VNx24HImode:
case E_VNx12SImode:
case E_VNx6DImode:
case E_VNx24BFmode:
case E_VNx24HFmode:
case E_VNx12SFmode:
case E_VNx6DFmode:
/* x4 SVE vectors. */
case E_VNx64QImode:
case E_VNx32HImode:
case E_VNx16SImode:
case E_VNx8DImode:
case E_VNx32BFmode:
case E_VNx32HFmode:
case E_VNx16SFmode:
case E_VNx8DFmode:
return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
/* 64-bit Advanced SIMD vectors. */
case E_V8QImode:
case E_V4HImode:
case E_V2SImode:
/* ...E_V1DImode doesn't exist. */
case E_V4HFmode:
case E_V4BFmode:
case E_V2SFmode:
case E_V1DFmode:
/* 128-bit Advanced SIMD vectors. */
case E_V16QImode:
case E_V8HImode:
case E_V4SImode:
case E_V2DImode:
case E_V8HFmode:
case E_V8BFmode:
case E_V4SFmode:
case E_V2DFmode:
return TARGET_SIMD ? VEC_ADVSIMD : 0;
default:
return 0;
}
}
/* Return true if MODE is any of the data vector modes, including
structure modes. */
static bool
aarch64_vector_data_mode_p (machine_mode mode)
{
return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
}
/* Return true if MODE is any form of SVE mode, including predicates,
vectors and structures. */
bool
aarch64_sve_mode_p (machine_mode mode)
{
return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
}
/* Return true if MODE is an SVE data vector mode; either a single vector
or a structure of vectors. */
static bool
aarch64_sve_data_mode_p (machine_mode mode)
{
return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
}
/* Return the number of defined bytes in one constituent vector of
SVE mode MODE, which has vector flags VEC_FLAGS. */
static poly_int64
aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
{
if (vec_flags & VEC_PARTIAL)
/* A single partial vector. */
return GET_MODE_SIZE (mode);
if (vec_flags & VEC_SVE_DATA)
/* A single vector or a tuple. */
return BYTES_PER_SVE_VECTOR;
/* A single predicate. */
gcc_assert (vec_flags & VEC_SVE_PRED);
return BYTES_PER_SVE_PRED;
}
/* Implement target hook TARGET_ARRAY_MODE. */
static opt_machine_mode
aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
{
if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
&& IN_RANGE (nelems, 2, 4))
return mode_for_vector (GET_MODE_INNER (mode),
GET_MODE_NUNITS (mode) * nelems);
return opt_machine_mode ();
}
/* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
static bool
aarch64_array_mode_supported_p (machine_mode mode,
unsigned HOST_WIDE_INT nelems)
{
if (TARGET_SIMD
&& (AARCH64_VALID_SIMD_QREG_MODE (mode)
|| AARCH64_VALID_SIMD_DREG_MODE (mode))
&& (nelems >= 2 && nelems <= 4))
return true;
return false;
}
/* MODE is some form of SVE vector mode. For data modes, return the number
of vector register bits that each element of MODE occupies, such as 64
for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
in a 64-bit container). For predicate modes, return the number of
data bits controlled by each significant predicate bit. */
static unsigned int
aarch64_sve_container_bits (machine_mode mode)
{
unsigned int vec_flags = aarch64_classify_vector_mode (mode);
poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
? BITS_PER_SVE_VECTOR
: GET_MODE_BITSIZE (mode));
return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
}
/* Return the SVE predicate mode to use for elements that have
ELEM_NBYTES bytes, if such a mode exists. */
opt_machine_mode
aarch64_sve_pred_mode (unsigned int elem_nbytes)
{
if (TARGET_SVE)
{
if (elem_nbytes == 1)
return VNx16BImode;
if (elem_nbytes == 2)
return VNx8BImode;
if (elem_nbytes == 4)
return VNx4BImode;
if (elem_nbytes == 8)
return VNx2BImode;
}
return opt_machine_mode ();
}
/* Return the SVE predicate mode that should be used to control
SVE mode MODE. */
machine_mode
aarch64_sve_pred_mode (machine_mode mode)
{
unsigned int bits = aarch64_sve_container_bits (mode);
return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
}
/* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
static opt_machine_mode
aarch64_get_mask_mode (machine_mode mode)
{
unsigned int vec_flags = aarch64_classify_vector_mode (mode);
if (vec_flags & VEC_SVE_DATA)
return aarch64_sve_pred_mode (mode);
return default_get_mask_mode (mode);
}
/* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
opt_machine_mode
aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
{
enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
machine_mode mode;
FOR_EACH_MODE_IN_CLASS (mode, mclass)
if (inner_mode == GET_MODE_INNER (mode)
&& known_eq (nunits, GET_MODE_NUNITS (mode))
&& aarch64_sve_data_mode_p (mode))
return mode;
return opt_machine_mode ();
}
/* Return the integer element mode associated with SVE mode MODE. */
static scalar_int_mode
aarch64_sve_element_int_mode (machine_mode mode)
{
poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
? BITS_PER_SVE_VECTOR
: GET_MODE_BITSIZE (mode));
unsigned int elt_bits = vector_element_size (vector_bits,
GET_MODE_NUNITS (mode));
return int_mode_for_size (elt_bits, 0).require ();
}
/* Return an integer element mode that contains exactly
aarch64_sve_container_bits (MODE) bits. This is wider than
aarch64_sve_element_int_mode if MODE is a partial vector,
otherwise it's the same. */
static scalar_int_mode
aarch64_sve_container_int_mode (machine_mode mode)
{
return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
}
/* Return the integer vector mode associated with SVE mode MODE.
Unlike related_int_vector_mode, this can handle the case in which
MODE is a predicate (and thus has a different total size). */
machine_mode
aarch64_sve_int_mode (machine_mode mode)
{
scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
}
/* Implement TARGET_VECTORIZE_RELATED_MODE. */
static opt_machine_mode
aarch64_vectorize_related_mode (machine_mode vector_mode,
scalar_mode element_mode,
poly_uint64 nunits)
{
unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
/* If we're operating on SVE vectors, try to return an SVE mode. */
poly_uint64 sve_nunits;
if ((vec_flags & VEC_SVE_DATA)
&& multiple_p (BYTES_PER_SVE_VECTOR,
GET_MODE_SIZE (element_mode), &sve_nunits))
{
machine_mode sve_mode;
if (maybe_ne (nunits, 0U))
{
/* Try to find a full or partial SVE mode with exactly
NUNITS units. */
if (multiple_p (sve_nunits, nunits)
&& aarch64_sve_data_mode (element_mode,
nunits).exists (&sve_mode))
return sve_mode;
}
else
{
/* Take the preferred number of units from the number of bytes
that fit in VECTOR_MODE. We always start by "autodetecting"
a full vector mode with preferred_simd_mode, so vectors
chosen here will also be full vector modes. Then
autovectorize_vector_modes tries smaller starting modes
and thus smaller preferred numbers of units. */
sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
if (aarch64_sve_data_mode (element_mode,
sve_nunits).exists (&sve_mode))
return sve_mode;
}
}
/* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
if ((vec_flags & VEC_ADVSIMD)
&& known_eq (nunits, 0U)
&& known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
&& maybe_ge (GET_MODE_BITSIZE (element_mode)
* GET_MODE_NUNITS (vector_mode), 128U))
{
machine_mode res = aarch64_simd_container_mode (element_mode, 128);
if (VECTOR_MODE_P (res))
return res;
}
return default_vectorize_related_mode (vector_mode, element_mode, nunits);
}
/* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
prefer to use the first arithmetic operand as the else value if
the else value doesn't matter, since that exactly matches the SVE
destructive merging form. For ternary operations we could either
pick the first operand and use FMAD-like instructions or the last
operand and use FMLA-like instructions; the latter seems more
natural. */
static tree
aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
{
return nops == 3 ? ops[2] : ops[0];
}
/* Implement TARGET_HARD_REGNO_NREGS. */
static unsigned int
aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
{
/* ??? Logically we should only need to provide a value when
HARD_REGNO_MODE_OK says that the combination is valid,
but at the moment we need to handle all modes. Just ignore
any runtime parts for registers that can't store them. */
HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
switch (aarch64_regno_regclass (regno))
{
case FP_REGS:
case FP_LO_REGS:
case FP_LO8_REGS:
{
unsigned int vec_flags = aarch64_classify_vector_mode (mode);
if (vec_flags & VEC_SVE_DATA)
return exact_div (GET_MODE_SIZE (mode),
aarch64_vl_bytes (mode, vec_flags)).to_constant ();
return CEIL (lowest_size, UNITS_PER_VREG);
}
case PR_REGS:
case PR_LO_REGS:
case PR_HI_REGS:
case FFR_REGS:
case PR_AND_FFR_REGS:
return 1;
default:
return CEIL (lowest_size, UNITS_PER_WORD);
}
gcc_unreachable ();
}
/* Implement TARGET_HARD_REGNO_MODE_OK. */
static bool
aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
{
if (GET_MODE_CLASS (mode) == MODE_CC)
return regno == CC_REGNUM;
if (regno == VG_REGNUM)
/* This must have the same size as _Unwind_Word. */
return mode == DImode;
unsigned int vec_flags = aarch64_classify_vector_mode (mode);
if (vec_flags & VEC_SVE_PRED)
return pr_or_ffr_regnum_p (regno);
if (pr_or_ffr_regnum_p (regno))
return false;
if (regno == SP_REGNUM)
/* The purpose of comparing with ptr_mode is to support the
global register variable associated with the stack pointer
register via the syntax of asm ("wsp") in ILP32. */
return mode == Pmode || mode == ptr_mode;
if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
return mode == Pmode;
if (GP_REGNUM_P (regno))
{
if (vec_flags & VEC_ANY_SVE)
return false;
if (known_le (GET_MODE_SIZE (mode), 8))
return true;
if (known_le (GET_MODE_SIZE (mode), 16))
return (regno & 1) == 0;
}
else if (FP_REGNUM_P (regno))
{
if (vec_flags & VEC_STRUCT)
return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
else
return !VECTOR_MODE_P (mode) || vec_flags != 0;
}
return false;
}
/* Return true if a function with type FNTYPE returns its value in
SVE vector or predicate registers. */
static bool
aarch64_returns_value_in_sve_regs_p (const_tree fntype)
{
tree return_type = TREE_TYPE (fntype);
pure_scalable_type_info pst_info;
switch (pst_info.analyze (return_type))
{
case pure_scalable_type_info::IS_PST:
return (pst_info.num_zr () <= NUM_FP_ARG_REGS
&& pst_info.num_pr () <= NUM_PR_ARG_REGS);
case pure_scalable_type_info::DOESNT_MATTER:
gcc_assert (aarch64_return_in_memory_1 (return_type));
return false;
case pure_scalable_type_info::NO_ABI_IDENTITY:
case pure_scalable_type_info::ISNT_PST:
return false;
}
gcc_unreachable ();
}
/* Return true if a function with type FNTYPE takes arguments in
SVE vector or predicate registers. */
static bool
aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)