blob: 67bab17a9d77a54b69394505e55d67d4ab4416d5 [file] [log] [blame]
/* Output routines for GCC for ARM.
Copyright (C) 1991-2015 Free Software Foundation, Inc.
Contributed by Pieter `Tiggr' Schoenmakers (rcpieter@win.tue.nl)
and Martin Simmons (@harleqn.co.uk).
More major hacks by Richard Earnshaw (rearnsha@arm.com).
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 3, or (at your
option) any later version.
GCC is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
License for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "hash-table.h"
#include "tm.h"
#include "rtl.h"
#include "hash-set.h"
#include "machmode.h"
#include "vec.h"
#include "double-int.h"
#include "input.h"
#include "alias.h"
#include "symtab.h"
#include "wide-int.h"
#include "inchash.h"
#include "tree.h"
#include "fold-const.h"
#include "stringpool.h"
#include "stor-layout.h"
#include "calls.h"
#include "varasm.h"
#include "obstack.h"
#include "regs.h"
#include "hard-reg-set.h"
#include "insn-config.h"
#include "conditions.h"
#include "output.h"
#include "insn-attr.h"
#include "flags.h"
#include "reload.h"
#include "function.h"
#include "hashtab.h"
#include "statistics.h"
#include "real.h"
#include "fixed-value.h"
#include "expmed.h"
#include "dojump.h"
#include "explow.h"
#include "emit-rtl.h"
#include "stmt.h"
#include "expr.h"
#include "insn-codes.h"
#include "optabs.h"
#include "diagnostic-core.h"
#include "recog.h"
#include "predict.h"
#include "dominance.h"
#include "cfg.h"
#include "cfgrtl.h"
#include "cfganal.h"
#include "lcm.h"
#include "cfgbuild.h"
#include "cfgcleanup.h"
#include "basic-block.h"
#include "hash-map.h"
#include "is-a.h"
#include "plugin-api.h"
#include "ipa-ref.h"
#include "cgraph.h"
#include "ggc.h"
#include "except.h"
#include "tm_p.h"
#include "target.h"
#include "sched-int.h"
#include "target-def.h"
#include "debug.h"
#include "langhooks.h"
#include "df.h"
#include "intl.h"
#include "libfuncs.h"
#include "params.h"
#include "opts.h"
#include "dumpfile.h"
#include "gimple-expr.h"
#include "builtins.h"
#include "tm-constrs.h"
#include "rtl-iter.h"
#include "sched-int.h"
/* Forward definitions of types. */
typedef struct minipool_node Mnode;
typedef struct minipool_fixup Mfix;
void (*arm_lang_output_object_attributes_hook)(void);
struct four_ints
{
int i[4];
};
/* Forward function declarations. */
static bool arm_const_not_ok_for_debug_p (rtx);
static bool arm_needs_doubleword_align (machine_mode, const_tree);
static int arm_compute_static_chain_stack_bytes (void);
static arm_stack_offsets *arm_get_frame_offsets (void);
static void arm_add_gc_roots (void);
static int arm_gen_constant (enum rtx_code, machine_mode, rtx,
HOST_WIDE_INT, rtx, rtx, int, int);
static unsigned bit_count (unsigned long);
static int arm_address_register_rtx_p (rtx, int);
static int arm_legitimate_index_p (machine_mode, rtx, RTX_CODE, int);
static int thumb2_legitimate_index_p (machine_mode, rtx, int);
static int thumb1_base_register_rtx_p (rtx, machine_mode, int);
static rtx arm_legitimize_address (rtx, rtx, machine_mode);
static reg_class_t arm_preferred_reload_class (rtx, reg_class_t);
static rtx thumb_legitimize_address (rtx, rtx, machine_mode);
inline static int thumb1_index_register_rtx_p (rtx, int);
static int thumb_far_jump_used_p (void);
static bool thumb_force_lr_save (void);
static unsigned arm_size_return_regs (void);
static bool arm_assemble_integer (rtx, unsigned int, int);
static void arm_print_operand (FILE *, rtx, int);
static void arm_print_operand_address (FILE *, rtx);
static bool arm_print_operand_punct_valid_p (unsigned char code);
static const char *fp_const_from_val (REAL_VALUE_TYPE *);
static arm_cc get_arm_condition_code (rtx);
static HOST_WIDE_INT int_log2 (HOST_WIDE_INT);
static const char *output_multi_immediate (rtx *, const char *, const char *,
int, HOST_WIDE_INT);
static const char *shift_op (rtx, HOST_WIDE_INT *);
static struct machine_function *arm_init_machine_status (void);
static void thumb_exit (FILE *, int);
static HOST_WIDE_INT get_jump_table_size (rtx_jump_table_data *);
static Mnode *move_minipool_fix_forward_ref (Mnode *, Mnode *, HOST_WIDE_INT);
static Mnode *add_minipool_forward_ref (Mfix *);
static Mnode *move_minipool_fix_backward_ref (Mnode *, Mnode *, HOST_WIDE_INT);
static Mnode *add_minipool_backward_ref (Mfix *);
static void assign_minipool_offsets (Mfix *);
static void arm_print_value (FILE *, rtx);
static void dump_minipool (rtx_insn *);
static int arm_barrier_cost (rtx);
static Mfix *create_fix_barrier (Mfix *, HOST_WIDE_INT);
static void push_minipool_barrier (rtx_insn *, HOST_WIDE_INT);
static void push_minipool_fix (rtx_insn *, HOST_WIDE_INT, rtx *,
machine_mode, rtx);
static void arm_reorg (void);
static void note_invalid_constants (rtx_insn *, HOST_WIDE_INT, int);
static unsigned long arm_compute_save_reg0_reg12_mask (void);
static unsigned long arm_compute_save_reg_mask (void);
static unsigned long arm_isr_value (tree);
static unsigned long arm_compute_func_type (void);
static tree arm_handle_fndecl_attribute (tree *, tree, tree, int, bool *);
static tree arm_handle_pcs_attribute (tree *, tree, tree, int, bool *);
static tree arm_handle_isr_attribute (tree *, tree, tree, int, bool *);
#if TARGET_DLLIMPORT_DECL_ATTRIBUTES
static tree arm_handle_notshared_attribute (tree *, tree, tree, int, bool *);
#endif
static void arm_output_function_epilogue (FILE *, HOST_WIDE_INT);
static void arm_output_function_prologue (FILE *, HOST_WIDE_INT);
static int arm_comp_type_attributes (const_tree, const_tree);
static void arm_set_default_type_attributes (tree);
static int arm_adjust_cost (rtx_insn *, rtx, rtx_insn *, int);
static int arm_sched_reorder (FILE *, int, rtx_insn **, int *, int);
static int optimal_immediate_sequence (enum rtx_code code,
unsigned HOST_WIDE_INT val,
struct four_ints *return_sequence);
static int optimal_immediate_sequence_1 (enum rtx_code code,
unsigned HOST_WIDE_INT val,
struct four_ints *return_sequence,
int i);
static int arm_get_strip_length (int);
static bool arm_function_ok_for_sibcall (tree, tree);
static machine_mode arm_promote_function_mode (const_tree,
machine_mode, int *,
const_tree, int);
static bool arm_return_in_memory (const_tree, const_tree);
static rtx arm_function_value (const_tree, const_tree, bool);
static rtx arm_libcall_value_1 (machine_mode);
static rtx arm_libcall_value (machine_mode, const_rtx);
static bool arm_function_value_regno_p (const unsigned int);
static void arm_internal_label (FILE *, const char *, unsigned long);
static void arm_output_mi_thunk (FILE *, tree, HOST_WIDE_INT, HOST_WIDE_INT,
tree);
static bool arm_have_conditional_execution (void);
static bool arm_cannot_force_const_mem (machine_mode, rtx);
static bool arm_legitimate_constant_p (machine_mode, rtx);
static bool arm_rtx_costs_1 (rtx, enum rtx_code, int*, bool);
static bool arm_size_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *);
static bool arm_slowmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
static bool arm_fastmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
static bool arm_xscale_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
static bool arm_9e_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
static bool arm_rtx_costs (rtx, int, int, int, int *, bool);
static int arm_address_cost (rtx, machine_mode, addr_space_t, bool);
static int arm_register_move_cost (machine_mode, reg_class_t, reg_class_t);
static int arm_memory_move_cost (machine_mode, reg_class_t, bool);
static void emit_constant_insn (rtx cond, rtx pattern);
static rtx_insn *emit_set_insn (rtx, rtx);
static rtx emit_multi_reg_push (unsigned long, unsigned long);
static int arm_arg_partial_bytes (cumulative_args_t, machine_mode,
tree, bool);
static rtx arm_function_arg (cumulative_args_t, machine_mode,
const_tree, bool);
static void arm_function_arg_advance (cumulative_args_t, machine_mode,
const_tree, bool);
static unsigned int arm_function_arg_boundary (machine_mode, const_tree);
static rtx aapcs_allocate_return_reg (machine_mode, const_tree,
const_tree);
static rtx aapcs_libcall_value (machine_mode);
static int aapcs_select_return_coproc (const_tree, const_tree);
#ifdef OBJECT_FORMAT_ELF
static void arm_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
static void arm_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
#endif
#ifndef ARM_PE
static void arm_encode_section_info (tree, rtx, int);
#endif
static void arm_file_end (void);
static void arm_file_start (void);
static void arm_setup_incoming_varargs (cumulative_args_t, machine_mode,
tree, int *, int);
static bool arm_pass_by_reference (cumulative_args_t,
machine_mode, const_tree, bool);
static bool arm_promote_prototypes (const_tree);
static bool arm_default_short_enums (void);
static bool arm_align_anon_bitfield (void);
static bool arm_return_in_msb (const_tree);
static bool arm_must_pass_in_stack (machine_mode, const_tree);
static bool arm_return_in_memory (const_tree, const_tree);
#if ARM_UNWIND_INFO
static void arm_unwind_emit (FILE *, rtx_insn *);
static bool arm_output_ttype (rtx);
static void arm_asm_emit_except_personality (rtx);
static void arm_asm_init_sections (void);
#endif
static rtx arm_dwarf_register_span (rtx);
static tree arm_cxx_guard_type (void);
static bool arm_cxx_guard_mask_bit (void);
static tree arm_get_cookie_size (tree);
static bool arm_cookie_has_size (void);
static bool arm_cxx_cdtor_returns_this (void);
static bool arm_cxx_key_method_may_be_inline (void);
static void arm_cxx_determine_class_data_visibility (tree);
static bool arm_cxx_class_data_always_comdat (void);
static bool arm_cxx_use_aeabi_atexit (void);
static void arm_init_libfuncs (void);
static tree arm_build_builtin_va_list (void);
static void arm_expand_builtin_va_start (tree, rtx);
static tree arm_gimplify_va_arg_expr (tree, tree, gimple_seq *, gimple_seq *);
static void arm_option_override (void);
static unsigned HOST_WIDE_INT arm_shift_truncation_mask (machine_mode);
static bool arm_macro_fusion_p (void);
static bool arm_cannot_copy_insn_p (rtx_insn *);
static int arm_issue_rate (void);
static int arm_first_cycle_multipass_dfa_lookahead (void);
static int arm_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *, int);
static void arm_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED;
static bool arm_output_addr_const_extra (FILE *, rtx);
static bool arm_allocate_stack_slots_for_args (void);
static bool arm_warn_func_return (tree);
static const char *arm_invalid_parameter_type (const_tree t);
static const char *arm_invalid_return_type (const_tree t);
static tree arm_promoted_type (const_tree t);
static tree arm_convert_to_type (tree type, tree expr);
static bool arm_scalar_mode_supported_p (machine_mode);
static bool arm_frame_pointer_required (void);
static bool arm_can_eliminate (const int, const int);
static void arm_asm_trampoline_template (FILE *);
static void arm_trampoline_init (rtx, tree, rtx);
static rtx arm_trampoline_adjust_address (rtx);
static rtx arm_pic_static_addr (rtx orig, rtx reg);
static bool cortex_a9_sched_adjust_cost (rtx_insn *, rtx, rtx_insn *, int *);
static bool xscale_sched_adjust_cost (rtx_insn *, rtx, rtx_insn *, int *);
static bool fa726te_sched_adjust_cost (rtx_insn *, rtx, rtx_insn *, int *);
static bool arm_array_mode_supported_p (machine_mode,
unsigned HOST_WIDE_INT);
static machine_mode arm_preferred_simd_mode (machine_mode);
static bool arm_class_likely_spilled_p (reg_class_t);
static HOST_WIDE_INT arm_vector_alignment (const_tree type);
static bool arm_vector_alignment_reachable (const_tree type, bool is_packed);
static bool arm_builtin_support_vector_misalignment (machine_mode mode,
const_tree type,
int misalignment,
bool is_packed);
static void arm_conditional_register_usage (void);
static reg_class_t arm_preferred_rename_class (reg_class_t rclass);
static unsigned int arm_autovectorize_vector_sizes (void);
static int arm_default_branch_cost (bool, bool);
static int arm_cortex_a5_branch_cost (bool, bool);
static int arm_cortex_m_branch_cost (bool, bool);
static int arm_cortex_m7_branch_cost (bool, bool);
static bool arm_vectorize_vec_perm_const_ok (machine_mode vmode,
const unsigned char *sel);
static bool aarch_macro_fusion_pair_p (rtx_insn*, rtx_insn*);
static int arm_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
tree vectype,
int misalign ATTRIBUTE_UNUSED);
static unsigned arm_add_stmt_cost (void *data, int count,
enum vect_cost_for_stmt kind,
struct _stmt_vec_info *stmt_info,
int misalign,
enum vect_cost_model_location where);
static void arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
bool op0_preserve_value);
static unsigned HOST_WIDE_INT arm_asan_shadow_offset (void);
static void arm_sched_fusion_priority (rtx_insn *, int, int *, int*);
/* Table of machine attributes. */
static const struct attribute_spec arm_attribute_table[] =
{
/* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
affects_type_identity } */
/* Function calls made to this symbol must be done indirectly, because
it may lie outside of the 26 bit addressing range of a normal function
call. */
{ "long_call", 0, 0, false, true, true, NULL, false },
/* Whereas these functions are always known to reside within the 26 bit
addressing range. */
{ "short_call", 0, 0, false, true, true, NULL, false },
/* Specify the procedure call conventions for a function. */
{ "pcs", 1, 1, false, true, true, arm_handle_pcs_attribute,
false },
/* Interrupt Service Routines have special prologue and epilogue requirements. */
{ "isr", 0, 1, false, false, false, arm_handle_isr_attribute,
false },
{ "interrupt", 0, 1, false, false, false, arm_handle_isr_attribute,
false },
{ "naked", 0, 0, true, false, false, arm_handle_fndecl_attribute,
false },
#ifdef ARM_PE
/* ARM/PE has three new attributes:
interfacearm - ?
dllexport - for exporting a function/variable that will live in a dll
dllimport - for importing a function/variable from a dll
Microsoft allows multiple declspecs in one __declspec, separating
them with spaces. We do NOT support this. Instead, use __declspec
multiple times.
*/
{ "dllimport", 0, 0, true, false, false, NULL, false },
{ "dllexport", 0, 0, true, false, false, NULL, false },
{ "interfacearm", 0, 0, true, false, false, arm_handle_fndecl_attribute,
false },
#elif TARGET_DLLIMPORT_DECL_ATTRIBUTES
{ "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
{ "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
{ "notshared", 0, 0, false, true, false, arm_handle_notshared_attribute,
false },
#endif
{ NULL, 0, 0, false, false, false, NULL, false }
};
/* Initialize the GCC target structure. */
#if TARGET_DLLIMPORT_DECL_ATTRIBUTES
#undef TARGET_MERGE_DECL_ATTRIBUTES
#define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
#endif
#undef TARGET_LEGITIMIZE_ADDRESS
#define TARGET_LEGITIMIZE_ADDRESS arm_legitimize_address
#undef TARGET_LRA_P
#define TARGET_LRA_P hook_bool_void_true
#undef TARGET_ATTRIBUTE_TABLE
#define TARGET_ATTRIBUTE_TABLE arm_attribute_table
#undef TARGET_ASM_FILE_START
#define TARGET_ASM_FILE_START arm_file_start
#undef TARGET_ASM_FILE_END
#define TARGET_ASM_FILE_END arm_file_end
#undef TARGET_ASM_ALIGNED_SI_OP
#define TARGET_ASM_ALIGNED_SI_OP NULL
#undef TARGET_ASM_INTEGER
#define TARGET_ASM_INTEGER arm_assemble_integer
#undef TARGET_PRINT_OPERAND
#define TARGET_PRINT_OPERAND arm_print_operand
#undef TARGET_PRINT_OPERAND_ADDRESS
#define TARGET_PRINT_OPERAND_ADDRESS arm_print_operand_address
#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
#define TARGET_PRINT_OPERAND_PUNCT_VALID_P arm_print_operand_punct_valid_p
#undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
#define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA arm_output_addr_const_extra
#undef TARGET_ASM_FUNCTION_PROLOGUE
#define TARGET_ASM_FUNCTION_PROLOGUE arm_output_function_prologue
#undef TARGET_ASM_FUNCTION_EPILOGUE
#define TARGET_ASM_FUNCTION_EPILOGUE arm_output_function_epilogue
#undef TARGET_OPTION_OVERRIDE
#define TARGET_OPTION_OVERRIDE arm_option_override
#undef TARGET_COMP_TYPE_ATTRIBUTES
#define TARGET_COMP_TYPE_ATTRIBUTES arm_comp_type_attributes
#undef TARGET_SCHED_MACRO_FUSION_P
#define TARGET_SCHED_MACRO_FUSION_P arm_macro_fusion_p
#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
#define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
#undef TARGET_SET_DEFAULT_TYPE_ATTRIBUTES
#define TARGET_SET_DEFAULT_TYPE_ATTRIBUTES arm_set_default_type_attributes
#undef TARGET_SCHED_ADJUST_COST
#define TARGET_SCHED_ADJUST_COST arm_adjust_cost
#undef TARGET_SCHED_REORDER
#define TARGET_SCHED_REORDER arm_sched_reorder
#undef TARGET_REGISTER_MOVE_COST
#define TARGET_REGISTER_MOVE_COST arm_register_move_cost
#undef TARGET_MEMORY_MOVE_COST
#define TARGET_MEMORY_MOVE_COST arm_memory_move_cost
#undef TARGET_ENCODE_SECTION_INFO
#ifdef ARM_PE
#define TARGET_ENCODE_SECTION_INFO arm_pe_encode_section_info
#else
#define TARGET_ENCODE_SECTION_INFO arm_encode_section_info
#endif
#undef TARGET_STRIP_NAME_ENCODING
#define TARGET_STRIP_NAME_ENCODING arm_strip_name_encoding
#undef TARGET_ASM_INTERNAL_LABEL
#define TARGET_ASM_INTERNAL_LABEL arm_internal_label
#undef TARGET_FUNCTION_OK_FOR_SIBCALL
#define TARGET_FUNCTION_OK_FOR_SIBCALL arm_function_ok_for_sibcall
#undef TARGET_FUNCTION_VALUE
#define TARGET_FUNCTION_VALUE arm_function_value
#undef TARGET_LIBCALL_VALUE
#define TARGET_LIBCALL_VALUE arm_libcall_value
#undef TARGET_FUNCTION_VALUE_REGNO_P
#define TARGET_FUNCTION_VALUE_REGNO_P arm_function_value_regno_p
#undef TARGET_ASM_OUTPUT_MI_THUNK
#define TARGET_ASM_OUTPUT_MI_THUNK arm_output_mi_thunk
#undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
#define TARGET_ASM_CAN_OUTPUT_MI_THUNK default_can_output_mi_thunk_no_vcall
#undef TARGET_RTX_COSTS
#define TARGET_RTX_COSTS arm_rtx_costs
#undef TARGET_ADDRESS_COST
#define TARGET_ADDRESS_COST arm_address_cost
#undef TARGET_SHIFT_TRUNCATION_MASK
#define TARGET_SHIFT_TRUNCATION_MASK arm_shift_truncation_mask
#undef TARGET_VECTOR_MODE_SUPPORTED_P
#define TARGET_VECTOR_MODE_SUPPORTED_P arm_vector_mode_supported_p
#undef TARGET_ARRAY_MODE_SUPPORTED_P
#define TARGET_ARRAY_MODE_SUPPORTED_P arm_array_mode_supported_p
#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE arm_preferred_simd_mode
#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
arm_autovectorize_vector_sizes
#undef TARGET_MACHINE_DEPENDENT_REORG
#define TARGET_MACHINE_DEPENDENT_REORG arm_reorg
#undef TARGET_INIT_BUILTINS
#define TARGET_INIT_BUILTINS arm_init_builtins
#undef TARGET_EXPAND_BUILTIN
#define TARGET_EXPAND_BUILTIN arm_expand_builtin
#undef TARGET_BUILTIN_DECL
#define TARGET_BUILTIN_DECL arm_builtin_decl
#undef TARGET_INIT_LIBFUNCS
#define TARGET_INIT_LIBFUNCS arm_init_libfuncs
#undef TARGET_PROMOTE_FUNCTION_MODE
#define TARGET_PROMOTE_FUNCTION_MODE arm_promote_function_mode
#undef TARGET_PROMOTE_PROTOTYPES
#define TARGET_PROMOTE_PROTOTYPES arm_promote_prototypes
#undef TARGET_PASS_BY_REFERENCE
#define TARGET_PASS_BY_REFERENCE arm_pass_by_reference
#undef TARGET_ARG_PARTIAL_BYTES
#define TARGET_ARG_PARTIAL_BYTES arm_arg_partial_bytes
#undef TARGET_FUNCTION_ARG
#define TARGET_FUNCTION_ARG arm_function_arg
#undef TARGET_FUNCTION_ARG_ADVANCE
#define TARGET_FUNCTION_ARG_ADVANCE arm_function_arg_advance
#undef TARGET_FUNCTION_ARG_BOUNDARY
#define TARGET_FUNCTION_ARG_BOUNDARY arm_function_arg_boundary
#undef TARGET_SETUP_INCOMING_VARARGS
#define TARGET_SETUP_INCOMING_VARARGS arm_setup_incoming_varargs
#undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
#define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS arm_allocate_stack_slots_for_args
#undef TARGET_ASM_TRAMPOLINE_TEMPLATE
#define TARGET_ASM_TRAMPOLINE_TEMPLATE arm_asm_trampoline_template
#undef TARGET_TRAMPOLINE_INIT
#define TARGET_TRAMPOLINE_INIT arm_trampoline_init
#undef TARGET_TRAMPOLINE_ADJUST_ADDRESS
#define TARGET_TRAMPOLINE_ADJUST_ADDRESS arm_trampoline_adjust_address
#undef TARGET_WARN_FUNC_RETURN
#define TARGET_WARN_FUNC_RETURN arm_warn_func_return
#undef TARGET_DEFAULT_SHORT_ENUMS
#define TARGET_DEFAULT_SHORT_ENUMS arm_default_short_enums
#undef TARGET_ALIGN_ANON_BITFIELD
#define TARGET_ALIGN_ANON_BITFIELD arm_align_anon_bitfield
#undef TARGET_NARROW_VOLATILE_BITFIELD
#define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
#undef TARGET_CXX_GUARD_TYPE
#define TARGET_CXX_GUARD_TYPE arm_cxx_guard_type
#undef TARGET_CXX_GUARD_MASK_BIT
#define TARGET_CXX_GUARD_MASK_BIT arm_cxx_guard_mask_bit
#undef TARGET_CXX_GET_COOKIE_SIZE
#define TARGET_CXX_GET_COOKIE_SIZE arm_get_cookie_size
#undef TARGET_CXX_COOKIE_HAS_SIZE
#define TARGET_CXX_COOKIE_HAS_SIZE arm_cookie_has_size
#undef TARGET_CXX_CDTOR_RETURNS_THIS
#define TARGET_CXX_CDTOR_RETURNS_THIS arm_cxx_cdtor_returns_this
#undef TARGET_CXX_KEY_METHOD_MAY_BE_INLINE
#define TARGET_CXX_KEY_METHOD_MAY_BE_INLINE arm_cxx_key_method_may_be_inline
#undef TARGET_CXX_USE_AEABI_ATEXIT
#define TARGET_CXX_USE_AEABI_ATEXIT arm_cxx_use_aeabi_atexit
#undef TARGET_CXX_DETERMINE_CLASS_DATA_VISIBILITY
#define TARGET_CXX_DETERMINE_CLASS_DATA_VISIBILITY \
arm_cxx_determine_class_data_visibility
#undef TARGET_CXX_CLASS_DATA_ALWAYS_COMDAT
#define TARGET_CXX_CLASS_DATA_ALWAYS_COMDAT arm_cxx_class_data_always_comdat
#undef TARGET_RETURN_IN_MSB
#define TARGET_RETURN_IN_MSB arm_return_in_msb
#undef TARGET_RETURN_IN_MEMORY
#define TARGET_RETURN_IN_MEMORY arm_return_in_memory
#undef TARGET_MUST_PASS_IN_STACK
#define TARGET_MUST_PASS_IN_STACK arm_must_pass_in_stack
#if ARM_UNWIND_INFO
#undef TARGET_ASM_UNWIND_EMIT
#define TARGET_ASM_UNWIND_EMIT arm_unwind_emit
/* EABI unwinding tables use a different format for the typeinfo tables. */
#undef TARGET_ASM_TTYPE
#define TARGET_ASM_TTYPE arm_output_ttype
#undef TARGET_ARM_EABI_UNWINDER
#define TARGET_ARM_EABI_UNWINDER true
#undef TARGET_ASM_EMIT_EXCEPT_PERSONALITY
#define TARGET_ASM_EMIT_EXCEPT_PERSONALITY arm_asm_emit_except_personality
#undef TARGET_ASM_INIT_SECTIONS
#define TARGET_ASM_INIT_SECTIONS arm_asm_init_sections
#endif /* ARM_UNWIND_INFO */
#undef TARGET_DWARF_REGISTER_SPAN
#define TARGET_DWARF_REGISTER_SPAN arm_dwarf_register_span
#undef TARGET_CANNOT_COPY_INSN_P
#define TARGET_CANNOT_COPY_INSN_P arm_cannot_copy_insn_p
#ifdef HAVE_AS_TLS
#undef TARGET_HAVE_TLS
#define TARGET_HAVE_TLS true
#endif
#undef TARGET_HAVE_CONDITIONAL_EXECUTION
#define TARGET_HAVE_CONDITIONAL_EXECUTION arm_have_conditional_execution
#undef TARGET_LEGITIMATE_CONSTANT_P
#define TARGET_LEGITIMATE_CONSTANT_P arm_legitimate_constant_p
#undef TARGET_CANNOT_FORCE_CONST_MEM
#define TARGET_CANNOT_FORCE_CONST_MEM arm_cannot_force_const_mem
#undef TARGET_MAX_ANCHOR_OFFSET
#define TARGET_MAX_ANCHOR_OFFSET 4095
/* The minimum is set such that the total size of the block
for a particular anchor is -4088 + 1 + 4095 bytes, which is
divisible by eight, ensuring natural spacing of anchors. */
#undef TARGET_MIN_ANCHOR_OFFSET
#define TARGET_MIN_ANCHOR_OFFSET -4088
#undef TARGET_SCHED_ISSUE_RATE
#define TARGET_SCHED_ISSUE_RATE arm_issue_rate
#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
arm_first_cycle_multipass_dfa_lookahead
#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
arm_first_cycle_multipass_dfa_lookahead_guard
#undef TARGET_MANGLE_TYPE
#define TARGET_MANGLE_TYPE arm_mangle_type
#undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
#define TARGET_ATOMIC_ASSIGN_EXPAND_FENV arm_atomic_assign_expand_fenv
#undef TARGET_BUILD_BUILTIN_VA_LIST
#define TARGET_BUILD_BUILTIN_VA_LIST arm_build_builtin_va_list
#undef TARGET_EXPAND_BUILTIN_VA_START
#define TARGET_EXPAND_BUILTIN_VA_START arm_expand_builtin_va_start
#undef TARGET_GIMPLIFY_VA_ARG_EXPR
#define TARGET_GIMPLIFY_VA_ARG_EXPR arm_gimplify_va_arg_expr
#ifdef HAVE_AS_TLS
#undef TARGET_ASM_OUTPUT_DWARF_DTPREL
#define TARGET_ASM_OUTPUT_DWARF_DTPREL arm_output_dwarf_dtprel
#endif
#undef TARGET_LEGITIMATE_ADDRESS_P
#define TARGET_LEGITIMATE_ADDRESS_P arm_legitimate_address_p
#undef TARGET_PREFERRED_RELOAD_CLASS
#define TARGET_PREFERRED_RELOAD_CLASS arm_preferred_reload_class
#undef TARGET_INVALID_PARAMETER_TYPE
#define TARGET_INVALID_PARAMETER_TYPE arm_invalid_parameter_type
#undef TARGET_INVALID_RETURN_TYPE
#define TARGET_INVALID_RETURN_TYPE arm_invalid_return_type
#undef TARGET_PROMOTED_TYPE
#define TARGET_PROMOTED_TYPE arm_promoted_type
#undef TARGET_CONVERT_TO_TYPE
#define TARGET_CONVERT_TO_TYPE arm_convert_to_type
#undef TARGET_SCALAR_MODE_SUPPORTED_P
#define TARGET_SCALAR_MODE_SUPPORTED_P arm_scalar_mode_supported_p
#undef TARGET_FRAME_POINTER_REQUIRED
#define TARGET_FRAME_POINTER_REQUIRED arm_frame_pointer_required
#undef TARGET_CAN_ELIMINATE
#define TARGET_CAN_ELIMINATE arm_can_eliminate
#undef TARGET_CONDITIONAL_REGISTER_USAGE
#define TARGET_CONDITIONAL_REGISTER_USAGE arm_conditional_register_usage
#undef TARGET_CLASS_LIKELY_SPILLED_P
#define TARGET_CLASS_LIKELY_SPILLED_P arm_class_likely_spilled_p
#undef TARGET_VECTORIZE_BUILTINS
#define TARGET_VECTORIZE_BUILTINS
#undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
arm_builtin_vectorized_function
#undef TARGET_VECTOR_ALIGNMENT
#define TARGET_VECTOR_ALIGNMENT arm_vector_alignment
#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
arm_vector_alignment_reachable
#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
arm_builtin_support_vector_misalignment
#undef TARGET_PREFERRED_RENAME_CLASS
#define TARGET_PREFERRED_RENAME_CLASS \
arm_preferred_rename_class
#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
#define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
arm_vectorize_vec_perm_const_ok
#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
arm_builtin_vectorization_cost
#undef TARGET_VECTORIZE_ADD_STMT_COST
#define TARGET_VECTORIZE_ADD_STMT_COST arm_add_stmt_cost
#undef TARGET_CANONICALIZE_COMPARISON
#define TARGET_CANONICALIZE_COMPARISON \
arm_canonicalize_comparison
#undef TARGET_ASAN_SHADOW_OFFSET
#define TARGET_ASAN_SHADOW_OFFSET arm_asan_shadow_offset
#undef MAX_INSN_PER_IT_BLOCK
#define MAX_INSN_PER_IT_BLOCK (arm_restrict_it ? 1 : 4)
#undef TARGET_CAN_USE_DOLOOP_P
#define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
#undef TARGET_CONST_NOT_OK_FOR_DEBUG_P
#define TARGET_CONST_NOT_OK_FOR_DEBUG_P arm_const_not_ok_for_debug_p
#undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
#define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
#undef TARGET_SCHED_FUSION_PRIORITY
#define TARGET_SCHED_FUSION_PRIORITY arm_sched_fusion_priority
struct gcc_target targetm = TARGET_INITIALIZER;
/* Obstack for minipool constant handling. */
static struct obstack minipool_obstack;
static char * minipool_startobj;
/* The maximum number of insns skipped which
will be conditionalised if possible. */
static int max_insns_skipped = 5;
extern FILE * asm_out_file;
/* True if we are currently building a constant table. */
int making_const_table;
/* The processor for which instructions should be scheduled. */
enum processor_type arm_tune = arm_none;
/* The current tuning set. */
const struct tune_params *current_tune;
/* Which floating point hardware to schedule for. */
int arm_fpu_attr;
/* Which floating popint hardware to use. */
const struct arm_fpu_desc *arm_fpu_desc;
/* Used for Thumb call_via trampolines. */
rtx thumb_call_via_label[14];
static int thumb_call_reg_needed;
/* The bits in this mask specify which
instructions we are allowed to generate. */
unsigned long insn_flags = 0;
/* The bits in this mask specify which instruction scheduling options should
be used. */
unsigned long tune_flags = 0;
/* The highest ARM architecture version supported by the
target. */
enum base_architecture arm_base_arch = BASE_ARCH_0;
/* The following are used in the arm.md file as equivalents to bits
in the above two flag variables. */
/* Nonzero if this chip supports the ARM Architecture 3M extensions. */
int arm_arch3m = 0;
/* Nonzero if this chip supports the ARM Architecture 4 extensions. */
int arm_arch4 = 0;
/* Nonzero if this chip supports the ARM Architecture 4t extensions. */
int arm_arch4t = 0;
/* Nonzero if this chip supports the ARM Architecture 5 extensions. */
int arm_arch5 = 0;
/* Nonzero if this chip supports the ARM Architecture 5E extensions. */
int arm_arch5e = 0;
/* Nonzero if this chip supports the ARM Architecture 6 extensions. */
int arm_arch6 = 0;
/* Nonzero if this chip supports the ARM 6K extensions. */
int arm_arch6k = 0;
/* Nonzero if instructions present in ARMv6-M can be used. */
int arm_arch6m = 0;
/* Nonzero if this chip supports the ARM 7 extensions. */
int arm_arch7 = 0;
/* Nonzero if instructions not present in the 'M' profile can be used. */
int arm_arch_notm = 0;
/* Nonzero if instructions present in ARMv7E-M can be used. */
int arm_arch7em = 0;
/* Nonzero if instructions present in ARMv8 can be used. */
int arm_arch8 = 0;
/* Nonzero if this chip can benefit from load scheduling. */
int arm_ld_sched = 0;
/* Nonzero if this chip is a StrongARM. */
int arm_tune_strongarm = 0;
/* Nonzero if this chip supports Intel Wireless MMX technology. */
int arm_arch_iwmmxt = 0;
/* Nonzero if this chip supports Intel Wireless MMX2 technology. */
int arm_arch_iwmmxt2 = 0;
/* Nonzero if this chip is an XScale. */
int arm_arch_xscale = 0;
/* Nonzero if tuning for XScale */
int arm_tune_xscale = 0;
/* Nonzero if we want to tune for stores that access the write-buffer.
This typically means an ARM6 or ARM7 with MMU or MPU. */
int arm_tune_wbuf = 0;
/* Nonzero if tuning for Cortex-A9. */
int arm_tune_cortex_a9 = 0;
/* Nonzero if generating Thumb instructions. */
int thumb_code = 0;
/* Nonzero if generating Thumb-1 instructions. */
int thumb1_code = 0;
/* Nonzero if we should define __THUMB_INTERWORK__ in the
preprocessor.
XXX This is a bit of a hack, it's intended to help work around
problems in GLD which doesn't understand that armv5t code is
interworking clean. */
int arm_cpp_interwork = 0;
/* Nonzero if chip supports Thumb 2. */
int arm_arch_thumb2;
/* Nonzero if chip supports integer division instruction. */
int arm_arch_arm_hwdiv;
int arm_arch_thumb_hwdiv;
/* Nonzero if this chip supports the Large Physical Address Extension. */
int arm_arch_lpae;
/* Nonzero if chip disallows volatile memory access in IT block. */
int arm_arch_no_volatile_ce;
/* Nonzero if we should use Neon to handle 64-bits operations rather
than core registers. */
int prefer_neon_for_64bits = 0;
/* Nonzero if we shouldn't use literal pools. */
bool arm_disable_literal_pool = false;
/* In case of a PRE_INC, POST_INC, PRE_DEC, POST_DEC memory reference,
we must report the mode of the memory reference from
TARGET_PRINT_OPERAND to TARGET_PRINT_OPERAND_ADDRESS. */
machine_mode output_memory_reference_mode;
/* The register number to be used for the PIC offset register. */
unsigned arm_pic_register = INVALID_REGNUM;
enum arm_pcs arm_pcs_default;
/* For an explanation of these variables, see final_prescan_insn below. */
int arm_ccfsm_state;
/* arm_current_cc is also used for Thumb-2 cond_exec blocks. */
enum arm_cond_code arm_current_cc;
rtx arm_target_insn;
int arm_target_label;
/* The number of conditionally executed insns, including the current insn. */
int arm_condexec_count = 0;
/* A bitmask specifying the patterns for the IT block.
Zero means do not output an IT block before this insn. */
int arm_condexec_mask = 0;
/* The number of bits used in arm_condexec_mask. */
int arm_condexec_masklen = 0;
/* Nonzero if chip supports the ARMv8 CRC instructions. */
int arm_arch_crc = 0;
/* Nonzero if the core has a very small, high-latency, multiply unit. */
int arm_m_profile_small_mul = 0;
/* The condition codes of the ARM, and the inverse function. */
static const char * const arm_condition_codes[] =
{
"eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
"hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
};
/* The register numbers in sequence, for passing to arm_gen_load_multiple. */
int arm_regs_in_sequence[] =
{
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
};
#define ARM_LSL_NAME (TARGET_UNIFIED_ASM ? "lsl" : "asl")
#define streq(string1, string2) (strcmp (string1, string2) == 0)
#define THUMB2_WORK_REGS (0xff & ~( (1 << THUMB_HARD_FRAME_POINTER_REGNUM) \
| (1 << SP_REGNUM) | (1 << PC_REGNUM) \
| (1 << PIC_OFFSET_TABLE_REGNUM)))
/* Initialization code. */
struct processors
{
const char *const name;
enum processor_type core;
const char *arch;
enum base_architecture base_arch;
const unsigned long flags;
const struct tune_params *const tune;
};
#define ARM_PREFETCH_NOT_BENEFICIAL 0, -1, -1
#define ARM_PREFETCH_BENEFICIAL(prefetch_slots,l1_size,l1_line_size) \
prefetch_slots, \
l1_size, \
l1_line_size
/* arm generic vectorizer costs. */
static const
struct cpu_vec_costs arm_default_vec_cost = {
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
1, /* vec_unalign_load_cost. */
1, /* vec_unalign_store_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* Cost tables for AArch32 + AArch64 cores should go in aarch-cost-tables.h */
#include "aarch-cost-tables.h"
const struct cpu_cost_table cortexa9_extra_costs =
{
/* ALU */
{
0, /* arith. */
0, /* logical. */
0, /* shift. */
COSTS_N_INSNS (1), /* shift_reg. */
COSTS_N_INSNS (1), /* arith_shift. */
COSTS_N_INSNS (2), /* arith_shift_reg. */
0, /* log_shift. */
COSTS_N_INSNS (1), /* log_shift_reg. */
COSTS_N_INSNS (1), /* extend. */
COSTS_N_INSNS (2), /* extend_arith. */
COSTS_N_INSNS (1), /* bfi. */
COSTS_N_INSNS (1), /* bfx. */
0, /* clz. */
0, /* rev. */
0, /* non_exec. */
true /* non_exec_costs_exec. */
},
{
/* MULT SImode */
{
COSTS_N_INSNS (3), /* simple. */
COSTS_N_INSNS (3), /* flag_setting. */
COSTS_N_INSNS (2), /* extend. */
COSTS_N_INSNS (3), /* add. */
COSTS_N_INSNS (2), /* extend_add. */
COSTS_N_INSNS (30) /* idiv. No HW div on Cortex A9. */
},
/* MULT DImode */
{
0, /* simple (N/A). */
0, /* flag_setting (N/A). */
COSTS_N_INSNS (4), /* extend. */
0, /* add (N/A). */
COSTS_N_INSNS (4), /* extend_add. */
0 /* idiv (N/A). */
}
},
/* LD/ST */
{
COSTS_N_INSNS (2), /* load. */
COSTS_N_INSNS (2), /* load_sign_extend. */
COSTS_N_INSNS (2), /* ldrd. */
COSTS_N_INSNS (2), /* ldm_1st. */
1, /* ldm_regs_per_insn_1st. */
2, /* ldm_regs_per_insn_subsequent. */
COSTS_N_INSNS (5), /* loadf. */
COSTS_N_INSNS (5), /* loadd. */
COSTS_N_INSNS (1), /* load_unaligned. */
COSTS_N_INSNS (2), /* store. */
COSTS_N_INSNS (2), /* strd. */
COSTS_N_INSNS (2), /* stm_1st. */
1, /* stm_regs_per_insn_1st. */
2, /* stm_regs_per_insn_subsequent. */
COSTS_N_INSNS (1), /* storef. */
COSTS_N_INSNS (1), /* stored. */
COSTS_N_INSNS (1) /* store_unaligned. */
},
{
/* FP SFmode */
{
COSTS_N_INSNS (14), /* div. */
COSTS_N_INSNS (4), /* mult. */
COSTS_N_INSNS (7), /* mult_addsub. */
COSTS_N_INSNS (30), /* fma. */
COSTS_N_INSNS (3), /* addsub. */
COSTS_N_INSNS (1), /* fpconst. */
COSTS_N_INSNS (1), /* neg. */
COSTS_N_INSNS (3), /* compare. */
COSTS_N_INSNS (3), /* widen. */
COSTS_N_INSNS (3), /* narrow. */
COSTS_N_INSNS (3), /* toint. */
COSTS_N_INSNS (3), /* fromint. */
COSTS_N_INSNS (3) /* roundint. */
},
/* FP DFmode */
{
COSTS_N_INSNS (24), /* div. */
COSTS_N_INSNS (5), /* mult. */
COSTS_N_INSNS (8), /* mult_addsub. */
COSTS_N_INSNS (30), /* fma. */
COSTS_N_INSNS (3), /* addsub. */
COSTS_N_INSNS (1), /* fpconst. */
COSTS_N_INSNS (1), /* neg. */
COSTS_N_INSNS (3), /* compare. */
COSTS_N_INSNS (3), /* widen. */
COSTS_N_INSNS (3), /* narrow. */
COSTS_N_INSNS (3), /* toint. */
COSTS_N_INSNS (3), /* fromint. */
COSTS_N_INSNS (3) /* roundint. */
}
},
/* Vector */
{
COSTS_N_INSNS (1) /* alu. */
}
};
const struct cpu_cost_table cortexa8_extra_costs =
{
/* ALU */
{
0, /* arith. */
0, /* logical. */
COSTS_N_INSNS (1), /* shift. */
0, /* shift_reg. */
COSTS_N_INSNS (1), /* arith_shift. */
0, /* arith_shift_reg. */
COSTS_N_INSNS (1), /* log_shift. */
0, /* log_shift_reg. */
0, /* extend. */
0, /* extend_arith. */
0, /* bfi. */
0, /* bfx. */
0, /* clz. */
0, /* rev. */
0, /* non_exec. */
true /* non_exec_costs_exec. */
},
{
/* MULT SImode */
{
COSTS_N_INSNS (1), /* simple. */
COSTS_N_INSNS (1), /* flag_setting. */
COSTS_N_INSNS (1), /* extend. */
COSTS_N_INSNS (1), /* add. */
COSTS_N_INSNS (1), /* extend_add. */
COSTS_N_INSNS (30) /* idiv. No HW div on Cortex A8. */
},
/* MULT DImode */
{
0, /* simple (N/A). */
0, /* flag_setting (N/A). */
COSTS_N_INSNS (2), /* extend. */
0, /* add (N/A). */
COSTS_N_INSNS (2), /* extend_add. */
0 /* idiv (N/A). */
}
},
/* LD/ST */
{
COSTS_N_INSNS (1), /* load. */
COSTS_N_INSNS (1), /* load_sign_extend. */
COSTS_N_INSNS (1), /* ldrd. */
COSTS_N_INSNS (1), /* ldm_1st. */
1, /* ldm_regs_per_insn_1st. */
2, /* ldm_regs_per_insn_subsequent. */
COSTS_N_INSNS (1), /* loadf. */
COSTS_N_INSNS (1), /* loadd. */
COSTS_N_INSNS (1), /* load_unaligned. */
COSTS_N_INSNS (1), /* store. */
COSTS_N_INSNS (1), /* strd. */
COSTS_N_INSNS (1), /* stm_1st. */
1, /* stm_regs_per_insn_1st. */
2, /* stm_regs_per_insn_subsequent. */
COSTS_N_INSNS (1), /* storef. */
COSTS_N_INSNS (1), /* stored. */
COSTS_N_INSNS (1) /* store_unaligned. */
},
{
/* FP SFmode */
{
COSTS_N_INSNS (36), /* div. */
COSTS_N_INSNS (11), /* mult. */
COSTS_N_INSNS (20), /* mult_addsub. */
COSTS_N_INSNS (30), /* fma. */
COSTS_N_INSNS (9), /* addsub. */
COSTS_N_INSNS (3), /* fpconst. */
COSTS_N_INSNS (3), /* neg. */
COSTS_N_INSNS (6), /* compare. */
COSTS_N_INSNS (4), /* widen. */
COSTS_N_INSNS (4), /* narrow. */
COSTS_N_INSNS (8), /* toint. */
COSTS_N_INSNS (8), /* fromint. */
COSTS_N_INSNS (8) /* roundint. */
},
/* FP DFmode */
{
COSTS_N_INSNS (64), /* div. */
COSTS_N_INSNS (16), /* mult. */
COSTS_N_INSNS (25), /* mult_addsub. */
COSTS_N_INSNS (30), /* fma. */
COSTS_N_INSNS (9), /* addsub. */
COSTS_N_INSNS (3), /* fpconst. */
COSTS_N_INSNS (3), /* neg. */
COSTS_N_INSNS (6), /* compare. */
COSTS_N_INSNS (6), /* widen. */
COSTS_N_INSNS (6), /* narrow. */
COSTS_N_INSNS (8), /* toint. */
COSTS_N_INSNS (8), /* fromint. */
COSTS_N_INSNS (8) /* roundint. */
}
},
/* Vector */
{
COSTS_N_INSNS (1) /* alu. */
}
};
const struct cpu_cost_table cortexa5_extra_costs =
{
/* ALU */
{
0, /* arith. */
0, /* logical. */
COSTS_N_INSNS (1), /* shift. */
COSTS_N_INSNS (1), /* shift_reg. */
COSTS_N_INSNS (1), /* arith_shift. */
COSTS_N_INSNS (1), /* arith_shift_reg. */
COSTS_N_INSNS (1), /* log_shift. */
COSTS_N_INSNS (1), /* log_shift_reg. */
COSTS_N_INSNS (1), /* extend. */
COSTS_N_INSNS (1), /* extend_arith. */
COSTS_N_INSNS (1), /* bfi. */
COSTS_N_INSNS (1), /* bfx. */
COSTS_N_INSNS (1), /* clz. */
COSTS_N_INSNS (1), /* rev. */
0, /* non_exec. */
true /* non_exec_costs_exec. */
},
{
/* MULT SImode */
{
0, /* simple. */
COSTS_N_INSNS (1), /* flag_setting. */
COSTS_N_INSNS (1), /* extend. */
COSTS_N_INSNS (1), /* add. */
COSTS_N_INSNS (1), /* extend_add. */
COSTS_N_INSNS (7) /* idiv. */
},
/* MULT DImode */
{
0, /* simple (N/A). */
0, /* flag_setting (N/A). */
COSTS_N_INSNS (1), /* extend. */
0, /* add. */
COSTS_N_INSNS (2), /* extend_add. */
0 /* idiv (N/A). */
}
},
/* LD/ST */
{
COSTS_N_INSNS (1), /* load. */
COSTS_N_INSNS (1), /* load_sign_extend. */
COSTS_N_INSNS (6), /* ldrd. */
COSTS_N_INSNS (1), /* ldm_1st. */
1, /* ldm_regs_per_insn_1st. */
2, /* ldm_regs_per_insn_subsequent. */
COSTS_N_INSNS (2), /* loadf. */
COSTS_N_INSNS (4), /* loadd. */
COSTS_N_INSNS (1), /* load_unaligned. */
COSTS_N_INSNS (1), /* store. */
COSTS_N_INSNS (3), /* strd. */
COSTS_N_INSNS (1), /* stm_1st. */
1, /* stm_regs_per_insn_1st. */
2, /* stm_regs_per_insn_subsequent. */
COSTS_N_INSNS (2), /* storef. */
COSTS_N_INSNS (2), /* stored. */
COSTS_N_INSNS (1) /* store_unaligned. */
},
{
/* FP SFmode */
{
COSTS_N_INSNS (15), /* div. */
COSTS_N_INSNS (3), /* mult. */
COSTS_N_INSNS (7), /* mult_addsub. */
COSTS_N_INSNS (7), /* fma. */
COSTS_N_INSNS (3), /* addsub. */
COSTS_N_INSNS (3), /* fpconst. */
COSTS_N_INSNS (3), /* neg. */
COSTS_N_INSNS (3), /* compare. */
COSTS_N_INSNS (3), /* widen. */
COSTS_N_INSNS (3), /* narrow. */
COSTS_N_INSNS (3), /* toint. */
COSTS_N_INSNS (3), /* fromint. */
COSTS_N_INSNS (3) /* roundint. */
},
/* FP DFmode */
{
COSTS_N_INSNS (30), /* div. */
COSTS_N_INSNS (6), /* mult. */
COSTS_N_INSNS (10), /* mult_addsub. */
COSTS_N_INSNS (7), /* fma. */
COSTS_N_INSNS (3), /* addsub. */
COSTS_N_INSNS (3), /* fpconst. */
COSTS_N_INSNS (3), /* neg. */
COSTS_N_INSNS (3), /* compare. */
COSTS_N_INSNS (3), /* widen. */
COSTS_N_INSNS (3), /* narrow. */
COSTS_N_INSNS (3), /* toint. */
COSTS_N_INSNS (3), /* fromint. */
COSTS_N_INSNS (3) /* roundint. */
}
},
/* Vector */
{
COSTS_N_INSNS (1) /* alu. */
}
};
const struct cpu_cost_table cortexa7_extra_costs =
{
/* ALU */
{
0, /* arith. */
0, /* logical. */
COSTS_N_INSNS (1), /* shift. */
COSTS_N_INSNS (1), /* shift_reg. */
COSTS_N_INSNS (1), /* arith_shift. */
COSTS_N_INSNS (1), /* arith_shift_reg. */
COSTS_N_INSNS (1), /* log_shift. */
COSTS_N_INSNS (1), /* log_shift_reg. */
COSTS_N_INSNS (1), /* extend. */
COSTS_N_INSNS (1), /* extend_arith. */
COSTS_N_INSNS (1), /* bfi. */
COSTS_N_INSNS (1), /* bfx. */
COSTS_N_INSNS (1), /* clz. */
COSTS_N_INSNS (1), /* rev. */
0, /* non_exec. */
true /* non_exec_costs_exec. */
},
{
/* MULT SImode */
{
0, /* simple. */
COSTS_N_INSNS (1), /* flag_setting. */
COSTS_N_INSNS (1), /* extend. */
COSTS_N_INSNS (1), /* add. */
COSTS_N_INSNS (1), /* extend_add. */
COSTS_N_INSNS (7) /* idiv. */
},
/* MULT DImode */
{
0, /* simple (N/A). */
0, /* flag_setting (N/A). */
COSTS_N_INSNS (1), /* extend. */
0, /* add. */
COSTS_N_INSNS (2), /* extend_add. */
0 /* idiv (N/A). */
}
},
/* LD/ST */
{
COSTS_N_INSNS (1), /* load. */
COSTS_N_INSNS (1), /* load_sign_extend. */
COSTS_N_INSNS (3), /* ldrd. */
COSTS_N_INSNS (1), /* ldm_1st. */
1, /* ldm_regs_per_insn_1st. */
2, /* ldm_regs_per_insn_subsequent. */
COSTS_N_INSNS (2), /* loadf. */
COSTS_N_INSNS (2), /* loadd. */
COSTS_N_INSNS (1), /* load_unaligned. */
COSTS_N_INSNS (1), /* store. */
COSTS_N_INSNS (3), /* strd. */
COSTS_N_INSNS (1), /* stm_1st. */
1, /* stm_regs_per_insn_1st. */
2, /* stm_regs_per_insn_subsequent. */
COSTS_N_INSNS (2), /* storef. */
COSTS_N_INSNS (2), /* stored. */
COSTS_N_INSNS (1) /* store_unaligned. */
},
{
/* FP SFmode */
{
COSTS_N_INSNS (15), /* div. */
COSTS_N_INSNS (3), /* mult. */
COSTS_N_INSNS (7), /* mult_addsub. */
COSTS_N_INSNS (7), /* fma. */
COSTS_N_INSNS (3), /* addsub. */
COSTS_N_INSNS (3), /* fpconst. */
COSTS_N_INSNS (3), /* neg. */
COSTS_N_INSNS (3), /* compare. */
COSTS_N_INSNS (3), /* widen. */
COSTS_N_INSNS (3), /* narrow. */
COSTS_N_INSNS (3), /* toint. */
COSTS_N_INSNS (3), /* fromint. */
COSTS_N_INSNS (3) /* roundint. */
},
/* FP DFmode */
{
COSTS_N_INSNS (30), /* div. */
COSTS_N_INSNS (6), /* mult. */
COSTS_N_INSNS (10), /* mult_addsub. */
COSTS_N_INSNS (7), /* fma. */
COSTS_N_INSNS (3), /* addsub. */
COSTS_N_INSNS (3), /* fpconst. */
COSTS_N_INSNS (3), /* neg. */
COSTS_N_INSNS (3), /* compare. */
COSTS_N_INSNS (3), /* widen. */
COSTS_N_INSNS (3), /* narrow. */
COSTS_N_INSNS (3), /* toint. */
COSTS_N_INSNS (3), /* fromint. */
COSTS_N_INSNS (3) /* roundint. */
}
},
/* Vector */
{
COSTS_N_INSNS (1) /* alu. */
}
};
const struct cpu_cost_table cortexa12_extra_costs =
{
/* ALU */
{
0, /* arith. */
0, /* logical. */
0, /* shift. */
COSTS_N_INSNS (1), /* shift_reg. */
COSTS_N_INSNS (1), /* arith_shift. */
COSTS_N_INSNS (1), /* arith_shift_reg. */
COSTS_N_INSNS (1), /* log_shift. */
COSTS_N_INSNS (1), /* log_shift_reg. */
0, /* extend. */
COSTS_N_INSNS (1), /* extend_arith. */
0, /* bfi. */
COSTS_N_INSNS (1), /* bfx. */
COSTS_N_INSNS (1), /* clz. */
COSTS_N_INSNS (1), /* rev. */
0, /* non_exec. */
true /* non_exec_costs_exec. */
},
/* MULT SImode */
{
{
COSTS_N_INSNS (2), /* simple. */
COSTS_N_INSNS (3), /* flag_setting. */
COSTS_N_INSNS (2), /* extend. */
COSTS_N_INSNS (3), /* add. */
COSTS_N_INSNS (2), /* extend_add. */
COSTS_N_INSNS (18) /* idiv. */
},
/* MULT DImode */
{
0, /* simple (N/A). */
0, /* flag_setting (N/A). */
COSTS_N_INSNS (3), /* extend. */
0, /* add (N/A). */
COSTS_N_INSNS (3), /* extend_add. */
0 /* idiv (N/A). */
}
},
/* LD/ST */
{
COSTS_N_INSNS (3), /* load. */
COSTS_N_INSNS (3), /* load_sign_extend. */
COSTS_N_INSNS (3), /* ldrd. */
COSTS_N_INSNS (3), /* ldm_1st. */
1, /* ldm_regs_per_insn_1st. */
2, /* ldm_regs_per_insn_subsequent. */
COSTS_N_INSNS (3), /* loadf. */
COSTS_N_INSNS (3), /* loadd. */
0, /* load_unaligned. */
0, /* store. */
0, /* strd. */
0, /* stm_1st. */
1, /* stm_regs_per_insn_1st. */
2, /* stm_regs_per_insn_subsequent. */
COSTS_N_INSNS (2), /* storef. */
COSTS_N_INSNS (2), /* stored. */
0 /* store_unaligned. */
},
{
/* FP SFmode */
{
COSTS_N_INSNS (17), /* div. */
COSTS_N_INSNS (4), /* mult. */
COSTS_N_INSNS (8), /* mult_addsub. */
COSTS_N_INSNS (8), /* fma. */
COSTS_N_INSNS (4), /* addsub. */
COSTS_N_INSNS (2), /* fpconst. */
COSTS_N_INSNS (2), /* neg. */
COSTS_N_INSNS (2), /* compare. */
COSTS_N_INSNS (4), /* widen. */
COSTS_N_INSNS (4), /* narrow. */
COSTS_N_INSNS (4), /* toint. */
COSTS_N_INSNS (4), /* fromint. */
COSTS_N_INSNS (4) /* roundint. */
},
/* FP DFmode */
{
COSTS_N_INSNS (31), /* div. */
COSTS_N_INSNS (4), /* mult. */
COSTS_N_INSNS (8), /* mult_addsub. */
COSTS_N_INSNS (8), /* fma. */
COSTS_N_INSNS (4), /* addsub. */
COSTS_N_INSNS (2), /* fpconst. */
COSTS_N_INSNS (2), /* neg. */
COSTS_N_INSNS (2), /* compare. */
COSTS_N_INSNS (4), /* widen. */
COSTS_N_INSNS (4), /* narrow. */
COSTS_N_INSNS (4), /* toint. */
COSTS_N_INSNS (4), /* fromint. */
COSTS_N_INSNS (4) /* roundint. */
}
},
/* Vector */
{
COSTS_N_INSNS (1) /* alu. */
}
};
const struct cpu_cost_table cortexa15_extra_costs =
{
/* ALU */
{
0, /* arith. */
0, /* logical. */
0, /* shift. */
0, /* shift_reg. */
COSTS_N_INSNS (1), /* arith_shift. */
COSTS_N_INSNS (1), /* arith_shift_reg. */
COSTS_N_INSNS (1), /* log_shift. */
COSTS_N_INSNS (1), /* log_shift_reg. */
0, /* extend. */
COSTS_N_INSNS (1), /* extend_arith. */
COSTS_N_INSNS (1), /* bfi. */
0, /* bfx. */
0, /* clz. */
0, /* rev. */
0, /* non_exec. */
true /* non_exec_costs_exec. */
},
/* MULT SImode */
{
{
COSTS_N_INSNS (2), /* simple. */
COSTS_N_INSNS (3), /* flag_setting. */
COSTS_N_INSNS (2), /* extend. */
COSTS_N_INSNS (2), /* add. */
COSTS_N_INSNS (2), /* extend_add. */
COSTS_N_INSNS (18) /* idiv. */
},
/* MULT DImode */
{
0, /* simple (N/A). */
0, /* flag_setting (N/A). */
COSTS_N_INSNS (3), /* extend. */
0, /* add (N/A). */
COSTS_N_INSNS (3), /* extend_add. */
0 /* idiv (N/A). */
}
},
/* LD/ST */
{
COSTS_N_INSNS (3), /* load. */
COSTS_N_INSNS (3), /* load_sign_extend. */
COSTS_N_INSNS (3), /* ldrd. */
COSTS_N_INSNS (4), /* ldm_1st. */
1, /* ldm_regs_per_insn_1st. */
2, /* ldm_regs_per_insn_subsequent. */
COSTS_N_INSNS (4), /* loadf. */
COSTS_N_INSNS (4), /* loadd. */
0, /* load_unaligned. */
0, /* store. */
0, /* strd. */
COSTS_N_INSNS (1), /* stm_1st. */
1, /* stm_regs_per_insn_1st. */
2, /* stm_regs_per_insn_subsequent. */
0, /* storef. */
0, /* stored. */
0 /* store_unaligned. */
},
{
/* FP SFmode */
{
COSTS_N_INSNS (17), /* div. */
COSTS_N_INSNS (4), /* mult. */
COSTS_N_INSNS (8), /* mult_addsub. */
COSTS_N_INSNS (8), /* fma. */
COSTS_N_INSNS (4), /* addsub. */
COSTS_N_INSNS (2), /* fpconst. */
COSTS_N_INSNS (2), /* neg. */
COSTS_N_INSNS (5), /* compare. */
COSTS_N_INSNS (4), /* widen. */
COSTS_N_INSNS (4), /* narrow. */
COSTS_N_INSNS (4), /* toint. */
COSTS_N_INSNS (4), /* fromint. */
COSTS_N_INSNS (4) /* roundint. */
},
/* FP DFmode */
{
COSTS_N_INSNS (31), /* div. */
COSTS_N_INSNS (4), /* mult. */
COSTS_N_INSNS (8), /* mult_addsub. */
COSTS_N_INSNS (8), /* fma. */
COSTS_N_INSNS (4), /* addsub. */
COSTS_N_INSNS (2), /* fpconst. */
COSTS_N_INSNS (2), /* neg. */
COSTS_N_INSNS (2), /* compare. */
COSTS_N_INSNS (4), /* widen. */
COSTS_N_INSNS (4), /* narrow. */
COSTS_N_INSNS (4), /* toint. */
COSTS_N_INSNS (4), /* fromint. */
COSTS_N_INSNS (4) /* roundint. */
}
},
/* Vector */
{
COSTS_N_INSNS (1) /* alu. */
}
};
const struct cpu_cost_table v7m_extra_costs =
{
/* ALU */
{
0, /* arith. */
0, /* logical. */
0, /* shift. */
0, /* shift_reg. */
0, /* arith_shift. */
COSTS_N_INSNS (1), /* arith_shift_reg. */
0, /* log_shift. */
COSTS_N_INSNS (1), /* log_shift_reg. */
0, /* extend. */
COSTS_N_INSNS (1), /* extend_arith. */
0, /* bfi. */
0, /* bfx. */
0, /* clz. */
0, /* rev. */
COSTS_N_INSNS (1), /* non_exec. */
false /* non_exec_costs_exec. */
},
{
/* MULT SImode */
{
COSTS_N_INSNS (1), /* simple. */
COSTS_N_INSNS (1), /* flag_setting. */
COSTS_N_INSNS (2), /* extend. */
COSTS_N_INSNS (1), /* add. */
COSTS_N_INSNS (3), /* extend_add. */
COSTS_N_INSNS (8) /* idiv. */
},
/* MULT DImode */
{
0, /* simple (N/A). */
0, /* flag_setting (N/A). */
COSTS_N_INSNS (2), /* extend. */
0, /* add (N/A). */
COSTS_N_INSNS (3), /* extend_add. */
0 /* idiv (N/A). */
}
},
/* LD/ST */
{
COSTS_N_INSNS (2), /* load. */
0, /* load_sign_extend. */
COSTS_N_INSNS (3), /* ldrd. */
COSTS_N_INSNS (2), /* ldm_1st. */
1, /* ldm_regs_per_insn_1st. */
1, /* ldm_regs_per_insn_subsequent. */
COSTS_N_INSNS (2), /* loadf. */
COSTS_N_INSNS (3), /* loadd. */
COSTS_N_INSNS (1), /* load_unaligned. */
COSTS_N_INSNS (2), /* store. */
COSTS_N_INSNS (3), /* strd. */
COSTS_N_INSNS (2), /* stm_1st. */
1, /* stm_regs_per_insn_1st. */
1, /* stm_regs_per_insn_subsequent. */
COSTS_N_INSNS (2), /* storef. */
COSTS_N_INSNS (3), /* stored. */
COSTS_N_INSNS (1) /* store_unaligned. */
},
{
/* FP SFmode */
{
COSTS_N_INSNS (7), /* div. */
COSTS_N_INSNS (2), /* mult. */
COSTS_N_INSNS (5), /* mult_addsub. */
COSTS_N_INSNS (3), /* fma. */
COSTS_N_INSNS (1), /* addsub. */
0, /* fpconst. */
0, /* neg. */
0, /* compare. */
0, /* widen. */
0, /* narrow. */
0, /* toint. */
0, /* fromint. */
0 /* roundint. */
},
/* FP DFmode */
{
COSTS_N_INSNS (15), /* div. */
COSTS_N_INSNS (5), /* mult. */
COSTS_N_INSNS (7), /* mult_addsub. */
COSTS_N_INSNS (7), /* fma. */
COSTS_N_INSNS (3), /* addsub. */
0, /* fpconst. */
0, /* neg. */
0, /* compare. */
0, /* widen. */
0, /* narrow. */
0, /* toint. */
0, /* fromint. */
0 /* roundint. */
}
},
/* Vector */
{
COSTS_N_INSNS (1) /* alu. */
}
};
#define ARM_FUSE_NOTHING (0)
#define ARM_FUSE_MOVW_MOVT (1 << 0)
const struct tune_params arm_slowmul_tune =
{
arm_slowmul_rtx_costs,
NULL,
NULL, /* Sched adj cost. */
3, /* Constant limit. */
5, /* Max cond insns. */
ARM_PREFETCH_NOT_BENEFICIAL,
true, /* Prefer constant pool. */
arm_default_branch_cost,
false, /* Prefer LDRD/STRD. */
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_OFF /* Sched L2 autopref. */
};
const struct tune_params arm_fastmul_tune =
{
arm_fastmul_rtx_costs,
NULL,
NULL, /* Sched adj cost. */
1, /* Constant limit. */
5, /* Max cond insns. */
ARM_PREFETCH_NOT_BENEFICIAL,
true, /* Prefer constant pool. */
arm_default_branch_cost,
false, /* Prefer LDRD/STRD. */
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_OFF /* Sched L2 autopref. */
};
/* StrongARM has early execution of branches, so a sequence that is worth
skipping is shorter. Set max_insns_skipped to a lower value. */
const struct tune_params arm_strongarm_tune =
{
arm_fastmul_rtx_costs,
NULL,
NULL, /* Sched adj cost. */
1, /* Constant limit. */
3, /* Max cond insns. */
ARM_PREFETCH_NOT_BENEFICIAL,
true, /* Prefer constant pool. */
arm_default_branch_cost,
false, /* Prefer LDRD/STRD. */
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_OFF /* Sched L2 autopref. */
};
const struct tune_params arm_xscale_tune =
{
arm_xscale_rtx_costs,
NULL,
xscale_sched_adjust_cost,
2, /* Constant limit. */
3, /* Max cond insns. */
ARM_PREFETCH_NOT_BENEFICIAL,
true, /* Prefer constant pool. */
arm_default_branch_cost,
false, /* Prefer LDRD/STRD. */
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_OFF /* Sched L2 autopref. */
};
const struct tune_params arm_9e_tune =
{
arm_9e_rtx_costs,
NULL,
NULL, /* Sched adj cost. */
1, /* Constant limit. */
5, /* Max cond insns. */
ARM_PREFETCH_NOT_BENEFICIAL,
true, /* Prefer constant pool. */
arm_default_branch_cost,
false, /* Prefer LDRD/STRD. */
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_OFF /* Sched L2 autopref. */
};
const struct tune_params arm_v6t2_tune =
{
arm_9e_rtx_costs,
NULL,
NULL, /* Sched adj cost. */
1, /* Constant limit. */
5, /* Max cond insns. */
ARM_PREFETCH_NOT_BENEFICIAL,
false, /* Prefer constant pool. */
arm_default_branch_cost,
false, /* Prefer LDRD/STRD. */
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_OFF /* Sched L2 autopref. */
};
/* Generic Cortex tuning. Use more specific tunings if appropriate. */
const struct tune_params arm_cortex_tune =
{
arm_9e_rtx_costs,
&generic_extra_costs,
NULL, /* Sched adj cost. */
1, /* Constant limit. */
5, /* Max cond insns. */
ARM_PREFETCH_NOT_BENEFICIAL,
false, /* Prefer constant pool. */
arm_default_branch_cost,
false, /* Prefer LDRD/STRD. */
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_OFF /* Sched L2 autopref. */
};
const struct tune_params arm_cortex_a8_tune =
{
arm_9e_rtx_costs,
&cortexa8_extra_costs,
NULL, /* Sched adj cost. */
1, /* Constant limit. */
5, /* Max cond insns. */
ARM_PREFETCH_NOT_BENEFICIAL,
false, /* Prefer constant pool. */
arm_default_branch_cost,
false, /* Prefer LDRD/STRD. */
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_OFF /* Sched L2 autopref. */
};
const struct tune_params arm_cortex_a7_tune =
{
arm_9e_rtx_costs,
&cortexa7_extra_costs,
NULL,
1, /* Constant limit. */
5, /* Max cond insns. */
ARM_PREFETCH_NOT_BENEFICIAL,
false, /* Prefer constant pool. */
arm_default_branch_cost,
false, /* Prefer LDRD/STRD. */
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_OFF /* Sched L2 autopref. */
};
const struct tune_params arm_cortex_a15_tune =
{
arm_9e_rtx_costs,
&cortexa15_extra_costs,
NULL, /* Sched adj cost. */
1, /* Constant limit. */
2, /* Max cond insns. */
ARM_PREFETCH_NOT_BENEFICIAL,
false, /* Prefer constant pool. */
arm_default_branch_cost,
true, /* Prefer LDRD/STRD. */
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
true, true, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_FULL /* Sched L2 autopref. */
};
const struct tune_params arm_cortex_a53_tune =
{
arm_9e_rtx_costs,
&cortexa53_extra_costs,
NULL, /* Scheduler cost adjustment. */
1, /* Constant limit. */
5, /* Max cond insns. */
ARM_PREFETCH_NOT_BENEFICIAL,
false, /* Prefer constant pool. */
arm_default_branch_cost,
false, /* Prefer LDRD/STRD. */
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_MOVW_MOVT, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_OFF /* Sched L2 autopref. */
};
const struct tune_params arm_cortex_a57_tune =
{
arm_9e_rtx_costs,
&cortexa57_extra_costs,
NULL, /* Scheduler cost adjustment. */
1, /* Constant limit. */
2, /* Max cond insns. */
ARM_PREFETCH_NOT_BENEFICIAL,
false, /* Prefer constant pool. */
arm_default_branch_cost,
true, /* Prefer LDRD/STRD. */
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
true, true, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_MOVW_MOVT, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_FULL /* Sched L2 autopref. */
};
const struct tune_params arm_xgene1_tune =
{
arm_9e_rtx_costs,
&xgene1_extra_costs,
NULL, /* Scheduler cost adjustment. */
1, /* Constant limit. */
2, /* Max cond insns. */
ARM_PREFETCH_NOT_BENEFICIAL,
false, /* Prefer constant pool. */
arm_default_branch_cost,
true, /* Prefer LDRD/STRD. */
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
true, true, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
32, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_OFF /* Sched L2 autopref. */
};
/* Branches can be dual-issued on Cortex-A5, so conditional execution is
less appealing. Set max_insns_skipped to a low value. */
const struct tune_params arm_cortex_a5_tune =
{
arm_9e_rtx_costs,
&cortexa5_extra_costs,
NULL, /* Sched adj cost. */
1, /* Constant limit. */
1, /* Max cond insns. */
ARM_PREFETCH_NOT_BENEFICIAL,
false, /* Prefer constant pool. */
arm_cortex_a5_branch_cost,
false, /* Prefer LDRD/STRD. */
{false, false}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_OFF /* Sched L2 autopref. */
};
const struct tune_params arm_cortex_a9_tune =
{
arm_9e_rtx_costs,
&cortexa9_extra_costs,
cortex_a9_sched_adjust_cost,
1, /* Constant limit. */
5, /* Max cond insns. */
ARM_PREFETCH_BENEFICIAL(4,32,32),
false, /* Prefer constant pool. */
arm_default_branch_cost,
false, /* Prefer LDRD/STRD. */
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_OFF /* Sched L2 autopref. */
};
const struct tune_params arm_cortex_a12_tune =
{
arm_9e_rtx_costs,
&cortexa12_extra_costs,
NULL, /* Sched adj cost. */
1, /* Constant limit. */
2, /* Max cond insns. */
ARM_PREFETCH_NOT_BENEFICIAL,
false, /* Prefer constant pool. */
arm_default_branch_cost,
true, /* Prefer LDRD/STRD. */
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
true, true, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_MOVW_MOVT, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_OFF /* Sched L2 autopref. */
};
/* armv7m tuning. On Cortex-M4 cores for example, MOVW/MOVT take a single
cycle to execute each. An LDR from the constant pool also takes two cycles
to execute, but mildly increases pipelining opportunity (consecutive
loads/stores can be pipelined together, saving one cycle), and may also
improve icache utilisation. Hence we prefer the constant pool for such
processors. */
const struct tune_params arm_v7m_tune =
{
arm_9e_rtx_costs,
&v7m_extra_costs,
NULL, /* Sched adj cost. */
1, /* Constant limit. */
2, /* Max cond insns. */
ARM_PREFETCH_NOT_BENEFICIAL,
true, /* Prefer constant pool. */
arm_cortex_m_branch_cost,
false, /* Prefer LDRD/STRD. */
{false, false}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_OFF /* Sched L2 autopref. */
};
/* Cortex-M7 tuning. */
const struct tune_params arm_cortex_m7_tune =
{
arm_9e_rtx_costs,
&v7m_extra_costs,
NULL, /* Sched adj cost. */
0, /* Constant limit. */
1, /* Max cond insns. */
ARM_PREFETCH_NOT_BENEFICIAL,
true, /* Prefer constant pool. */
arm_cortex_m7_branch_cost,
false, /* Prefer LDRD/STRD. */
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_OFF /* Sched L2 autopref. */
};
/* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than
arm_v6t2_tune. It is used for cortex-m0, cortex-m1 and cortex-m0plus. */
const struct tune_params arm_v6m_tune =
{
arm_9e_rtx_costs,
NULL,
NULL, /* Sched adj cost. */
1, /* Constant limit. */
5, /* Max cond insns. */
ARM_PREFETCH_NOT_BENEFICIAL,
false, /* Prefer constant pool. */
arm_default_branch_cost,
false, /* Prefer LDRD/STRD. */
{false, false}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_OFF /* Sched L2 autopref. */
};
const struct tune_params arm_fa726te_tune =
{
arm_9e_rtx_costs,
NULL,
fa726te_sched_adjust_cost,
1, /* Constant limit. */
5, /* Max cond insns. */
ARM_PREFETCH_NOT_BENEFICIAL,
true, /* Prefer constant pool. */
arm_default_branch_cost,
false, /* Prefer LDRD/STRD. */
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
ARM_SCHED_AUTOPREF_OFF /* Sched L2 autopref. */
};
/* Not all of these give usefully different compilation alternatives,
but there is no simple way of generalizing them. */
static const struct processors all_cores[] =
{
/* ARM Cores */
#define ARM_CORE(NAME, X, IDENT, ARCH, FLAGS, COSTS) \
{NAME, IDENT, #ARCH, BASE_ARCH_##ARCH, \
FLAGS | FL_FOR_ARCH##ARCH, &arm_##COSTS##_tune},
#include "arm-cores.def"
#undef ARM_CORE
{NULL, arm_none, NULL, BASE_ARCH_0, 0, NULL}
};
static const struct processors all_architectures[] =
{
/* ARM Architectures */
/* We don't specify tuning costs here as it will be figured out
from the core. */
#define ARM_ARCH(NAME, CORE, ARCH, FLAGS) \
{NAME, CORE, #ARCH, BASE_ARCH_##ARCH, FLAGS, NULL},
#include "arm-arches.def"
#undef ARM_ARCH
{NULL, arm_none, NULL, BASE_ARCH_0, 0, NULL}
};
/* These are populated as commandline arguments are processed, or NULL
if not specified. */
static const struct processors *arm_selected_arch;
static const struct processors *arm_selected_cpu;
static const struct processors *arm_selected_tune;
/* The name of the preprocessor macro to define for this architecture. */
char arm_arch_name[] = "__ARM_ARCH_0UNK__";
/* Available values for -mfpu=. */
static const struct arm_fpu_desc all_fpus[] =
{
#define ARM_FPU(NAME, MODEL, REV, VFP_REGS, NEON, FP16, CRYPTO) \
{ NAME, MODEL, REV, VFP_REGS, NEON, FP16, CRYPTO },
#include "arm-fpus.def"
#undef ARM_FPU
};
/* Supported TLS relocations. */
enum tls_reloc {
TLS_GD32,
TLS_LDM32,
TLS_LDO32,
TLS_IE32,
TLS_LE32,
TLS_DESCSEQ /* GNU scheme */
};
/* The maximum number of insns to be used when loading a constant. */
inline static int
arm_constant_limit (bool size_p)
{
return size_p ? 1 : current_tune->constant_limit;
}
/* Emit an insn that's a simple single-set. Both the operands must be known
to be valid. */
inline static rtx_insn *
emit_set_insn (rtx x, rtx y)
{
return emit_insn (gen_rtx_SET (VOIDmode, x, y));
}
/* Return the number of bits set in VALUE. */
static unsigned
bit_count (unsigned long value)
{
unsigned long count = 0;
while (value)
{
count++;
value &= value - 1; /* Clear the least-significant set bit. */
}
return count;
}
typedef struct
{
machine_mode mode;
const char *name;
} arm_fixed_mode_set;
/* A small helper for setting fixed-point library libfuncs. */
static void
arm_set_fixed_optab_libfunc (optab optable, machine_mode mode,
const char *funcname, const char *modename,
int num_suffix)
{
char buffer[50];
if (num_suffix == 0)
sprintf (buffer, "__gnu_%s%s", funcname, modename);
else
sprintf (buffer, "__gnu_%s%s%d", funcname, modename, num_suffix);
set_optab_libfunc (optable, mode, buffer);
}
static void
arm_set_fixed_conv_libfunc (convert_optab optable, machine_mode to,
machine_mode from, const char *funcname,
const char *toname, const char *fromname)
{
char buffer[50];
const char *maybe_suffix_2 = "";
/* Follow the logic for selecting a "2" suffix in fixed-bit.h. */
if (ALL_FIXED_POINT_MODE_P (from) && ALL_FIXED_POINT_MODE_P (to)
&& UNSIGNED_FIXED_POINT_MODE_P (from) == UNSIGNED_FIXED_POINT_MODE_P (to)
&& ALL_FRACT_MODE_P (from) == ALL_FRACT_MODE_P (to))
maybe_suffix_2 = "2";
sprintf (buffer, "__gnu_%s%s%s%s", funcname, fromname, toname,
maybe_suffix_2);
set_conv_libfunc (optable, to, from, buffer);
}
/* Set up library functions unique to ARM. */
static void
arm_init_libfuncs (void)
{
/* For Linux, we have access to kernel support for atomic operations. */
if (arm_abi == ARM_ABI_AAPCS_LINUX)
init_sync_libfuncs (MAX_SYNC_LIBFUNC_SIZE);
/* There are no special library functions unless we are using the
ARM BPABI. */
if (!TARGET_BPABI)
return;
/* The functions below are described in Section 4 of the "Run-Time
ABI for the ARM architecture", Version 1.0. */
/* Double-precision floating-point arithmetic. Table 2. */
set_optab_libfunc (add_optab, DFmode, "__aeabi_dadd");
set_optab_libfunc (sdiv_optab, DFmode, "__aeabi_ddiv");
set_optab_libfunc (smul_optab, DFmode, "__aeabi_dmul");
set_optab_libfunc (neg_optab, DFmode, "__aeabi_dneg");
set_optab_libfunc (sub_optab, DFmode, "__aeabi_dsub");
/* Double-precision comparisons. Table 3. */
set_optab_libfunc (eq_optab, DFmode, "__aeabi_dcmpeq");
set_optab_libfunc (ne_optab, DFmode, NULL);
set_optab_libfunc (lt_optab, DFmode, "__aeabi_dcmplt");
set_optab_libfunc (le_optab, DFmode, "__aeabi_dcmple");
set_optab_libfunc (ge_optab, DFmode, "__aeabi_dcmpge");
set_optab_libfunc (gt_optab, DFmode, "__aeabi_dcmpgt");
set_optab_libfunc (unord_optab, DFmode, "__aeabi_dcmpun");
/* Single-precision floating-point arithmetic. Table 4. */
set_optab_libfunc (add_optab, SFmode, "__aeabi_fadd");
set_optab_libfunc (sdiv_optab, SFmode, "__aeabi_fdiv");
set_optab_libfunc (smul_optab, SFmode, "__aeabi_fmul");
set_optab_libfunc (neg_optab, SFmode, "__aeabi_fneg");
set_optab_libfunc (sub_optab, SFmode, "__aeabi_fsub");
/* Single-precision comparisons. Table 5. */
set_optab_libfunc (eq_optab, SFmode, "__aeabi_fcmpeq");
set_optab_libfunc (ne_optab, SFmode, NULL);
set_optab_libfunc (lt_optab, SFmode, "__aeabi_fcmplt");
set_optab_libfunc (le_optab, SFmode, "__aeabi_fcmple");
set_optab_libfunc (ge_optab, SFmode, "__aeabi_fcmpge");
set_optab_libfunc (gt_optab, SFmode, "__aeabi_fcmpgt");
set_optab_libfunc (unord_optab, SFmode, "__aeabi_fcmpun");
/* Floating-point to integer conversions. Table 6. */
set_conv_libfunc (sfix_optab, SImode, DFmode, "__aeabi_d2iz");
set_conv_libfunc (ufix_optab, SImode, DFmode, "__aeabi_d2uiz");
set_conv_libfunc (sfix_optab, DImode, DFmode, "__aeabi_d2lz");
set_conv_libfunc (ufix_optab, DImode, DFmode, "__aeabi_d2ulz");
set_conv_libfunc (sfix_optab, SImode, SFmode, "__aeabi_f2iz");
set_conv_libfunc (ufix_optab, SImode, SFmode, "__aeabi_f2uiz");
set_conv_libfunc (sfix_optab, DImode, SFmode, "__aeabi_f2lz");
set_conv_libfunc (ufix_optab, DImode, SFmode, "__aeabi_f2ulz");
/* Conversions between floating types. Table 7. */
set_conv_libfunc (trunc_optab, SFmode, DFmode, "__aeabi_d2f");
set_conv_libfunc (sext_optab, DFmode, SFmode, "__aeabi_f2d");
/* Integer to floating-point conversions. Table 8. */
set_conv_libfunc (sfloat_optab, DFmode, SImode, "__aeabi_i2d");
set_conv_libfunc (ufloat_optab, DFmode, SImode, "__aeabi_ui2d");
set_conv_libfunc (sfloat_optab, DFmode, DImode, "__aeabi_l2d");
set_conv_libfunc (ufloat_optab, DFmode, DImode, "__aeabi_ul2d");
set_conv_libfunc (sfloat_optab, SFmode, SImode, "__aeabi_i2f");
set_conv_libfunc (ufloat_optab, SFmode, SImode, "__aeabi_ui2f");
set_conv_libfunc (sfloat_optab, SFmode, DImode, "__aeabi_l2f");
set_conv_libfunc (ufloat_optab, SFmode, DImode, "__aeabi_ul2f");
/* Long long. Table 9. */
set_optab_libfunc (smul_optab, DImode, "__aeabi_lmul");
set_optab_libfunc (sdivmod_optab, DImode, "__aeabi_ldivmod");
set_optab_libfunc (udivmod_optab, DImode, "__aeabi_uldivmod");
set_optab_libfunc (ashl_optab, DImode, "__aeabi_llsl");
set_optab_libfunc (lshr_optab, DImode, "__aeabi_llsr");
set_optab_libfunc (ashr_optab, DImode, "__aeabi_lasr");
set_optab_libfunc (cmp_optab, DImode, "__aeabi_lcmp");
set_optab_libfunc (ucmp_optab, DImode, "__aeabi_ulcmp");
/* Integer (32/32->32) division. \S 4.3.1. */
set_optab_libfunc (sdivmod_optab, SImode, "__aeabi_idivmod");
set_optab_libfunc (udivmod_optab, SImode, "__aeabi_uidivmod");
/* The divmod functions are designed so that they can be used for
plain division, even though they return both the quotient and the
remainder. The quotient is returned in the usual location (i.e.,
r0 for SImode, {r0, r1} for DImode), just as would be expected
for an ordinary division routine. Because the AAPCS calling
conventions specify that all of { r0, r1, r2, r3 } are
callee-saved registers, there is no need to tell the compiler
explicitly that those registers are clobbered by these
routines. */
set_optab_libfunc (sdiv_optab, DImode, "__aeabi_ldivmod");
set_optab_libfunc (udiv_optab, DImode, "__aeabi_uldivmod");
/* For SImode division the ABI provides div-without-mod routines,
which are faster. */
set_optab_libfunc (sdiv_optab, SImode, "__aeabi_idiv");
set_optab_libfunc (udiv_optab, SImode, "__aeabi_uidiv");
/* We don't have mod libcalls. Fortunately gcc knows how to use the
divmod libcalls instead. */
set_optab_libfunc (smod_optab, DImode, NULL);
set_optab_libfunc (umod_optab, DImode, NULL);
set_optab_libfunc (smod_optab, SImode, NULL);
set_optab_libfunc (umod_optab, SImode, NULL);
/* Half-precision float operations. The compiler handles all operations
with NULL libfuncs by converting the SFmode. */
switch (arm_fp16_format)
{
case ARM_FP16_FORMAT_IEEE:
case ARM_FP16_FORMAT_ALTERNATIVE:
/* Conversions. */
set_conv_libfunc (trunc_optab, HFmode, SFmode,
(arm_fp16_format == ARM_FP16_FORMAT_IEEE
? "__gnu_f2h_ieee"
: "__gnu_f2h_alternative"));
set_conv_libfunc (sext_optab, SFmode, HFmode,
(arm_fp16_format == ARM_FP16_FORMAT_IEEE
? "__gnu_h2f_ieee"
: "__gnu_h2f_alternative"));
/* Arithmetic. */
set_optab_libfunc (add_optab, HFmode, NULL);
set_optab_libfunc (sdiv_optab, HFmode, NULL);
set_optab_libfunc (smul_optab, HFmode, NULL);
set_optab_libfunc (neg_optab, HFmode, NULL);
set_optab_libfunc (sub_optab, HFmode, NULL);
/* Comparisons. */
set_optab_libfunc (eq_optab, HFmode, NULL);
set_optab_libfunc (ne_optab, HFmode, NULL);
set_optab_libfunc (lt_optab, HFmode, NULL);
set_optab_libfunc (le_optab, HFmode, NULL);
set_optab_libfunc (ge_optab, HFmode, NULL);
set_optab_libfunc (gt_optab, HFmode, NULL);
set_optab_libfunc (unord_optab, HFmode, NULL);
break;
default:
break;
}
/* Use names prefixed with __gnu_ for fixed-point helper functions. */
{
const arm_fixed_mode_set fixed_arith_modes[] =
{
{ QQmode, "qq" },
{ UQQmode, "uqq" },
{ HQmode, "hq" },
{ UHQmode, "uhq" },
{ SQmode, "sq" },
{ USQmode, "usq" },
{ DQmode, "dq" },
{ UDQmode, "udq" },
{ TQmode, "tq" },
{ UTQmode, "utq" },
{ HAmode, "ha" },
{ UHAmode, "uha" },
{ SAmode, "sa" },
{ USAmode, "usa" },
{ DAmode, "da" },
{ UDAmode, "uda" },
{ TAmode, "ta" },
{ UTAmode, "uta" }
};
const arm_fixed_mode_set fixed_conv_modes[] =
{
{ QQmode, "qq" },
{ UQQmode, "uqq" },
{ HQmode, "hq" },
{ UHQmode, "uhq" },
{ SQmode, "sq" },
{ USQmode, "usq" },
{ DQmode, "dq" },
{ UDQmode, "udq" },
{ TQmode, "tq" },
{ UTQmode, "utq" },
{ HAmode, "ha" },
{ UHAmode, "uha" },
{ SAmode, "sa" },
{ USAmode, "usa" },
{ DAmode, "da" },
{ UDAmode, "uda" },
{ TAmode, "ta" },
{ UTAmode, "uta" },
{ QImode, "qi" },
{ HImode, "hi" },
{ SImode, "si" },
{ DImode, "di" },
{ TImode, "ti" },
{ SFmode, "sf" },
{ DFmode, "df" }
};
unsigned int i, j;
for (i = 0; i < ARRAY_SIZE (fixed_arith_modes); i++)
{
arm_set_fixed_optab_libfunc (add_optab, fixed_arith_modes[i].mode,
"add", fixed_arith_modes[i].name, 3);
arm_set_fixed_optab_libfunc (ssadd_optab, fixed_arith_modes[i].mode,
"ssadd", fixed_arith_modes[i].name, 3);
arm_set_fixed_optab_libfunc (usadd_optab, fixed_arith_modes[i].mode,
"usadd", fixed_arith_modes[i].name, 3);
arm_set_fixed_optab_libfunc (sub_optab, fixed_arith_modes[i].mode,
"sub", fixed_arith_modes[i].name, 3);
arm_set_fixed_optab_libfunc (sssub_optab, fixed_arith_modes[i].mode,
"sssub", fixed_arith_modes[i].name, 3);
arm_set_fixed_optab_libfunc (ussub_optab, fixed_arith_modes[i].mode,
"ussub", fixed_arith_modes[i].name, 3);
arm_set_fixed_optab_libfunc (smul_optab, fixed_arith_modes[i].mode,
"mul", fixed_arith_modes[i].name, 3);
arm_set_fixed_optab_libfunc (ssmul_optab, fixed_arith_modes[i].mode,
"ssmul", fixed_arith_modes[i].name, 3);
arm_set_fixed_optab_libfunc (usmul_optab, fixed_arith_modes[i].mode,
"usmul", fixed_arith_modes[i].name, 3);
arm_set_fixed_optab_libfunc (sdiv_optab, fixed_arith_modes[i].mode,
"div", fixed_arith_modes[i].name, 3);
arm_set_fixed_optab_libfunc (udiv_optab, fixed_arith_modes[i].mode,
"udiv", fixed_arith_modes[i].name, 3);
arm_set_fixed_optab_libfunc (ssdiv_optab, fixed_arith_modes[i].mode,
"ssdiv", fixed_arith_modes[i].name, 3);
arm_set_fixed_optab_libfunc (usdiv_optab, fixed_arith_modes[i].mode,
"usdiv", fixed_arith_modes[i].name, 3);
arm_set_fixed_optab_libfunc (neg_optab, fixed_arith_modes[i].mode,
"neg", fixed_arith_modes[i].name, 2);
arm_set_fixed_optab_libfunc (ssneg_optab, fixed_arith_modes[i].mode,
"ssneg", fixed_arith_modes[i].name, 2);
arm_set_fixed_optab_libfunc (usneg_optab, fixed_arith_modes[i].mode,
"usneg", fixed_arith_modes[i].name, 2);
arm_set_fixed_optab_libfunc (ashl_optab, fixed_arith_modes[i].mode,
"ashl", fixed_arith_modes[i].name, 3);
arm_set_fixed_optab_libfunc (ashr_optab, fixed_arith_modes[i].mode,
"ashr", fixed_arith_modes[i].name, 3);
arm_set_fixed_optab_libfunc (lshr_optab, fixed_arith_modes[i].mode,
"lshr", fixed_arith_modes[i].name, 3);
arm_set_fixed_optab_libfunc (ssashl_optab, fixed_arith_modes[i].mode,
"ssashl", fixed_arith_modes[i].name, 3);
arm_set_fixed_optab_libfunc (usashl_optab, fixed_arith_modes[i].mode,
"usashl", fixed_arith_modes[i].name, 3);
arm_set_fixed_optab_libfunc (cmp_optab, fixed_arith_modes[i].mode,
"cmp", fixed_arith_modes[i].name, 2);
}
for (i = 0; i < ARRAY_SIZE (fixed_conv_modes); i++)
for (j = 0; j < ARRAY_SIZE (fixed_conv_modes); j++)
{
if (i == j
|| (!ALL_FIXED_POINT_MODE_P (fixed_conv_modes[i].mode)
&& !ALL_FIXED_POINT_MODE_P (fixed_conv_modes[j].mode)))
continue;
arm_set_fixed_conv_libfunc (fract_optab, fixed_conv_modes[i].mode,
fixed_conv_modes[j].mode, "fract",
fixed_conv_modes[i].name,
fixed_conv_modes[j].name);
arm_set_fixed_conv_libfunc (satfract_optab,
fixed_conv_modes[i].mode,
fixed_conv_modes[j].mode, "satfract",
fixed_conv_modes[i].name,
fixed_conv_modes[j].name);
arm_set_fixed_conv_libfunc (fractuns_optab,
fixed_conv_modes[i].mode,
fixed_conv_modes[j].mode, "fractuns",
fixed_conv_modes[i].name,
fixed_conv_modes[j].name);
arm_set_fixed_conv_libfunc (satfractuns_optab,
fixed_conv_modes[i].mode,
fixed_conv_modes[j].mode, "satfractuns",
fixed_conv_modes[i].name,
fixed_conv_modes[j].name);
}
}
if (TARGET_AAPCS_BASED)
synchronize_libfunc = init_one_libfunc ("__sync_synchronize");
}
/* On AAPCS systems, this is the "struct __va_list". */
static GTY(()) tree va_list_type;
/* Return the type to use as __builtin_va_list. */
static tree
arm_build_builtin_va_list (void)
{
tree va_list_name;
tree ap_field;
if (!TARGET_AAPCS_BASED)
return std_build_builtin_va_list ();
/* AAPCS \S 7.1.4 requires that va_list be a typedef for a type
defined as:
struct __va_list
{
void *__ap;
};
The C Library ABI further reinforces this definition in \S
4.1.
We must follow this definition exactly. The structure tag
name is visible in C++ mangled names, and thus forms a part
of the ABI. The field name may be used by people who
#include <stdarg.h>. */
/* Create the type. */
va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
/* Give it the required name. */
va_list_name = build_decl (BUILTINS_LOCATION,
TYPE_DECL,
get_identifier ("__va_list"),
va_list_type);
DECL_ARTIFICIAL (va_list_name) = 1;
TYPE_NAME (va_list_type) = va_list_name;
TYPE_STUB_DECL (va_list_type) = va_list_name;
/* Create the __ap field. */
ap_field = build_decl (BUILTINS_LOCATION,
FIELD_DECL,
get_identifier ("__ap"),
ptr_type_node);
DECL_ARTIFICIAL (ap_field) = 1;
DECL_FIELD_CONTEXT (ap_field) = va_list_type;
TYPE_FIELDS (va_list_type) = ap_field;
/* Compute its layout. */
layout_type (va_list_type);
return va_list_type;
}
/* Return an expression of type "void *" pointing to the next
available argument in a variable-argument list. VALIST is the
user-level va_list object, of type __builtin_va_list. */
static tree
arm_extract_valist_ptr (tree valist)
{
if (TREE_TYPE (valist) == error_mark_node)
return error_mark_node;
/* On an AAPCS target, the pointer is stored within "struct
va_list". */
if (TARGET_AAPCS_BASED)
{
tree ap_field = TYPE_FIELDS (TREE_TYPE (valist));
valist = build3 (COMPONENT_REF, TREE_TYPE (ap_field),
valist, ap_field, NULL_TREE);
}
return valist;
}
/* Implement TARGET_EXPAND_BUILTIN_VA_START. */
static void
arm_expand_builtin_va_start (tree valist, rtx nextarg)
{
valist = arm_extract_valist_ptr (valist);
std_expand_builtin_va_start (valist, nextarg);
}
/* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
static tree
arm_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
gimple_seq *post_p)
{
valist = arm_extract_valist_ptr (valist);
return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
}
/* Fix up any incompatible options that the user has specified. */
static void
arm_option_override (void)
{
arm_selected_arch = NULL;
arm_selected_cpu = NULL;
arm_selected_tune = NULL;
if (global_options_set.x_arm_arch_option)
arm_selected_arch = &all_architectures[arm_arch_option];
if (global_options_set.x_arm_cpu_option)
{
arm_selected_cpu = &all_cores[(int) arm_cpu_option];
arm_selected_tune = &all_cores[(int) arm_cpu_option];
}
if (global_options_set.x_arm_tune_option)
arm_selected_tune = &all_cores[(int) arm_tune_option];
#ifdef SUBTARGET_OVERRIDE_OPTIONS
SUBTARGET_OVERRIDE_OPTIONS;
#endif
if (arm_selected_arch)
{
if (arm_selected_cpu)
{
/* Check for conflict between mcpu and march. */
if ((arm_selected_cpu->flags ^ arm_selected_arch->flags) & ~FL_TUNE)
{
warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
arm_selected_cpu->name, arm_selected_arch->name);
/* -march wins for code generation.
-mcpu wins for default tuning. */
if (!arm_selected_tune)
arm_selected_tune = arm_selected_cpu;
arm_selected_cpu = arm_selected_arch;
}
else
/* -mcpu wins. */
arm_selected_arch = NULL;
}
else
/* Pick a CPU based on the architecture. */
arm_selected_cpu = arm_selected_arch;
}
/* If the user did not specify a processor, choose one for them. */
if (!arm_selected_cpu)
{
const struct processors * sel;
unsigned int sought;
arm_selected_cpu = &all_cores[TARGET_CPU_DEFAULT];
if (!arm_selected_cpu->name)
{
#ifdef SUBTARGET_CPU_DEFAULT
/* Use the subtarget default CPU if none was specified by
configure. */
arm_selected_cpu = &all_cores[SUBTARGET_CPU_DEFAULT];
#endif
/* Default to ARM6. */
if (!arm_selected_cpu->name)
arm_selected_cpu = &all_cores[arm6];
}
sel = arm_selected_cpu;
insn_flags = sel->flags;
/* Now check to see if the user has specified some command line
switch that require certain abilities from the cpu. */
sought = 0;
if (TARGET_INTERWORK || TARGET_THUMB)
{
sought |= (FL_THUMB | FL_MODE32);
/* There are no ARM processors that support both APCS-26 and
interworking. Therefore we force FL_MODE26 to be removed
from insn_flags here (if it was set), so that the search
below will always be able to find a compatible processor. */
insn_flags &= ~FL_MODE26;
}
if (sought != 0 && ((sought & insn_flags) != sought))
{
/* Try to locate a CPU type that supports all of the abilities
of the default CPU, plus the extra abilities requested by
the user. */
for (sel = all_cores; sel->name != NULL; sel++)
if ((sel->flags & sought) == (sought | insn_flags))
break;
if (sel->name == NULL)
{
unsigned current_bit_count = 0;
const struct processors * best_fit = NULL;
/* Ideally we would like to issue an error message here
saying that it was not possible to find a CPU compatible
with the default CPU, but which also supports the command
line options specified by the programmer, and so they
ought to use the -mcpu=<name> command line option to
override the default CPU type.
If we cannot find a cpu that has both the
characteristics of the default cpu and the given
command line options we scan the array again looking
for a best match. */
for (sel = all_cores; sel->name != NULL; sel++)
if ((sel->flags & sought) == sought)
{
unsigned count;
count = bit_count (sel->flags & insn_flags);
if (count >= current_bit_count)
{
best_fit = sel;
current_bit_count = count;
}
}
gcc_assert (best_fit);
sel = best_fit;
}
arm_selected_cpu = sel;
}
}
gcc_assert (arm_selected_cpu);
/* The selected cpu may be an architecture, so lookup tuning by core ID. */
if (!arm_selected_tune)
arm_selected_tune = &all_cores[arm_selected_cpu->core];
sprintf (arm_arch_name, "__ARM_ARCH_%s__", arm_selected_cpu->arch);
insn_flags = arm_selected_cpu->flags;
arm_base_arch = arm_selected_cpu->base_arch;
arm_tune = arm_selected_tune->core;
tune_flags = arm_selected_tune->flags;
current_tune = arm_selected_tune->tune;
/* Make sure that the processor choice does not conflict with any of the
other command line choices. */
if (TARGET_ARM && !(insn_flags & FL_NOTM))
error ("target CPU does not support ARM mode");
/* BPABI targets use linker tricks to allow interworking on cores
without thumb support. */
if (TARGET_INTERWORK && !((insn_flags & FL_THUMB) || TARGET_BPABI))
{
warning (0, "target CPU does not support interworking" );
target_flags &= ~MASK_INTERWORK;
}
if (TARGET_THUMB && !(insn_flags & FL_THUMB))
{
warning (0, "target CPU does not support THUMB instructions");
target_flags &= ~MASK_THUMB;
}
if (TARGET_APCS_FRAME && TARGET_THUMB)
{
/* warning (0, "ignoring -mapcs-frame because -mthumb was used"); */
target_flags &= ~MASK_APCS_FRAME;
}
/* Callee super interworking implies thumb interworking. Adding
this to the flags here simplifies the logic elsewhere. */
if (TARGET_THUMB && TARGET_CALLEE_INTERWORKING)
target_flags |= MASK_INTERWORK;
/* TARGET_BACKTRACE calls leaf_function_p, which causes a crash if done
from here where no function is being compiled currently. */
if ((TARGET_TPCS_FRAME || TARGET_TPCS_LEAF_FRAME) && TARGET_ARM)
warning (0, "enabling backtrace support is only meaningful when compiling for the Thumb");
if (TARGET_ARM && TARGET_CALLEE_INTERWORKING)
warning (0, "enabling callee interworking support is only meaningful when compiling for the Thumb");
if (TARGET_APCS_STACK && !TARGET_APCS_FRAME)
{
warning (0, "-mapcs-stack-check incompatible with -mno-apcs-frame");
target_flags |= MASK_APCS_FRAME;
}
if (TARGET_POKE_FUNCTION_NAME)
target_flags |= MASK_APCS_FRAME;
if (TARGET_APCS_REENT && flag_pic)
error ("-fpic and -mapcs-reent are incompatible");
if (TARGET_APCS_REENT)
warning (0, "APCS reentrant code not supported. Ignored");
/* If this target is normally configured to use APCS frames, warn if they
are turned off and debugging is turned on. */
if (TARGET_ARM
&& write_symbols != NO_DEBUG
&& !TARGET_APCS_FRAME
&& (TARGET_DEFAULT & MASK_APCS_FRAME))
warning (0, "-g with -mno-apcs-frame may not give sensible debugging");
if (TARGET_APCS_FLOAT)
warning (0, "passing floating point arguments in fp regs not yet supported");
/* Initialize boolean versions of the flags, for use in the arm.md file. */
arm_arch3m = (insn_flags & FL_ARCH3M) != 0;
arm_arch4 = (insn_flags & FL_ARCH4) != 0;
arm_arch4t = arm_arch4 & ((insn_flags & FL_THUMB) != 0);
arm_arch5 = (insn_flags & FL_ARCH5) != 0;
arm_arch5e = (insn_flags & FL_ARCH5E) != 0;
arm_arch6 = (insn_flags & FL_ARCH6) != 0;
arm_arch6k = (insn_flags & FL_ARCH6K) != 0;
arm_arch_notm = (insn_flags & FL_NOTM) != 0;
arm_arch6m = arm_arch6 && !arm_arch_notm;
arm_arch7 = (insn_flags & FL_ARCH7) != 0;
arm_arch7em = (insn_flags & FL_ARCH7EM) != 0;
arm_arch8 = (insn_flags & FL_ARCH8) != 0;
arm_arch_thumb2 = (insn_flags & FL_THUMB2) != 0;
arm_arch_xscale = (insn_flags & FL_XSCALE) != 0;
arm_ld_sched = (tune_flags & FL_LDSCHED) != 0;
arm_tune_strongarm = (tune_flags & FL_STRONG) != 0;
thumb_code = TARGET_ARM == 0;
thumb1_code = TARGET_THUMB1 != 0;
arm_tune_wbuf = (tune_flags & FL_WBUF) != 0;
arm_tune_xscale = (tune_flags & FL_XSCALE) != 0;
arm_arch_iwmmxt = (insn_flags & FL_IWMMXT) != 0;
arm_arch_iwmmxt2 = (insn_flags & FL_IWMMXT2) != 0;
arm_arch_thumb_hwdiv = (insn_flags & FL_THUMB_DIV) != 0;
arm_arch_arm_hwdiv = (insn_flags & FL_ARM_DIV) != 0;
arm_arch_lpae = (insn_flags & FL_LPAE) != 0;
arm_arch_no_volatile_ce = (insn_flags & FL_NO_VOLATILE_CE) != 0;
arm_tune_cortex_a9 = (arm_tune == cortexa9) != 0;
arm_arch_crc = (insn_flags & FL_CRC32) != 0;
arm_m_profile_small_mul = (insn_flags & FL_SMALLMUL) != 0;
if (arm_restrict_it == 2)
arm_restrict_it = arm_arch8 && TARGET_THUMB2;
if (!TARGET_THUMB2)
arm_restrict_it = 0;
/* If we are not using the default (ARM mode) section anchor offset
ranges, then set the correct ranges now. */
if (TARGET_THUMB1)
{
/* Thumb-1 LDR instructions cannot have negative offsets.
Permissible positive offset ranges are 5-bit (for byte loads),
6-bit (for halfword loads), or 7-bit (for word loads).
Empirical results suggest a 7-bit anchor range gives the best
overall code size. */
targetm.min_anchor_offset = 0;
targetm.max_anchor_offset = 127;
}
else if (TARGET_THUMB2)
{
/* The minimum is set such that the total size of the block
for a particular anchor is 248 + 1 + 4095 bytes, which is
divisible by eight, ensuring natural spacing of anchors. */
targetm.min_anchor_offset = -248;
targetm.max_anchor_offset = 4095;
}
/* V5 code we generate is completely interworking capable, so we turn off
TARGET_INTERWORK here to avoid many tests later on. */
/* XXX However, we must pass the right pre-processor defines to CPP
or GLD can get confused. This is a hack. */
if (TARGET_INTERWORK)
arm_cpp_interwork = 1;
if (arm_arch5)
target_flags &= ~MASK_INTERWORK;
if (TARGET_IWMMXT && !ARM_DOUBLEWORD_ALIGN)
error ("iwmmxt requires an AAPCS compatible ABI for proper operation");
if (TARGET_IWMMXT_ABI && !TARGET_IWMMXT)
error ("iwmmxt abi requires an iwmmxt capable cpu");
if (!global_options_set.x_arm_fpu_index)
{
const char *target_fpu_name;
bool ok;
#ifdef FPUTYPE_DEFAULT
target_fpu_name = FPUTYPE_DEFAULT;
#else
target_fpu_name = "vfp";
#endif
ok = opt_enum_arg_to_value (OPT_mfpu_, target_fpu_name, &arm_fpu_index,
CL_TARGET);
gcc_assert (ok);
}
arm_fpu_desc = &all_fpus[arm_fpu_index];
switch (arm_fpu_desc->model)
{
case ARM_FP_MODEL_VFP:
arm_fpu_attr = FPU_VFP;
break;
default:
gcc_unreachable();
}
if (TARGET_AAPCS_BASED)
{
if (TARGET_CALLER_INTERWORKING)
error ("AAPCS does not support -mcaller-super-interworking");
else
if (TARGET_CALLEE_INTERWORKING)
error ("AAPCS does not support -mcallee-super-interworking");
}
/* iWMMXt and NEON are incompatible. */
if (TARGET_IWMMXT && TARGET_NEON)
error ("iWMMXt and NEON are incompatible");
/* iWMMXt unsupported under Thumb mode. */
if (TARGET_THUMB && TARGET_IWMMXT)
error ("iWMMXt unsupported under Thumb mode");
/* __fp16 support currently assumes the core has ldrh. */
if (!arm_arch4 && arm_fp16_format != ARM_FP16_FORMAT_NONE)
sorry ("__fp16 and no ldrh");
/* If soft-float is specified then don't use FPU. */
if (TARGET_SOFT_FLOAT)
arm_fpu_attr = FPU_NONE;
if (TARGET_AAPCS_BASED)
{
if (arm_abi == ARM_ABI_IWMMXT)
arm_pcs_default = ARM_PCS_AAPCS_IWMMXT;
else if (arm_float_abi == ARM_FLOAT_ABI_HARD
&& TARGET_HARD_FLOAT
&& TARGET_VFP)
arm_pcs_default = ARM_PCS_AAPCS_VFP;
else
arm_pcs_default = ARM_PCS_AAPCS;
}
else
{
if (arm_float_abi == ARM_FLOAT_ABI_HARD && TARGET_VFP)
sorry ("-mfloat-abi=hard and VFP");
if (arm_abi == ARM_ABI_APCS)
arm_pcs_default = ARM_PCS_APCS;
else
arm_pcs_default = ARM_PCS_ATPCS;
}
/* For arm2/3 there is no need to do any scheduling if we are doing
software floating-point. */
if (TARGET_SOFT_FLOAT && (tune_flags & FL_MODE32) == 0)
flag_schedule_insns = flag_schedule_insns_after_reload = 0;
/* Use the cp15 method if it is available. */
if (target_thread_pointer == TP_AUTO)
{
if (arm_arch6k && !TARGET_THUMB1)
target_thread_pointer = TP_CP15;
else
target_thread_pointer = TP_SOFT;
}
if (TARGET_HARD_TP && TARGET_THUMB1)
error ("can not use -mtp=cp15 with 16-bit Thumb");
/* Override the default structure alignment for AAPCS ABI. */
if (!global_options_set.x_arm_structure_size_boundary)
{
if (TARGET_AAPCS_BASED)
arm_structure_size_boundary = 8;
}
else
{
if (arm_structure_size_boundary != 8
&& arm_structure_size_boundary != 32
&& !(ARM_DOUBLEWORD_ALIGN && arm_structure_size_boundary == 64))
{
if (ARM_DOUBLEWORD_ALIGN)
warning (0,
"structure size boundary can only be set to 8, 32 or 64");
else
warning (0, "structure size boundary can only be set to 8 or 32");
arm_structure_size_boundary
= (TARGET_AAPCS_BASED ? 8 : DEFAULT_STRUCTURE_SIZE_BOUNDARY);
}
}
if (!TARGET_ARM && TARGET_VXWORKS_RTP && flag_pic)
{
error ("RTP PIC is incompatible with Thumb");
flag_pic = 0;
}
/* If stack checking is disabled, we can use r10 as the PIC register,
which keeps r9 available. The EABI specifies r9 as the PIC register. */
if (flag_pic && TARGET_SINGLE_PIC_BASE)
{
if (TARGET_VXWORKS_RTP)
warning (0, "RTP PIC is incompatible with -msingle-pic-base");
arm_pic_register = (TARGET_APCS_STACK || TARGET_AAPCS_BASED) ? 9 : 10;
}
if (flag_pic && TARGET_VXWORKS_RTP)
arm_pic_register = 9;
if (arm_pic_register_string != NULL)
{
int pic_register = decode_reg_name (arm_pic_register_string);
if (!flag_pic)
warning (0, "-mpic-register= is useless without -fpic");
/* Prevent the user from choosing an obviously stupid PIC register. */
else if (pic_register < 0 || call_used_regs[pic_register]
|| pic_register == HARD_FRAME_POINTER_REGNUM
|| pic_register == STACK_POINTER_REGNUM
|| pic_register >= PC_REGNUM
|| (TARGET_VXWORKS_RTP
&& (unsigned int) pic_register != arm_pic_register))
error ("unable to use '%s' for PIC register", arm_pic_register_string);
else
arm_pic_register = pic_register;
}
if (TARGET_VXWORKS_RTP
&& !global_options_set.x_arm_pic_data_is_text_relative)
arm_pic_data_is_text_relative = 0;
/* Enable -mfix-cortex-m3-ldrd by default for Cortex-M3 cores. */
if (fix_cm3_ldrd == 2)
{
if (arm_selected_cpu->core == cortexm3)
fix_cm3_ldrd = 1;
else
fix_cm3_ldrd = 0;
}
/* Enable -munaligned-access by default for
- all ARMv6 architecture-based processors
- ARMv7-A, ARMv7-R, and ARMv7-M architecture-based processors.
- ARMv8 architecture-base processors.
Disable -munaligned-access by default for
- all pre-ARMv6 architecture-based processors
- ARMv6-M architecture-based processors. */
if (unaligned_access == 2)
{
if (arm_arch6 && (arm_arch_notm || arm_arch7))
unaligned_access = 1;
else
unaligned_access = 0;
}
else if (unaligned_access == 1
&& !(arm_arch6 && (arm_arch_notm || arm_arch7)))
{
warning (0, "target CPU does not support unaligned accesses");
unaligned_access = 0;
}
if (TARGET_THUMB1 && flag_schedule_insns)
{
/* Don't warn since it's on by default in -O2. */
flag_schedule_insns = 0;
}
if (optimize_size)
{
/* If optimizing for size, bump the number of instructions that we
are prepared to conditionally execute (even on a StrongARM). */
max_insns_skipped = 6;
/* For THUMB2, we limit the conditional sequence to one IT block. */
if (TARGET_THUMB2)
max_insns_skipped = MAX_INSN_PER_IT_BLOCK;
}
else
max_insns_skipped = current_tune->max_insns_skipped;
/* Hot/Cold partitioning is not currently supported, since we can't
handle literal pool placement in that case. */
if (flag_reorder_blocks_and_partition)
{
inform (input_location,
"-freorder-blocks-and-partition not supported on this architecture");
flag_reorder_blocks_and_partition = 0;
flag_reorder_blocks = 1;
}
if (flag_pic)
/* Hoisting PIC address calculations more aggressively provides a small,
but measurable, size reduction for PIC code. Therefore, we decrease
the bar for unrestricted expression hoisting to the cost of PIC address
calculation, which is 2 instructions. */
maybe_set_param_value (PARAM_GCSE_UNRESTRICTED_COST, 2,
global_options.x_param_values,
global_options_set.x_param_values);
/* ARM EABI defaults to strict volatile bitfields. */
if (TARGET_AAPCS_BASED && flag_strict_volatile_bitfields < 0
&& abi_version_at_least(2))
flag_strict_volatile_bitfields = 1;
/* Enable sw prefetching at -O3 for CPUS that have prefetch, and we have deemed
it beneficial (signified by setting num_prefetch_slots to 1 or more.) */
if (flag_prefetch_loop_arrays < 0
&& HAVE_prefetch
&& optimize >= 3
&& current_tune->num_prefetch_slots > 0)
flag_prefetch_loop_arrays = 1;
/* Set up parameters to be used in prefetching algorithm. Do not override the
defaults unless we are tuning for a core we have researched values for. */
if (current_tune->num_prefetch_slots > 0)
maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
current_tune->num_prefetch_slots,
global_options.x_param_values,
global_options_set.x_param_values);
if (current_tune->l1_cache_line_size >= 0)
maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
current_tune->l1_cache_line_size,
global_options.x_param_values,
global_options_set.x_param_values);
if (current_tune->l1_cache_size >= 0)
maybe_set_param_value (PARAM_L1_CACHE_SIZE,
current_tune->l1_cache_size,
global_options.x_param_values,
global_options_set.x_param_values);
/* Use Neon to perform 64-bits operations rather than core
registers. */
prefer_neon_for_64bits = current_tune->prefer_neon_for_64bits;
if (use_neon_for_64bits == 1)
prefer_neon_for_64bits = true;
/* Use the alternative scheduling-pressure algorithm by default. */
maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
global_options.x_param_values,
global_options_set.x_param_values);
/* Look through ready list and all of queue for instructions
relevant for L2 auto-prefetcher. */
int param_sched_autopref_queue_depth;
if (current_tune->sched_autopref == ARM_SCHED_AUTOPREF_OFF)
param_sched_autopref_queue_depth = -1;
else if (current_tune->sched_autopref == ARM_SCHED_AUTOPREF_RANK)
param_sched_autopref_queue_depth = 0;
else if (current_tune->sched_autopref == ARM_SCHED_AUTOPREF_FULL)
param_sched_autopref_queue_depth = max_insn_queue_index + 1;
else
gcc_unreachable ();
maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
param_sched_autopref_queue_depth,
global_options.x_param_values,
global_options_set.x_param_values);
/* Disable shrink-wrap when optimizing function for size, since it tends to
generate additional returns. */
if (optimize_function_for_size_p (cfun) && TARGET_THUMB2)
flag_shrink_wrap = false;
/* TBD: Dwarf info for apcs frame is not handled yet. */
if (TARGET_APCS_FRAME)
flag_shrink_wrap = false;
/* We only support -mslow-flash-data on armv7-m targets. */
if (target_slow_flash_data
&& ((!(arm_arch7 && !arm_arch_notm) && !arm_arch7em)
|| (TARGET_THUMB1 || flag_pic || TARGET_NEON)))
error ("-mslow-flash-data only supports non-pic code on armv7-m targets");
/* Currently, for slow flash data, we just disable literal pools. */
if (target_slow_flash_data)
arm_disable_literal_pool = true;
/* Thumb2 inline assembly code should always use unified syntax.
This will apply to ARM and Thumb1 eventually. */
if (TARGET_THUMB2)
inline_asm_unified = 1;
/* Disable scheduling fusion by default if it's not armv7 processor
or doesn't prefer ldrd/strd. */
if (flag_schedule_fusion == 2
&& (!arm_arch7 || !current_tune->prefer_ldrd_strd))
flag_schedule_fusion = 0;
/* In Thumb1 mode, we emit the epilogue in RTL, but the last insn
- epilogue_insns - does not accurately model the corresponding insns
emitted in the asm file. In particular, see the comment in thumb_exit
'Find out how many of the (return) argument registers we can corrupt'.
As a consequence, the epilogue may clobber registers without fipa-ra
finding out about it. Therefore, disable fipa-ra in Thumb1 mode.
TODO: Accurately model clobbers for epilogue_insns and reenable
fipa-ra. */
if (TARGET_THUMB1)
flag_ipa_ra = 0;
/* Register global variables with the garbage collector. */
arm_add_gc_roots ();
}
static void
arm_add_gc_roots (void)
{
gcc_obstack_init(&minipool_obstack);
minipool_startobj = (char *) obstack_alloc (&minipool_obstack, 0);
}
/* A table of known ARM exception types.
For use with the interrupt function attribute. */
typedef struct
{
const char *const arg;
const unsigned long return_value;
}
isr_attribute_arg;
static const isr_attribute_arg isr_attribute_args [] =
{
{ "IRQ", ARM_FT_ISR },
{ "irq", ARM_FT_ISR },
{ "FIQ", ARM_FT_FIQ },
{ "fiq", ARM_FT_FIQ },
{ "ABORT", ARM_FT_ISR },
{ "abort", ARM_FT_ISR },
{ "ABORT", ARM_FT_ISR },
{ "abort", ARM_FT_ISR },
{ "UNDEF", ARM_FT_EXCEPTION },
{ "undef", ARM_FT_EXCEPTION },
{ "SWI", ARM_FT_EXCEPTION },
{ "swi", ARM_FT_EXCEPTION },
{ NULL, ARM_FT_NORMAL }
};
/* Returns the (interrupt) function type of the current
function, or ARM_FT_UNKNOWN if the type cannot be determined. */
static unsigned long
arm_isr_value (tree argument)
{
const isr_attribute_arg * ptr;
const char * arg;
if (!arm_arch_notm)
return ARM_FT_NORMAL | ARM_FT_STACKALIGN;
/* No argument - default to IRQ. */
if (argument == NULL_TREE)
return ARM_FT_ISR;
/* Get the value of the argument. */
if (TREE_VALUE (argument) == NULL_TREE
|| TREE_CODE (TREE_VALUE (argument)) != STRING_CST)
return ARM_FT_UNKNOWN;
arg = TREE_STRING_POINTER (TREE_VALUE (argument));
/* Check it against the list of known arguments. */
for (ptr = isr_attribute_args; ptr->arg != NULL; ptr++)
if (streq (arg, ptr->arg))
return ptr->return_value;
/* An unrecognized interrupt type. */
return ARM_FT_UNKNOWN;
}
/* Computes the type of the current function. */
static unsigned long
arm_compute_func_type (void)
{
unsigned long type = ARM_FT_UNKNOWN;
tree a;
tree attr;
gcc_assert (TREE_CODE (current_function_decl) == FUNCTION_DECL);
/* Decide if the current function is volatile. Such functions
never return, and many memory cycles can be saved by not storing
register values that will never be needed again. This optimization
was added to speed up context switching in a kernel application. */
if (optimize > 0
&& (TREE_NOTHROW (current_function_decl)
|| !(flag_unwind_tables
|| (flag_exceptions
&& arm_except_unwind_info (&global_options) != UI_SJLJ)))
&& TREE_THIS_VOLATILE (current_function_decl))
type |= ARM_FT_VOLATILE;
if (cfun->static_chain_decl != NULL)
type |= ARM_FT_NESTED;
attr = DECL_ATTRIBUTES (current_function_decl);
a = lookup_attribute ("naked", attr);
if (a != NULL_TREE)
type |= ARM_FT_NAKED;
a = lookup_attribute ("isr", attr);
if (a == NULL_TREE)
a = lookup_attribute ("interrupt", attr);
if (a == NULL_TREE)
type |= TARGET_INTERWORK ? ARM_FT_INTERWORKED : ARM_FT_NORMAL;
else
type |= arm_isr_value (TREE_VALUE (a));
return type;
}
/* Returns the type of the current function. */
unsigned long
arm_current_func_type (void)
{
if (ARM_FUNC_TYPE (cfun->machine->func_type) == ARM_FT_UNKNOWN)
cfun->machine->func_type = arm_compute_func_type ();
return cfun->machine->func_type;
}
bool
arm_allocate_stack_slots_for_args (void)
{
/* Naked functions should not allocate stack slots for arguments. */
return !IS_NAKED (arm_current_func_type ());
}
static bool
arm_warn_func_return (tree decl)
{
/* Naked functions are implemented entirely in assembly, including the
return sequence, so suppress warnings about this. */
return lookup_attribute ("naked", DECL_ATTRIBUTES (decl)) == NULL_TREE;
}
/* Output assembler code for a block containing the constant parts
of a trampoline, leaving space for the variable parts.
On the ARM, (if r8 is the static chain regnum, and remembering that
referencing pc adds an offset of 8) the trampoline looks like:
ldr r8, [pc, #0]
ldr pc, [pc]
.word static chain value
.word function's address
XXX FIXME: When the trampoline returns, r8 will be clobbered. */
static void
arm_asm_trampoline_template (FILE *f)
{
if (TARGET_ARM)
{
asm_fprintf (f, "\tldr\t%r, [%r, #0]\n", STATIC_CHAIN_REGNUM, PC_REGNUM);
asm_fprintf (f, "\tldr\t%r, [%r, #0]\n", PC_REGNUM, PC_REGNUM);
}
else if (TARGET_THUMB2)
{
/* The Thumb-2 trampoline is similar to the arm implementation.
Unlike 16-bit Thumb, we enter the stub in thumb mode. */
asm_fprintf (f, "\tldr.w\t%r, [%r, #4]\n",
STATIC_CHAIN_REGNUM, PC_REGNUM);
asm_fprintf (f, "\tldr.w\t%r, [%r, #4]\n", PC_REGNUM, PC_REGNUM);
}
else
{
ASM_OUTPUT_ALIGN (f, 2);
fprintf (f, "\t.code\t16\n");
fprintf (f, ".Ltrampoline_start:\n");
asm_fprintf (f, "\tpush\t{r0, r1}\n");
asm_fprintf (f, "\tldr\tr0, [%r, #8]\n", PC_REGNUM);
asm_fprintf (f, "\tmov\t%r, r0\n", STATIC_CHAIN_REGNUM);
asm_fprintf (f, "\tldr\tr0, [%r, #8]\n", PC_REGNUM);
asm_fprintf (f, "\tstr\tr0, [%r, #4]\n", SP_REGNUM);
asm_fprintf (f, "\tpop\t{r0, %r}\n", PC_REGNUM);
}
assemble_aligned_integer (UNITS_PER_WORD, const0_rtx);
assemble_aligned_integer (UNITS_PER_WORD, const0_rtx);
}
/* Emit RTL insns to initialize the variable parts of a trampoline. */
static void
arm_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
{
rtx fnaddr, mem, a_tramp;
emit_block_move (m_tramp, assemble_trampoline_template (),
GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL);
mem = adjust_address (m_tramp, SImode, TARGET_32BIT ? 8 : 12);
emit_move_insn (mem, chain_value);
mem = adjust_address (m_tramp, SImode, TARGET_32BIT ? 12 : 16);
fnaddr = XEXP (DECL_RTL (fndecl), 0);
emit_move_insn (mem, fnaddr);
a_tramp = XEXP (m_tramp, 0);
emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
LCT_NORMAL, VOIDmode, 2, a_tramp, Pmode,
plus_constant (Pmode, a_tramp, TRAMPOLINE_SIZE), Pmode);
}
/* Thumb trampolines should be entered in thumb mode, so set
the bottom bit of the address. */
static rtx
arm_trampoline_adjust_address (rtx addr)
{
if (TARGET_THUMB)
addr = expand_simple_binop (Pmode, IOR, addr, const1_rtx,
NULL, 0, OPTAB_LIB_WIDEN);
return addr;
}
/* Return 1 if it is possible to return using a single instruction.
If SIBLING is non-null, this is a test for a return before a sibling
call. SIBLING is the call insn, so we can examine its register usage. */
int
use_return_insn (int iscond, rtx sibling)
{
int regno;
unsigned int func_type;
unsigned long saved_int_regs;
unsigned HOST_WIDE_INT stack_adjust;
arm_stack_offsets *offsets;
/* Never use a return instruction before reload has run. */
if (!reload_completed)
return 0;
func_type = arm_current_func_type ();
/* Naked, volatile and stack alignment functions need special
consideration. */
if (func_type & (ARM_FT_VOLATILE | ARM_FT_NAKED | ARM_FT_STACKALIGN))
return 0;
/* So do interrupt functions that use the frame pointer and Thumb
interrupt functions. */
if (IS_INTERRUPT (func_type) && (frame_pointer_needed || TARGET_THUMB))
return 0;
if (TARGET_LDRD && current_tune->prefer_ldrd_strd
&& !optimize_function_for_size_p (cfun))
return 0;
offsets = arm_get_frame_offsets ();
stack_adjust = offsets->outgoing_args - offsets->saved_regs;
/* As do variadic functions. */
if (crtl->args.pretend_args_size
|| cfun->machine->uses_anonymous_args
/* Or if the function calls __builtin_eh_return () */
|| crtl->calls_eh_return
/* Or if the function calls alloca */
|| cfun->calls_alloca
/* Or if there is a stack adjustment. However, if the stack pointer
is saved on the stack, we can use a pre-incrementing stack load. */
|| !(stack_adjust == 0 || (TARGET_APCS_FRAME && frame_pointer_needed
&& stack_adjust == 4)))
return 0;
saved_int_regs = offsets->saved_regs_mask;
/* Unfortunately, the insn
ldmib sp, {..., sp, ...}
triggers a bug on most SA-110 based devices, such that the stack
pointer won't be correctly restored if the instruction takes a
page fault. We work around this problem by popping r3 along with
the other registers, since that is never slower than executing
another instruction.
We test for !arm_arch5 here, because code for any architecture
less than this could potentially be run on one of the buggy
chips. */
if (stack_adjust == 4 && !arm_arch5 && TARGET_ARM)
{
/* Validate that r3 is a call-clobbered register (always true in
the default abi) ... */
if (!call_used_regs[3])
return 0;
/* ... that it isn't being used for a return value ... */
if (arm_size_return_regs () >= (4 * UNITS_PER_WORD))
return 0;
/* ... or for a tail-call argument ... */
if (sibling)
{
gcc_assert (CALL_P (sibling));
if (find_regno_fusage (sibling, USE, 3))
return 0;
}
/* ... and that there are no call-saved registers in r0-r2
(always true in the default ABI). */
if (saved_int_regs & 0x7)
return 0;
}
/* Can't be done if interworking with Thumb, and any registers have been
stacked. */
if (TARGET_INTERWORK && saved_int_regs != 0 && !IS_INTERRUPT(func_type))
return 0;
/* On StrongARM, conditional returns are expensive if they aren't
taken and multiple registers have been stacked. */
if (iscond && arm_tune_strongarm)
{
/* Conditional return when just the LR is stored is a simple
conditional-load instruction, that's not expensive. */
if (saved_int_regs != 0 && saved_int_regs != (1 << LR_REGNUM))
return 0;
if (flag_pic
&& arm_pic_register != INVALID_REGNUM
&& df_regs_ever_live_p (PIC_OFFSET_TABLE_REGNUM))
return 0;
}
/* If there are saved registers but the LR isn't saved, then we need
two instructions for the return. */
if (saved_int_regs && !(saved_int_regs & (1 << LR_REGNUM)))
return 0;
/* Can't be done if any of the VFP regs are pushed,
since this also requires an insn. */
if (TARGET_HARD_FLOAT && TARGET_VFP)
for (regno = FIRST_VFP_REGNUM; regno <= LAST_VFP_REGNUM; regno++)
if (df_regs_ever_live_p (regno) && !call_used_regs[regno])
return 0;
if (TARGET_REALLY_IWMMXT)
for (regno = FIRST_IWMMXT_REGNUM; regno <= LAST_IWMMXT_REGNUM; regno++)
if (df_regs_ever_live_p (regno) && ! call_used_regs[regno])
return 0;
return 1;
}
/* Return TRUE if we should try to use a simple_return insn, i.e. perform
shrink-wrapping if possible. This is the case if we need to emit a
prologue, which we can test by looking at the offsets. */
bool
use_simple_return_p (void)
{
arm_stack_offsets *offsets;
offsets = arm_get_frame_offsets ();
return offsets->outgoing_args != 0;
}
/* Return TRUE if int I is a valid immediate ARM constant. */
int
const_ok_for_arm (HOST_WIDE_INT i)
{
int lowbit;
/* For machines with >32 bit HOST_WIDE_INT, the bits above bit 31 must
be all zero, or all one. */
if ((i & ~(unsigned HOST_WIDE_INT) 0xffffffff) != 0
&& ((i & ~(unsigned HOST_WIDE_INT) 0xffffffff)
!= ((~(unsigned HOST_WIDE_INT) 0)
& ~(unsigned HOST_WIDE_INT) 0xffffffff)))
return FALSE;
i &= (unsigned HOST_WIDE_INT) 0xffffffff;
/* Fast return for 0 and small values. We must do this for zero, since
the code below can't handle that one case. */
if ((i & ~(unsigned HOST_WIDE_INT) 0xff) == 0)
return TRUE;
/* Get the number of trailing zeros. */
lowbit = ffs((int) i) - 1;
/* Only even shifts are allowed in ARM mode so round down to the
nearest even number. */
if (TARGET_ARM)
lowbit &= ~1;
if ((i & ~(((unsigned HOST_WIDE_INT) 0xff) << lowbit)) == 0)
return TRUE;
if (TARGET_ARM)
{
/* Allow rotated constants in ARM mode. */
if (lowbit <= 4
&& ((i & ~0xc000003f) == 0
|| (i & ~0xf000000f) == 0
|| (i & ~0xfc000003) == 0))
return TRUE;
}
else
{
HOST_WIDE_INT v;
/* Allow repeated patterns 0x00XY00XY or 0xXYXYXYXY. */
v = i & 0xff;
v |= v << 16;
if (i == v || i == (v | (v << 8)))
return TRUE;
/* Allow repeated pattern 0xXY00XY00. */
v = i & 0xff00;
v |= v << 16;
if (i == v)
return TRUE;
}
return FALSE;
}
/* Return true if I is a valid constant for the operation CODE. */
int
const_ok_for_op (HOST_WIDE_INT i, enum rtx_code code)
{
if (const_ok_for_arm (i))
return 1;
switch (code)
{
case SET:
/* See if we can use movw. */
if (arm_arch_thumb2 && (i & 0xffff0000) == 0)
return 1;
else
/* Otherwise, try mvn. */
return const_ok_for_arm (ARM_SIGN_EXTEND (~i));
case PLUS:
/* See if we can use addw or subw. */
if (TARGET_THUMB2
&& ((i & 0xfffff000) == 0
|| ((-i) & 0xfffff000) == 0))
return 1;
/* else fall through. */
case COMPARE:
case EQ:
case NE:
case GT:
case LE:
case LT:
case GE:
case GEU:
case LTU:
case GTU:
case LEU:
case UNORDERED:
case ORDERED:
case UNEQ:
case UNGE:
case UNLT:
case UNGT:
case UNLE:
return const_ok_for_arm (ARM_SIGN_EXTEND (-i));
case MINUS: /* Should only occur with (MINUS I reg) => rsb */
case XOR:
return 0;
case IOR:
if (TARGET_THUMB2)
return const_ok_for_arm (ARM_SIGN_EXTEND (~i));
return 0;
case AND:
return const_ok_for_arm (ARM_SIGN_EXTEND (~i));
default:
gcc_unreachable ();
}
}
/* Return true if I is a valid di mode constant for the operation CODE. */
int
const_ok_for_dimode_op (HOST_WIDE_INT i, enum rtx_code code)
{
HOST_WIDE_INT hi_val = (i >> 32) & 0xFFFFFFFF;
HOST_WIDE_INT lo_val = i & 0xFFFFFFFF;
rtx hi = GEN_INT (hi_val);
rtx lo = GEN_INT (lo_val);
if (TARGET_THUMB1)
return 0;
switch (code)
{
case AND:
case IOR:
case XOR:
return (const_ok_for_op (hi_val, code) || hi_val == 0xFFFFFFFF)
&& (const_ok_for_op (lo_val, code) || lo_val == 0xFFFFFFFF);
case PLUS:
return arm_not_operand (hi, SImode) && arm_add_operand (lo, SImode);
default:
return 0;
}
}
/* Emit a sequence of insns to handle a large constant.
CODE is the code of the operation required, it can be any of SET, PLUS,
IOR, AND, XOR, MINUS;
MODE is the mode in which the operation is being performed;
VAL is the integer to operate on;
SOURCE is the other operand (a register, or a null-pointer for SET);
SUBTARGETS means it is safe to create scratch registers if that will
either produce a simpler sequence, or we will want to cse the values.
Return value is the number of insns emitted. */
/* ??? Tweak this for thumb2. */
int
arm_split_constant (enum rtx_code code, machine_mode mode, rtx insn,
HOST_WIDE_INT val, rtx target, rtx source, int subtargets)
{
rtx cond;
if (insn && GET_CODE (PATTERN (insn)) == COND_EXEC)
cond = COND_EXEC_TEST (PATTERN (insn));
else
cond = NULL_RTX;
if (subtargets || code == SET
|| (REG_P (target) && REG_P (source)
&& REGNO (target) != REGNO (source)))
{
/* After arm_reorg has been called, we can't fix up expensive
constants by pushing them into memory so we must synthesize
them in-line, regardless of the cost. This is only likely to
be more costly on chips that have load delay slots and we are
compiling without running the scheduler (so no splitting
occurred before the final instruction emission).
Ref: gcc -O1 -mcpu=strongarm gcc.c-torture/compile/980506-2.c
*/
if (!cfun->machine->after_arm_reorg
&& !cond
&& (arm_gen_constant (code, mode, NULL_RTX, val, target, source,
1, 0)
> (arm_constant_limit (optimize_function_for_size_p (cfun))
+ (code != SET))))
{
if (code == SET)
{
/* Currently SET is the only monadic value for CODE, all
the rest are diadic. */
if (TARGET_USE_MOVT)
arm_emit_movpair (target, GEN_INT (val));
else
emit_set_insn (target, GEN_INT (val));
return 1;
}
else
{
rtx temp = subtargets ? gen_reg_rtx (mode) : target;
if (TARGET_USE_MOVT)
arm_emit_movpair (temp, GEN_INT (val));
else
emit_set_insn (temp, GEN_INT (val));
/* For MINUS, the value is subtracted from, since we never
have subtraction of a constant. */
if (code == MINUS)
emit_set_insn (target, gen_rtx_MINUS (mode, temp, source));
else
emit_set_insn (target,
gen_rtx_fmt_ee (code, mode, source, temp));
return 2;
}
}
}
return arm_gen_constant (code, mode, cond, val, target, source, subtargets,
1);
}
/* Return a sequence of integers, in RETURN_SEQUENCE that fit into
ARM/THUMB2 immediates, and add up to VAL.
Thr function return value gives the number of insns required. */
static int
optimal_immediate_sequence (enum rtx_code code, unsigned HOST_WIDE_INT val,
struct four_ints *return_sequence)
{
int best_consecutive_zeros = 0;
int i;
int best_start = 0;
int insns1, insns2;
struct four_ints tmp_sequence;
/* If we aren't targeting ARM, the best place to start is always at
the bottom, otherwise look more closely. */
if (TARGET_ARM)
{
for (i = 0; i < 32; i += 2)
{
int consecutive_zeros = 0;
if (!(val & (3 << i)))
{
while ((i < 32) && !(val & (3 << i)))
{
consecutive_zeros += 2;
i += 2;
}
if (consecutive_zeros > best_consecutive_zeros)
{
best_consecutive_zeros = consecutive_zeros;
best_start = i - consecutive_zeros;
}
i -= 2;
}
}
}
/* So long as it won't require any more insns to do so, it's
desirable to emit a small constant (in bits 0...9) in the last
insn. This way there is more chance that it can be combined with
a later addressing insn to form a pre-indexed load or store
operation. Consider:
*((volatile int *)0xe0000100) = 1;
*((volatile int *)0xe0000110) = 2;
We want this to wind up as:
mov rA, #0xe0000000
mov rB, #1
str rB, [rA, #0x100]
mov rB, #2
str rB, [rA, #0x110]
rather than having to synthesize both large constants from scratch.
Therefore, we calculate how many insns would be required to emit
the constant starting from `best_start', and also starting from
zero (i.e. with bit 31 first to be output). If `best_start' doesn't
yield a shorter sequence, we may as well use zero. */
insns1 = optimal_immediate_sequence_1 (code, val, return_sequence, best_start);
if (best_start != 0
&& ((((unsigned HOST_WIDE_INT) 1) << best_start) < val))
{
insns2 = optimal_immediate_sequence_1 (code, val, &tmp_sequence, 0);
if (insns2 <= insns1)
{
*return_sequence = tmp_sequence;
insns1 = insns2;
}
}
return insns1;
}
/* As for optimal_immediate_sequence, but starting at bit-position I. */
static int
optimal_immediate_sequence_1 (enum rtx_code code, unsigned HOST_WIDE_INT val,
struct four_ints *return_sequence, int i)
{
int remainder = val & 0xffffffff;
int insns = 0;
/* Try and find a way of doing the job in either two or three
instructions.
In ARM mode we can use 8-bit constants, rotated to any 2-bit aligned
location. We start at position I. This may be the MSB, or
optimial_immediate_sequence may have positioned it at the largest block
of zeros that are aligned on a 2-bit boundary. We then fill up the temps,
wrapping around to the top of the word when we drop off the bottom.
In the worst case this code should produce no more than four insns.
In Thumb2 mode, we can use 32/16-bit replicated constants, and 8-bit
constants, shifted to any arbitrary location. We should always start
at the MSB. */
do
{
int end;
unsigned int b1, b2, b3, b4;
unsigned HOST_WIDE_INT result;
int loc;
gcc_assert (insns < 4);
if (i <= 0)
i += 32;
/* First, find the next normal 12/8-bit shifted/rotated immediate. */
if (remainder & ((TARGET_ARM ? (3 << (i - 2)) : (1 << (i - 1)))))
{
loc = i;
if (i <= 12 && TARGET_THUMB2 && code == PLUS)
/* We can use addw/subw for the last 12 bits. */
result = remainder;
else
{
/* Use an 8-bit shifted/rotated immediate. */
end = i - 8;
if (end < 0)
end += 32;
result = remainder & ((0x0ff << end)
| ((i < end) ? (0xff >> (32 - end))
: 0));
i -= 8;
}
}
else
{
/* Arm allows rotates by a multiple of two. Thumb-2 allows
arbitrary shifts. */
i -= TARGET_ARM ? 2 : 1;
continue;
}
/* Next, see if we can do a better job with a thumb2 replicated
constant.
We do it this way around to catch the cases like 0x01F001E0 where
two 8-bit immediates would work, but a replicated constant would
make it worse.
TODO: 16-bit constants that don't clear all the bits, but still win.
TODO: Arithmetic splitting for set/add/sub, rather than bitwise. */
if (TARGET_THUMB2)
{
b1 = (remainder & 0xff000000) >> 24;
b2 = (remainder & 0x00ff0000) >> 16;
b3 = (remainder & 0x0000ff00) >> 8;
b4 = remainder & 0xff;
if (loc > 24)
{
/* The 8-bit immediate already found clears b1 (and maybe b2),
but must leave b3 and b4 alone. */
/* First try to find a 32-bit replicated constant that clears
almost everything. We can assume that we can't do it in one,
or else we wouldn't be here. */
unsigned int tmp = b1 & b2 & b3 & b4;
unsigned int tmp2 = tmp + (tmp << 8) + (tmp << 16)
+ (tmp << 24);
unsigned int matching_bytes = (tmp == b1) + (tmp == b2)
+ (tmp == b3) + (tmp == b4);
if (tmp
&& (matching_bytes >= 3
|| (matching_bytes == 2
&& const_ok_for_op (remainder & ~tmp2, code))))
{
/* At least 3 of the bytes match, and the fourth has at
least as many bits set, or two of the bytes match
and it will only require one more insn to finish. */
result = tmp2;
i = tmp != b1 ? 32
: tmp != b2 ? 24
: tmp != b3 ? 16
: 8;
}
/* Second, try to find a 16-bit replicated constant that can
leave three of the bytes clear. If b2 or b4 is already
zero, then we can. If the 8-bit from above would not
clear b2 anyway, then we still win. */
else if (b1 == b3 && (!b2 || !b4
|| (remainder & 0x00ff0000 & ~result)))
{
result = remainder & 0xff00ff00;
i = 24;
}
}
else if (loc > 16)
{
/* The 8-bit immediate already found clears b2 (and maybe b3)
and we don't get here unless b1 is alredy clear, but it will
leave b4 unchanged. */
/* If we can clear b2 and b4 at once, then we win, since the
8-bits couldn't possibly reach that far. */
if (b2 == b4)
{
result = remainder & 0x00ff00ff;
i = 16;
}
}
}
return_sequence->i[insns++] = result;
remainder &= ~result;
if (code == SET || code == MINUS)
code = PLUS;
}
while (remainder);
return insns;
}
/* Emit an instruction with the indicated PATTERN. If COND is
non-NULL, conditionalize the execution of the instruction on COND
being true. */
static void
emit_constant_insn (rtx cond, rtx pattern)
{
if (cond)
pattern = gen_rtx_COND_EXEC (VOIDmode, copy_rtx (cond), pattern);
emit_insn (pattern);
}
/* As above, but extra parameter GENERATE which, if clear, suppresses
RTL generation. */
static int
arm_gen_constant (enum rtx_code code, machine_mode mode, rtx cond,
HOST_WIDE_INT val, rtx target, rtx source, int subtargets,
int generate)
{
int can_invert = 0;
int can_negate = 0;
int final_invert = 0;
int i;
int set_sign_bit_copies = 0;
int clear_sign_bit_copies = 0;
int clear_zero_bit_copies = 0;
int set_zero_bit_copies = 0;
int insns = 0, neg_insns, inv_insns;
unsigned HOST_WIDE_INT temp1, temp2;
unsigned HOST_WIDE_INT remainder = val & 0xffffffff;
struct four_ints *immediates;
struct four_ints pos_immediates, neg_immediates, inv_immediates;
/* Find out which operations are safe for a given CODE. Also do a quick
check for degenerate cases; these can occur when DImode operations
are split. */
switch (code)
{
case SET:
can_invert = 1;
break;
case PLUS:
can_negate = 1;
break;
case IOR:
if (remainder == 0xffffffff)
{
if (generate)
emit_constant_insn (cond,
gen_rtx_SET (VOIDmode, target,
GEN_INT (ARM_SIGN_EXTEND (val))));
return 1;
}
if (remainder == 0)
{
if (reload_completed && rtx_equal_p (target, source))
return 0;
if (generate)
emit_constant_insn (cond,
gen_rtx_SET (VOIDmode, target, source));
return 1;
}
break;
case AND:
if (remainder == 0)
{
if (generate)
emit_constant_insn (cond,
gen_rtx_SET (VOIDmode, target, const0_rtx));
return 1;
}
if (remainder == 0xffffffff)
{
if (reload_completed && rtx_equal_p (target, source))
return 0;
if (generate)
emit_constant_insn (cond,
gen_rtx_SET (VOIDmode, target, source));
return 1;
}
can_invert = 1;
break;
case XOR:
if (remainder == 0)
{
if (reload_completed && rtx_equal_p (target, source))
return 0;
if (generate)
emit_constant_insn (cond,
gen_rtx_SET (VOIDmode, target, source));
return 1;
}
if (remainder == 0xffffffff)
{
if (generate)
emit_constant_insn (cond,
gen_rtx_SET (VOIDmode, target,
gen_rtx_NOT (mode, source)));
return 1;
}
final_invert = 1;
break;
case MINUS:
/* We treat MINUS as (val - source), since (source - val) is always
passed as (source + (-val)). */
if (remainder == 0)
{
if (generate)
emit_constant_insn (cond,
gen_rtx_SET (VOIDmode, target,
gen_rtx_NEG (mode, source)));
return 1;
}
if (const_ok_for_arm (val))
{
if (generate)
emit_constant_insn (cond,
gen_rtx_SET (VOIDmode, target,
gen_rtx_MINUS (mode, GEN_INT (val),
source)));
return 1;
}
break;
default:
gcc_unreachable ();
}
/* If we can do it in one insn get out quickly. */
if (const_ok_for_op (val, code))
{
if (generate)
emit_constant_insn (cond,
gen_rtx_SET (VOIDmode, target,
(source
? gen_rtx_fmt_ee (code, mode, source,
GEN_INT (val))
: GEN_INT (val))));
return 1;
}
/* On targets with UXTH/UBFX, we can deal with AND (2^N)-1 in a single
insn. */
if (code == AND && (i = exact_log2 (remainder + 1)) > 0
&& (arm_arch_thumb2 || (i == 16 && arm_arch6 && mode == SImode)))
{
if (generate)
{
if (mode == SImode && i == 16)
/* Use UXTH in preference to UBFX, since on Thumb2 it's a
smaller insn. */
emit_constant_insn (cond,
gen_zero_extendhisi2
(target, gen_lowpart (HImode, source)));
else
/* Extz only supports SImode, but we can coerce the operands
into that mode. */
emit_constant_insn (cond,
gen_extzv_t2 (gen_lowpart (SImode, target),
gen_lowpart (SImode, source),
GEN_INT (i), const0_rtx));
}
return 1;
}
/* Calculate a few attributes that may be useful for specific
optimizations. */
/* Count number of leading zeros. */
for (i = 31; i >= 0; i--)
{
if ((remainder & (1 << i)) == 0)
clear_sign_bit_copies++;
else
break;
}
/* Count number of leading 1's. */
for (i = 31; i >= 0; i--)
{
if ((remainder & (1 << i)) != 0)
set_sign_bit_copies++;
else
break;
}
/* Count number of trailing zero's. */
for (i = 0; i <= 31; i++)
{
if ((remainder & (1 << i)) == 0)
clear_zero_bit_copies++;
else
break;
}
/* Count number of trailing 1's. */
for (i = 0; i <= 31; i++)
{
if ((remainder & (1 << i)) != 0)
set_zero_bit_copies++;
else
break;
}
switch (code)
{
case SET:
/* See if we can do this by sign_extending a constant that is known
to be negative. This is a good, way of doing it, since the shift
may well merge into a subsequent insn. */
if (set_sign_bit_copies > 1)
{
if (const_ok_for_arm
(temp1 = ARM_SIGN_EXTEND (remainder
<< (set_sign_bit_copies - 1))))
{
if (generate)
{
rtx new_src = subtargets ? gen_reg_rtx (mode) : target;
emit_constant_insn (cond,
gen_rtx_SET (VOIDmode, new_src,
GEN_INT (temp1)));
emit_constant_insn (cond,
gen_ashrsi3 (target, new_src,
GEN_INT (set_sign_bit_copies - 1)));
}
return 2;
}
/* For an inverted constant, we will need to set the low bits,
these will be shifted out of harm's way. */
temp1 |= (1 << (set_sign_bit_copies - 1)) - 1;
if (const_ok_for_arm (~temp1))
{
if (generate)
{
rtx new_src = subtargets ? gen_reg_rtx (mode) : target;
emit_constant_insn (cond,
gen_rtx_SET (VOIDmode, new_src,
GEN_INT (temp1)));
emit_constant_insn (cond,
gen_ashrsi3 (target, new_src,
GEN_INT (set_sign_bit_copies - 1)));
}
return 2;
}
}
/* See if we can calculate the value as the difference between two
valid immediates. */
if (clear_sign_bit_copies + clear_zero_bit_copies <= 16)
{
int topshift = clear_sign_bit_copies & ~1;
temp1 = ARM_SIGN_EXTEND ((remainder + (0x00800000 >> topshift))
& (0xff000000 >> topshift));
/* If temp1 is zero, then that means the 9 most significant
bits of remainder were 1 and we've caused it to overflow.
When topshift is 0 we don't need to do anything since we
can borrow from 'bit 32'. */
if (temp1 == 0 && topshift != 0)
temp1 = 0x80000000 >> (topshift - 1);
temp2 = ARM_SIGN_EXTEND (temp1 - remainder);
if (const_ok_for_arm (temp2))
{
if (generate)
{
rtx new_src = subtargets ? gen_reg_rtx (mode) : target;
emit_constant_insn (cond,
gen_rtx_SET (VOIDmode, new_src,
GEN_INT (temp1)));
emit_constant_insn (cond,
gen_addsi3 (target, new_src,
GEN_INT (-temp2)));
}
return 2;
}
}
/* See if we can generate this by setting the bottom (or the top)
16 bits, and then shifting these into the other half of the
word. We only look for the simplest cases, to do more would cost
too much. Be careful, however, not to generate this when the
alternative would take fewer insns. */
if (val & 0xffff0000)
{
temp1 = remainder & 0xffff0000;
temp2 = remainder & 0x0000ffff;
/* Overlaps outside this range are best done using other methods. */
for (i = 9; i < 24; i++)
{
if ((((temp2 | (temp2 << i)) & 0xffffffff) == remainder)
&& !const_ok_for_arm (temp2))
{
rtx new_src = (subtargets
? (generate ? gen_reg_rtx (mode) : NULL_RTX)
: target);
insns = arm_gen_constant (code, mode, cond, temp2, new_src,
source, subtargets, generate);
source = new_src;
if (generate)
emit_constant_insn
(cond,
gen_rtx_SET
(VOIDmode, target,
gen_rtx_IOR (mode,
gen_rtx_ASHIFT (mode, source,
GEN_INT (i)),
source)));
return insns + 1;
}
}
/* Don't duplicate cases already considered. */
for (i = 17; i < 24; i++)
{
if (((temp1 | (temp1 >> i)) == remainder)
&& !const_ok_for_arm (temp1))
{
rtx new_src = (subtargets
? (generate ? gen_reg_rtx (mode) : NULL_RTX)
: target);
insns = arm_gen_constant (code, mode, cond, temp1, new_src,
source, subtargets, generate);
source = new_src;
if (generate)
emit_constant_insn
(cond,
gen_rtx_SET (VOIDmode, target,
gen_rtx_IOR
(mode,
gen_rtx_LSHIFTRT (mode, source,
GEN_INT (i)),
source)));
return insns + 1;
}
}
}
break;
case IOR:
case XOR:
/* If we have IOR or XOR, and the constant can be loaded in a
single instruction, and we can find a temporary to put it in,
then this can be done in two instructions instead of 3-4. */
if (subtargets
/* TARGET can't be NULL if SUBTARGETS is 0 */
|| (reload_completed && !reg_mentioned_p (target, source)))
{
if (const_ok_for_arm (ARM_SIGN_EXTEND (~val)))
{
if (generate)
{
rtx sub = subtargets ? gen_reg_rtx (mode) : target;
emit_constant_insn (cond,
gen_rtx_SET (VOIDmode, sub,
GEN_INT (val)));
emit_constant_insn (cond,
gen_rtx_SET (VOIDmode, target,
gen_rtx_fmt_ee (code, mode,
source, sub)));
}
return 2;
}
}
if (code == XOR)
break;
/* Convert.
x = y | constant ( which is composed of set_sign_bit_copies of leading 1s
and the remainder 0s for e.g. 0xfff00000)
x = ~(~(y ashift set_sign_bit_copies) lshiftrt set_sign_bit_copies)
This can be done in 2 instructions by using shifts with mov or mvn.
e.g. for
x = x | 0xfff00000;
we generate.
mvn r0, r0, asl #12
mvn r0, r0, lsr #12 */
if (set_sign_bit_copies > 8
&& (val & (-1 << (32 - set_sign_bit_copies))) == val)
{
if (generate)
{
rtx sub = subtargets ? gen_reg_rtx (mode) : target;
rtx shift = GEN_INT (set_sign_bit_copies);
emit_constant_insn
(cond,
gen_rtx_SET (VOIDmode, sub,
gen_rtx_NOT (mode,
gen_rtx_ASHIFT (mode,
source,
shift))));
emit_constant_insn
(cond,
gen_rtx_SET (VOIDmode, target,
gen_rtx_NOT (mode,
gen_rtx_LSHIFTRT (mode, sub,
shift))));
}
return 2;
}
/* Convert
x = y | constant (which has set_zero_bit_copies number of trailing ones).
to
x = ~((~y lshiftrt set_zero_bit_copies) ashift set_zero_bit_copies).
For eg. r0 = r0 | 0xfff
mvn r0, r0, lsr #12
mvn r0, r0, asl #12
*/
if (set_zero_bit_copies > 8
&& (remainder & ((1 << set_zero_bit_copies) - 1)) == remainder)
{
if (generate)
{
rtx sub = subtargets ? gen_reg_rtx (mode) : target;
rtx shift = GEN_INT (set_zero_bit_copies);
emit_constant_insn
(cond,
gen_rtx_SET (VOIDmode, sub,
gen_rtx_NOT (mode,
gen_rtx_LSHIFTRT (mode,
source,
shift))));
emit_constant_insn
(cond,
gen_rtx_SET (VOIDmode, target,
gen_rtx_NOT (mode,
gen_rtx_ASHIFT (mode, sub,
shift))));
}
return 2;
}
/* This will never be reached for Thumb2 because orn is a valid
instruction. This is for Thumb1 and the ARM 32 bit cases.
x = y | constant (such that ~constant is a valid constant)
Transform this to
x = ~(~y & ~constant).
*/
if (const_ok_for_arm (temp1 = ARM_SIGN_EXTEND (~val)))
{
if (generate)
{
rtx sub = subtargets ? gen_reg_rtx (mode) : target;
emit_constant_insn (cond,
gen_rtx_SET (VOIDmode, sub,
gen_rtx_NOT (mode, source)));
source = sub;
if (subtargets)
sub = gen_reg_rtx (mode);
emit_constant_insn (cond,
gen_rtx_SET (VOIDmode, sub,
gen_rtx_AND (mode, source,
GEN_INT (temp1))));
emit_constant_insn (cond,
gen_rtx_SET (VOIDmode, target,
gen_rtx_NOT (mode, sub)));
}
return 3;
}
break;
case AND:
/* See if two shifts will do 2 or more insn's worth of work. */
if (clear_sign_bit_copies >= 16 && clear_sign_bit_copies < 24)
{
HOST_WIDE_INT shift_mask = ((0xffffffff
<< (32 - clear_sign_bit_copies))
& 0xffffffff);
if ((remainder | shift_mask) != 0xffffffff)
{
HOST_WIDE_INT new_val
= ARM_SIGN_EXTEND (remainder | shift_mask);
if (generate)
{
rtx new_src = subtargets ? gen_reg_rtx (mode) : target;
insns = arm_gen_constant (AND, SImode, cond, new_val,
new_src, source, subtargets, 1);
source = new_src;
}
else
{
rtx targ = subtargets ? NULL_RTX : target;
insns = arm_gen_constant (AND, mode, cond, new_val,
targ, source, subtargets, 0);
}
}
if (generate)
{
rtx new_src = subtargets ? gen_reg_rtx (mode) : target;
rtx shift = GEN_INT (clear_sign_bit_copies);
emit_insn (gen_ashlsi3 (new_src, source, shift));
emit_insn (gen_lshrsi3 (target, new_src, shift));
}
return insns + 2;
}
if (clear_zero_bit_copies >= 16 && clear_zero_bit_copies < 24)
{
HOST_WIDE_INT shift_mask = (1 << clear_zero_bit_copies) - 1;
if ((remainder | shift_mask) != 0xffffffff)
{
HOST_WIDE_INT new_val
= ARM_SIGN_EXTEND (remainder | shift_mask);
if (generate)
{
rtx new_src = subtargets ? gen_reg_rtx (mode) : target;
insns = arm_gen_constant (AND, mode, cond, new_val,
new_src, source, subtargets, 1);
source = new_src;
}
else
{
rtx targ = subtargets ? NULL_RTX : target;
insns = arm_gen_constant (AND, mode, cond, new_val,
targ, source, subtargets, 0);
}
}
if (generate)
{
rtx new_src = subtargets ? gen_reg_rtx (mode) : target;
rtx shift = GEN_INT (clear_zero_bit_copies);
emit_insn (gen_lshrsi3 (new_src, source, shift));
emit_insn (gen_ashlsi3 (target, new_src, shift));
}
return insns + 2;
}
break;
default:
break;
}
/* Calculate what the instruction sequences would be if we generated it
normally, negated, or inverted. */
if (code == AND)
/* AND cannot be split into multiple insns, so invert and use BIC. */
insns = 99;
else
insns = optimal_immediate_sequence (code, remainder, &pos_immediates);
if (can_negate)
neg_insns = optimal_immediate_sequence (code, (-remainder) & 0xffffffff,
&neg_immediates);
else
neg_insns = 99;
if (can_invert || final_invert)
inv_insns = optimal_immediate_sequence (code, remainder ^ 0xffffffff,
&inv_immediates);
else
inv_insns = 99;
immediates = &pos_immediates;
/* Is the negated immediate sequence more efficient? */
if (neg_insns < insns && neg_insns <= inv_insns)
{
insns = neg_insns;
immediates = &neg_immediates;
}
else
can_negate = 0;
/* Is the inverted immediate sequence more efficient?
We must allow for an extra NOT instruction for XOR operations, although
there is some chance that the final 'mvn' will get optimized later. */
if ((inv_insns + 1) < insns || (!final_invert && inv_insns < insns))
{
insns = inv_insns;
immediates = &inv_immediates;
}
else
{
can_invert = 0;
final_invert = 0;
}
/* Now output the chosen sequence as instructions. */
if (generate)
{
for (i = 0; i < insns; i++)
{
rtx new_src, temp1_rtx;
temp1 = immediates->i[i];
if (code == SET || code == MINUS)
new_src = (subtargets ? gen_reg_rtx (mode) : target);
else if ((final_invert || i < (insns - 1)) && subtargets)
new_src = gen_reg_rtx (mode);
else
new_src = target;
if (can_invert)
temp1 = ~temp1;
else if (can_negate)
temp1 = -temp1;
temp1 = trunc_int_for_mode (temp1, mode);
temp1_rtx = GEN_INT (temp1);
if (code == SET)
;
else if (code == MINUS)
temp1_rtx = gen_rtx_MINUS (mode, temp1_rtx, source);
else
temp1_rtx = gen_rtx_fmt_ee (code, mode, source, temp1_rtx);
emit_constant_insn (cond,
gen_rtx_SET (VOIDmode, new_src,
temp1_rtx));
source = new_src;
if (code == SET)
{
can_negate = can_invert;
can_invert = 0;
code = PLUS;
}
else if (code == MINUS)
code = PLUS;
}
}
if (final_invert)
{
if (generate)
emit_constant_insn (cond, gen_rtx_SET (VOIDmode, target,
gen_rtx_NOT (mode, source)));
insns++;
}
return insns;
}
/* Canonicalize a comparison so that we are more likely to recognize it.
This can be done for a few constant compares, where we can make the
immediate value easier to load. */
static void
arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
bool op0_preserve_value)
{
machine_mode mode;
unsigned HOST_WIDE_INT i, maxval;
mode = GET_MODE (*op0);
if (mode == VOIDmode)
mode = GET_MODE (*op1);
maxval = (((unsigned HOST_WIDE_INT) 1) << (GET_MODE_BITSIZE(mode) - 1)) - 1;
/* For DImode, we have GE/LT/GEU/LTU comparisons. In ARM mode
we can also use cmp/cmpeq for GTU/LEU. GT/LE must be either
reversed or (for constant OP1) adjusted to GE/LT. Similarly
for GTU/LEU in Thumb mode. */
if (mode == DImode)
{
if (*code == GT || *code == LE
|| (!TARGET_ARM && (*code == GTU || *code == LEU)))
{
/* Missing comparison. First try to use an available
comparison. */
if (CONST_INT_P (*op1))
{
i = INTVAL (*op1);
switch (*code)
{
case GT:
case LE:
if (i != maxval
&& arm_const_double_by_immediates (GEN_INT (i + 1)))
{
*op1 = GEN_INT (i + 1);
*code = *code == GT ? GE : LT;
return;
}
break;
case GTU:
case LEU:
if (i != ~((unsigned HOST_WIDE_INT) 0)
&& arm_const_double_by_immediates (GEN_INT (i + 1)))
{
*op1 = GEN_INT (i + 1);
*code = *code == GTU ? GEU : LTU;
return;
}
break;
default:
gcc_unreachable ();
}
}
/* If that did not work, reverse the condition. */
if (!op0_preserve_value)
{
std::swap (*op0, *op1);
*code = (int)swap_condition ((enum rtx_code)*code);
}
}
return;
}
/* If *op0 is (zero_extend:SI (subreg:QI (reg:SI) 0)) and comparing
with const0_rtx, change it to (and:SI (reg:SI) (const_int 255)),
to facilitate possible combining with a cmp into 'ands'. */
if (mode == SImode
&& GET_CODE (*op0) == ZERO_EXTEND
&& GET_CODE (XEXP (*op0, 0)) == SUBREG
&& GET_MODE (XEXP (*op0, 0)) == QImode
&& GET_MODE (SUBREG_REG (XEXP (*op0, 0))) == SImode
&& subreg_lowpart_p (XEXP (*op0, 0))
&& *op1 == const0_rtx)
*op0 = gen_rtx_AND (SImode, SUBREG_REG (XEXP (*op0, 0)),
GEN_INT (255));
/* Comparisons smaller than DImode. Only adjust comparisons against
an out-of-range constant. */
if (!CONST_INT_P (*op1)
|| const_ok_for_arm (INTVAL (*op1))
|| const_ok_for_arm (- INTVAL (*op1)))
return;
i = INTVAL (*op1);
switch (*code)
{
case EQ:
case NE:
return;
case GT:
case LE:
if (i != maxval
&& (const_ok_for_arm (i + 1) || const_ok_for_arm (-(i + 1))))
{
*op1 = GEN_INT (ARM_SIGN_EXTEND (i + 1));
*code = *code == GT ? GE : LT;
return;
}
break;
case GE:
case LT:
if (i != ~maxval
&& (const_ok_for_arm (i - 1) || const_ok_for_arm (-(i - 1))))
{
*op1 = GEN_INT (i - 1);
*code = *code == GE ? GT : LE;
return;
}
break;
case GTU:
case LEU:
if (i != ~((unsigned HOST_WIDE_INT) 0)
&& (const_ok_for_arm (i + 1) || const_ok_for_arm (-(i + 1))))
{
*op1 = GEN_INT (ARM_SIGN_EXTEND (i + 1));
*code = *code == GTU ? GEU : LTU;
return;
}
break;
case GEU:
case LTU:
if (i != 0
&& (const_ok_for_arm (i - 1) || const_ok_for_arm (-(i - 1))))
{
*op1 = GEN_INT (i - 1);
*code = *code == GEU ? GTU : LEU;
return;
}
break;
default:
gcc_unreachable ();
}
}
/* Define how to find the value returned by a function. */
static rtx
arm_function_value(const_tree type, const_tree func,
bool outgoing ATTRIBUTE_UNUSED)
{
machine_mode mode;
int unsignedp ATTRIBUTE_UNUSED;
rtx r ATTRIBUTE_UNUSED;
mode = TYPE_MODE (type);
if (TARGET_AAPCS_BASED)
return aapcs_allocate_return_reg (mode, type, func);
/* Promote integer types. */
if (INTEGRAL_TYPE_P (type))
mode = arm_promote_function_mode (type, mode, &unsignedp, func, 1);
/* Promotes small structs returned in a register to full-word size
for big-endian AAPCS. */
if (arm_return_in_msb (type))
{
HOST_WIDE_INT size = int_size_in_bytes (type);
if (size % UNITS_PER_WORD != 0)
{
size += UNITS_PER_WORD - size % UNITS_PER_WORD;
mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
}
}
return arm_libcall_value_1 (mode);
}
/* libcall hashtable helpers. */
struct libcall_hasher : typed_noop_remove <rtx_def>
{
typedef rtx_def value_type;
typedef rtx_def compare_type;
static inline hashval_t hash (const value_type *);
static inline bool equal (const value_type *, const compare_type *);
static inline void remove (value_type *);
};
inline bool
libcall_hasher::equal (const value_type *p1, const compare_type *p2)
{
return rtx_equal_p (p1, p2);
}
inline hashval_t
libcall_hasher::hash (const value_type *p1)
{
return hash_rtx (p1, VOIDmode, NULL, NULL, FALSE);
}
typedef hash_table<libcall_hasher> libcall_table_type;
static void
add_libcall (libcall_table_type *htab, rtx libcall)
{
*htab->find_slot (libcall, INSERT) = libcall;
}
static bool
arm_libcall_uses_aapcs_base (const_rtx libcall)
{
static bool init_done = false;
static libcall_table_type *libcall_htab = NULL;
if (!init_done)
{
init_done = true;
libcall_htab = new libcall_table_type (31);
add_libcall (libcall_htab,
convert_optab_libfunc (sfloat_optab, SFmode, SImode));
add_libcall (libcall_htab,
convert_optab_libfunc (sfloat_optab, DFmode, SImode));
add_libcall (libcall_htab,
convert_optab_libfunc (sfloat_optab, SFmode, DImode));
add_libcall (libcall_htab,
convert_optab_libfunc (sfloat_optab, DFmode, DImode));
add_libcall (libcall_htab,
convert_optab_libfunc (ufloat_optab, SFmode, SImode));
add_libcall (libcall_htab,
convert_optab_libfunc (ufloat_optab, DFmode, SImode));
add_libcall (libcall_htab,
convert_optab_libfunc (ufloat_optab, SFmode, DImode));
add_libcall (libcall_htab,
convert_optab_libfunc (ufloat_optab, DFmode, DImode));
add_libcall (libcall_htab,
convert_optab_libfunc (sext_optab, SFmode, HFmode));
add_libcall (libcall_htab,
convert_optab_libfunc (trunc_optab, HFmode, SFmode));
add_libcall (libcall_htab,
convert_optab_libfunc (sfix_optab, SImode, DFmode));
add_libcall (libcall_htab,
convert_optab_libfunc (ufix_optab, SImode, DFmode));
add_libcall (libcall_htab,
convert_optab_libfunc (sfix_optab, DImode, DFmode));
add_libcall (libcall_htab,
convert_optab_libfunc (ufix_optab, DImode, DFmode));
add_libcall (libcall_htab,
convert_optab_libfunc (sfix_optab, DImode, SFmode));
add_libcall (libcall_htab,
convert_optab_libfunc (ufix_optab, DImode, SFmode));
/* Values from double-precision helper functions are returned in core
registers if the selected core only supports single-precision
arithmetic, even if we are using the hard-float ABI. The same is
true for single-precision helpers, but we will never be using the
hard-float ABI on a CPU which doesn't support single-precision
operations in hardware. */
add_libcall (libcall_htab, optab_libfunc (add_optab, DFmode));
add_libcall (libcall_htab, optab_libfunc (sdiv_optab, DFmode));
add_libcall (libcall_htab, optab_libfunc (smul_optab, DFmode));
add_libcall (libcall_htab, optab_libfunc (neg_optab, DFmode));
add_libcall (libcall_htab, optab_libfunc (sub_optab, DFmode));
add_libcall (libcall_htab, optab_libfunc (eq_optab, DFmode));
add_libcall (libcall_htab, optab_libfunc (lt_optab, DFmode));
add_libcall (libcall_htab, optab_libfunc (le_optab, DFmode));
add_libcall (libcall_htab, optab_libfunc (ge_optab, DFmode));
add_libcall (libcall_htab, optab_libfunc (gt_optab, DFmode));
add_libcall (libcall_htab, optab_libfunc (unord_optab, DFmode));
add_libcall (libcall_htab, convert_optab_libfunc (sext_optab, DFmode,
SFmode));
add_libcall (libcall_htab, convert_optab_libfunc (trunc_optab, SFmode,
DFmode));
}
return libcall && libcall_htab->find (libcall) != NULL;
}
static rtx
arm_libcall_value_1 (machine_mode mode)
{
if (TARGET_AAPCS_BASED)
return aapcs_libcall_value (mode);
else if (TARGET_IWMMXT_ABI
&& arm_vector_mode_supported_p (mode))
return gen_rtx_REG (mode, FIRST_IWMMXT_REGNUM);
else
return gen_rtx_REG (mode, ARG_REGISTER (1));
}
/* Define how to find the value returned by a library function
assuming the value has mode MODE. */
static rtx
arm_libcall_value (machine_mode mode, const_rtx libcall)
{
if (TARGET_AAPCS_BASED && arm_pcs_default != ARM_PCS_AAPCS
&& GET_MODE_CLASS (mode) == MODE_FLOAT)
{
/* The following libcalls return their result in integer registers,
even though they return a floating point value. */
if (arm_libcall_uses_aapcs_base (libcall))
return gen_rtx_REG (mode, ARG_REGISTER(1));
}
return arm_libcall_value_1 (mode);
}
/* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
static bool
arm_function_value_regno_p (const unsigned int regno)
{
if (regno == ARG_REGISTER (1)
|| (TARGET_32BIT
&& TARGET_AAPCS_BASED
&& TARGET_VFP
&& TARGET_HARD_FLOAT
&& regno == FIRST_VFP_REGNUM)
|| (TARGET_IWMMXT_ABI
&& regno == FIRST_IWMMXT_REGNUM))
return true;
return false;
}
/* Determine the amount of memory needed to store the possible return
registers of an untyped call. */
int
arm_apply_result_size (void)
{
int size = 16;
if (TARGET_32BIT)
{
if (TARGET_HARD_FLOAT_ABI && TARGET_VFP)
size += 32;
if (TARGET_IWMMXT_ABI)
size += 8;
}
return size;
}
/* Decide whether TYPE should be returned in memory (true)
or in a register (false). FNTYPE is the type of the function making
the call. */
static bool
arm_return_in_memory (const_tree type, const_tree fntype)
{
HOST_WIDE_INT size;
size = int_size_in_bytes (type); /* Negative if not fixed size. */
if (TARGET_AAPCS_BASED)
{
/* Simple, non-aggregate types (ie not including vectors and
complex) are always returned in a register (or registers).
We don't care about which register here, so we can short-cut
some of the detail. */
if (!AGGREGATE_TYPE_P (type)
&& TREE_CODE (type) != VECTOR_TYPE
&& TREE_CODE (type) != COMPLEX_TYPE)
return false;
/* Any return value that is no larger than one word can be
returned in r0. */
if (((unsigned HOST_WIDE_INT) size) <= UNITS_PER_WORD)
return false;
/* Check any available co-processors to see if they accept the
type as a register candidate (VFP, for example, can return
some aggregates in consecutive registers). These aren't
available if the call is variadic. */
if (aapcs_select_return_coproc (type, fntype) >= 0)
return false;
/* Vector values should be returned using ARM registers, not
memory (unless they're over 16 bytes, which will break since
we only have four call-clobbered registers to play with). */
if (TREE_CODE (type) == VECTOR_TYPE)
return (size < 0 || size > (4 * UNITS_PER_WORD));
/* The rest go in memory. */
return true;
}
if (TREE_CODE (type) == VECTOR_TYPE)
return (size < 0 || size > (4 * UNITS_PER_WORD));
if (!AGGREGATE_TYPE_P (type) &&
(TREE_CODE (type) != VECTOR_TYPE))
/* All simple types are returned in registers. */
return false;
if (arm_abi != ARM_ABI_APCS)
{
/* ATPCS and later return aggregate types in memory only if they are
larger than a word (or are variable size). */
return (size < 0 || size > UNITS_PER_WORD);
}
/* For the arm-wince targets we choose to be compatible with Microsoft's
ARM and Thumb compilers, which always return aggregates in memory. */
#ifndef ARM_WINCE
/* All structures/unions bigger than one word are returned in memory.
Also catch the case where int_size_in_bytes returns -1. In this case
the aggregate is either huge or of variable size, and in either case
we will want to return it via memory and not in a register. */
if (size < 0 || size > UNITS_PER_WORD)
return true;
if (TREE_CODE (type) == RECORD_TYPE)
{
tree field;
/* For a struct the APCS says that we only return in a register
if the type is 'integer like' and every addressable element
has an offset of zero. For practical purposes this means
that the structure can have at most one non bit-field element
and that this element must be the first one in the structure. */
/* Find the first field, ignoring non FIELD_DECL things which will
have been created by C++. */
for (field = TYPE_FIELDS (type);
field && TREE_CODE (field) != FIELD_DECL;
field = DECL_CHAIN (field))
continue;
if (field == NULL)
return false; /* An empty structure. Allowed by an extension to ANSI C. */
/* Check that the first field is valid for returning in a register. */
/* ... Floats are not allowed */
if (FLOAT_TYPE_P (TREE_TYPE (field)))
return true;
/* ... Aggregates that are not themselves valid for returning in
a register are not allowed. */
if (arm_return_in_memory (TREE_TYPE (field), NULL_TREE))
return true;
/* Now check the remaining fields, if any. Only bitfields are allowed,
since they are not addressable. */
for (field = DECL_CHAIN (field);
field;
field = DECL_CHAIN (field))
{
if (TREE_CODE (field) != FIELD_DECL)
continue;
if (!DECL_BIT_FIELD_TYPE (field))
return true;
}
return false;
}
if (TREE_CODE (type) == UNION_TYPE)
{
tree field;
/* Unions can be returned in registers if every element is
integral, or can be returned in an integer register. */
for (field = TYPE_FIELDS (type);
field;
field = DECL_CHAIN (field))
{
if (TREE_CODE (field) != FIELD_DECL)
continue;
if (FLOAT_TYPE_P (TREE_TYPE (field)))
return true;
if (arm_return_in_memory (TREE_TYPE (field), NULL_TREE))
return true;
}
return false;
}
#endif /* not ARM_WINCE */
/* Return all other types in memory. */
return true;
}
const struct pcs_attribute_arg
{
const char *arg;
enum arm_pcs value;
} pcs_attribute_args[] =
{
{"aapcs", ARM_PCS_AAPCS},
{"aapcs-vfp", ARM_PCS_AAPCS_VFP},
#if 0
/* We could recognize these, but changes would be needed elsewhere
* to implement them. */
{"aapcs-iwmmxt", ARM_PCS_AAPCS_IWMMXT},
{"atpcs", ARM_PCS_ATPCS},
{"apcs", ARM_PCS_APCS},
#endif
{NULL, ARM_PCS_UNKNOWN}
};
static enum arm_pcs
arm_pcs_from_attribute (tree attr)
{
const struct pcs_attribute_arg *ptr;
const char *arg;
/* Get the value of the argument. */
if (TREE_VALUE (attr) == NULL_TREE
|| TREE_CODE (TREE_VALUE (attr)) != STRING_CST)
return ARM_PCS_UNKNOWN;
arg = TREE_STRING_POINTER (TREE_VALUE (attr));
/* Check it against the list of known arguments. */
for (ptr = pcs_attribute_args; ptr->arg != NULL; ptr++)
if (streq (arg, ptr->arg))
return ptr->value;
/* An unrecognized interrupt type. */
return ARM_PCS_UNKNOWN;
}
/* Get the PCS variant to use for this call. TYPE is the function's type
specification, DECL is the specific declartion. DECL may be null if
the call could be indirect or if this is a library call. */
static enum arm_pcs
arm_get_pcs_model (const_tree type, const_tree decl)
{
bool user_convention = false;
enum arm_pcs user_pcs = arm_pcs_default;
tree attr;
gcc_assert (type);
attr = lookup_attribute ("pcs", TYPE_ATTRIBUTES (type));
if (attr)
{
user_pcs = arm_pcs_from_attribute (TREE_VALUE (attr));
user_convention = true;
}
if (TARGET_AAPCS_BASED)
{
/* Detect varargs functions. These always use the base rules
(no argument is ever a candidate for a co-processor
register). */
bool base_rules = stdarg_p (type);
if (user_convention)
{
if (user_pcs > ARM_PCS_AAPCS_LOCAL)
sorry ("non-AAPCS derived PCS variant");
else if (base_rules && user_pcs != ARM_PCS_AAPCS)
error ("variadic functions must use the base AAPCS variant");
}
if (base_rules)
return ARM_PCS_AAPCS;
else if (user_convention)
return user_pcs;
else if (decl && flag_unit_at_a_time)
{
/* Local functions never leak outside this compilation unit,
so we are free to use whatever conventions are
appropriate. */
/* FIXME: remove CONST_CAST_TREE when cgraph is constified. */
cgraph_local_info *i = cgraph_node::local_info (CONST_CAST_TREE(decl));
if (i && i->local)
return ARM_PCS_AAPCS_LOCAL;
}
}
else if (user_convention && user_pcs != arm_pcs_default)
sorry ("PCS variant");
/* For everything else we use the target's default. */
return arm_pcs_default;
}
static void
aapcs_vfp_cum_init (CUMULATIVE_ARGS *pcum ATTRIBUTE_UNUSED,
const_tree fntype ATTRIBUTE_UNUSED,
rtx libcall ATTRIBUTE_UNUSED,
const_tree fndecl ATTRIBUTE_UNUSED)
{
/* Record the unallocated VFP registers. */
pcum->aapcs_vfp_regs_free = (1 << NUM_VFP_ARG_REGS) - 1;
pcum->aapcs_vfp_reg_alloc = 0;
}
/* Walk down the type tree of TYPE counting consecutive base elements.
If *MODEP is VOIDmode, then set it to the first valid floating point
type. If a non-floating point type is found, or if a floating point
type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
otherwise return the count in the sub-tree. */
static int
aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
{
machine_mode mode;
HOST_WIDE_INT size;
switch (TREE_CODE (type))
{
case REAL_TYPE:
mode = TYPE_MODE (type);
if (mode != DFmode && mode != SFmode)
return -1;
if (*modep == VOIDmode)
*modep = mode;
if (*modep == mode)
return 1;
break;
case COMPLEX_TYPE:
mode = TYPE_MODE (TREE_TYPE (type));
if (mode != DFmode && mode != SFmode)
return -1;
if (*modep == VOIDmode)
*modep = mode;
if (*modep == mode)
return 2;
break;
case VECTOR_TYPE:
/* Use V2SImode and V4SImode as representatives of all 64-bit
and 128-bit vector types, whether or not those modes are
supported with the present options. */
size = int_size_in_bytes (type);
switch (size)
{
case 8:
mode = V2SImode;
break;
case 16:
mode = V4SImode;
break;
default:
return -1;
}
if (*modep == VOIDmode)
*modep = mode;
/* Vector modes are considered to be opaque: two vectors are
equivalent for the purposes of being homogeneous aggregates
if they are the same size. */
if (*modep == mode)
return 1;
break;
case ARRAY_TYPE:
{
int count;
tree index = TYPE_DOMAIN (type);
/* Can't handle incomplete types nor sizes that are not
fixed. */
if (!COMPLETE_TYPE_P (type)
|| TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
return -1;
count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
if (count == -1
|| !index
|| !TYPE_MAX_VALUE (index)
|| !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
|| !TYPE_MIN_VALUE (index)
|| !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
|| count < 0)
return -1;
count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
- tree_to_uhwi (TYPE_MIN_VALUE (index)));
/* There must be no padding. */
if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
return -1;
return count;
}
case RECORD_TYPE:
{
int count = 0;
int sub_count;
tree field;
/* Can't handle incomplete types nor sizes that are not
fixed. */
if (!COMPLETE_TYPE_P (type)
|| TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
return -1;
for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
{
if (TREE_CODE (field) != FIELD_DECL)
continue;
sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
if (sub_count < 0)
return -1;
count += sub_count;
}
/* There must be no padding. */
if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
return -1;
return count;
}
case UNION_TYPE:
case QUAL_UNION_TYPE:
{
/* These aren't very interesting except in a degenerate case. */
int count = 0;
int sub_count;
tree field;
/* Can't handle incomplete types nor sizes that are not
fixed. */
if (!COMPLETE_TYPE_P (type)
|| TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
return -1;
for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
{
if (TREE_CODE (field) != FIELD_DECL)
continue;
sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
if (sub_count < 0)
return -1;
count = count > sub_count ? count : sub_count;
}
/* There must be no padding. */
if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
return -1;
return count;
}
default:
break;
}
return -1;
}
/* Return true if PCS_VARIANT should use VFP registers. */
static bool
use_vfp_abi (enum arm_pcs pcs_variant, bool is_double)
{
if (pcs_variant == ARM_PCS_AAPCS_VFP)
{
static bool seen_thumb1_vfp = false;
if (TARGET_THUMB1 && !seen_thumb1_vfp)
{
sorry ("Thumb-1 hard-float VFP ABI");
/* sorry() is not immediately fatal, so only display this once. */
seen_thumb1_vfp = true;
}
return true;
}
if (pcs_variant != ARM_PCS_AAPCS_LOCAL)
return false;
return (TARGET_32BIT && TARGET_VFP && TARGET_HARD_FLOAT &&
(TARGET_VFP_DOUBLE || !is_double));
}
/* Return true if an argument whose type is TYPE, or mode is MODE, is
suitable for passing or returning in VFP registers for the PCS
variant selected. If it is, then *BASE_MODE is updated to contain
a machine mode describing each element of the argument's type and
*COUNT to hold the number of such elements. */
static bool
aapcs_vfp_is_call_or_return_candidate (enum arm_pcs pcs_variant,
machine_mode mode, const_tree type,
machine_mode *base_mode, int *count)
{
machine_mode new_mode = VOIDmode;
/* If we have the type information, prefer that to working things
out from the mode. */
if (type)
{
int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
if (ag_count > 0 && ag_count <= 4)
*count = ag_count;
else
return false;
}
else if (GET_MODE_CLASS (mode) == MODE_FLOAT
|| GET_MODE_CLASS (mode) == MODE_VECTOR_INT
|| GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
{
*count = 1;
new_mode = mode;
}
else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
{
*count = 2;
new_mode = (mode == DCmode ? DFmode : SFmode);
}
else
return false;
if (!use_vfp_abi (pcs_variant, ARM_NUM_REGS (new_mode) > 1))
return false;
*base_mode = new_mode;
return true;
}
static bool
aapcs_vfp_is_return_candidate (enum arm_pcs pcs_variant,
machine_mode mode, const_tree type)
{
int count ATTRIBUTE_UNUSED;
machine_mode ag_mode ATTRIBUTE_UNUSED;
if (!use_vfp_abi (pcs_variant, false))
return false;
return aapcs_vfp_is_call_or_return_candidate (pcs_variant, mode, type,
&ag_mode, &count);
}
static bool
aapcs_vfp_is_call_candidate (CUMULATIVE_ARGS *pcum, machine_mode mode,
const_tree type)
{
if (!use_vfp_abi (pcum->pcs_variant, false))
return false;
return aapcs_vfp_is_call_or_return_candidate (pcum->pcs_variant, mode, type,
&pcum->aapcs_vfp_rmode,
&pcum->aapcs_vfp_rcount);
}
static bool
aapcs_vfp_allocate (CUMULATIVE_ARGS *pcum, machine_mode mode,
const_tree type ATTRIBUTE_UNUSED)
{
int shift = GET_MODE_SIZE (pcum->aapcs_vfp_rmode) / GET_MODE_SIZE (SFmode);
unsigned mask = (1 << (shift * pcum->aapcs_vfp_rcount)) - 1;
int regno;
for (regno = 0; regno < NUM_VFP_ARG_REGS; regno += shift)
if (((pcum->aapcs_vfp_regs_free >> regno) & mask) == mask)
{
pcum->aapcs_vfp_reg_alloc = mask << regno;
if (mode == BLKmode
|| (mode == TImode && ! TARGET_NEON)
|| ! arm_hard_regno_mode_ok (FIRST_VFP_REGNUM + regno, mode))
{
int i;
int rcount = pcum->aapcs_vfp_rcount;
int rshift = shift;
machine_mode rmode = pcum->aapcs_vfp_rmode;
rtx par;
if (!TARGET_NEON)
{
/* Avoid using unsupported vector modes. */
if (rmode == V2SImode)
rmode = DImode;
else if (rmode == V4SImode)
{
rmode = DImode;
rcount *= 2;
rshift /= 2;
}
}
par = gen_rtx_PARALLEL (mode, rtvec_alloc (rcount));
for (i = 0; i < rcount; i++)
{
rtx tmp = gen_rtx_REG (rmode,
FIRST_VFP_REGNUM + regno + i * rshift);
tmp = gen_rtx_EXPR_LIST
(VOIDmode, tmp,
GEN_INT (i * GET_MODE_SIZE (rmode)));
XVECEXP (par, 0, i) = tmp;
}
pcum->aapcs_reg = par;
}
else
pcum->aapcs_reg = gen_rtx_REG (mode, FIRST_VFP_REGNUM + regno);
return true;
}
return false;
}
static rtx
aapcs_vfp_allocate_return_reg (enum arm_pcs pcs_variant ATTRIBUTE_UNUSED,
machine_mode mode,
const_tree type ATTRIBUTE_UNUSED)
{
if (!use_vfp_abi (pcs_variant, false))
return NULL;
if (mode == BLKmode || (mode == TImode && !TARGET_NEON))
{
int count;
machine_mode ag_mode;
int i;
rtx par;
int shift;
aapcs_vfp_is_call_or_return_candidate (pcs_variant, mode, type,
&ag_mode, &count);
if (!TARGET_NEON)
{
if (ag_mode == V2SImode)
ag_mode = DImode;
else if (ag_mode == V4SImode)
{
ag_mode = DImode;
count *= 2;
}
}
shift = GET_MODE_SIZE(ag_mode) / GET_MODE_SIZE(SFmode);
par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
for (i = 0; i < count; i++)
{
rtx tmp = gen_rtx_REG (ag_mode, FIRST_VFP_REGNUM + i * shift);
tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
GEN_INT (i * GET_MODE_SIZE (ag_mode)));
XVECEXP (par, 0, i) = tmp;
}
return par;
}
return gen_rtx_REG (mode, FIRST_VFP_REGNUM);
}
static void
aapcs_vfp_advance (CUMULATIVE_ARGS *pcum ATTRIBUTE_UNUSED,
machine_mode mode ATTRIBUTE_UNUSED,
const_tree type ATTRIBUTE_UNUSED)
{
pcum->aapcs_vfp_regs_free &= ~pcum->aapcs_vfp_reg_alloc;
pcum->aapcs_vfp_reg_alloc = 0;
return;
}
#define AAPCS_CP(X) \
{ \
aapcs_ ## X ## _cum_init, \
aapcs_ ## X ## _is_call_candidate, \
aapcs_ ## X ## _allocate, \
aapcs_ ## X ## _is_return_candidate, \
aapcs_ ## X ## _allocate_return_reg, \
aapcs_ ## X ## _advance \
}
/* Table of co-processors that can be used to pass arguments in
registers. Idealy no arugment should be a candidate for more than
one co-processor table entry, but the table is processed in order
and stops after the first match. If that entry then fails to put
the argument into a co-processor register, the argument will go on
the stack. */
static struct
{
/* Initialize co-processor related state in CUMULATIVE_ARGS structure. */
void (*cum_init) (CUMULATIVE_ARGS *, const_tree, rtx, const_tree);
/* Return true if an argument of mode MODE (or type TYPE if MODE is
BLKmode) is a candidate for this co-processor's registers; this
function should ignore any position-dependent state in
CUMULATIVE_ARGS and only use call-type dependent information. */
bool (*is_call_candidate) (CUMULATIVE_ARGS *, machine_mode, const_tree);
/* Return true if the argument does get a co-processor register; it
should set aapcs_reg to an RTX of the register allocated as is
required for a return from FUNCTION_ARG. */
bool (*allocate) (CUMULATIVE_ARGS *, machine_mode, const_tree);
/* Return true if a result of mode MODE (or type TYPE if MODE is
BLKmode) is can be returned in this co-processor's registers. */
bool (*is_return_candidate) (enum arm_pcs, machine_mode, const_tree);
/* Allocate and return an RTX element to hold the return type of a
call, this routine must not fail and will only be called if
is_return_candidate returned true with the same parameters. */
rtx (*allocate_return_reg) (enum arm_pcs, machine_mode, const_tree);
/* Finish processing this argument and prepare to start processing
the next one. */
void (*advance) (CUMULATIVE_ARGS *, machine_mode, const_tree);
} aapcs_cp_arg_layout[ARM_NUM_COPROC_SLOTS] =
{
AAPCS_CP(vfp)
};
#undef AAPCS_CP
static int
aapcs_select_call_coproc (CUMULATIVE_ARGS *pcum, machine_mode mode,
const_tree type)
{
int i;
for (i = 0; i < ARM_NUM_COPROC_SLOTS; i++)
if (aapcs_cp_arg_layout[i].is_call_candidate (pcum, mode, type))
return i;
return -1;
}
static int
aapcs_select_return_coproc (const_tree type, const_tree fntype)
{
/* We aren't passed a decl, so we can't check that a call is local.
However, it isn't clear that that would be a win anyway, since it
might limit some tail-calling opportunities. */
enum arm_pcs pcs_variant;
if (fntype)
{
const_tree fndecl = NULL_TREE;
if (TREE_CODE (fntype) == FUNCTION_DECL)
{
fndecl = fntype;
fntype = TREE_TYPE (fntype);
}
pcs_variant = arm_get_pcs_model (fntype, fndecl);
}
else
pcs_variant = arm_pcs_default;
if (pcs_variant != ARM_PCS_AAPCS)
{
int i;
for (i = 0; i < ARM_NUM_COPROC_SLOTS; i++)
if (aapcs_cp_arg_layout[i].is_return_candidate (pcs_variant,
TYPE_MODE (type),
type))
return i;
}
return -1;
}
static rtx
aapcs_allocate_return_reg (machine_mode mode, const_tree type,
const_tree fntype)
{
/* We aren't passed a decl, so we can't check that a call is local.
However, it isn't clear that that would be a win anyway, since it
might limit some tail-calling opportunities. */
enum arm_pcs pcs_variant;
int unsignedp ATTRIBUTE_UNUSED;
if (fntype)
{
const_tree fndecl = NULL_TREE;
if (TREE_CODE (fntype) == FUNCTION_DECL)
{
fndecl = fntype;
fntype = TREE_TYPE (fntype);
}
pcs_variant = arm_get_pcs_model (fntype, fndecl);
}
else
pcs_variant = arm_pcs_default;
/* Promote integer types. */
if (type && INTEGRAL_TYPE_P (type))
mode = arm_promote_function_mode (type, mode, &unsignedp, fntype, 1);
if (pcs_variant != ARM_PCS_AAPCS)
{
int i;
for (i = 0; i < ARM_NUM_COPROC_SLOTS; i++)
if (aapcs_cp_arg_layout[i].is_return_candidate (pcs_variant, mode,
type))
return aapcs_cp_arg_layout[i].allocate_return_reg (pcs_variant,
mode, type);
}
/* Promotes small structs returned in a register to full-word size
for big-endian AAPCS. */
if (type && arm_return_in_msb (type))
{
HOST_WIDE_INT size = int_size_in_bytes (type);
if (size % UNITS_PER_WORD != 0)
{
size += UNITS_PER_WORD - size % UNITS_PER_WORD;
mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
}
}
return gen_rtx_REG (mode, R0_REGNUM);
}
static rtx
aapcs_libcall_value (machine_mode mode)
{
if (BYTES_BIG_ENDIAN && ALL_FIXED_POINT_MODE_P (mode)
&& GET_MODE_SIZE (mode) <= 4)
mode = SImode;
return aapcs_allocate_return_reg (mode, NULL_TREE, NULL_TREE);
}
/* Lay out a function argument using the AAPCS rules. The rule
numbers referred to here are those in the AAPCS. */
static void
aapcs_layout_arg (CUMULATIVE_ARGS *pcum, machine_mode mode,
const_tree type, bool named)
{
int nregs, nregs2;
int ncrn;
/* We only need to do this once per argument. */
if (pcum->aapcs_arg_processed)
return;
pcum->aapcs_arg_processed = true;
/* Special case: if named is false then we are handling an incoming
anonymous argument which is on the stack. */
if (!named)
return;
/* Is this a potential co-processor register candidate? */
if (pcum->pcs_variant != ARM_PCS_AAPCS)
{
int slot = aapcs_select_call_coproc (pcum, mode, type);
pcum->aapcs_cprc_slot = slot;
/* We don't have to apply any of the rules from part B of the
preparation phase, these are handled elsewhere in the
compiler. */
if (slot >= 0)
{
/* A Co-processor register candidate goes either in its own
class of registers or on the stack. */
if (!pcum->aapcs_cprc_failed[slot])
{
/* C1.cp - Try to allocate the argument to co-processor
registers. */
if (aapcs_cp_arg_layout[slot].allocate (pcum, mode, type))
return;
/* C2.cp - Put the argument on the stack and note that we
can't assign any more candidates in this slot. We also
need to note that we have allocated stack space, so that
we won't later try to split a non-cprc candidate between
core registers and the stack. */
pcum->aapcs_cprc_failed[slot] = true;
pcum->can_split = false;
}
/* We didn't get a register, so this argument goes on the
stack. */
gcc_assert (pcum->can_split == false);
return;
}
}
/* C3 - For double-word aligned arguments, round the NCRN up to the
next even number. */
ncrn = pcum->aapcs_ncrn;
if ((ncrn & 1) && arm_needs_doubleword_align (mode, type))
ncrn++;
nregs = ARM_NUM_REGS2(mode, type);
/* Sigh, this test should really assert that nregs > 0, but a GCC
extension allows empty structs and then gives them empty size; it
then allows such a structure to be passed by value. For some of
the code below we have to pretend that such an argument has
non-zero size so that we 'locate' it correctly either in
registers or on the stack. */
gcc_assert (nregs >= 0);
nregs2 = nregs ? nregs : 1;
/* C4 - Argument fits entirely in core registers. */
if (ncrn + nregs2 <= NUM_ARG_REGS)
{
pcum->aapcs_reg = gen_rtx_REG (mode, ncrn);
pcum->aapcs_next_ncrn = ncrn + nregs;
return;
}
/* C5 - Some core registers left and there are no arguments already
on the stack: split this argument between the remaining core
registers and the stack. */
if (ncrn < NUM_ARG_REGS && pcum->can_split)
{
pcum->aapcs_reg = gen_rtx_REG (mode, ncrn);
pcum->aapcs_next_ncrn = NUM_ARG_REGS;
pcum->aapcs_partial = (NUM_ARG_REGS - ncrn) * UNITS_PER_WORD;
return;
}
/* C6 - NCRN is set to 4. */
pcum->aapcs_next_ncrn = NUM_ARG_REGS;
/* C7,C8 - arugment goes on the stack. We have nothing to do here. */
return;
}
/* Initialize a variable CUM of type CUMULATIVE_ARGS
for a call to a function whose data type is FNTYPE.
For a library call, FNTYPE is NULL. */
void
arm_init_cumulative_args (CUMULATIVE_ARGS *pcum, tree fntype,
rtx libname,
tree fndecl ATTRIBUTE_UNUSED)
{
/* Long call handling. */
if (fntype)
pcum->pcs_variant = arm_get_pcs_model (fntype, fndecl);
else
pcum->pcs_variant = arm_pcs_default;
if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL)
{
if (arm_libcall_uses_aapcs_base (libname))
pcum->pcs_variant = ARM_PCS_AAPCS;
pcum->aapcs_ncrn = pcum->aapcs_next_ncrn = 0;
pcum->aapcs_reg = NULL_RTX;
pcum->aapcs_partial = 0;
pcum->aapcs_arg_processed = false;
pcum->aapcs_cprc_slot = -1;
pcum->can_split = true;
if (pcum->pcs_variant != ARM_PCS_AAPCS)
{
int i;
for (i = 0; i < ARM_NUM_COPROC_SLOTS; i++)
{
pcum->aapcs_cprc_failed[i] = false;
aapcs_cp_arg_layout[i].cum_init (pcum, fntype, libname, fndecl);
}
}
return;
}
/* Legacy ABIs */
/* On the ARM, the offset starts at 0. */
pcum->nregs = 0;
pcum->iwmmxt_nregs = 0;
pcum->can_split = true;
/* Varargs vectors are treated the same as long long.
named_count avoids having to change the way arm handles 'named' */
pcum->named_count = 0;
pcum->nargs = 0;
if (TARGET_REALLY_IWMMXT && fntype)
{
tree fn_arg;
for (fn_arg = TYPE_ARG_TYPES (fntype);
fn_arg;
fn_arg = TREE_CHAIN (fn_arg))
pcum->named_count += 1;
if (! pcum->named_count)
pcum->named_count = INT_MAX;
}
}
/* Return true if mode/type need doubleword alignment. */
static bool
arm_needs_doubleword_align (machine_mode mode, const_tree type)
{
if (!type)
return PARM_BOUNDARY < GET_MODE_ALIGNMENT (mode);
/* Scalar and vector types: Use natural alignment, i.e. of base type. */
if (!AGGREGATE_TYPE_P (type))
return TYPE_ALIGN (TYPE_MAIN_VARIANT (type)) > PARM_BOUNDARY;
/* Array types: Use member alignment of element type. */
if (TREE_CODE (type) == ARRAY_TYPE)
return TYPE_ALIGN (TREE_TYPE (type)) > PARM_BOUNDARY;
/* Record/aggregate types: Use greatest member alignment of any member. */
for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
if (DECL_ALIGN (field) > PARM_BOUNDARY)
return true;
return false;
}
/* Determine where to put an argument to a function.
Value is zero to push the argument on the stack,
or a hard register in which to store the argument.
MODE is the argument's machine mode.
TYPE is the data type of the argument (as a tree).
This is null for libcalls where that information may
not be available.
CUM is a variable of type CUMULATIVE_ARGS which gives info about
the preceding args and about the function being called.
NAMED is nonzero if this argument is a named parameter
(otherwise it is an extra parameter matching an ellipsis).
On the ARM, normally the first 16 bytes are passed in registers r0-r3; all
other arguments are passed on the stack. If (NAMED == 0) (which happens
only in assign_parms, since TARGET_SETUP_INCOMING_VARARGS is
defined), say it is passed in the stack (function_prologue will
indeed make it pass in the stack if necessary). */
static rtx
arm_function_arg (cumulative_args_t pcum_v, machine_mode mode,
const_tree type, bool named)
{
CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
int nregs;
/* Handle the special case quickly. Pick an arbitrary value for op2 of
a call insn (op3 of a call_value insn). */
if (mode == VOIDmode)
return const0_rtx;
if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL)
{
aapcs_layout_arg (pcum, mode, type, named);
return pcum->aapcs_reg;
}
/* Varargs vectors are treated the same as long long.
named_count avoids having to change the way arm handles 'named' */
if (TARGET_IWMMXT_ABI
&& arm_vector_mode_supported_p (mode)
&& pcum->named_count > pcum->nargs + 1)
{
if (pcum->iwmmxt_nregs <= 9)
return gen_rtx_REG (mode, pcum->iwmmxt_nregs + FIRST_IWMMXT_REGNUM);
else
{
pcum->can_split = false;
return NULL_RTX;
}
}
/* Put doubleword aligned quantities in even register pairs. */
if (pcum->nregs & 1
&& ARM_DOUBLEWORD_ALIGN
&& arm_needs_doubleword_align (mode, type))
pcum->nregs++;
/* Only allow splitting an arg between regs and memory if all preceding
args were allocated to regs. For args passed by reference we only count
the reference pointer. */
if (pcum->can_split)
nregs = 1;
else
nregs = ARM_NUM_REGS2 (mode, type);
if (!named || pcum->nregs + nregs > NUM_ARG_REGS)
return NULL_RTX;
return gen_rtx_REG (mode, pcum->nregs);
}
static unsigned int
arm_function_arg_boundary (machine_mode mode, const_tree type)
{
return (ARM_DOUBLEWORD_ALIGN && arm_needs_doubleword_align (mode, type)
? DOUBLEWORD_ALIGNMENT
: PARM_BOUNDARY);
}
static int
arm_arg_partial_bytes (cumulative_args_t pcum_v, machine_mode mode,
tree type, bool named)
{
CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
int nregs = pcum->nregs;
if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL)
{
aapcs_layout_arg (pcum, mode, type, named);
return pcum->aapcs_partial;
}
if (TARGET_IWMMXT_ABI && arm_vector_mode_supported_p (mode))
return 0;
if (NUM_ARG_REGS > nregs
&& (NUM_ARG_REGS < nregs + ARM_NUM_REGS2 (mode, type))
&& pcum->can_split)
return (NUM_ARG_REGS - nregs) * UNITS_PER_WORD;
return 0;
}
/* Update the data in PCUM to advance over an argument
of mode MODE and data type TYPE.
(TYPE is null for libcalls where that information may not be available.) */
static void
arm_function_arg_advance (cumulative_args_t pcum_v, machine_mode mode,
const_tree type, bool named)
{
CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL)
{
aapcs_layout_arg (pcum, mode, type, named);
if (pcum->aapcs_cprc_slot >= 0)
{
aapcs_cp_arg_layout[pcum->aapcs_cprc_slot].advance (pcum, mode,
type);
pcum->aapcs_cprc_slot = -1;
}
/* Generic stuff. */
pcum->aapcs_arg_processed = false;
pcum->aapcs_ncrn = pcum->aapcs_next_ncrn;
pcum->aapcs_reg = NULL_RTX;
pcum->aapcs_partial = 0;
}
else
{
pcum->nargs += 1;
if (arm_vector_mode_supported_p (mode)
&& pcum->named_count > pcum->nargs
&& TARGET_IWMMXT_ABI)
pcum->iwmmxt_nregs += 1;
else
pcum->nregs += ARM_NUM_REGS2 (mode, type);
}
}
/* Variable sized types are passed by reference. This is a GCC
extension to the ARM ABI. */
static bool
arm_pass_by_reference (cumulative_args_t cum ATTRIBUTE_UNUSED,
machine_mode mode ATTRIBUTE_UNUSED,
const_tree type, bool named ATTRIBUTE_UNUSED)
{
return type && TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST;
}
/* Encode the current state of the #pragma [no_]long_calls. */
typedef enum
{
OFF, /* No #pragma [no_]long_calls is in effect. */
LONG, /* #pragma long_calls is in effect. */
SHORT /* #pragma no_long_calls is in effect. */
} arm_pragma_enum;
static arm_pragma_enum arm_pragma_long_calls = OFF;
void
arm_pr_long_calls (struct cpp_reader * pfile ATTRIBUTE_UNUSED)
{
arm_pragma_long_calls = LONG;
}
void
arm_pr_no_long_calls (struct cpp_reader * pfile ATTRIBUTE_UNUSED)
{
arm_pragma_long_calls = SHORT;
}
void
arm_pr_long_calls_off (struct cpp_reader * pfile ATTRIBUTE_UNUSED)
{
arm_pragma_long_calls = OFF;
}
/* Handle an attribute requiring a FUNCTION_DECL;
arguments as in struct attribute_spec.handler. */
static tree
arm_handle_fndecl_attribute (tree *node, tree name, tree args ATTRIBUTE_UNUSED,
int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
{
if (TREE_CODE (*node) != FUNCTION_DECL)
{
warning (OPT_Wattributes, "%qE attribute only applies to functions",
name);
*no_add_attrs = true;
}
return NULL_TREE;
}
/* Handle an "interrupt" or "isr" attribute;
arguments as in struct attribute_spec.handler. */
static tree
arm_handle_isr_attribute (tree *node, tree name, tree args, int flags,
bool *no_add_attrs)
{
if (DECL_P (*node))
{
if (TREE_CODE (*node) != FUNCTION_DECL)
{
warning (OPT_Wattributes, "%qE attribute only applies to functions",
name);
*no_add_attrs = true;
}
/* FIXME: the argument if any is checked for type attributes;
should it be checked for decl ones? */
}
else
{
if (TREE_CODE (*node) == FUNCTION_TYPE
|| TREE_CODE (*node) == METHOD_TYPE)
{
if (arm_isr_value (args) == ARM_FT_UNKNOWN)
{
warning (OPT_Wattributes, "%qE attribute ignored",
name);
*no_add_attrs = true;
}
}
else if (TREE_CODE (*node) == POINTER_TYPE
&& (TREE_CODE (TREE_TYPE (*node)) == FUNCTION_TYPE
|| TREE_CODE (TREE_TYPE (*node)) == METHOD_TYPE)
&& arm_isr_value (args) != ARM_FT_UNKNOWN)
{
*node = build_variant_type_copy (*node);
TREE_TYPE (*node) = build_type_attribute_variant
(TREE_TYPE (*node),
tree_cons (name, args, TYPE_ATTRIBUTES (TREE_TYPE (*node))));
*no_add_attrs = true;
}
else
{
/* Possibly pass this attribute on from the type to a decl. */
if (flags & ((int) ATTR_FLAG_DECL_NEXT
| (int) ATTR_FLAG_FUNCTION_NEXT
| (int) ATTR_FLAG_ARRAY_NEXT))
{
*no_add_attrs = true;
return tree_cons (name, args, NULL_TREE);
}
else
{
warning (OPT_Wattributes, "%qE attribute ignored",
name);
}
}
}
return NULL_TREE;
}
/* Handle a "pcs" attribute; arguments as in struct
attribute_spec.handler. */
static tree
arm_handle_pcs_attribute (tree *node ATTRIBUTE_UNUSED, tree name, tree args,
int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
{
if (arm_pcs_from_attribute (args) == ARM_PCS_UNKNOWN)
{
warning (OPT_Wattributes, "%qE attribute ignored", name);
*no_add_attrs = true;
}
return NULL_TREE;
}
#if TARGET_DLLIMPORT_DECL_ATTRIBUTES
/* Handle the "notshared" attribute. This attribute is another way of
requesting hidden visibility. ARM's compiler supports
"__declspec(notshared)"; we support the same thing via an
attribute. */
static tree
arm_handle_notshared_attribute (tree *node,
tree name ATTRIBUTE_UNUSED,
tree args ATTRIBUTE_UNUSED,
int flags ATTRIBUTE_UNUSED,
bool *no_add_attrs)
{
tree decl = TYPE_NAME (*node);
if (decl)
{
DECL_VISIBILITY (decl) = VISIBILITY_HIDDEN;
DECL_VISIBILITY_SPECIFIED (decl) = 1;
*no_add_attrs = false;
}
return NULL_TREE;
}
#endif
/* Return 0 if the attributes for two types are incompatible, 1 if they
are compatible, and 2 if they are nearly compatible (which causes a
warning to be generated). */
static int
arm_comp_type_attributes (const_tree type1, const_tree type2)
{
int l1, l2, s1, s2;
/* Check for mismatch of non-default calling convention. */
if (TREE_CODE (type1) != FUNCTION_TYPE)
return 1;
/* Check for mismatched call attributes. */
l1 = lookup_attribute ("long_call", TYPE_ATTRIBUTES (type1)) != NULL;
l2 = lookup_attribute ("long_call", TYPE_ATTRIBUTES (type2)) != NULL;
s1 = lookup_attribute ("short_call", TYPE_ATTRIBUTES (type1)) != NULL;
s2 = lookup_attribute ("short_call", TYPE_ATTRIBUTES (type2)) != NULL;
/* Only bother to check if an attribute is defined. */
if (l1 | l2 | s1 | s2)
{
/* If one type has an attribute, the other must have the same attribute. */
if ((l1 != l2) || (s1 != s2))
return 0;
/* Disallow mixed attributes. */
if ((l1 & s2) || (l2 & s1))
return 0;
}
/* Check for mismatched ISR attribute. */
l1 = lookup_attribute ("isr", TYPE_ATTRIBUTES (type1)) != NULL;
if (! l1)
l1 = lookup_attribute ("interrupt", TYPE_ATTRIBUTES (type1)) != NULL;
l2 = lookup_attribute ("isr", TYPE_ATTRIBUTES (type2)) != NULL;
if (! l2)
l1 = lookup_attribute ("interrupt", TYPE_ATTRIBUTES (type2)) != NULL;
if (l1 != l2)
return 0;
return 1;
}
/* Assigns default attributes to newly defined type. This is used to
set short_call/long_call attributes for function types of
functions defined inside corresponding #pragma scopes. */
static void
arm_set_default_type_attributes (tree type)
{
/* Add __attribute__ ((long_call)) to all functions, when
inside #pragma long_calls or __attribute__ ((short_call)),
when inside #pragma no_long_calls. */
if (TREE_CODE (type) == FUNCTION_TYPE || TREE_CODE (type) == METHOD_TYPE)
{
tree type_attr_list, attr_name;
type_attr_list = TYPE_ATTRIBUTES (type);
if (arm_pragma_long_calls == LONG)
attr_name = get_identifier ("long_call");
else if (arm_pragma_long_calls == SHORT)
attr_name = get_identifier ("short_call");
else
return;
type_attr_list = tree_cons (attr_name, NULL_TREE, type_attr_list);
TYPE_ATTRIBUTES (type) = type_attr_list;
}
}
/* Return true if DECL is known to be linked into section SECTION. */
static bool
arm_function_in_section_p (tree decl, section *section)
{
/* We can only be certain about the prevailing symbol definition. */
if (!decl_binds_to_current_def_p (decl))
return false;
/* If DECL_SECTION_NAME is set, assume it is trustworthy. */
if (!DECL_SECTION_NAME (decl))
{
/* Make sure that we will not create a unique section for DECL. */
if (flag_function_sections || DECL_COMDAT_GROUP (decl))
return false;
}
return function_section (decl) == section;
}
/* Return nonzero if a 32-bit "long_call" should be generated for
a call from the current function to DECL. We generate a long_call
if the function:
a. has an __attribute__((long call))
or b. is within the scope of a #pragma long_calls
or c. the -mlong-calls command line switch has been specified
However we do not generate a long call if the function:
d. has an __attribute__ ((short_call))
or e. is inside the scope of a #pragma no_long_calls
or f. is defined in the same section as the current function. */
bool
arm_is_long_call_p (tree decl)
{
tree attrs;
if (!decl)
return TARGET_LONG_CALLS;
attrs = TYPE_ATTRIBUTES (TREE_TYPE (decl));
if (lookup_attribute ("short_call", attrs))
return false;
/* For "f", be conservative, and only cater for cases in which the
whole of the current function is placed in the same section. */
if (!flag_reorder_blocks_and_partition
&& TREE_CODE (decl) == FUNCTION_DECL
&& arm_function_in_section_p (decl, current_function_section ()))
return false;
if (lookup_attribute ("long_call", attrs))
return true;
return TARGET_LONG_CALLS;
}
/* Return nonzero if it is ok to make a tail-call to DECL. */
static bool
arm_function_ok_for_sibcall (tree decl, tree exp)
{
unsigned long func_type;
if (cfun->machine->sibcall_blocked)
return false;
/* Never tailcall something if we are generating code for Thumb-1. */
if (TARGET_THUMB1)
return false;
/* The PIC register is live on entry to VxWorks PLT entries, so we
must make the call before restoring the PIC register. */
if (TARGET_VXWORKS_RTP && flag_pic && decl && !targetm.binds_local_p (decl))
return false;
/* If we are interworking and the function is not declared static
then we can't tail-call it unless we know that it exists in this
compilation unit (since it might be a Thumb routine). */
if (TARGET_INTERWORK && decl && TREE_PUBLIC (decl)
&& !TREE_ASM_WRITTEN (decl))
return false;
func_type = arm_current_func_type ();
/* Never tailcall from an ISR routine - it needs a special exit sequence. */
if (IS_INTERRUPT (func_type))
return false;
if (!VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
{
/* Check that the return value locations are the same. For
example that we aren't returning a value from the sibling in
a VFP register but then need to transfer it to a core
register. */
rtx a, b;
tree decl_or_type = decl;
/* If it is an indirect function pointer, get the function type. */
if (!decl)
decl_or_type = TREE_TYPE (TREE_TYPE (CALL_EXPR_FN (exp)));
a = arm_function_value (TREE_TYPE (exp), decl_or_type, false);
b = arm_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
cfun->decl, false);
if (!rtx_equal_p (a, b))
return false;
}
/* Never tailcall if function may be called with a misaligned SP. */
if (IS_STACKALIGN (func_type))
return false;
/* The AAPCS says that, on bare-metal, calls to unresolved weak
references should become a NOP. Don't convert such calls into
sibling calls. */
if (TARGET_AAPCS_BASED
&& arm_abi == ARM_ABI_AAPCS
&& decl
&& DECL_WEAK (decl))
return false;
/* Everything else is ok. */
return true;
}
/* Addressing mode support functions. */
/* Return nonzero if X is a legitimate immediate operand when compiling
for PIC. We know that X satisfies CONSTANT_P and flag_pic is true. */
int
legitimate_pic_operand_p (rtx x)
{
if (GET_CODE (x) == SYMBOL_REF
|| (GET_CODE (x) == CONST
&& GET_CODE (XEXP (x, 0)) == PLUS
&& GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
return 0;
return 1;
}
/* Record that the current function needs a PIC register. Initialize
cfun->machine->pic_reg if we have not already done so. */
static void
require_pic_register (void)
{
/* A lot of the logic here is made obscure by the fact that this
routine gets called as part of the rtx cost estimation process.
We don't want those calls to affect any assumptions about the real
function; and further, we can't call entry_of_function() until we
start the real expansion process. */
if (!crtl->uses_pic_offset_table)
{
gcc_assert (can_create_pseudo_p ());
if (arm_pic_register != INVALID_REGNUM
&& !(TARGET_THUMB1 && arm_pic_register > LAST_LO_REGNUM))
{
if (!cfun->machine->pic_reg)
cfun->machine->pic_reg = gen_rtx_REG (Pmode, arm_pic_register);
/* Play games to avoid marking the function as needing pic
if we are being called as part of the cost-estimation
process. */
if (current_ir_type () != IR_GIMPLE || currently_expanding_to_rtl)
crtl->uses_pic_offset_table = 1;
}
else
{
rtx_insn *seq, *insn;
if (!cfun->machine->pic_reg)
cfun->machine->pic_reg = gen_reg_rtx (Pmode);
/* Play games to avoid marking the function as needing pic
if we are being called as part of the cost-estimation
process. */
if (current_ir_type () != IR_GIMPLE || currently_expanding_to_rtl)
{
crtl->uses_pic_offset_table = 1;
start_sequence ();
if (TARGET_THUMB1 && arm_pic_register != INVALID_REGNUM
&& arm_pic_register > LAST_LO_REGNUM)
emit_move_insn (cfun->machine->pic_reg,
gen_rtx_REG (Pmode, arm_pic_register));
else
arm_load_pic_register (0UL);
seq = get_insns ();
end_sequence ();
for (insn = seq; insn; insn = NEXT_INSN (insn))
if (INSN_P (insn))
INSN_LOCATION (insn) = prologue_location;
/* We can be called during expansion of PHI nodes, where
we can't yet emit instructions directly in the final
insn stream. Queue the insns on the entry edge, they will
be committed after everything else is expanded. */
insert_insn_on_edge (seq,
single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
}
}
}
}
rtx
legitimize_pic_address (rtx orig, machine_mode mode, rtx reg)
{
if (GET_CODE (orig) == SYMBOL_REF
|| GET_CODE (orig) == LABEL_REF)
{
rtx insn;
if (reg == 0)
{
gcc_assert (can_create_pseudo_p ());
reg = gen_reg_rtx (Pmode);
}
/* VxWorks does not impose a fixed gap between segments; the run-time
gap can be different from the object-file gap. We therefore can't
use GOTOFF unless we are absolutely sure that the symbol is in the
same segment as the GOT. Unfortunately, the flexibility of linker
scripts means that we can't be sure of that in general, so assume
that GOTOFF is never valid on VxWorks. */
/* References to weak symbols cannot be resolved locally: they
may be overridden by a non-weak definition at link time. */
if ((GET_CODE (orig) == LABEL_REF
|| (GET_CODE (orig) == SYMBOL_REF
&& SYMBOL_REF_LOCAL_P (orig)
&& (SYMBOL_REF_DECL (orig)
? !DECL_WEAK (SYMBOL_REF_DECL (orig)) : 1)))
&& NEED_GOT_RELOC
&& arm_pic_data_is_text_relative)
insn = arm_pic_static_addr (orig, reg);
else
{
rtx pat;
rtx mem;
/* If this function doesn't have a pic register, create one now. */
require_pic_register ();
pat = gen_calculate_pic_address (reg, cfun->machine->pic_reg, orig);
/* Make the MEM as close to a constant as possible. */
mem = SET_SRC (pat);
gcc_assert (MEM_P (mem) && !MEM_VOLATILE_P (mem));
MEM_READONLY_P (mem) = 1;
MEM_NOTRAP_P (mem) = 1;
insn = emit_insn (pat);
}
/* Put a REG_EQUAL note on this insn, so that it can be optimized
by loop. */
set_unique_reg_note (insn, REG_EQUAL, orig);
return reg;
}
else if (GET_CODE (orig) == CONST)
{
rtx base, offset;
if (GET_CODE (XEXP (orig, 0)) == PLUS
&& XEXP (XEXP (orig, 0), 0) == cfun->machine->pic_reg)
return orig;
/* Handle the case where we have: const (UNSPEC_TLS). */
if (GET_CODE (XEXP (orig, 0)) == UNSPEC
&& XINT (XEXP (orig, 0), 1) == UNSPEC_TLS)
return orig;
/* Handle the case where we have:
const (plus (UNSPEC_TLS) (ADDEND)). The ADDEND must be a
CONST_INT. */
if (GET_CODE (XEXP (orig, 0)) == PLUS
&& GET_CODE (XEXP (XEXP (orig, 0), 0)) == UNSPEC
&& XINT (XEXP (XEXP (orig, 0), 0), 1) == UNSPEC_TLS)
{
gcc_assert (CONST_INT_P (XEXP (XEXP (orig, 0), 1)));
return orig;
}
if (reg == 0)
{
gcc_assert (can_create_pseudo_p ());
reg = gen_reg_rtx (Pmode);
}
gcc_assert (GET_CODE (XEXP (orig, 0)) == PLUS);
base = legitimize_pic_address (XEXP (XEXP (orig, 0), 0), Pmode, reg);
offset = legitimize_pic_address (XEXP (XEXP (orig, 0), 1), Pmode,
base == reg ? 0 : reg);
if (CONST_INT_P (offset))
{
/* The base register doesn't really matter, we only want to
test the index for the appropriate mode. */
if (!arm_legitimate_index_p (mode, offset, SET, 0))
{
gcc_assert (can_create_pseudo_p ());
offset = force_reg (Pmode, offset);
}
if (CONST_INT_P (offset))
return plus_constant (Pmode, base, INTVAL (offset));
}
if (GET_MODE_SIZE (mode) > 4
&& (GET_MODE_CLASS (mode) == MODE_INT
|| TARGET_SOFT_FLOAT))
{
emit_insn (gen_addsi3 (reg, base, offset));
return reg;
}
return gen_rtx_PLUS (Pmode, base, offset);
}
return orig;
}
/* Find a spare register to use during the prolog of a function. */
static int
thumb_find_work_register (unsigned long pushed_regs_mask)
{
int reg;
/* Check the argument registers first as these are call-used. The
register allocation order means that sometimes r3 might be used
but earlier argument registers might not, so check them all. */
for (reg = LAST_ARG_REGNUM; reg >= 0; reg --)
if (!df_regs_ever_live_p (reg))
return reg;
/* Before going on to check the call-saved registers we can try a couple
more ways of deducing that r3 is available. The first is when we are
pushing anonymous arguments onto the stack and we have less than 4
registers worth of fixed arguments(*). In this case r3 will be part of
the variable argument list and so we can be sure that it will be
pushed right at the start of the function. Hence it will be available
for the rest of the prologue.
(*): ie crtl->args.pretend_args_size is greater than 0. */
if (cfun->machine->uses_anonymous_args
&& crtl->args.pretend_args_size > 0)
return LAST_ARG_REGNUM;
/* The other case is when we have fixed arguments but less than 4 registers
worth. In this case r3 might be used in the body of the function, but
it is not being used to convey an argument into the function. In theory
we could just check crtl->args.size to see how many bytes are
being passed in argument registers, but it seems that it is unreliable.
Sometimes it will have the value 0 when in fact arguments are being
passed. (See testcase execute/20021111-1.c for an example). So we also
check the args_info.nregs field as well. The problem with this field is
that it makes no allowances for arguments that are passed to the
function but which are not used. Hence we could miss an opportunity
when a function has an unused argument in r3. But it is better to be
safe than to be sorry. */
if (! cfun->machine->uses_anonymous_args
&& crtl->args.size >= 0
&& crtl->args.size <= (LAST_ARG_REGNUM * UNITS_PER_WORD)
&& (TARGET_AAPCS_BASED
? crtl->args.info.aapcs_ncrn < 4
: crtl->args.info.nregs < 4))
return LAST_ARG_REGNUM;
/* Otherwise look for a call-saved register that is going to be pushed. */
for (reg = LAST_LO_REGNUM; reg > LAST_ARG_REGNUM; reg --)
if (pushed_regs_mask & (1 << reg))
return reg;
if (TARGET_THUMB2)
{
/* Thumb-2 can use high regs. */
for (reg = FIRST_HI_REGNUM; reg < 15; reg ++)
if (pushed_regs_mask & (1 << reg))
return reg;
}
/* Something went wrong - thumb_compute_save_reg_mask()
should have arranged for a suitable register to be pushed. */
gcc_unreachable ();
}
static GTY(()) int pic_labelno;
/* Generate code to load the PIC register. In thumb mode SCRATCH is a
low register. */
void
arm_load_pic_register (unsigned long saved_regs ATTRIBUTE_UNUSED)
{
rtx l1, labelno, pic_tmp, pic_rtx, pic_reg;
if (crtl->uses_pic_offset_table == 0 || TARGET_SINGLE_PIC_BASE)
return;
gcc_assert (flag_pic);
pic_reg = cfun->machine->pic_reg;
if (TARGET_VXWORKS_RTP)
{
pic_rtx = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE);
pic_rtx = gen_rtx_CONST (Pmode, pic_rtx);
emit_insn (gen_pic_load_addr_32bit (pic_reg, pic_rtx));
emit_insn (gen_rtx_SET (Pmode, pic_reg, gen_rtx_MEM (Pmode, pic_reg)));
pic_tmp = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
emit_insn (gen_pic_offset_arm (pic_reg, pic_reg, pic_tmp));
}
else
{
/* We use an UNSPEC rather than a LABEL_REF because this label
never appears in the code stream. */
labelno = GEN_INT (pic_labelno++);
l1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, labelno), UNSPEC_PIC_LABEL);
l1 = gen_rtx_CONST (VOIDmode, l1);
/* On the ARM the PC register contains 'dot + 8' at the time of the
addition, on the Thumb it is 'dot + 4'. */
pic_rtx = plus_constant (Pmode, l1, TARGET_ARM ? 8 : 4);
pic_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, pic_rtx),
UNSPEC_GOTSYM_OFF);
pic_rtx = gen_rtx_CONST (Pmode, pic_rtx);
if (TARGET_32BIT)
{
emit_insn (gen_pic_load_addr_unified (pic_reg, pic_rtx, labelno));
}
else /* TARGET_THUMB1 */
{
if (arm_pic_register != INVALID_REGNUM
&& REGNO (pic_reg) > LAST_LO_REGNUM)
{
/* We will have pushed the pic register, so we should always be
able to find a work register. */
pic_tmp = gen_rtx_REG (SImode,
thumb_find_work_register (saved_regs));
emit_insn (gen_pic_load_addr_thumb1 (pic_tmp, pic_rtx));
emit_insn (gen_movsi (pic_offset_table_rtx, pic_tmp));
emit_insn (gen_pic_add_dot_plus_four (pic_reg, pic_reg, labelno));
}
else if (arm_pic_register != INVALID_REGNUM
&& arm_pic_register > LAST_LO_REGNUM
&& REGNO (pic_reg) <= LAST_LO_REGNUM)
{
emit_insn (gen_pic_load_addr_unified (pic_reg, pic_rtx, labelno));
emit_move_insn (gen_rtx_REG (Pmode, arm_pic_register), pic_reg);
emit_use (gen_rtx_REG (Pmode, arm_pic_register));
}
else
emit_insn (gen_pic_load_addr_unified (pic_reg, pic_rtx, labelno));
}
}
/* Need to emit this whether or not we obey regdecls,
since setjmp/longjmp can cause life info to screw up. */
emit_use (pic_reg);
}
/* Generate code to load the address of a static var when flag_pic is set. */
static rtx
arm_pic_static_addr (rtx orig, rtx reg)
{
rtx l1, labelno, offset_rtx, insn;
gcc_assert (flag_pic);
/* We use an UNSPEC rather than a LABEL_REF because this label
never appears in the code stream. */
labelno = GEN_INT (pic_labelno++);
l1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, labelno), UNSPEC_PIC_LABEL);
l1 = gen_rtx_CONST (VOIDmode, l1);
/* On the ARM the PC register contains 'dot + 8' at the time of the
addition, on the Thumb it is 'dot + 4'. */
offset_rtx = plus_constant (Pmode, l1, TARGET_ARM ? 8 : 4);
offset_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (2, orig, offset_rtx),
UNSPEC_SYMBOL_OFFSET);
offset_rtx = gen_rtx_CONST (Pmode, offset_rtx);
insn = emit_insn (gen_pic_load_addr_unified (reg, offset_rtx, labelno));
return insn;
}
/* Return nonzero if X is valid as an ARM state addressing register. */
static int
arm_address_register_rtx_p (rtx x, int strict_p)
{
int regno;
if (!REG_P (x))
return 0;
regno = REGNO (x);
if (strict_p)
return ARM_REGNO_OK_FOR_BASE_P (regno);
return (regno <= LAST_ARM_REGNUM
|| regno >= FIRST_PSEUDO_REGISTER
|| regno == FRAME_POINTER_REGNUM
|| regno == ARG_POINTER_REGNUM);
}
/* Return TRUE if this rtx is the difference of a symbol and a label,
and will reduce to a PC-relative relocation in the object file.
Expressions like this can be left alone when generating PIC, rather
than forced through the GOT. */
static int
pcrel_constant_p (rtx x)
{
if (GET_CODE (x) == MINUS)
return symbol_mentioned_p (XEXP (x, 0)) && label_mentioned_p (XEXP (x, 1));
return FALSE;
}
/* Return true if X will surely end up in an index register after next
splitting pass. */
static bool
will_be_in_index_register (const_rtx x)
{
/* arm.md: calculate_pic_address will split this into a register. */
return GET_CODE (x) == UNSPEC && (XINT (x, 1) == UNSPEC_PIC_SYM);
}
/* Return nonzero if X is a valid ARM state address operand. */
int
arm_legitimate_address_outer_p (machine_mode mode, rtx x, RTX_CODE outer,
int strict_p)
{
bool use_ldrd;
enum rtx_code code = GET_CODE (x);
if (arm_address_register_rtx_p (x, strict_p))
return 1;
use_ldrd = (TARGET_LDRD
&& (mode == DImode
|| (mode == DFmode && (TARGET_SOFT_FLOAT || TARGET_VFP))));
if (code == POST_INC || code == PRE_DEC
|| ((code == PRE_INC || code == POST_DEC)
&& (use_ldrd || GET_MODE_SIZE (mode) <= 4)))
return arm_address_register_rtx_p (XEXP (x, 0), strict_p);
else if ((code == POST_MODIFY || code == PRE_MODIFY)
&& arm_address_register_rtx_p (XEXP (x, 0), strict_p)
&& GET_CODE (XEXP (x, 1)) == PLUS
&& rtx_equal_p (XEXP (XEXP (x, 1), 0), XEXP (x, 0)))
{
rtx addend = XEXP (XEXP (x, 1), 1);
/* Don't allow ldrd post increment by register because it's hard
to fixup invalid register choices. */
if (use_ldrd
&& GET_CODE (x) == POST_MODIFY
&& REG_P (addend))
return 0;
return ((use_ldrd || GET_MODE_SIZE (mode) <= 4)
&& arm_legitimate_index_p (mode, addend, outer, strict_p));
}
/* After reload constants split into minipools will have addresses
from a LABEL_REF. */
else if (reload_completed
&& (code == LABEL_REF
|| (code == CONST
&& GET_CODE (XEXP (x, 0)) == PLUS
&& GET_CODE (XEXP (XEXP (x, 0), 0)) == LABEL_REF
&& CONST_INT_P (XEXP (XEXP (x, 0), 1)))))
return 1;
else if (mode == TImode || (TARGET_NEON && VALID_NEON_STRUCT_MODE (mode)))
return 0;
else if (code == PLUS)
{
rtx xop0 = XEXP (x, 0);
rtx xop1 = XEXP (x, 1);
return ((arm_address_register_rtx_p (xop0, strict_p)
&& ((CONST_INT_P (xop1)
&& arm_legitimate_index_p (mode, xop1, outer, strict_p))
|| (!strict_p && will_be_in_index_register (xop1))))
|| (arm_address_register_rtx_p (xop1, strict_p)
&& arm_legitimate_index_p (mode, xop0, outer, strict_p)));
}
#if 0
/* Reload currently can't handle MINUS, so disable this for now */
else if (GET_CODE (x) == MINUS)
{
rtx xop0 = XEXP (x, 0);
rtx xop1 = XEXP (x, 1);
return (arm_address_register_rtx_p (xop0, strict_p)
&& arm_legitimate_index_p (mode, xop1, outer, strict_p));
}
#endif
else if (GET_MODE_CLASS (mode) != MODE_FLOAT
&& code == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x)
&& ! (flag_pic
&& symbol_mentioned_p (get_pool_constant (x))
&& ! pcrel_constant_p (get_pool_constant (x))))
return 1;
return 0;
}
/* Return nonzero if X is a valid Thumb-2 address operand. */
static int
thumb2_legitimate_address_p (machine_mode mode, rtx x, int strict_p)
{
bool use_ldrd;
enum rtx_code code = GET_CODE (x);
if (arm_address_register_rtx_p (x, strict_p))
return 1;
use_ldrd = (TARGET_LDRD
&& (mode == DImode
|| (mode == DFmode && (TARGET_SOFT_FLOAT || TARGET_VFP))));
if (code == POST_INC || code == PRE_DEC
|| ((code == PRE_INC || code == POST_DEC)
&& (use_ldrd || GET_MODE_SIZE (mode) <= 4)))
return arm_address_register_rtx_p (XEXP (x, 0), strict_p);
else if ((code == POST_MODIFY || code == PRE_MODIFY)
&& arm_address_register_rtx_p (XEXP (x, 0), strict_p)
&& GET_CODE (XEXP (x, 1)) == PLUS
&& rtx_equal_p (XEXP (XEXP (x, 1), 0), XEXP (x, 0)))
{
/* Thumb-2 only has autoincrement by constant. */
rtx addend = XEXP (XEXP (x, 1), 1);
HOST_WIDE_INT offset;
if (!CONST_INT_P (addend))
return 0;
offset = INTVAL(addend);
if (GET_MODE_SIZE (mode) <= 4)
return (offset > -256 && offset < 256);
return (use_ldrd && offset > -1024 && offset < 1024
&& (offset & 3) == 0);
}
/* After reload constants split into minipools will have addresses
from a LABEL_REF. */
else if (reload_completed
&& (code == LABEL_REF
|| (code == CONST
&& GET_CODE (XEXP (x, 0)) == PLUS
&& GET_CODE (XEXP (XEXP (x, 0), 0)) == LABEL_REF
&& CONST_INT_P (XEXP (XEXP (x, 0), 1)))))
return 1;
else if (mode == TImode || (TARGET_NEON && VALID_NEON_STRUCT_MODE (mode)))
return 0;
else if (code == PLUS)
{
rtx xop0 = XEXP (x, 0);
rtx xop1 = XEXP (x, 1);
return ((arm_address_register_rtx_p (xop0, strict_p)
&& (thumb2_legitimate_index_p (mode, xop1, strict_p)
|| (!strict_p && will_be_in_index_register (xop1))))
|| (arm_address_register_rtx_p (xop1, strict_p)
&& thumb2_legitimate_index_p (mode, xop0, strict_p)));
}
/* Normally we can assign constant values to target registers without
the help of constant pool. But there are cases we have to use constant
pool like:
1) assign a label to register.
2) sign-extend a 8bit value to 32bit and then assign to register.
Constant pool access in format:
(set (reg r0) (mem (symbol_ref (".LC0"))))
will cause the use of literal pool (later in function arm_reorg).
So here we mark such format as an invalid format, then the compiler
will adjust it into:
(set (reg r0) (symbol_ref (".LC0")))
(set (reg r0) (mem (reg r0))).
No extra register is required, and (mem (reg r0)) won't cause the use
of literal pools. */
else if (arm_disable_literal_pool && code == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x))
return 0;
else if (GET_MODE_CLASS (mode) != MODE_FLOAT
&& code == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x)
&& ! (flag_pic
&& symbol_mentioned_p (get_pool_constant (x))
&& ! pcrel_constant_p (get_pool_constant (x))))
return 1;
return 0;
}
/* Return nonzero if INDEX is valid for an address index operand in
ARM state. */
static int
arm_legitimate_index_p (machine_mode mode, rtx index, RTX_CODE outer,
int strict_p)
{
HOST_WIDE_INT range;
enum rtx_code code = GET_CODE (index);
/* Standard coprocessor addressing modes. */
if (TARGET_HARD_FLOAT
&& TARGET_VFP
&& (mode == SFmode || mode == DFmode))
return (code == CONST_INT && INTVAL (index) < 1024
&& INTVAL (index) > -1024
&& (INTVAL (index) & 3) == 0);
/* For quad modes, we restrict the constant offset to be slightly less
than what the instruction format permits. We do this because for
quad mode moves, we will actually decompose them into two separate
double-mode reads or writes. INDEX must therefore be a valid
(double-mode) offset and so should INDEX+8. */
if (TARGET_NEON && VALID_NEON_QREG_MODE (mode))
return (code == CONST_INT
&& INTVAL (index) < 1016
&& INTVAL (index) > -1024
&& (INTVAL (index) & 3) == 0);
/* We have no such constraint on double mode offsets, so we permit the
full range of the instruction format. */
if (TARGET_NEON && VALID_NEON_DREG_MODE (mode))
return (code == CONST_INT
&& INTVAL (index) < 1024
&& INTVAL (index) > -1024
&& (INTVAL (index) & 3) == 0);
if (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (mode))
return (code == CONST_INT
&& INTVAL (index) < 1024
&& INTVAL (index) > -1024
&& (INTVAL (index) & 3) == 0);
if (arm_address_register_rtx_p (index, strict_p)
&& (GET_MODE_SIZE (mode) <= 4))
return 1;
if (mode == DImode || mode == DFmode)
{
if (code == CONST_INT)
{
HOST_WIDE_INT val = INTVAL (index);
if (TARGET_LDRD)
return val > -256 && val < 256;
else
return val > -4096 && val < 4092;
}
return TARGET_LDRD && arm_address_register_rtx_p (index, strict_p);
}
if (GET_MODE_SIZE (mode) <= 4
&& ! (arm_arch4
&& (mode == HImode
|| mode == HFmode
|| (mode == QImode && outer == SIGN_EXTEND))))
{
if (code == MULT)
{
rtx xiop0 = XEXP (index, 0);
rtx xiop1 = XEXP (index, 1);
return ((arm_address_register_rtx_p (xiop0, strict_p)
&& power_of_two_operand (xiop1, SImode))
|| (arm_address_register_rtx_p (xiop1, strict_p)
&& power_of_two_operand (xiop0, SImode)));
}
else if (code == LSHIFTRT || code == ASHIFTRT
|| code == ASHIFT || code == ROTATERT)
{
rtx op = XEXP (index, 1);
return (arm_address_register_rtx_p (XEXP (index, 0), strict_p)
&& CONST_INT_P (op)
&& INTVAL (op) > 0
&& INTVAL (op) <= 31);
}
}
/* For ARM v4 we may be doing a sign-extend operation during the
load. */
if (arm_arch4)
{
if (mode == HImode
|| mode == HFmode
|| (outer == SIGN_EXTEND && mode == QImode))
range = 256;
else
range = 4096;
}
else
range = (mode == HImode || mode == HFmode) ? 4095 : 4096;
return (code == CONST_INT
&& INTVAL (index) < range
&& INTVAL (index) > -range);
}
/* Return true if OP is a valid index scaling factor for Thumb-2 address
index operand. i.e. 1, 2, 4 or 8. */
static bool
thumb2_index_mul_operand (rtx op)
{
HOST_WIDE_INT val;
if (!CONST_INT_P (op))
return false;
val = INTVAL(op);
return (val == 1 || val == 2 || val == 4 || val == 8);
}
/* Return nonzero if INDEX is a valid Thumb-2 address index operand. */
static int
thumb2_legitimate_index_p (machine_mode mode, rtx index, int strict_p)
{
enum rtx_code code = GET_CODE (index);
/* ??? Combine arm and thumb2 coprocessor addressing modes. */
/* Standard coprocessor addressing modes. */
if (TARGET_HARD_FLOAT
&& TARGET_VFP
&& (mode == SFmode || mode == DFmode))
return (code == CONST_INT && INTVAL (index) < 1024
/* Thumb-2 allows only > -256 index range for it's core register
load/stores. Since we allow SF/DF in core registers, we have
to use the intersection between -256~4096 (core) and -1024~1024
(coprocessor). */
&& INTVAL (index) > -256
&& (INTVAL (index) & 3) == 0);
if (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (mode))
{
/* For DImode assume values will usually live in core regs
and only allow LDRD addressing modes. */
if (!TARGET_LDRD || mode != DImode)
return (code == CONST_INT
&& INTVAL (index) < 1024
&& INTVAL (index) > -1024
&& (INTVAL (index) & 3) == 0);
}
/* For quad modes, we restrict the constant offset to be slightly less
than what the instruction format permits. We do this because for
quad mode moves, we will actually decompose them into two separate
double-mode reads or writes. INDEX must therefore be a valid
(double-mode) offset and so should INDEX+8. */
if (TARGET_NEON && VALID_NEON_QREG_MODE (mode))
return (code == CONST_INT
&& INTVAL (index) < 1016
&& INTVAL (index) > -1024
&& (INTVAL (index) & 3) == 0);
/* We have no such constraint on double mode offsets, so we permit the
full range of the instruction format. */
if (TARGET_NEON && VALID_NEON_DREG_MODE (mode))
return (code == CONST_INT
&& INTVAL (index) < 1024
&& INTVAL (index) > -1024
&& (INTVAL (index) & 3) == 0);
if (arm_address_register_rtx_p (index, strict_p)
&& (GET_MODE_SIZE (mode) <= 4))
return 1;
if (mode == DImode || mode == DFmode)
{
if (code == CONST_INT)
{
HOST_WIDE_INT val = INTVAL (index);
/* ??? Can we assume ldrd for thumb2? */
/* Thumb-2 ldrd only has reg+const addressing modes. */
/* ldrd supports offsets of +-1020.
However the ldr fallback does not. */
return val > -256 && val < 256 && (val & 3) == 0;
}
else
return 0;
}
if (code == MULT)
{
rtx xiop0 = XEXP (index, 0);
rtx xiop1 = XEXP (index, 1);
return ((arm_address_register_rtx_p (xiop0, strict_p)
&& thumb2_index_mul_operand (xiop1))
|| (arm_address_register_rtx_p (xiop1, strict_p)
&& thumb2_index_mul_operand (xiop0)));
}
else if (code == ASHIFT)
{
rtx op = XEXP (index, 1);
return (arm_address_register_rtx_p (XEXP (index, 0), strict_p)
&& CONST_INT_P (op)
&& INTVAL (op) > 0
&& INTVAL (op) <= 3);
}
return (code == CONST_INT
&& INTVAL (index) < 4096
&& INTVAL (index) > -256);
}
/* Return nonzero if X is valid as a 16-bit Thumb state base register. */
static int
thumb1_base_register_rtx_p (rtx x, machine_mode mode, int strict_p)
{
int regno;
if (!REG_P (x))
return 0;
regno = REGNO (x);
if (strict_p)
return THUMB1_REGNO_MODE_OK_FOR_BASE_P (regno, mode);
return (regno <= LAST_LO_REGNUM
|| regno > LAST_VIRTUAL_REGISTER
|| regno == FRAME_POINTER_REGNUM
|| (GET_MODE_SIZE (mode) >= 4
&& (regno == STACK_POINTER_REGNUM
|| regno >= FIRST_PSEUDO_REGISTER
|| x == hard_frame_pointer_rtx
|| x == arg_pointer_rtx)));
}
/* Return nonzero if x is a legitimate index register. This is the case
for any base register that can access a QImode object. */
inline static int
thumb1_index_register_rtx_p (rtx x, int strict_p)
{
return thumb1_base_register_rtx_p (x, QImode, strict_p);
}
/* Return nonzero if x is a legitimate 16-bit Thumb-state address.
The AP may be eliminated to either the SP or the FP, so we use the
least common denominator, e.g. SImode, and offsets from 0 to 64.
??? Verify whether the above is the right approach.
??? Also, the FP may be eliminated to the SP, so perhaps that
needs special handling also.
??? Look at how the mips16 port solves this problem. It probably uses
better ways to solve some of these problems.
Although it is not incorrect, we don't accept QImode and HImode
addresses based on the frame pointer or arg pointer until the
reload pass starts. This is so that eliminating such addresses
into stack based ones won't produce impossible code. */
int
thumb1_legitimate_address_p (machine_mode mode, rtx x, int strict_p)
{
/* ??? Not clear if this is right. Experiment. */
if (GET_MODE_SIZE (mode) < 4
&& !(reload_in_progress || reload_completed)
&& (reg_mentioned_p (frame_pointer_rtx, x)
|| reg_mentioned_p (arg_pointer_rtx, x)
|| reg_mentioned_p (virtual_incoming_args_rtx, x)
|| reg_mentioned_p (virtual_outgoing_args_rtx, x)
|| reg_mentioned_p (virtual_stack_dynamic_rtx, x)
|| reg_mentioned_p (virtual_stack_vars_rtx, x)))
return 0;
/* Accept any base register. SP only in SImode or larger. */
else if (thumb1_base_register_rtx_p (x, mode, strict_p))
return 1;
/* This is PC relative data before arm_reorg runs. */
else if (GET_MODE_SIZE (mode) >= 4 && CONSTANT_P (x)
&& GET_CODE (x) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x) && !flag_pic)
return 1;
/* This is PC relative data after arm_reorg runs. */
else if ((GET_MODE_SIZE (mode) >= 4 || mode == HFmode)
&& reload_completed
&& (GET_CODE (x) == LABEL_REF
|| (GET_CODE (x) == CONST
&& GET_CODE (XEXP (x, 0)) == PLUS
&& GET_CODE (XEXP (XEXP (x, 0), 0)) == LABEL_REF
&& CONST_INT_P (XEXP (XEXP (x, 0), 1)))))
return 1;
/* Post-inc indexing only supported for SImode and larger. */
else if (GET_CODE (x) == POST_INC && GET_MODE_SIZE (mode) >= 4
&& thumb1_index_register_rtx_p (XEXP (x, 0), strict_p))
return 1;
else if (GET_CODE (x) == PLUS)
{
/* REG+REG address can be any two index registers. */
/* We disallow FRAME+REG addressing since we know that FRAME
will be replaced with STACK, and SP relative addressing only
permits SP+OFFSET. */
if (GET_MODE_SIZE (mode) <= 4
&& XEXP (x, 0) != frame_pointer_rtx
&& XEXP (x, 1) != frame_pointer_rtx
&& thumb1_index_register_rtx_p (XEXP (x, 0), strict_p)
&& (thumb1_index_register_rtx_p (XEXP (x, 1), strict_p)
|| (!strict_p && will_be_in_index_register (XEXP (x, 1)))))
return 1;
/* REG+const has 5-7 bit offset for non-SP registers. */
else if ((thumb1_index_register_rtx_p (XEXP (x, 0), strict_p)
|| XEXP (x, 0) == arg_pointer_rtx)
&& CONST_INT_P (XEXP (x, 1))
&& thumb_legitimate_offset_p (mode, INTVAL (XEXP (x, 1))))
return 1;
/* REG+const has 10-bit offset for SP, but only SImode and
larger is supported. */
/* ??? Should probably check for DI/DFmode overflow here
just like GO_IF_LEGITIMATE_OFFSET does. */
else if (REG_P (XEXP (x, 0))
&& REGNO (XEXP (x, 0)) == STACK_POINTER_REGNUM
&& GET_MODE_SIZE (mode) >= 4
&& CONST_INT_P (XEXP (x, 1))
&& INTVAL (XEXP (x, 1)) >= 0
&& INTVAL (XEXP (x, 1)) + GET_MODE_SIZE (mode) <= 1024
&& (INTVAL (XEXP (x, 1)) & 3) == 0)
return 1;
else if (REG_P (XEXP (x, 0))
&& (REGNO (XEXP (x, 0)) == FRAME_POINTER_REGNUM
|| REGNO (XEXP (x, 0)) == ARG_POINTER_REGNUM
|| (REGNO (XEXP (x, 0)) >= FIRST_VIRTUAL_REGISTER
&& REGNO (XEXP (x, 0))
<= LAST_VIRTUAL_POINTER_REGISTER))
&& GET_MODE_SIZE (mode) >= 4
&& CONST_INT_P (XEXP (x, 1))
&& (INTVAL (XEXP (x, 1)) & 3) == 0)
return 1;
}
else if (GET_MODE_CLASS (mode) != MODE_FLOAT
&& GET_MODE_SIZE (mode) == 4
&& GET_CODE (x) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x)
&& ! (flag_pic
&& symbol_mentioned_p (get_pool_constant (x))
&& ! pcrel_constant_p (get_pool_constant (x))))
return 1;
return 0;
}
/* Return nonzero if VAL can be used as an offset in a Thumb-state address
instruction of mode MODE. */
int
thumb_legitimate_offset_p (machine_mode mode, HOST_WIDE_INT val)
{
switch (GET_MODE_SIZE (mode))
{
case 1:
return val >= 0 && val < 32;
case 2:
return val >= 0 && val < 64 && (val & 1) == 0;
default:
return (val >= 0
&& (val + GET_MODE_SIZE (mode)) <= 128
&& (val & 3) == 0);
}
}
bool
arm_legitimate_address_p (machine_mode mode, rtx x, bool strict_p)
{
if (TARGET_ARM)
return arm_legitimate_address_outer_p (mode, x, SET, strict_p);
else if (TARGET_THUMB2)
return thumb2_legitimate_address_p (mode, x, strict_p);
else /* if (TARGET_THUMB1) */
return thumb1_legitimate_address_p (mode, x, strict_p);
}
/* Worker function for TARGET_PREFERRED_RELOAD_CLASS.
Given an rtx X being reloaded into a reg required to be
in class CLASS, return the class of reg to actually use.
In general this is just CLASS, but for the Thumb core registers and
immediate constants we prefer a LO_REGS class or a subset. */
static reg_class_t
arm_preferred_reload_class (rtx x ATTRIBUTE_UNUSED, reg_class_t rclass)
{
if (TARGET_32BIT)
return rclass;
else
{
if (rclass == GENERAL_REGS)
return LO_REGS;
else
return rclass;
}
}
/* Build the SYMBOL_REF for __tls_get_addr. */
static GTY(()) rtx tls_get_addr_libfunc;
static rtx
get_tls_get_addr (void)
{
if (!tls_get_addr_libfunc)
tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
return tls_get_addr_libfunc;
}
rtx
arm_load_tp (rtx target)
{
if (!target)
target = gen_reg_rtx (SImode);
if (TARGET_HARD_TP)
{
/* Can return in any reg. */
emit_insn (gen_load_tp_hard (target));
}
else
{
/* Always returned in r0. Immediately copy the result into a pseudo,
otherwise other uses of r0 (e.g. setting up function arguments) may
clobber the value. */
rtx tmp;
emit_insn (gen_load_tp_soft ());
tmp = gen_rtx_REG (SImode, R0_REGNUM);
emit_move_insn (target, tmp);
}
return target;
}
static rtx
load_tls_operand (rtx x, rtx reg)
{
rtx tmp;
if (reg == NULL_RTX)
reg = gen_reg_rtx (SImode);
tmp = gen_rtx_CONST (SImode, x);
emit_move_insn (reg, tmp);
return reg;
}
static rtx
arm_call_tls_get_addr (rtx x, rtx reg, rtx *valuep, int reloc)
{
rtx insns, label, labelno, sum;
gcc_assert (reloc != TLS_DESCSEQ);
start_sequence ();
labelno = GEN_INT (pic_labelno++);
label = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, labelno), UNSPEC_PIC_LABEL);
label = gen_rtx_CONST (VOIDmode, label);
sum = gen_rtx_UNSPEC (Pmode,
gen_rtvec (4, x, GEN_INT (reloc), label,
GEN_INT (TARGET_ARM ? 8 : 4)),
UNSPEC_TLS);
reg = load_tls_operand (sum, reg);
if (TARGET_ARM)
emit_insn (gen_pic_add_dot_plus_eight (reg, reg, labelno));
else
emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno));
*valuep = emit_library_call_value (get_tls_get_addr (), NULL_RTX,
LCT_PURE, /* LCT_CONST? */
Pmode, 1, reg, Pmode);
insns = get_insns ();
end_sequence ();
return insns;
}
static rtx
arm_tls_descseq_addr (rtx x, rtx reg)
{
rtx labelno = GEN_INT (pic_labelno++);
rtx label = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, labelno), UNSPEC_PIC_LABEL);
rtx sum = gen_rtx_UNSPEC (Pmode,
gen_rtvec (4, x, GEN_INT (TLS_DESCSEQ),
gen_rtx_CONST (VOIDmode, label),
GEN_INT (!TARGET_ARM)),
UNSPEC_TLS);
rtx reg0 = load_tls_operand (sum, gen_rtx_REG (SImode, R0_REGNUM));
emit_insn (gen_tlscall (x, labelno));
if (!reg)
reg = gen_reg_rtx (SImode);
else
gcc_assert (REGNO (reg) != R0_REGNUM);
emit_move_insn (reg, reg0);
return reg;
}
rtx
legitimize_tls_address (rtx x, rtx reg)
{
rtx dest, tp, label, labelno, sum, insns, ret, eqv, addend;
unsigned int model = SYMBOL_REF_TLS_MODEL (x);
switch (model)
{
case TLS_MODEL_GLOBAL_DYNAMIC:
if (TARGET_GNU2_TLS)
{
reg = arm_tls_descseq_addr (x, reg);
tp = arm_load_tp (NULL_RTX);
dest = gen_rtx_PLUS (Pmode, tp, reg);
}
else
{
/* Original scheme */
insns = arm_call_tls_get_addr (x, reg, &ret, TLS_GD32);
dest = gen_reg_rtx (Pmode);
emit_libcall_block (insns, dest, ret, x);
}
return dest;
case TLS_MODEL_LOCAL_DYNAMIC:
if (TARGET_GNU2_TLS)
{
reg = arm_tls_descseq_addr (x, reg);
tp = arm_load_tp (NULL_RTX);
dest = gen_rtx_PLUS (Pmode, tp, reg);
}
else
{
insns = arm_call_tls_get_addr (x, reg, &ret, TLS_LDM32);
/* Attach a unique REG_EQUIV, to allow the RTL optimizers to
share the LDM result with other LD model accesses. */
eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const1_rtx),
UNSPEC_TLS);
dest = gen_reg_rtx (Pmode);
emit_libcall_block (insns, dest, ret, eqv);
/* Load the addend. */
addend = gen_rtx_UNSPEC (Pmode, gen_rtvec (2, x,
GEN_INT (TLS_LDO32)),
UNSPEC_TLS);
addend = force_reg (SImode, gen_rtx_CONST (SImode, addend));
dest = gen_rtx_PLUS (Pmode, dest, addend);
}
return dest;
case TLS_MODEL_INITIAL_EXEC:
labelno = GEN_INT (pic_labelno++);
label = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, labelno), UNSPEC_PIC_LABEL);
label = gen_rtx_CONST (VOIDmode, label);
sum = gen_rtx_UNSPEC (Pmode,
gen_rtvec (4, x, GEN_INT (TLS_IE32), label,
GEN_INT (TARGET_ARM ? 8 : 4)),
UNSPEC_TLS);
reg = load_tls_operand (sum, reg);
if (TARGET_ARM)
emit_insn (gen_tls_load_dot_plus_eight (reg, reg, labelno));
else if (TARGET_THUMB2)
emit_insn (gen_tls_load_dot_plus_four (reg, NULL, reg, labelno));
else
{
emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno));
emit_move_insn (reg, gen_const_mem (SImode, reg));
}
tp = arm_load_tp (NULL_RTX);
return gen_rtx_PLUS (Pmode, tp, reg);
case TLS_MODEL_LOCAL_EXEC:
tp = arm_load_tp (NULL_RTX);
reg = gen_rtx_UNSPEC (Pmode,
gen_rtvec (2, x, GEN_INT (TLS_LE32)),
UNSPEC_TLS);
reg = force_reg (SImode, gen_rtx_CONST (SImode, reg));
return gen_rtx_PLUS (Pmode, tp, reg);
default:
abort ();
}
}
/* Try machine-dependent ways of modifying an illegitimate address
to be legitimate. If we find one, return the new, valid address. */
rtx
arm_legitimize_address (rtx x, rtx orig_x, machine_mode mode)
{
if (arm_tls_referenced_p (x))
{
rtx addend = NULL;
if (GET_CODE (x) == CONST && GET_CODE (XEXP (x, 0)) == PLUS)
{
addend = XEXP (XEXP (x, 0), 1);
x = XEXP (XEXP (x, 0), 0);
}
if (GET_CODE (x) != SYMBOL_REF)
return x;
gcc_assert (SYMBOL_REF_TLS_MODEL (x) != 0);
x = legitimize_tls_address (x, NULL_RTX);
if (addend)
{
x = gen_rtx_PLUS (SImode, x, addend);
orig_x = x;
}
else
return x;
}
if (!TARGET_ARM)
{
/* TODO: legitimize_address for Thumb2. */
if (TARGET_THUMB2)
return x;
return thumb_legitimize_address (x, orig_x, mode);
}
if (GET_CODE (x) == PLUS)
{
rtx xop0 = XEXP (x, 0);
rtx xop1 = XEXP (x, 1);
if (CONSTANT_P (xop0) && !symbol_mentioned_p (xop0))
xop0 = force_reg (SImode, xop0);
if (CONSTANT_P (xop1) && !CONST_INT_P (xop1)
&& !symbol_mentioned_p (xop1))
xop1 = force_reg (SImode, xop1);
if (ARM_BASE_REGISTER_RTX_P (xop0)
&& CONST_INT_P (xop1))
{
HOST_WIDE_INT n, low_n;
rtx base_reg, val;
n = INTVAL (xop1);
/* VFP addressing modes actually allow greater offsets, but for
now we just stick with the lowest common denominator. */
if (mode == DImode
|| ((TARGET_SOFT_FLOAT || TARGET_VFP) && mode == DFmode))
{
low_n = n & 0x0f;
n &= ~0x0f;
if (low_n > 4)
{
n += 16;
low_n -= 16;
}
}
else
{
low_n = ((mode) == TImode ? 0
: n >= 0 ? (n & 0xfff) : -((-n) & 0xfff));
n -= low_n;
}
base_reg = gen_reg_rtx (SImode);
val = force_operand (plus_constant (Pmode, xop0, n), NULL_RTX);
emit_move_insn (base_reg, val);
x = plus_constant (Pmode, base_reg, low_n);
}
else if (xop0 != XEXP (x, 0) || xop1 != XEXP (x, 1))
x = gen_rtx_PLUS (SImode, xop0, xop1);
}
/* XXX We don't allow MINUS any more -- see comment in
arm_legitimate_address_outer_p (). */
else if (GET_CODE (x) == MINUS)
{
rtx xop0 = XEXP (x, 0);
rtx xop1 = XEXP (x, 1);
if (CONSTANT_P (xop0))
xop0 = force_reg (SImode, xop0);
if (CONSTANT_P (xop1) && ! symbol_mentioned_p (xop1))
xop1 = force_reg (SImode, xop1);
if (xop0 != XEXP (x, 0) || xop1 != XEXP (x, 1))
x = gen_rtx_MINUS (SImode, xop0, xop1);
}
/* Make sure to take full advantage of the pre-indexed addressing mode
with absolute addresses which often allows for the base register to
be factorized for multiple adjacent memory references, and it might
even allows for the mini pool to be avoided entirely. */
else if (CONST_INT_P (x) && optimize > 0)
{
unsigned int bits;
HOST_WIDE_INT mask, base, index;
rtx base_reg;
/* ldr and ldrb can use a 12-bit index, ldrsb and the rest can only
use a 8-bit index. So let's use a 12-bit index for SImode only and
hope that arm_gen_constant will enable ldrb to use more bits. */
bits = (mode == SImode) ? 12 : 8;
mask = (1 << bits) - 1;
base = INTVAL (x) & ~mask;
index = INTVAL (x) & mask;
if (bit_count (base & 0xffffffff) > (32 - bits)/2)
{
/* It'll most probably be more efficient to generate the base
with more bits set and use a negative index instead. */
base |= mask;
index -= mask;
}
base_reg = force_reg (SImode, GEN_INT (base));
x = plus_constant (Pmode, base_reg, index);
}
if (flag_pic)
{
/* We need to find and carefully transform any SYMBOL and LABEL
references; so go back to the original address expression. */
rtx new_x = legitimize_pic_address (orig_x, mode, NULL_RTX);
if (new_x != orig_x)
x = new_x;
}
return x;
}
/* Try machine-dependent ways of modifying an illegitimate Thumb address
to be legitimate. If we find one, return the new, valid address. */
rtx
thumb_legitimize_address (rtx x, rtx orig_x, machine_mode mode)
{
if (GET_CODE (x) == PLUS
&& CONST_INT_P (XEXP (x, 1))
&& (INTVAL (XEXP (x, 1)) >= 32 * GET_MODE_SIZE (mode)
|| INTVAL (XEXP (x, 1)) < 0))
{
rtx xop0 = XEXP (x, 0);
rtx xop1 = XEXP (x, 1);
HOST_WIDE_INT offset = INTVAL (xop1);
/* Try and fold the offset into a biasing of the base register and
then offsetting that. Don't do this when optimizing for space
since it can cause too many CSEs. */
if (optimize_size && offset >= 0
&& offset < 256 + 31 * GET_MODE_SIZE (mode))
{
HOST_WIDE_INT delta;
if (offset >= 256)
delta = offset - (256 - GET_MODE_SIZE (mode));
else if (offset < 32 * GET_MODE_SIZE (mode) + 8)
delta = 31 * GET_MODE_SIZE (mode);
else
delta = offset & (~31 * GET_MODE_SIZE (mode));
xop0 = force_operand (plus_constant (Pmode, xop0, offset - delta),
NULL_RTX);
x = plus_constant (Pmode, xop0, delta);
}
else if (offset < 0 && offset > -256)
/* Small negative offsets are best done with a subtract before the
dereference, forcing these into a register normally takes two
instructions. */
x = force_operand (x, NULL_RTX);
else
{
/* For the remaining cases, force the constant into a register. */
xop1 = force_reg (SImode, xop1);
x = gen_rtx_PLUS (SImode, xop0, xop1);
}
}
else if (GET_CODE (x) == PLUS
&& s_register_operand (XEXP (x, 1), SImode)
&& !s_register_operand (XEXP (x, 0), SImode))
{
rtx xop0 = force_operand (XEXP (x, 0), NULL_RTX);
x = gen_rtx_PLUS (SImode, xop0, XEXP (x, 1));
}
if (flag_pic)
{
/* We need to find and carefully transform any SYMBOL and LABEL
references; so go back to the original address expression. */
rtx new_x = legitimize_pic_address (orig_x, mode, NULL_RTX);
if (new_x != orig_x)
x = new_x;
}
return x;
}
bool
arm_legitimize_reload_address (rtx *p,
machine_mode mode,
int opnum, int type,
int ind_levels ATTRIBUTE_UNUSED)
{
/* We must recognize output that we have already generated ourselves. */
if (GET_CODE (*p) == PLUS
&& GET_CODE (XEXP (*p, 0)) == PLUS
&& REG_P (XEXP (XEXP (*p, 0), 0))
&& CONST_INT_P (XEXP (XEXP (*p, 0), 1))
&& CONST_INT_P (XEXP (*p, 1)))
{
push_reload (XEXP (*p, 0), NULL_RTX, &XEXP (*p, 0), NULL,
MODE_BASE_REG_CLASS (mode), GET_MODE (*p),
VOIDmode, 0, 0, opnum, (enum reload_type) type);
return true;
}
if (GET_CODE (*p) == PLUS
&& REG_P (XEXP (*p, 0))
&& ARM_REGNO_OK_FOR_BASE_P (REGNO (XEXP (*p, 0)))
/* If the base register is equivalent to a constant, let the generic
code handle it. Otherwise we will run into problems if a future
reload pass decides to rematerialize the constant. */
&& !reg_equiv_constant (ORIGINAL_REGNO (XEXP (*p, 0)))
&& CONST_INT_P (XEXP (*p, 1)))
{
HOST_WIDE_INT val = INTVAL (XEXP (*p, 1));
HOST_WIDE_INT low, high;
/* Detect coprocessor load/stores. */
bool coproc_p = ((TARGET_HARD_FLOAT
&& TARGET_VFP
&& (mode == SFmode || mode == DFmode))
|| (TARGET_REALLY_IWMMXT
&& VALID_IWMMXT_REG_MODE (mode))
|| (TARGET_NEON
&& (VALID_NEON_DREG_MODE (mode)
|| VALID_NEON_QREG_MODE (mode))));
/* For some conditions, bail out when lower two bits are unaligned. */
if ((val & 0x3) != 0
/* Coprocessor load/store indexes are 8-bits + '00' appended. */
&& (coproc_p
/* For DI, and DF under soft-float: */
|| ((mode == DImode || mode == DFmode)
/* Without ldrd, we use stm/ldm, which does not
fair well with unaligned bits. */
&& (! TARGET_LDRD
/* Thumb-2 ldrd/strd is [-1020,+1020] in steps of 4. */
|| TARGET_THUMB2))))
return false;
/* When breaking down a [reg+index] reload address into [(reg+high)+low],
of which the (reg+high) gets turned into a reload add insn,
we try to decompose the index into high/low values that can often
also lead to better reload CSE.
For example:
ldr r0, [r2, #4100] // Offset too large
ldr r1, [r2, #4104] // Offset too large
is best reloaded as:
add t1, r2, #4096
ldr r0, [t1, #4]
add t2, r2, #4096
ldr r1, [t2, #8]
which post-reload CSE can simplify in most cases to eliminate the
second add instruction:
add t1, r2, #4096
ldr r0, [t1, #4]
ldr r1, [t1, #8]
The idea here is that we want to split out the bits of the constant
as a mask, rather than as subtracting the maximum offset that the
respective type of load/store used can handle.
When encountering negative offsets, we can still utilize it even if
the overall offset is positive; sometimes this may lead to an immediate
that can be constructed with fewer instructions.
For example:
ldr r0, [r2, #0x3FFFFC]
This is best reloaded as:
add t1, r2, #0x400000
ldr r0, [t1, #-4]
The trick for spotting this for a load insn with N bits of offset
(i.e. bits N-1:0) is to look at bit N; if it is set, then chose a
negative offset that is going to make bit N and all the bits below
it become zero in the remainder part.
The SIGN_MAG_LOW_ADDR_BITS macro below implements this, with respect
to sign-magnitude addressing (i.e. separate +- bit, or 1's complement),
used in most cases of ARM load/store instructions. */
#define SIGN_MAG_LOW_ADDR_BITS(VAL, N) \
(((VAL) & ((1 << (N)) - 1)) \
? (((VAL) & ((1 << ((N) + 1)) - 1)) ^ (1 << (N))) - (1 << (N)) \
: 0)
if (coproc_p)
{
low = SIGN_MAG_LOW_ADDR_BITS (val, 10);
/* NEON quad-word load/stores are made of two double-word accesses,
so the valid index range is reduced by 8. Treat as 9-bit range if
we go over it. */
if (TARGET_NEON && VALID_NEON_QREG_MODE (mode) && low >= 1016)
low = SIGN_MAG_LOW_ADDR_BITS (val, 9);
}
else if (GET_MODE_SIZE (mode) == 8)
{
if (TARGET_LDRD)
low = (TARGET_THUMB2
? SIGN_MAG_LOW_ADDR_BITS (val, 10)
: SIGN_MAG_LOW_ADDR_BITS (val, 8));
else
/* For pre-ARMv5TE (without ldrd), we use ldm/stm(db/da/ib)
to access doublewords. The supported load/store offsets are
-8, -4, and 4, which we try to produce here. */
low = ((val & 0xf) ^ 0x8) - 0x8;
}
else if (GET_MODE_SIZE (mode) < 8)
{
/* NEON element load/stores do not have an offset. */
if (TARGET_NEON_FP16 && mode == HFmode)
return false;
if (TARGET_THUMB2)
{
/* Thumb-2 has an asymmetrical index range of (-256,4096).
Try the wider 12-bit range first, and re-try if the result
is out of range. */
low = SIGN_MAG_LOW_ADDR_BITS (val, 12);
if (low < -255)
low = SIGN_MAG_LOW_ADDR_BITS (val, 8);
}
else
{
if (mode == HImode || mode == HFmode)
{
if (arm_arch4)
low = SIGN_MAG_LOW_ADDR_BITS (val, 8);
else
{
/* The storehi/movhi_bytes fallbacks can use only
[-4094,+4094] of the full ldrb/strb index range. */
low = SIGN_MAG_LOW_ADDR_BITS (val, 12);
if (low == 4095 || low == -4095)
return false;
}
}
else
low = SIGN_MAG_LOW_ADDR_BITS (val, 12);
}
}
else
return false;
high = ((((val - low) & (unsigned HOST_WIDE_INT) 0xffffffff)
^ (unsigned HOST_WIDE_INT) 0x80000000)
- (unsigned HOST_WIDE_INT) 0x80000000);
/* Check for overflow or zero */
if (low == 0 || high == 0 || (high + low != val))
return false;
/* Reload the high part into a base reg; leave the low part
in the mem.
Note that replacing this gen_rtx_PLUS with plus_constant is
wrong in this case because we rely on the
(plus (plus reg c1) c2) structure being preserved so that
XEXP (*p, 0) in push_reload below uses the correct term. */
*p = gen_rtx_PLUS (GET_MODE (*p),
gen_rtx_PLUS (GET_MODE (*p), XEXP (*p, 0),
GEN_INT (high)),
GEN_INT (low));
push_reload (XEXP (*p, 0), NULL_RTX, &XEXP (*p, 0), NULL,
MODE_BASE_REG_CLASS (mode), GET_MODE (*p),
VOIDmode, 0, 0, opnum, (enum reload_type) type);
return true;
}
return false;
}
rtx
thumb_legitimize_reload_address (rtx *x_p,
machine_mode mode,
int opnum, int type,
int ind_levels ATTRIBUTE_UNUSED)
{
rtx x = *x_p;
if (GET_CODE (x) == PLUS
&& GET_MODE_SIZE (mode) < 4
&& REG_P (XEXP (x, 0))
&& XEXP (x, 0) == stack_pointer_rtx
&& CONST_INT_P (XEXP (x, 1))
&& !thumb_legitimate_offset_p (mode, INTVAL (XEXP (x, 1))))
{
rtx orig_x = x;
x = copy_rtx (x);
push_reload (orig_x, NULL_RTX, x_p, NULL, MODE_BASE_REG_CLASS (mode),
Pmode, VOIDmode, 0, 0, opnum, (enum reload_type) type);
return x;
}
/* If both registers are hi-regs, then it's better to reload the
entire expression rather than each register individually. That
only requires one reload register rather than two. */
if (GET_CODE (x) == PLUS
&& REG_P (XEXP (x, 0))
&& REG_P (XEXP (x, 1))
&& !REG_MODE_OK_FOR_REG_BASE_P (XEXP (x, 0), mode)
&& !REG_MODE_OK_FOR_REG_BASE_P (XEXP (x, 1), mode))
{
rtx orig_x = x;
x = copy_rtx (x);
push_reload (orig_x, NULL_RTX, x_p, NULL, MODE_BASE_REG_CLASS (mode),
Pmode, VOIDmode, 0, 0, opnum, (enum reload_type) type);
return x;
}
return NULL;
}
/* Return TRUE if X contains any TLS symbol references. */
bool
arm_tls_referenced_p (rtx x)
{
if (! TARGET_HAVE_TLS)
return false;
subrtx_iterator::array_type array;
FOR_EACH_SUBRTX (iter, array, x, ALL)
{
const_rtx x = *iter;
if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
return true;
/* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
TLS offsets, not real symbol references. */
if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
iter.skip_subrtxes ();
}
return false;
}
/* Implement TARGET_LEGITIMATE_CONSTANT_P.
On the ARM, allow any integer (invalid ones are removed later by insn
patterns), nice doubles and symbol_refs which refer to the function's
constant pool XXX.
When generating pic allow anything. */
static bool
arm_legitimate_constant_p_1 (machine_mode, rtx x)
{
return flag_pic || !label_mentioned_p (x);
}
static bool
thumb_legitimate_constant_p (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
{
return (CONST_INT_P (x)
|| CONST_DOUBLE_P (x)
|| CONSTANT_ADDRESS_P (x)
|| flag_pic);
}
static bool
arm_legitimate_constant_p (machine_mode mode, rtx x)
{
return (!arm_cannot_force_const_mem (mode, x)
&& (TARGET_32BIT
? arm_legitimate_constant_p_1 (mode, x)
: thumb_legitimate_constant_p (mode, x)));
}
/* Implement TARGET_CANNOT_FORCE_CONST_MEM. */
static bool
arm_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
{
rtx base, offset;
if (ARM_OFFSETS_MUST_BE_WITHIN_SECTIONS_P)
{
split_const (x, &base, &offset);
if (GET_CODE (base) == SYMBOL_REF
&& !offset_within_block_p (base, INTVAL (offset)))
return true;
}
return arm_tls_referenced_p (x);
}
#define REG_OR_SUBREG_REG(X) \
(REG_P (X) \
|| (GET_CODE (X) == SUBREG && REG_P (SUBREG_REG (X))))
#define REG_OR_SUBREG_RTX(X) \
(REG_P (X) ? (X) : SUBREG_REG (X))
static inline int
thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
{
machine_mode mode = GET_MODE (x);
int total, words;
switch (code)
{
case ASHIFT:
case ASHIFTRT:
case LSHIFTRT:
case ROTATERT:
return (mode == SImode) ? COSTS_N_INSNS (1) : COSTS_N_INSNS (2);
case PLUS:
case MINUS:
case COMPARE:
case NEG:
case NOT:
return COSTS_N_INSNS (1);
case MULT:
if (CONST_INT_P (XEXP (x, 1)))
{
int cycles = 0;
unsigned HOST_WIDE_INT i = INTVAL (XEXP (x, 1));
while (i)
{
i >>= 2;
cycles++;
}
return COSTS_N_INSNS (2) + cycles;
}
return COSTS_N_INSNS (1) + 16;
case SET:
/* A SET doesn't have a mode, so let's look at the SET_DEST to get
the mode. */
words = ARM_NUM_INTS (GET_MODE_SIZE (GET_MODE (SET_DEST (x))));
return (COSTS_N_INSNS (words)
+ 4 * ((MEM_P (SET_SRC (x)))
+ MEM_P (SET_DEST (x))));
case CONST_INT:
if (outer == SET)
{
if ((unsigned HOST_WIDE_INT) INTVAL (x) < 256)
return 0;
if (thumb_shiftable_const (INTVAL (x)))
return COSTS_N_INSNS (2);
return COSTS_N_INSNS (3);
}
else if ((outer == PLUS || outer == COMPARE)
&& INTVAL (x) < 256 && INTVAL (x) > -256)
return 0;
else if ((outer == IOR || outer == XOR || outer == AND)
&& INTVAL (x) < 256 && INTVAL (x) >= -256)
return COSTS_N_INSNS (1);
else if (outer == AND)
{
int i;
/* This duplicates the tests in the andsi3 expander. */
for (i = 9; i <= 31; i++)
if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (x)
|| (((HOST_WIDE_INT) 1) << i) - 1 == ~INTVAL (x))
return COSTS_N_INSNS (2);
}
else if (outer == ASHIFT || outer == ASHIFTRT
|| outer == LSHIFTRT)
return 0;
return COSTS_N_INSNS (2);
case CONST:
case CONST_DOUBLE:
case LABEL_REF:
case SYMBOL_REF:
return COSTS_N_INSNS (3);
case UDIV:
case UMOD:
case DIV:
case MOD:
return 100;
case TRUNCATE:
return 99;
case AND:
case XOR:
case IOR:
/* XXX guess. */
return 8;
case MEM:
/* XXX another guess. */
/* Memory costs quite a lot for the first word, but subsequent words
load at the equivalent of a single insn each. */
return (10 + 4 * ((GET_MODE_SIZE (mode) - 1) / UNITS_PER_WORD)
+ ((GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
? 4 : 0));
case IF_THEN_ELSE:
/* XXX a guess. */
if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC)
return 14;
return 2;
case SIGN_EXTEND:
case ZERO_EXTEND:
total = mode == DImode ? COSTS_N_INSNS (1) : 0;
total += thumb1_rtx_costs (XEXP (x, 0), GET_CODE (XEXP (x, 0)), code);
if (mode == SImode)
return total;
if (arm_arch6)
return total + COSTS_N_INSNS (1);
/* Assume a two-shift sequence. Increase the cost slightly so
we prefer actual shifts over an extend operation. */
return total + 1 + COSTS_N_INSNS (2);
default:
return 99;
}
}
static inline bool
arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
{
machine_mode mode = GET_MODE (x);
enum rtx_code subcode;
rtx operand;
enum rtx_code code = GET_CODE (x);
*total = 0;
switch (code)
{
case MEM:
/* Memory costs quite a lot for the first word, but subsequent words
load at the equivalent of a single insn each. */
*total = COSTS_N_INSNS (2 + ARM_NUM_REGS (mode));
return true;
case DIV:
case MOD:
case UDIV:
case UMOD:
if (TARGET_HARD_FLOAT && mode == SFmode)
*total = COSTS_N_INSNS (2);
else if (TARGET_HARD_FLOAT && mode == DFmode && !TARGET_VFP_SINGLE)
*total = COSTS_N_INSNS (4);
else
*total = COSTS_N_INSNS (20);
return false;
case ROTATE:
if (REG_P (XEXP (x, 1)))
*total = COSTS_N_INSNS (1); /* Need to subtract from 32 */
else if (!CONST_INT_P (XEXP (x, 1)))
*total = rtx_cost (XEXP (x, 1), code, 1, speed);
/* Fall through */
case ROTATERT:
if (mode != SImode)
{
*total += COSTS_N_INSNS (4);
return true;
}
/* Fall through */
case ASHIFT: case LSHIFTRT: case ASHIFTRT:
*total += rtx_cost (XEXP (x, 0), code, 0, speed);
if (mode == DImode)
{
*total += COSTS_N_INSNS (3);
return true;
}
*total += COSTS_N_INSNS (1);
/* Increase the cost of complex shifts because they aren't any faster,
and reduce dual issue opportunities. */
if (arm_tune_cortex_a9
&& outer != SET && !CONST_INT_P (XEXP (x, 1)))
++*total;
return true;
case MINUS:
if (mode == DImode)
{
*total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
if (CONST_INT_P (XEXP (x, 0))
&& const_ok_for_arm (INTVAL (XEXP (x, 0))))
{
*total += rtx_cost (XEXP (x, 1), code, 1, speed);
return true;
}
if (CONST_INT_P (XEXP (x, 1))
&& const_ok_for_arm (INTVAL (XEXP (x, 1))))
{
*total += rtx_cost (XEXP (x, 0), code, 0, speed);
return true;
}
return false;
}
if (GET_MODE_CLASS (mode) == MODE_FLOAT)
{
if (TARGET_HARD_FLOAT
&& (mode == SFmode
|| (mode == DFmode && !TARGET_VFP_SINGLE)))
{
*total = COSTS_N_INSNS (1);
if (CONST_DOUBLE_P (XEXP (x, 0))
&& arm_const_double_rtx (XEXP (x, 0)))
{
*total += rtx_cost (XEXP (x, 1), code, 1, speed);
return true;
}
if (CONST_DOUBLE_P (XEXP (x, 1))
&& arm_const_double_rtx (XEXP (x, 1)))
{
*total += rtx_cost (XEXP (x, 0), code, 0, speed);
return true;
}
return false;
}
*total = COSTS_N_INSNS (20);
return false;
}
*total = COSTS_N_INSNS (1);
if (CONST_INT_P (XEXP (x, 0))
&& const_ok_for_arm (INTVAL (XEXP (x, 0))))
{
*total += rtx_cost (XEXP (x, 1), code, 1, speed);
return true;
}
subcode = GET_CODE (XEXP (x, 1));
if (subcode == ASHIFT || subcode == ASHIFTRT
|| subcode == LSHIFTRT
|| subcode == ROTATE || subcode == ROTATERT)
{
*total += rtx_cost (XEXP (x, 0), code, 0, speed);
*total += rtx_cost (XEXP (XEXP (x, 1), 0), subcode, 0, speed);
return true;
}
/* A shift as a part of RSB costs no more than RSB itself. */
if (GET_CODE (XEXP (x, 0)) == MULT
&& power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
{
*total += rtx_cost (XEXP (XEXP (x, 0), 0), code, 0, speed);
*total += rtx_cost (XEXP (x, 1), code, 1, speed);
return true;
}
if (subcode == MULT
&& power_of_two_operand (XEXP (XEXP (x, 1), 1), SImode))
{
*total += rtx_cost (XEXP (x, 0), code, 0, speed);
*total += rtx_cost (XEXP (XEXP (x, 1), 0), subcode, 0, speed);
return true;
}
if (GET_RTX_CLASS (GET_CODE (XEXP (x, 1))) == RTX_COMPARE
|| GET_RTX_CLASS (GET_CODE (XEXP (x, 1))) == RTX_COMM_COMPARE)
{
*total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, 0, speed);
if (REG_P (XEXP (XEXP (x, 1), 0))
&& REGNO (XEXP (XEXP (x, 1), 0)) != CC_REGNUM)
*total += COSTS_N_INSNS (1);
return true;
}
/* Fall through */
case PLUS:
if (code == PLUS && arm_arch6 && mode == SImode
&& (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
|| GET_CODE (XEXP (x, 0)) == SIGN_EXTEND))
{
*total = COSTS_N_INSNS (1);
*total += rtx_cost (XEXP (XEXP (x, 0), 0), GET_CODE (XEXP (x, 0)),
0, speed);
*total += rtx_cost (XEXP (x, 1), code, 1, speed);
return true;
}
/* MLA: All arguments must be registers. We filter out
multiplication by a power of two, so that we fall down into
the code below. */
if (GET_CODE (XEXP (x, 0)) == MULT
&& !power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
{
/* The cost comes from the cost of the multiply. */
return false;
}
if (GET_MODE_CLASS (mode) == MODE_FLOAT)
{
if (TARGET_HARD_FLOAT
&& (mode == SFmode
|| (mode == DFmode && !TARGET_VFP_SINGLE)))
{
*total = COSTS_N_INSNS (1);
if (CONST_DOUBLE_P (XEXP (x, 1))
&& arm_const_double_rtx (XEXP (x, 1)))
{
*total += rtx_cost (XEXP (x, 0), code, 0, speed);
return true;
}
return false;
}
*total = COSTS_N_INSNS (20);
return false;
}
if (GET_RTX_CLASS (GET_CODE (XEXP (x, 0))) == RTX_COMPARE
|| GET_RTX_CLASS (GET_CODE (XEXP (x, 0))) == RTX_COMM_COMPARE)
{
*total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 1), code, 1, speed);
if (REG_P (XEXP (XEXP (x, 0), 0))
&& REGNO (XEXP (XEXP (x, 0), 0)) != CC_REGNUM)
*total += COSTS_N_INSNS (1);
return true;
}
/* Fall through */
case AND: case XOR: case IOR:
/* Normally the frame registers will be spilt into reg+const during
reload, so it is a bad idea to combine them with other instructions,
since then they might not be moved outside of loops. As a compromise
we allow integration with ops that have a constant as their second
operand. */
if (REG_OR_SUBREG_REG (XEXP (x, 0))
&& ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0)))
&& !CONST_INT_P (XEXP (x, 1)))
*total = COSTS_N_INSNS (1);
if (mode == DImode)
{
*total += COSTS_N_INSNS (2);
if (CONST_INT_P (XEXP (x, 1))
&& const_ok_for_op (INTVAL (XEXP (x, 1)), code))
{
*total += rtx_cost (XEXP (x, 0), code, 0, speed);
return true;
}
return false;
}
*total += COSTS_N_INSNS (1);
if (CONST_INT_P (XEXP (x, 1))
&& const_ok_for_op (INTVAL (XEXP (x, 1)), code))
{
*total += rtx_cost (XEXP (x, 0), code, 0, speed);
return true;
}
subcode = GET_CODE (XEXP (x, 0));
if (subcode == ASHIFT || subcode == ASHIFTRT
|| subcode == LSHIFTRT
|| subcode == ROTATE || subcode == ROTATERT)
{
*total += rtx_cost (XEXP (x, 1), code, 1, speed);
*total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, 0, speed);
return true;
}
if (subcode == MULT
&& power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
{
*total += rtx_cost (XEXP (x, 1), code, 1, speed);
*total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, 0, speed);
return true;
}
if (subcode == UMIN || subcode == UMAX
|| subcode == SMIN || subcode == SMAX)
{
*total = COSTS_N_INSNS (3);
return true;
}
return false;
case MULT:
/* This should have been handled by the CPU specific routines. */
gcc_unreachable ();
case TRUNCATE:
if (arm_arch3m && mode == SImode
&& GET_CODE (XEXP (x, 0)) == LSHIFTRT
&& GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
&& (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0))
== GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)))
&& (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
|| GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND))
{
*total = rtx_cost (XEXP (XEXP (x, 0), 0), LSHIFTRT, 0, speed);
return true;
}
*total = COSTS_N_INSNS (2); /* Plus the cost of the MULT */
return false;
case NEG:
if (GET_MODE_CLASS (mode) == MODE_FLOAT)
{
if (TARGET_HARD_FLOAT
&& (mode == SFmode
|| (mode == DFmode && !TARGET_VFP_SINGLE)))
{
*total = COSTS_N_INSNS (1);
return false;
}
*total = COSTS_N_INSNS (2);
return false;
}
/* Fall through */
case NOT:
*total = COSTS_N_INSNS (ARM_NUM_REGS(mode));
if (mode == SImode && code == NOT)
{
subcode = GET_CODE (XEXP (x, 0));
if (subcode == ASHIFT || subcode == ASHIFTRT
|| subcode == LSHIFTRT
|| subcode == ROTATE || subcode == ROTATERT
|| (subcode == MULT
&& power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode)))
{
*total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, 0, speed);
/* Register shifts cost an extra cycle. */
if (!CONST_INT_P (XEXP (XEXP (x, 0), 1)))
*total += COSTS_N_INSNS (1) + rtx_cost (XEXP (XEXP (x, 0), 1),
subcode, 1, speed);
return true;
}
}
return false;
case IF_THEN_ELSE:
if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC)
{
*total = COSTS_N_INSNS (4);
return true;
}
operand = XEXP (x, 0);
if (!((GET_RTX_CLASS (GET_CODE (operand)) == RTX_COMPARE
|| GET_RTX_CLASS (GET_CODE (operand)) == RTX_COMM_COMPARE)
&& REG_P (XEXP (operand, 0))
&& REGNO (XEXP (operand, 0)) == CC_REGNUM))
*total += COSTS_N_INSNS (1);
*total += (rtx_cost (XEXP (x, 1), code, 1, speed)
+ rtx_cost (XEXP (x, 2), code, 2, speed));
return true;
case NE:
if (mode == SImode && XEXP (x, 1) == const0_rtx)
{
*total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, 0, speed);
return true;
}
goto scc_insn;
case GE:
if ((!REG_P (XEXP (x, 0)) || REGNO (XEXP (x, 0)) != CC_REGNUM)
&& mode == SImode && XEXP (x, 1) == const0_rtx)
{
*total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, 0, speed);
return true;
}
goto scc_insn;
case LT:
if ((!REG_P (XEXP (x, 0)) || REGNO (XEXP (x, 0)) != CC_REGNUM)
&& mode == SImode && XEXP (x, 1) == const0_rtx)
{
*total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, 0, speed);
return true;
}
goto scc_insn;
case EQ:
case GT:
case LE:
case GEU:
case LTU:
case GTU:
case LEU:
case UNORDERED:
case ORDERED:
case UNEQ:
case UNGE:
case UNLT:
case UNGT:
case UNLE:
scc_insn:
/* SCC insns. In the case where the comparison has already been
performed, then they cost 2 instructions. Otherwise they need
an additional comparison before them. */
*total = COSTS_N_INSNS (2);
if (REG_P (XEXP (x, 0)) && REGNO (XEXP (x, 0)) == CC_REGNUM)
{
return true;
}
/* Fall through */
case COMPARE:
if (REG_P (XEXP (x, 0)) && REGNO (XEXP (x, 0)) == CC_REGNUM)
{
*total = 0;
return true;
}
*total += COSTS_N_INSNS (1);
if (CONST_INT_P (XEXP (x, 1))
&& const_ok_for_op (INTVAL (XEXP (x, 1)), code))
{
*total += rtx_cost (XEXP (x, 0), code, 0, speed);
return true;
}
subcode = GET_CODE (XEXP (x, 0));
if (subcode == ASHIFT || subcode == ASHIFTRT
|| subcode == LSHIFTRT
|| subcode == ROTATE || subcode == ROTATERT)
{
*total += rtx_cost (XEXP (x, 1), code, 1, speed);
*total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, 0, speed);
return true;
}
if (subcode == MULT
&& power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
{
*total += rtx_cost (XEXP (x, 1), code, 1, speed);
*total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, 0, speed);
return true;
}
return false;
case UMIN:
case UMAX:
case SMIN:
case SMAX:
*total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, 0, speed);
if (!CONST_INT_P (XEXP (x, 1))
|| !const_ok_for_arm (INTVAL (XEXP (x, 1))))
*total += rtx_cost (XEXP (x, 1), code, 1, speed);
return true;
case ABS:
if (GET_MODE_CLASS (mode) == MODE_FLOAT)
{
if (TARGET_HARD_FLOAT
&& (mode == SFmode
|| (mode == DFmode && !TARGET_VFP_SINGLE)))
{
*total = COSTS_N_INSNS (1);
return false;
}
*total = COSTS_N_INSNS (20);
return false;
}
*total = COSTS_N_INSNS (1);
if (mode == DImode)
*total += COSTS_N_INSNS (3);
return false;
case SIGN_EXTEND:
case ZERO_EXTEND:
*total = 0;
if (GET_MODE_CLASS (mode) == MODE_INT)
{
rtx op = XEXP (x, 0);
machine_mode opmode = GET_MODE (op);
if (mode == DImode)
*total += COSTS_N_INSNS (1);
if (opmode != SImode)
{
if (MEM_P (op))
{
/* If !arm_arch4, we use one of the extendhisi2_mem
or movhi_bytes patterns for HImode. For a QImode
sign extension, we first zero-extend from memory
and then perform a shift sequence. */
if (!arm_arch4 && (opmode != QImode || code == SIGN_EXTEND))
*total += COSTS_N_INSNS (2);
}
else if (arm_arch6)
*total += COSTS_N_INSNS (1);
/* We don't have the necessary insn, so we need to perform some
other operation. */
else if (TARGET_ARM && code == ZERO_EXTEND && mode == QImode)
/* An and with constant 255. */
*total += COSTS_N_INSNS (1);
else
/* A shift sequence. Increase costs slightly to avoid
combining two shifts into an extend operation. */
*total += COSTS_N_INSNS (2) + 1;
}
return false;
}
switch (GET_MODE (XEXP (x, 0)))
{
case V8QImode:
case V4HImode:
case V2SImode:
case V4QImode:
case V2HImode:
*total = COSTS_N_INSNS (1);
return false;
default:
gcc_unreachable ();
}
gcc_unreachable ();
case ZERO_EXTRACT:
case SIGN_EXTRACT:
*total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, 0, speed);
return true;
case CONST_INT:
if (const_ok_for_arm (INTVAL (x))
|| const_ok_for_arm (~INTVAL (x)))
*total = COSTS_N_INSNS (1);
else
*total = COSTS_N_INSNS (arm_gen_constant (SET, mode, NULL_RTX,
INTVAL (x), NULL_RTX,
NULL_RTX, 0, 0));
return true;
case CONST:
case LABEL_REF:
case SYMBOL_REF:
*total = COSTS_N_INSNS (3);
return true;
case HIGH:
*total = COSTS_N_INSNS (1);
return true;
case LO_SUM:
*total = COSTS_N_INSNS (1);
*total += rtx_cost (XEXP (x, 0), code, 0, speed);
return true;
case CONST_DOUBLE:
if (TARGET_HARD_FLOAT && vfp3_const_double_rtx (x)
&& (mode == SFmode || !TARGET_VFP_SINGLE))
*total = COSTS_N_INSNS (1);
else
*total = COSTS_N_INSNS (4);
return true;
case SET:
/* The vec_extract patterns accept memory operands that require an
address reload. Account for the cost of that reload to give the
auto-inc-dec pass an incentive to try to replace them. */
if (TARGET_NEON && MEM_P (SET_DEST (x))
&& GET_CODE (SET_SRC (x)) == VEC_SELECT)
{
*total = rtx_cost (SET_DEST (x), code, 0, speed);
if (!neon_vector_mem_operand (SET_DEST (x), 2, true))
*total += COSTS_N_INSNS (1);
return true;
}
/* Likewise for the vec_set patterns. */
if (TARGET_NEON && GET_CODE (SET_SRC (x)) == VEC_MERGE
&& GET_CODE (XEXP (SET_SRC (x), 0)) == VEC_DUPLICATE
&& MEM_P (XEXP (XEXP (SET_SRC (x), 0), 0)))
{
rtx mem = XEXP (XEXP (SET_SRC (x), 0), 0);
*total = rtx_cost (mem, code, 0, speed);
if (!neon_vector_mem_operand (mem, 2, true))
*total += COSTS_N_INSNS (1);
return true;
}
return false;
case UNSPEC:
/* We cost this as high as our memory costs to allow this to
be hoisted from loops. */
if (XINT (x, 1) == UNSPEC_PIC_UNIFIED)
{
*total = COSTS_N_INSNS (2 + ARM_NUM_REGS (mode));
}
return true;
case CONST_VECTOR:
if (TARGET_NEON
&& TARGET_HARD_FLOAT
&& outer == SET
&& (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))
&& neon_immediate_valid_for_move (x, mode, NULL, NULL))
*total = COSTS_N_INSNS (1);
else
*total = COSTS_N_INSNS (4);
return true;
default:
*total = COSTS_N_INSNS (4);
return false;
}
}
/* Estimates the size cost of thumb1 instructions.
For now most of the code is copied from thumb1_rtx_costs. We need more
fine grain tuning when we have more related test cases. */
static inline int
thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
{
machine_mode mode = GET_MODE (x);
int words;
switch (code)
{
case ASHIFT:
case ASHIFTRT:
case LSHIFTRT:
case ROTATERT:
return (mode == SImode) ? COSTS_N_INSNS (1) : COSTS_N_INSNS (2);
case PLUS:
case MINUS:
/* Thumb-1 needs two instructions to fulfill shiftadd/shiftsub0/shiftsub1
defined by RTL expansion, especially for the expansion of
multiplication. */
if ((GET_CODE (XEXP (x, 0)) == MULT
&& power_of_two_operand (XEXP (XEXP (x,0),1), SImode))
|| (GET_CODE (XEXP (x, 1)) == MULT
&& power_of_two_operand (XEXP (XEXP (x, 1), 1), SImode)))
return COSTS_N_INSNS (2);
/* On purpose fall through for normal RTX. */
case COMPARE:
case NEG:
case NOT:
return COSTS_N_INSNS (1);
case MULT:
if (CONST_INT_P (XEXP (x, 1)))
{
/* Thumb1 mul instruction can't operate on const. We must Load it
into a register first. */
int const_size = thumb1_size_rtx_costs (XEXP (x, 1), CONST_INT, SET);
/* For the targets which have a very small and high-latency multiply
unit, we prefer to synthesize the mult with up to 5 instructions,
giving a good balance between size and performance. */
if (arm_arch6m && arm_m_profile_small_mul)
return COSTS_N_INSNS (5);
else
return COSTS_N_INSNS (1) + const_size;
}
return COSTS_N_INSNS (1);
case SET:
/* A SET doesn't have a mode, so let's look at the SET_DEST to get
the mode. */
words = ARM_NUM_INTS (GET_MODE_SIZE (GET_MODE (SET_DEST (x))));
return COSTS_N_INSNS (words)
+ COSTS_N_INSNS (1) * (satisfies_constraint_J (SET_SRC (x))
|| satisfies_constraint_K (SET_SRC (x))
/* thumb1_movdi_insn. */
|| ((words > 1) && MEM_P (SET_SRC (x))));
case CONST_INT:
if (outer == SET)
{
if ((unsigned HOST_WIDE_INT) INTVAL (x) < 256)
return COSTS_N_INSNS (1);
/* See split "TARGET_THUMB1 && satisfies_constraint_J". */
if (INTVAL (x) >= -255 && INTVAL (x) <= -1)
return COSTS_N_INSNS (2);
/* See split "TARGET_THUMB1 && satisfies_constraint_K". */
if (thumb_shiftable_const (INTVAL (x)))
return COSTS_N_INSNS (2);
return COSTS_N_INSNS (3);
}
else if ((outer == PLUS || outer == COMPARE)
&& INTVAL (x) < 256 && INTVAL (x) > -256)
return 0;
else if ((outer == IOR || outer == XOR || outer == AND)
&& INTVAL (x) < 256 && INTVAL (x) >= -256)
return COSTS_N_INSNS (1);
else if (outer == AND)
{
int i;
/* This duplicates the tests in the andsi3 expander. */
for (i = 9; i <= 31; i++)
if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (x)
|| (((HOST_WIDE_INT) 1) << i) - 1 == ~INTVAL (x))
return COSTS_N_INSNS (2);
}
else if (outer == ASHIFT || outer == ASHIFTRT
|| outer == LSHIFTRT)
return 0;
return COSTS_N_INSNS (2);
case CONST:
case CONST_DOUBLE:
case LABEL_REF:
case SYMBOL_REF:
return COSTS_N_INSNS (3);
case UDIV:
case UMOD:
case DIV:
case MOD:
return 100;
case TRUNCATE:
return 99;
case AND:
case XOR:
case IOR:
return COSTS_N_INSNS (1);
case MEM:
return (COSTS_N_INSNS (1)
+ COSTS_N_INSNS (1)
* ((GET_MODE_SIZE (mode) - 1) / UNITS_PER_WORD)
+ ((GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
? COSTS_N_INSNS (1) : 0));
case IF_THEN_ELSE:
/* XXX a guess. */
if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC)
return 14;
return 2;
case ZERO_EXTEND:
/* XXX still guessing. */
switch (GET_MODE (XEXP (x, 0)))
{
case QImode:
return (1 + (mode == DImode ? 4 : 0)
+ (MEM_P (XEXP (x, 0)) ? 10 : 0));
case HImode:
return (4 + (mode == DImode ? 4 : 0)
+ (MEM_P (XEXP (x, 0)) ? 10 : 0));
case SImode:
return (1 + (MEM_P (XEXP (x, 0)) ? 10 : 0));
default:
return 99;
}
default:
return 99;
}
}
/* RTX costs when optimizing for size. */
static bool
arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
int *total)
{
machine_mode mode = GET_MODE (x);
if (TARGET_THUMB1)
{
*total = thumb1_size_rtx_costs (x, code, outer_code);
return true;
}
/* FIXME: This makes no attempt to prefer narrow Thumb-2 instructions. */
switch (code)
{
case MEM:
/* A memory access costs 1 insn if the mode is small, or the address is
a single register, otherwise it costs one insn per word. */
if (REG_P (XEXP (x, 0)))
*total = COSTS_N_INSNS (1);
else if (flag_pic
&& GET_CODE (XEXP (x, 0)) == PLUS
&& will_be_in_index_register (XEXP (XEXP (x, 0), 1)))
/* This will be split into two instructions.
See arm.md:calculate_pic_address. */
*total = COSTS_N_INSNS (2);
else
*total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
return true;
case DIV:
case MOD:
case UDIV:
case UMOD:
/* Needs a libcall, so it costs about this. */
*total = COSTS_N_INSNS (2);
return false;
case ROTATE:
if (mode == SImode && REG_P (XEXP (x, 1)))
{
*total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, 0, false);
return true;
}
/* Fall through */
case ROTATERT:
case ASHIFT:
case LSHIFTRT:
case ASHIFTRT:
if (mode == DImode && CONST_INT_P (XEXP (x, 1)))
{
*total = COSTS_N_INSNS (3) + rtx_cost (XEXP (x, 0), code, 0, false);
return true;
}
else if (mode == SImode)
{
*total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, 0, false);
/* Slightly disparage register shifts, but not by much. */
if (!CONST_INT_P (XEXP (x, 1)))
*total += 1 + rtx_cost (XEXP (x, 1), code, 1, false);
return true;
}
/* Needs a libcall. */
*total = COSTS_N_INSNS (2);
return false;
case MINUS:
if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
&& (mode == SFmode || !TARGET_VFP_SINGLE))
{
*total = COSTS_N_INSNS (1);
return false;
}
if (mode == SImode)
{
enum rtx_code subcode0 = GET_CODE (XEXP (x, 0));
enum rtx_code subcode1 = GET_CODE (XEXP (x, 1));
if (subcode0 == ROTATE || subcode0 == ROTATERT || subcode0 == ASHIFT
|| subcode0 == LSHIFTRT || subcode0 == ASHIFTRT
|| subcode1 == ROTATE || subcode1 == ROTATERT
|| subcode1 == ASHIFT || subcode1 == LSHIFTRT
|| subcode1 == ASHIFTRT)
{
/* It's just the cost of the two operands. */
*total = 0;
return false;
}
*total = COSTS_N_INSNS (1);
return false;
}
*total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
return false;
case PLUS:
if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
&& (mode == SFmode || !TARGET_VFP_SINGLE))
{
*total = COSTS_N_INSNS (1);
return false;
}
/* A shift as a part of ADD costs nothing. */
if (GET_CODE (XEXP (x, 0)) == MULT
&& power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
{
*total = COSTS_N_INSNS (TARGET_THUMB2 ? 2 : 1);
*total += rtx_cost (XEXP (XEXP (x, 0), 0), code, 0, false);
*total += rtx_cost (XEXP (x, 1), code, 1, false);
return true;
}
/* Fall through */
case AND: case XOR: case IOR:
if (mode == SImode)
{
enum rtx_code subcode = GET_CODE (XEXP (x, 0));
if (subcode == ROTATE || subcode == ROTATERT || subcode == ASHIFT
|| subcode == LSHIFTRT || subcode == ASHIFTRT
|| (code == AND && subcode == NOT))
{
/* It's just the cost of the two operands. */
*total = 0;
return false;
}
}
*total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
return false;
case MULT:
*total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
return false;
case NEG:
if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
&& (mode == SFmode || !TARGET_VFP_SINGLE))
{
*total = COSTS_N_INSNS (1);
return false;
}
/* Fall through */
case NOT:
*total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
return false;
case IF_THEN_ELSE:
*total = 0;
return false;
case COMPARE:
if (cc_register (XEXP (x, 0), VOIDmode))
* total = 0;
else
*total = COSTS_N_INSNS (1);
return false;
case ABS:
if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
&& (mode == SFmode || !TARGET_VFP_SINGLE))
*total = COSTS_N_INSNS (1);
else
*total = COSTS_N_INSNS (1 + ARM_NUM_REGS (mode));
return false;
case SIGN_EXTEND:
case ZERO_EXTEND:
return arm_rtx_costs_1 (x, outer_code, total, 0);
case CONST_INT:
if (const_ok_for_arm (INTVAL (x)))
/* A multiplication by a constant requires another instruction
to load the constant to a register. */
*total = COSTS_N_INSNS ((outer_code == SET || outer_code == MULT)
? 1 : 0);
else if (const_ok_for_arm (~INTVAL (x)))
*total = COSTS_N_INSNS (outer_code == AND ? 0 : 1);
else if (const_ok_for_arm (-INTVAL (x)))
{
if (outer_code == COMPARE || outer_code == PLUS
|| outer_code == MINUS)
*total = 0;
else
*total = COSTS_N_INSNS (1);
}
else
*total = COSTS_N_INSNS (2);
return true;
case CONST:
case LABEL_REF:
case SYMBOL_REF:
*total = COSTS_N_INSNS (2);
return true;
case CONST_DOUBLE:
*total = COSTS_N_INSNS (4);
return true;
case CONST_VECTOR:
if (TARGET_NEON
&& TARGET_HARD_FLOAT
&& outer_code == SET
&& (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))
&& neon_immediate_valid_for_move (x, mode, NULL, NULL))
*total = COSTS_N_INSNS (1);
else
*total = COSTS_N_INSNS (4);
return true;
case HIGH:
case LO_SUM:
/* We prefer constant pool entries to MOVW/MOVT pairs, so bump the
cost of these slightly. */
*total = COSTS_N_INSNS (1) + 1;
return true;
case SET:
return false;
default:
if (mode != VOIDmode)
*total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
else
*total = COSTS_N_INSNS (4); /* How knows? */
return false;
}
}
/* Helper function for arm_rtx_costs. If the operand is a valid shift
operand, then return the operand that is being shifted. If the shift
is not by a constant, then set SHIFT_REG to point to the operand.
Return NULL if OP is not a shifter operand. */
static rtx
shifter_op_p (rtx op, rtx *shift_reg)
{
enum rtx_code code = GET_CODE (op);
if (code == MULT && CONST_INT_P (XEXP (op, 1))
&& exact_log2 (INTVAL (XEXP (op, 1))) > 0)
return XEXP (op, 0);
else if (code == ROTATE && CONST_INT_P (XEXP (op, 1)))
return XEXP (op, 0);
else if (code == ROTATERT || code == ASHIFT || code == LSHIFTRT
|| code == ASHIFTRT)
{
if (!CONST_INT_P (XEXP (op, 1)))
*shift_reg = XEXP (op, 1);
return XEXP (op, 0);
}
return NULL;
}
static bool
arm_unspec_cost (rtx x, enum rtx_code /* outer_code */, bool speed_p, int *cost)
{
const struct cpu_cost_table *extra_cost = current_tune->insn_extra_cost;
gcc_assert (GET_CODE (x) == UNSPEC);
switch (XINT (x, 1))
{
case UNSPEC_UNALIGNED_LOAD:
/* We can only do unaligned loads into the integer unit, and we can't
use LDM or LDRD. */
*cost = COSTS_N_INSNS (ARM_NUM_REGS (GET_MODE (x)));
if (speed_p)
*cost += (ARM_NUM_REGS (GET_MODE (x)) * extra_cost->ldst.load
+ extra_cost->ldst.load_unaligned);
#ifdef NOT_YET
*cost += arm_address_cost (XEXP (XVECEXP (x, 0, 0), 0), GET_MODE (x),
ADDR_SPACE_GENERIC, speed_p);
#endif
return true;
case UNSPEC_UNALIGNED_STORE:
*cost = COSTS_N_INSNS (ARM_NUM_REGS (GET_MODE (x)));
if (speed_p)
*cost += (ARM_NUM_REGS (GET_MODE (x)) * extra_cost->ldst.store
+ extra_cost->ldst.store_unaligned);
*cost += rtx_cost (XVECEXP (x, 0, 0), UNSPEC, 0, speed_p);
#ifdef NOT_YET
*cost += arm_address_cost (XEXP (XVECEXP (x, 0, 0), 0), GET_MODE (x),
ADDR_SPACE_GENERIC, speed_p);
#endif
return true;
case UNSPEC_VRINTZ:
case UNSPEC_VRINTP:
case UNSPEC_VRINTM:
case UNSPEC_VRINTR:
case UNSPEC_VRINTX:
case UNSPEC_VRINTA:
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->fp[GET_MODE (x) == DFmode].roundint;
return true;
default:
*cost = COSTS_N_INSNS (2);
break;
}
return false;
}
/* Cost of a libcall. We assume one insn per argument, an amount for the
call (one insn for -Os) and then one for processing the result. */
#define LIBCALL_COST(N) COSTS_N_INSNS (N + (speed_p ? 18 : 2))
#define HANDLE_NARROW_SHIFT_ARITH(OP, IDX) \
do \
{ \
shift_op = shifter_op_p (XEXP (x, IDX), &shift_reg); \
if (shift_op != NULL \
&& arm_rtx_shift_left_p (XEXP (x, IDX))) \
{ \
if (shift_reg) \
{ \
if (speed_p) \
*cost += extra_cost->alu.arith_shift_reg; \
*cost += rtx_cost (shift_reg, ASHIFT, 1, speed_p); \
} \
else if (speed_p) \
*cost += extra_cost->alu.arith_shift; \
\
*cost += (rtx_cost (shift_op, ASHIFT, 0, speed_p) \
+ rtx_cost (XEXP (x, 1 - IDX), \
OP, 1, speed_p)); \
return true; \
} \
} \
while (0);
/* RTX costs. Make an estimate of the cost of executing the operation
X, which is contained with an operation with code OUTER_CODE.
SPEED_P indicates whether the cost desired is the performance cost,
or the size cost. The estimate is stored in COST and the return
value is TRUE if the cost calculation is final, or FALSE if the
caller should recurse through the operands of X to add additional
costs.
We currently make no attempt to model the size savings of Thumb-2
16-bit instructions. At the normal points in compilation where
this code is called we have no measure of whether the condition
flags are live or not, and thus no realistic way to determine what
the size will eventually be. */
static bool
arm_new_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
const struct cpu_cost_table *extra_cost,
int *cost, bool speed_p)
{
machine_mode mode = GET_MODE (x);
if (TARGET_THUMB1)
{
if (speed_p)
*cost = thumb1_rtx_costs (x, code, outer_code);
else
*cost = thumb1_size_rtx_costs (x, code, outer_code);
return true;
}
switch (code)
{
case SET:
*cost = 0;
/* SET RTXs don't have a mode so we get it from the destination. */
mode = GET_MODE (SET_DEST (x));
if (REG_P (SET_SRC (x))
&& REG_P (SET_DEST (x)))
{
/* Assume that most copies can be done with a single insn,
unless we don't have HW FP, in which case everything
larger than word mode will require two insns. */
*cost = COSTS_N_INSNS (((!TARGET_HARD_FLOAT
&& GET_MODE_SIZE (mode) > 4)
|| mode == DImode)
? 2 : 1);
/* Conditional register moves can be encoded
in 16 bits in Thumb mode. */
if (!speed_p && TARGET_THUMB && outer_code == COND_EXEC)
*cost >>= 1;
return true;
}
if (CONST_INT_P (SET_SRC (x)))
{
/* Handle CONST_INT here, since the value doesn't have a mode
and we would otherwise be unable to work out the true cost. */
*cost = rtx_cost (SET_DEST (x), SET, 0, speed_p);
outer_code = SET;
/* Slightly lower the cost of setting a core reg to a constant.
This helps break up chains and allows for better scheduling. */
if (REG_P (SET_DEST (x))
&& REGNO (SET_DEST (x)) <= LR_REGNUM)
*cost -= 1;
x = SET_SRC (x);
/* Immediate moves with an immediate in the range [0, 255] can be
encoded in 16 bits in Thumb mode. */
if (!speed_p && TARGET_THUMB && GET_MODE (x) == SImode
&& INTVAL (x) >= 0 && INTVAL (x) <=255)
*cost >>= 1;
goto const_int_cost;
}
return false;
case MEM:
/* A memory access costs 1 insn if the mode is small, or the address is
a single register, otherwise it costs one insn per word. */
if (REG_P (XEXP (x, 0)))
*cost = COSTS_N_INSNS (1);
else if (flag_pic
&& GET_CODE (XEXP (x, 0)) == PLUS
&& will_be_in_index_register (XEXP (XEXP (x, 0), 1)))
/* This will be split into two instructions.
See arm.md:calculate_pic_address. */
*cost = COSTS_N_INSNS (2);
else
*cost = COSTS_N_INSNS (ARM_NUM_REGS (mode));
/* For speed optimizations, add the costs of the address and
accessing memory. */
if (speed_p)
#ifdef NOT_YET
*cost += (extra_cost->ldst.load
+ arm_address_cost (XEXP (x, 0), mode,
ADDR_SPACE_GENERIC, speed_p));
#else
*cost += extra_cost->ldst.load;
#endif
return true;
case PARALLEL:
{
/* Calculations of LDM costs are complex. We assume an initial cost
(ldm_1st) which will load the number of registers mentioned in
ldm_regs_per_insn_1st registers; then each additional
ldm_regs_per_insn_subsequent registers cost one more insn. The
formula for N regs is thus:
ldm_1st + COSTS_N_INSNS ((max (N - ldm_regs_per_insn_1st, 0)
+ ldm_regs_per_insn_subsequent - 1)
/ ldm_regs_per_insn_subsequent).
Additional costs may also be added for addressing. A similar
formula is used for STM. */
bool is_ldm = load_multiple_operation (x, SImode);
bool is_stm = store_multiple_operation (x, SImode);
*cost = COSTS_N_INSNS (1);
if (is_ldm || is_stm)
{
if (speed_p)
{
HOST_WIDE_INT nregs = XVECLEN (x, 0);
HOST_WIDE_INT regs_per_insn_1st = is_ldm
? extra_cost->ldst.ldm_regs_per_insn_1st
: extra_cost->ldst.stm_regs_per_insn_1st;
HOST_WIDE_INT regs_per_insn_sub = is_ldm
? extra_cost->ldst.ldm_regs_per_insn_subsequent
: extra_cost->ldst.stm_regs_per_insn_subsequent;
*cost += regs_per_insn_1st
+ COSTS_N_INSNS (((MAX (nregs - regs_per_insn_1st, 0))
+ regs_per_insn_sub - 1)
/ regs_per_insn_sub);
return true;
}
}
return false;
}
case DIV:
case UDIV:
if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
&& (mode == SFmode || !TARGET_VFP_SINGLE))
*cost = COSTS_N_INSNS (speed_p
? extra_cost->fp[mode != SFmode].div : 1);
else if (mode == SImode && TARGET_IDIV)
*cost = COSTS_N_INSNS (speed_p ? extra_cost->mult[0].idiv : 1);
else
*cost = LIBCALL_COST (2);
return false; /* All arguments must be in registers. */
case MOD:
case UMOD:
*cost = LIBCALL_COST (2);
return false; /* All arguments must be in registers. */
case ROTATE:
if (mode == SImode && REG_P (XEXP (x, 1)))
{
*cost = (COSTS_N_INSNS (2)
+ rtx_cost (XEXP (x, 0), code, 0, speed_p));
if (speed_p)
*cost += extra_cost->alu.shift_reg;
return true;
}
/* Fall through */
case ROTATERT:
case ASHIFT:
case LSHIFTRT:
case ASHIFTRT:
if (mode == DImode && CONST_INT_P (XEXP (x, 1)))
{
*cost = (COSTS_N_INSNS (3)
+ rtx_cost (XEXP (x, 0), code, 0, speed_p));
if (speed_p)
*cost += 2 * extra_cost->alu.shift;
return true;
}
else if (mode == SImode)
{
*cost = (COSTS_N_INSNS (1)
+ rtx_cost (XEXP (x, 0), code, 0, speed_p));
/* Slightly disparage register shifts at -Os, but not by much. */
if (!CONST_INT_P (XEXP (x, 1)))
*cost += (speed_p ? extra_cost->alu.shift_reg : 1
+ rtx_cost (XEXP (x, 1), code, 1, speed_p));
return true;
}
else if (GET_MODE_CLASS (mode) == MODE_INT
&& GET_MODE_SIZE (mode) < 4)
{
if (code == ASHIFT)
{
*cost = (COSTS_N_INSNS (1)
+ rtx_cost (XEXP (x, 0), code, 0, speed_p));
/* Slightly disparage register shifts at -Os, but not by
much. */
if (!CONST_INT_P (XEXP (x, 1)))
*cost += (speed_p ? extra_cost->alu.shift_reg : 1
+ rtx_cost (XEXP (x, 1), code, 1, speed_p));
}
else if (code == LSHIFTRT || code == ASHIFTRT)
{
if (arm_arch_thumb2 && CONST_INT_P (XEXP (x, 1)))
{
/* Can use SBFX/UBFX. */
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->alu.bfx;
*cost += rtx_cost (XEXP (x, 0), code, 0, speed_p);
}
else
{
*cost = COSTS_N_INSNS (2);
*cost += rtx_cost (XEXP (x, 0), code, 0, speed_p);
if (speed_p)
{
if (CONST_INT_P (XEXP (x, 1)))
*cost += 2 * extra_cost->alu.shift;
else
*cost += (extra_cost->alu.shift
+ extra_cost->alu.shift_reg);
}
else
/* Slightly disparage register shifts. */
*cost += !CONST_INT_P (XEXP (x, 1));
}
}
else /* Rotates. */
{
*cost = COSTS_N_INSNS (3 + !CONST_INT_P (XEXP (x, 1)));
*cost += rtx_cost (XEXP (x, 0), code, 0, speed_p);
if (speed_p)
{
if (CONST_INT_P (XEXP (x, 1)))
*cost += (2 * extra_cost->alu.shift
+ extra_cost->alu.log_shift);
else
*cost += (extra_cost->alu.shift
+ extra_cost->alu.shift_reg
+ extra_cost->alu.log_shift_reg);
}
}
return true;
}
*cost = LIBCALL_COST (2);
return false;
case BSWAP:
if (arm_arch6)
{
if (mode == SImode)
{
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->alu.rev;
return false;
}
}
else
{
/* No rev instruction available. Look at arm_legacy_rev
and thumb_legacy_rev for the form of RTL used then. */
if (TARGET_THUMB)
{
*cost = COSTS_N_INSNS (10);
if (speed_p)
{
*cost += 6 * extra_cost->alu.shift;
*cost += 3 * extra_cost->alu.logical;
}
}
else
{
*cost = COSTS_N_INSNS (5);
if (speed_p)
{
*cost += 2 * extra_cost->alu.shift;
*cost += extra_cost->alu.arith_shift;
*cost += 2 * extra_cost->alu.logical;
}
}
return true;
}
return false;
case MINUS:
if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
&& (mode == SFmode || !TARGET_VFP_SINGLE))
{
*cost = COSTS_N_INSNS (1);
if (GET_CODE (XEXP (x, 0)) == MULT
|| GET_CODE (XEXP (x, 1)) == MULT)
{
rtx mul_op0, mul_op1, sub_op;
if (speed_p)
*cost += extra_cost->fp[mode != SFmode].mult_addsub;
if (GET_CODE (XEXP (x, 0)) == MULT)
{
mul_op0 = XEXP (XEXP (x, 0), 0);
mul_op1 = XEXP (XEXP (x, 0), 1);
sub_op = XEXP (x, 1);
}
else
{
mul_op0 = XEXP (XEXP (x, 1), 0);
mul_op1 = XEXP (XEXP (x, 1), 1);
sub_op = XEXP (x, 0);
}
/* The first operand of the multiply may be optionally
negated. */
if (GET_CODE (mul_op0) == NEG)
mul_op0 = XEXP (mul_op0, 0);
*cost += (rtx_cost (mul_op0, code, 0, speed_p)
+ rtx_cost (mul_op1, code, 0, speed_p)
+ rtx_cost (sub_op, code, 0, speed_p));
return true;
}
if (speed_p)
*cost += extra_cost->fp[mode != SFmode].addsub;
return false;
}
if (mode == SImode)
{
rtx shift_by_reg = NULL;
rtx shift_op;
rtx non_shift_op;
*cost = COSTS_N_INSNS (1);
shift_op = shifter_op_p (XEXP (x, 0), &shift_by_reg);
if (shift_op == NULL)
{
shift_op = shifter_op_p (XEXP (x, 1), &shift_by_reg);
non_shift_op = XEXP (x, 0);
}
else
non_shift_op = XEXP (x, 1);
if (shift_op != NULL)
{
if (shift_by_reg != NULL)
{
if (speed_p)
*cost += extra_cost->alu.arith_shift_reg;
*cost += rtx_cost (shift_by_reg, code, 0, speed_p);
}
else if (speed_p)
*cost += extra_cost->alu.arith_shift;
*cost += (rtx_cost (shift_op, code, 0, speed_p)
+ rtx_cost (non_shift_op, code, 0, speed_p));
return true;
}
if (arm_arch_thumb2
&& GET_CODE (XEXP (x, 1)) == MULT)
{
/* MLS. */
if (speed_p)
*cost += extra_cost->mult[0].add;
*cost += (rtx_cost (XEXP (x, 0), MINUS, 0, speed_p)
+ rtx_cost (XEXP (XEXP (x, 1), 0), MULT, 0, speed_p)
+ rtx_cost (XEXP (XEXP (x, 1), 1), MULT, 1, speed_p));
return true;
}
if (CONST_INT_P (XEXP (x, 0)))
{
int insns = arm_gen_constant (MINUS, SImode, NULL_RTX,
INTVAL (XEXP (x, 0)), NULL_RTX,
NULL_RTX, 1, 0);
*cost = COSTS_N_INSNS (insns);
if (speed_p)
*cost += insns * extra_cost->alu.arith;
*cost += rtx_cost (XEXP (x, 1), code, 1, speed_p);
return true;
}
else if (speed_p)
*cost += extra_cost->alu.arith;
return false;
}
if (GET_MODE_CLASS (mode) == MODE_INT
&& GET_MODE_SIZE (mode) < 4)
{
rtx shift_op, shift_reg;
shift_reg = NULL;
/* We check both sides of the MINUS for shifter operands since,
unlike PLUS, it's not commutative. */
HANDLE_NARROW_SHIFT_ARITH (MINUS, 0)
HANDLE_NARROW_SHIFT_ARITH (MINUS, 1)
/* Slightly disparage, as we might need to widen the result. */
*cost = 1 + COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->alu.arith;
if (CONST_INT_P (XEXP (x, 0)))
{
*cost += rtx_cost (XEXP (x, 1), code, 1, speed_p);
return true;
}
return false;
}
if (mode == DImode)
{
*cost = COSTS_N_INSNS (2);
if (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
{
rtx op1 = XEXP (x, 1);
if (speed_p)
*cost += 2 * extra_cost->alu.arith;
if (GET_CODE (op1) == ZERO_EXTEND)
*cost += rtx_cost (XEXP (op1, 0), ZERO_EXTEND, 0, speed_p);
else
*cost += rtx_cost (op1, MINUS, 1, speed_p);
*cost += rtx_cost (XEXP (XEXP (x, 0), 0), ZERO_EXTEND,
0, speed_p);
return true;
}
else if (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
{
if (speed_p)
*cost += extra_cost->alu.arith + extra_cost->alu.arith_shift;
*cost += (rtx_cost (XEXP (XEXP (x, 0), 0), SIGN_EXTEND,
0, speed_p)
+ rtx_cost (XEXP (x, 1), MINUS, 1, speed_p));
return true;
}
else if (GET_CODE (XEXP (x, 1)) == ZERO_EXTEND
|| GET_CODE (XEXP (x, 1)) == SIGN_EXTEND)
{
if (speed_p)
*cost += (extra_cost->alu.arith
+ (GET_CODE (XEXP (x, 1)) == ZERO_EXTEND
? extra_cost->alu.arith
: extra_cost->alu.arith_shift));
*cost += (rtx_cost (XEXP (x, 0), MINUS, 0, speed_p)
+ rtx_cost (XEXP (XEXP (x, 1), 0),
GET_CODE (XEXP (x, 1)), 0, speed_p));
return true;
}
if (speed_p)
*cost += 2 * extra_cost->alu.arith;
return false;
}
/* Vector mode? */
*cost = LIBCALL_COST (2);
return false;
case PLUS:
if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
&& (mode == SFmode || !TARGET_VFP_SINGLE))
{
*cost = COSTS_N_INSNS (1);
if (GET_CODE (XEXP (x, 0)) == MULT)
{
rtx mul_op0, mul_op1, add_op;
if (speed_p)
*cost += extra_cost->fp[mode != SFmode].mult_addsub;
mul_op0 = XEXP (XEXP (x, 0), 0);
mul_op1 = XEXP (XEXP (x, 0), 1);
add_op = XEXP (x, 1);
*cost += (rtx_cost (mul_op0, code, 0, speed_p)
+ rtx_cost (mul_op1, code, 0, speed_p)
+ rtx_cost (add_op, code, 0, speed_p));
return true;
}
if (speed_p)
*cost += extra_cost->fp[mode != SFmode].addsub;
return false;
}
else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
{
*cost = LIBCALL_COST (2);
return false;
}
/* Narrow modes can be synthesized in SImode, but the range
of useful sub-operations is limited. Check for shift operations
on one of the operands. Only left shifts can be used in the
narrow modes. */
if (GET_MODE_CLASS (mode) == MODE_INT
&& GET_MODE_SIZE (mode) < 4)
{
rtx shift_op, shift_reg;
shift_reg = NULL;
HANDLE_NARROW_SHIFT_ARITH (PLUS, 0)
if (CONST_INT_P (XEXP (x, 1)))
{
int insns = arm_gen_constant (PLUS, SImode, NULL_RTX,
INTVAL (XEXP (x, 1)), NULL_RTX,
NULL_RTX, 1, 0);
*cost = COSTS_N_INSNS (insns);
if (speed_p)
*cost += insns * extra_cost->alu.arith;
/* Slightly penalize a narrow operation as the result may
need widening. */
*cost += 1 + rtx_cost (XEXP (x, 0), PLUS, 0, speed_p);
return true;
}
/* Slightly penalize a narrow operation as the result may
need widening. */
*cost = 1 + COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->alu.arith;
return false;
}
if (mode == SImode)
{
rtx shift_op, shift_reg;
*cost = COSTS_N_INSNS (1);
if (TARGET_INT_SIMD
&& (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
|| GET_CODE (XEXP (x, 0)) == SIGN_EXTEND))
{
/* UXTA[BH] or SXTA[BH]. */
if (speed_p)
*cost += extra_cost->alu.extend_arith;
*cost += (rtx_cost (XEXP (XEXP (x, 0), 0), ZERO_EXTEND, 0,
speed_p)
+ rtx_cost (XEXP (x, 1), PLUS, 0, speed_p));
return true;
}
shift_reg = NULL;
shift_op = shifter_op_p (XEXP (x, 0), &shift_reg);
if (shift_op != NULL)
{
if (shift_reg)
{
if (speed_p)
*cost += extra_cost->alu.arith_shift_reg;
*cost += rtx_cost (shift_reg, ASHIFT, 1, speed_p);
}
else if (speed_p)
*cost += extra_cost->alu.arith_shift;
*cost += (rtx_cost (shift_op, ASHIFT, 0, speed_p)
+ rtx_cost (XEXP (x, 1), PLUS, 1, speed_p));
return true;
}
if (GET_CODE (XEXP (x, 0)) == MULT)
{
rtx mul_op = XEXP (x, 0);
*cost = COSTS_N_INSNS (1);
if (TARGET_DSP_MULTIPLY
&& ((GET_CODE (XEXP (mul_op, 0)) == SIGN_EXTEND
&& (GET_CODE (XEXP (mul_op, 1)) == SIGN_EXTEND
|| (GET_CODE (XEXP (mul_op, 1)) == ASHIFTRT
&& CONST_INT_P (XEXP (XEXP (mul_op, 1), 1))
&& INTVAL (XEXP (XEXP (mul_op, 1), 1)) == 16)))
|| (GET_CODE (XEXP (mul_op, 0)) == ASHIFTRT
&& CONST_INT_P (XEXP (XEXP (mul_op, 0), 1))
&& INTVAL (XEXP (XEXP (mul_op, 0), 1)) == 16
&& (GET_CODE (XEXP (mul_op, 1)) == SIGN_EXTEND
|| (GET_CODE (XEXP (mul_op, 1)) == ASHIFTRT
&& CONST_INT_P (XEXP (XEXP (mul_op, 1), 1))
&& (INTVAL (XEXP (XEXP (mul_op, 1), 1))
== 16))))))
{
/* SMLA[BT][BT]. */
if (speed_p)
*cost += extra_cost->mult[0].extend_add;
*cost += (rtx_cost (XEXP (XEXP (mul_op, 0), 0),
SIGN_EXTEND, 0, speed_p)
+ rtx_cost (XEXP (XEXP (mul_op, 1), 0),
SIGN_EXTEND, 0, speed_p)
+ rtx_cost (XEXP (x, 1), PLUS, 1, speed_p));
return true;
}
if (speed_p)
*cost += extra_cost->mult[0].add;
*cost += (rtx_cost (XEXP (mul_op, 0), MULT, 0, speed_p)
+ rtx_cost (XEXP (mul_op, 1), MULT, 1, speed_p)
+ rtx_cost (XEXP (x, 1), PLUS, 1, speed_p));
return true;
}
if (CONST_INT_P (XEXP (x, 1)))
{
int insns = arm_gen_constant (PLUS, SImode, NULL_RTX,
INTVAL (XEXP (x, 1)), NULL_RTX,
NULL_RTX, 1, 0);
*cost = COSTS_N_INSNS (insns);
if (speed_p)
*cost += insns * extra_cost->alu.arith;
*cost += rtx_cost (XEXP (x, 0), PLUS, 0, speed_p);
return true;
}
else if (speed_p)
*cost += extra_cost->alu.arith;
return false;
}
if (mode == DImode)
{
if (arm_arch3m
&& GET_CODE (XEXP (x, 0)) == MULT
&& ((GET_CODE (XEXP (XEXP (x, 0), 0)) == ZERO_EXTEND
&& GET_CODE (XEXP (XEXP (x, 0), 1)) == ZERO_EXTEND)
|| (GET_CODE (XEXP (XEXP (x, 0), 0)) == SIGN_EXTEND
&& GET_CODE (XEXP (XEXP (x, 0), 1)) == SIGN_EXTEND)))
{
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->mult[1].extend_add;
*cost += (rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
ZERO_EXTEND, 0, speed_p)
+ rtx_cost (XEXP (XEXP (XEXP (x, 0), 1), 0),
ZERO_EXTEND, 0, speed_p)
+ rtx_cost (XEXP (x, 1), PLUS, 1, speed_p));
return true;
}
*cost = COSTS_N_INSNS (2);
if (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
|| GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
{
if (speed_p)
*cost += (extra_cost->alu.arith
+ (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
? extra_cost->alu.arith
: extra_cost->alu.arith_shift));
*cost += (rtx_cost (XEXP (XEXP (x, 0), 0), ZERO_EXTEND, 0,
speed_p)
+ rtx_cost (XEXP (x, 1), PLUS, 1, speed_p));
return true;
}
if (speed_p)
*cost += 2 * extra_cost->alu.arith;
return false;
}
/* Vector mode? */
*cost = LIBCALL_COST (2);
return false;
case IOR:
if (mode == SImode && arm_arch6 && aarch_rev16_p (x))
{
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->alu.rev;
return true;
}
/* Fall through. */
case AND: case XOR:
if (mode == SImode)
{
enum rtx_code subcode = GET_CODE (XEXP (x, 0));
rtx op0 = XEXP (x, 0);
rtx shift_op, shift_reg;
*cost = COSTS_N_INSNS (1);
if (subcode == NOT
&& (code == AND
|| (code == IOR && TARGET_THUMB2)))
op0 = XEXP (op0, 0);
shift_reg = NULL;
shift_op = shifter_op_p (op0, &shift_reg);
if (shift_op != NULL)
{
if (shift_reg)
{
if (speed_p)
*cost += extra_cost->alu.log_shift_reg;
*cost += rtx_cost (shift_reg, ASHIFT, 1, speed_p);
}
else if (speed_p)
*cost += extra_cost->alu.log_shift;
*cost += (rtx_cost (shift_op, ASHIFT, 0, speed_p)
+ rtx_cost (XEXP (x, 1), code, 1, speed_p));
return true;
}
if (CONST_INT_P (XEXP (x, 1)))
{
int insns = arm_gen_constant (code, SImode, NULL_RTX,
INTVAL (XEXP (x, 1)), NULL_RTX,
NULL_RTX, 1, 0);
*cost = COSTS_N_INSNS (insns);
if (speed_p)
*cost += insns * extra_cost->alu.logical;
*cost += rtx_cost (op0, code, 0, speed_p);
return true;
}
if (speed_p)
*cost += extra_cost->alu.logical;
*cost += (rtx_cost (op0, code, 0, speed_p)
+ rtx_cost (XEXP (x, 1), code, 1, speed_p));
return true;
}
if (mode == DImode)
{
rtx op0 = XEXP (x, 0);
enum rtx_code subcode = GET_CODE (op0);
*cost = COSTS_N_INSNS (2);
if (subcode == NOT
&& (code == AND
|| (code == IOR && TARGET_THUMB2)))
op0 = XEXP (op0, 0);
if (GET_CODE (op0) == ZERO_EXTEND)
{
if (speed_p)
*cost += 2 * extra_cost->alu.logical;
*cost += (rtx_cost (XEXP (op0, 0), ZERO_EXTEND, 0, speed_p)
+ rtx_cost (XEXP (x, 1), code, 0, speed_p));
return true;
}
else if (GET_CODE (op0) == SIGN_EXTEND)
{
if (speed_p)
*cost += extra_cost->alu.logical + extra_cost->alu.log_shift;
*cost += (rtx_cost (XEXP (op0, 0), SIGN_EXTEND, 0, speed_p)
+ rtx_cost (XEXP (x, 1), code, 0, speed_p));
return true;
}
if (speed_p)
*cost += 2 * extra_cost->alu.logical;
return true;
}
/* Vector mode? */
*cost = LIBCALL_COST (2);
return false;
case MULT:
if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
&& (mode == SFmode || !TARGET_VFP_SINGLE))
{
rtx op0 = XEXP (x, 0);
*cost = COSTS_N_INSNS (1);
if (GET_CODE (op0) == NEG && !flag_rounding_math)
op0 = XEXP (op0, 0);
if (speed_p)
*cost += extra_cost->fp[mode != SFmode].mult;
*cost += (rtx_cost (op0, MULT, 0, speed_p)
+ rtx_cost (XEXP (x, 1), MULT, 1, speed_p));
return true;
}
else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
{
*cost = LIBCALL_COST (2);
return false;
}
if (mode == SImode)
{
*cost = COSTS_N_INSNS (1);
if (TARGET_DSP_MULTIPLY
&& ((GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
&& (GET_CODE (XEXP (x, 1)) == SIGN_EXTEND
|| (GET_CODE (XEXP (x, 1)) == ASHIFTRT
&& CONST_INT_P (XEXP (XEXP (x, 1), 1))
&& INTVAL (XEXP (XEXP (x, 1), 1)) == 16)))
|| (GET_CODE (XEXP (x, 0)) == ASHIFTRT
&& CONST_INT_P (XEXP (XEXP (x, 0), 1))
&& INTVAL (XEXP (XEXP (x, 0), 1)) == 16
&& (GET_CODE (XEXP (x, 1)) == SIGN_EXTEND
|| (GET_CODE (XEXP (x, 1)) == ASHIFTRT
&& CONST_INT_P (XEXP (XEXP (x, 1), 1))
&& (INTVAL (XEXP (XEXP (x, 1), 1))
== 16))))))
{
/* SMUL[TB][TB]. */
if (speed_p)
*cost += extra_cost->mult[0].extend;
*cost += rtx_cost (XEXP (XEXP (x, 0), 0),
SIGN_EXTEND, 0, speed_p);
*cost += rtx_cost (XEXP (XEXP (x, 1), 0),
SIGN_EXTEND, 1, speed_p);
return true;
}
if (speed_p)
*cost += extra_cost->mult[0].simple;
return false;
}
if (mode == DImode)
{
if (arm_arch3m
&& ((GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
&& GET_CODE (XEXP (x, 1)) == ZERO_EXTEND)
|| (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
&& GET_CODE (XEXP (x, 1)) == SIGN_EXTEND)))
{
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->mult[1].extend;
*cost += (rtx_cost (XEXP (XEXP (x, 0), 0),
ZERO_EXTEND, 0, speed_p)
+ rtx_cost (XEXP (XEXP (x, 1), 0),
ZERO_EXTEND, 0, speed_p));
return true;
}
*cost = LIBCALL_COST (2);
return false;
}
/* Vector mode? */
*cost = LIBCALL_COST (2);
return false;
case NEG:
if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
&& (mode == SFmode || !TARGET_VFP_SINGLE))
{
if (GET_CODE (XEXP (x, 0)) == MULT)
{
/* VNMUL. */
*cost = rtx_cost (XEXP (x, 0), NEG, 0, speed_p);
return true;
}
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->fp[mode != SFmode].neg;
return false;
}
else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
{
*cost = LIBCALL_COST (1);
return false;
}
if (mode == SImode)
{
if (GET_CODE (XEXP (x, 0)) == ABS)
{
*cost = COSTS_N_INSNS (2);
/* Assume the non-flag-changing variant. */
if (speed_p)
*cost += (extra_cost->alu.log_shift
+ extra_cost->alu.arith_shift);
*cost += rtx_cost (XEXP (XEXP (x, 0), 0), ABS, 0, speed_p);
return true;
}
if (GET_RTX_CLASS (GET_CODE (XEXP (x, 0))) == RTX_COMPARE
|| GET_RTX_CLASS (GET_CODE (XEXP (x, 0))) == RTX_COMM_COMPARE)
{
*cost = COSTS_N_INSNS (2);
/* No extra cost for MOV imm and MVN imm. */
/* If the comparison op is using the flags, there's no further
cost, otherwise we need to add the cost of the comparison. */
if (!(REG_P (XEXP (XEXP (x, 0), 0))
&& REGNO (XEXP (XEXP (x, 0), 0)) == CC_REGNUM
&& XEXP (XEXP (x, 0), 1) == const0_rtx))
{
*cost += (COSTS_N_INSNS (1)
+ rtx_cost (XEXP (XEXP (x, 0), 0), COMPARE, 0,
speed_p)
+ rtx_cost (XEXP (XEXP (x, 0), 1), COMPARE, 1,
speed_p));
if (speed_p)
*cost += extra_cost->alu.arith;
}
return true;
}
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->alu.arith;
return false;
}
if (GET_MODE_CLASS (mode) == MODE_INT
&& GET_MODE_SIZE (mode) < 4)
{
/* Slightly disparage, as we might need an extend operation. */
*cost = 1 + COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->alu.arith;
return false;
}
if (mode == DImode)
{
*cost = COSTS_N_INSNS (2);
if (speed_p)
*cost += 2 * extra_cost->alu.arith;
return false;
}
/* Vector mode? */
*cost = LIBCALL_COST (1);
return false;
case NOT:
if (mode == SImode)
{
rtx shift_op;
rtx shift_reg = NULL;
*cost = COSTS_N_INSNS (1);
shift_op = shifter_op_p (XEXP (x, 0), &shift_reg);
if (shift_op)
{
if (shift_reg != NULL)
{
if (speed_p)
*cost += extra_cost->alu.log_shift_reg;
*cost += rtx_cost (shift_reg, ASHIFT, 1, speed_p);
}
else if (speed_p)
*cost += extra_cost->alu.log_shift;
*cost += rtx_cost (shift_op, ASHIFT, 0, speed_p);
return true;
}
if (speed_p)
*cost += extra_cost->alu.logical;
return false;
}
if (mode == DImode)
{
*cost = COSTS_N_INSNS (2);
return false;
}
/* Vector mode? */
*cost += LIBCALL_COST (1);
return false;
case IF_THEN_ELSE:
{
if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC)
{
*cost = COSTS_N_INSNS (4);
return true;
}
int op1cost = rtx_cost (XEXP (x, 1), SET, 1, speed_p);
int op2cost = rtx_cost (XEXP (x, 2), SET, 1, speed_p);
*cost = rtx_cost (XEXP (x, 0), IF_THEN_ELSE, 0, speed_p);
/* Assume that if one arm of the if_then_else is a register,
that it will be tied with the result and eliminate the
conditional insn. */
if (REG_P (XEXP (x, 1)))
*cost += op2cost;
else if (REG_P (XEXP (x, 2)))
*cost += op1cost;
else
{
if (speed_p)
{
if (extra_cost->alu.non_exec_costs_exec)
*cost += op1cost + op2cost + extra_cost->alu.non_exec;
else
*cost += MAX (op1cost, op2cost) + extra_cost->alu.non_exec;
}
else
*cost += op1cost + op2cost;
}
}
return true;
case COMPARE:
if (cc_register (XEXP (x, 0), VOIDmode) && XEXP (x, 1) == const0_rtx)
*cost = 0;
else
{
machine_mode op0mode;
/* We'll mostly assume that the cost of a compare is the cost of the
LHS. However, there are some notable exceptions. */
/* Floating point compares are never done as side-effects. */
op0mode = GET_MODE (XEXP (x, 0));
if (TARGET_HARD_FLOAT && GET_MODE_CLASS (op0mode) == MODE_FLOAT
&& (op0mode == SFmode || !TARGET_VFP_SINGLE))
{
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->fp[op0mode != SFmode].compare;
if (XEXP (x, 1) == CONST0_RTX (op0mode))
{
*cost += rtx_cost (XEXP (x, 0), code, 0, speed_p);
return true;
}
return false;
}
else if (GET_MODE_CLASS (op0mode) == MODE_FLOAT)
{
*cost = LIBCALL_COST (2);
return false;
}
/* DImode compares normally take two insns. */
if (op0mode == DImode)
{
*cost = COSTS_N_INSNS (2);
if (speed_p)
*cost += 2 * extra_cost->alu.arith;
return false;
}
if (op0mode == SImode)
{
rtx shift_op;
rtx shift_reg;
if (XEXP (x, 1) == const0_rtx
&& !(REG_P (XEXP (x, 0))
|| (GET_CODE (XEXP (x, 0)) == SUBREG
&& REG_P (SUBREG_REG (XEXP (x, 0))))))
{
*cost = rtx_cost (XEXP (x, 0), COMPARE, 0, speed_p);
/* Multiply operations that set the flags are often
significantly more expensive. */
if (speed_p
&& GET_CODE (XEXP (x, 0)) == MULT
&& !power_of_two_operand (XEXP (XEXP (x, 0), 1), mode))
*cost += extra_cost->mult[0].flag_setting;
if (speed_p
&& GET_CODE (XEXP (x, 0)) == PLUS
&& GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
&& !power_of_two_operand (XEXP (XEXP (XEXP (x, 0),
0), 1), mode))
*cost += extra_cost->mult[0].flag_setting;
return true;
}
shift_reg = NULL;
shift_op = shifter_op_p (XEXP (x, 0), &shift_reg);
if (shift_op != NULL)
{
*cost = COSTS_N_INSNS (1);
if (shift_reg != NULL)
{
*cost += rtx_cost (shift_reg, ASHIFT, 1, speed_p);
if (speed_p)
*cost += extra_cost->alu.arith_shift_reg;
}
else if (speed_p)
*cost += extra_cost->alu.arith_shift;
*cost += (rtx_cost (shift_op, ASHIFT, 0, speed_p)
+ rtx_cost (XEXP (x, 1), COMPARE, 1, speed_p));
return true;
}
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->alu.arith;
if (CONST_INT_P (XEXP (x, 1))
&& const_ok_for_op (INTVAL (XEXP (x, 1)), COMPARE))
{
*cost += rtx_cost (XEXP (x, 0), COMPARE, 0, speed_p);
return true;
}
return false;
}
/* Vector mode? */
*cost = LIBCALL_COST (2);
return false;
}
return true;
case EQ:
case NE:
case LT:
case LE:
case GT:
case GE:
case LTU:
case LEU:
case GEU:
case GTU:
case ORDERED:
case UNORDERED:
case UNEQ:
case UNLE:
case UNLT:
case UNGE:
case UNGT:
case LTGT:
if (outer_code == SET)
{
/* Is it a store-flag operation? */
if (REG_P (XEXP (x, 0)) && REGNO (XEXP (x, 0)) == CC_REGNUM
&& XEXP (x, 1) == const0_rtx)
{
/* Thumb also needs an IT insn. */
*cost = COSTS_N_INSNS (TARGET_THUMB ? 3 : 2);
return true;
}
if (XEXP (x, 1) == const0_rtx)
{
switch (code)
{
case LT:
/* LSR Rd, Rn, #31. */
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->alu.shift;
break;
case EQ:
/* RSBS T1, Rn, #0
ADC Rd, Rn, T1. */
case NE:
/* SUBS T1, Rn, #1
SBC Rd, Rn, T1. */
*cost = COSTS_N_INSNS (2);
break;
case LE:
/* RSBS T1, Rn, Rn, LSR #31
ADC Rd, Rn, T1. */
*cost = COSTS_N_INSNS (2);
if (speed_p)
*cost += extra_cost->alu.arith_shift;
break;
case GT:
/* RSB Rd, Rn, Rn, ASR #1
LSR Rd, Rd, #31. */
*cost = COSTS_N_INSNS (2);
if (speed_p)
*cost += (extra_cost->alu.arith_shift
+ extra_cost->alu.shift);
break;
case GE:
/* ASR Rd, Rn, #31
ADD Rd, Rn, #1. */
*cost = COSTS_N_INSNS (2);
if (speed_p)
*cost += extra_cost->alu.shift;
break;
default:
/* Remaining cases are either meaningless or would take
three insns anyway. */
*cost = COSTS_N_INSNS (3);
break;
}
*cost += rtx_cost (XEXP (x, 0), code, 0, speed_p);
return true;
}
else
{
*cost = COSTS_N_INSNS (TARGET_THUMB ? 4 : 3);
if (CONST_INT_P (XEXP (x, 1))
&& const_ok_for_op (INTVAL (XEXP (x, 1)), COMPARE))
{
*cost += rtx_cost (XEXP (x, 0), code, 0, speed_p);
return true;
}
return false;
}
}
/* Not directly inside a set. If it involves the condition code
register it must be the condition for a branch, cond_exec or
I_T_E operation. Since the comparison is performed elsewhere
this is just the control part which has no additional
cost. */
else if (REG_P (XEXP (x, 0)) && REGNO (XEXP (x, 0)) == CC_REGNUM
&& XEXP (x, 1) == const0_rtx)
{
*cost = 0;
return true;
}
return false;
case ABS:
if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
&& (mode == SFmode || !TARGET_VFP_SINGLE))
{
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->fp[mode != SFmode].neg;
return false;
}
else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
{
*cost = LIBCALL_COST (1);
return false;
}
if (mode == SImode)
{
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->alu.log_shift + extra_cost->alu.arith_shift;
return false;
}
/* Vector mode? */
*cost = LIBCALL_COST (1);
return false;
case SIGN_EXTEND:
if ((arm_arch4 || GET_MODE (XEXP (x, 0)) == SImode)
&& MEM_P (XEXP (x, 0)))
{
*cost = rtx_cost (XEXP (x, 0), code, 0, speed_p);
if (mode == DImode)
*cost += COSTS_N_INSNS (1);
if (!speed_p)
return true;
if (GET_MODE (XEXP (x, 0)) == SImode)
*cost += extra_cost->ldst.load;
else
*cost += extra_cost->ldst.load_sign_extend;
if (mode == DImode)
*cost += extra_cost->alu.shift;
return true;
}
/* Widening from less than 32-bits requires an extend operation. */
if (GET_MODE (XEXP (x, 0)) != SImode && arm_arch6)
{
/* We have SXTB/SXTH. */
*cost = COSTS_N_INSNS (1);
*cost += rtx_cost (XEXP (x, 0), code, 0, speed_p);
if (speed_p)
*cost += extra_cost->alu.extend;
}
else if (GET_MODE (XEXP (x, 0)) != SImode)
{
/* Needs two shifts. */
*cost = COSTS_N_INSNS (2);
*cost += rtx_cost (XEXP (x, 0), code, 0, speed_p);
if (speed_p)
*cost += 2 * extra_cost->alu.shift;
}
/* Widening beyond 32-bits requires one more insn. */
if (mode == DImode)
{
*cost += COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->alu.shift;
}
return true;
case ZERO_EXTEND:
if ((arm_arch4
|| GET_MODE (XEXP (x, 0)) == SImode
|| GET_MODE (XEXP (x, 0)) == QImode)
&& MEM_P (XEXP (x, 0)))
{
*cost = rtx_cost (XEXP (x, 0), code, 0, speed_p);
if (mode == DImode)
*cost += COSTS_N_INSNS (1); /* No speed penalty. */
return true;
}
/* Widening from less than 32-bits requires an extend operation. */
if (GET_MODE (XEXP (x, 0)) == QImode)
{
/* UXTB can be a shorter instruction in Thumb2, but it might
be slower than the AND Rd, Rn, #255 alternative. When
optimizing for speed it should never be slower to use
AND, and we don't really model 16-bit vs 32-bit insns
here. */
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->alu.logical;
}
else if (GET_MODE (XEXP (x, 0)) != SImode && arm_arch6)
{
/* We have UXTB/UXTH. */
*cost = COSTS_N_INSNS (1);
*cost += rtx_cost (XEXP (x, 0), code, 0, speed_p);
if (speed_p)
*cost += extra_cost->alu.extend;
}
else if (GET_MODE (XEXP (x, 0)) != SImode)
{
/* Needs two shifts. It's marginally preferable to use
shifts rather than two BIC instructions as the second
shift may merge with a subsequent insn as a shifter
op. */
*cost = COSTS_N_INSNS (2);
*cost += rtx_cost (XEXP (x, 0), code, 0, speed_p);
if (speed_p)
*cost += 2 * extra_cost->alu.shift;
}
else /* GET_MODE (XEXP (x, 0)) == SImode. */
*cost = COSTS_N_INSNS (1);
/* Widening beyond 32-bits requires one more insn. */
if (mode == DImode)
{
*cost += COSTS_N_INSNS (1); /* No speed penalty. */
}
return true;
case CONST_INT:
*cost = 0;
/* CONST_INT has no mode, so we cannot tell for sure how many
insns are really going to be needed. The best we can do is
look at the value passed. If it fits in SImode, then assume
that's the mode it will be used for. Otherwise assume it
will be used in DImode. */
if (INTVAL (x) == trunc_int_for_mode (INTVAL (x), SImode))
mode = SImode;
else
mode = DImode;
/* Avoid blowing up in arm_gen_constant (). */
if (!(outer_code == PLUS
|| outer_code == AND
|| outer_code == IOR
|| outer_code == XOR
|| outer_code == MINUS))
outer_code = SET;
const_int_cost:
if (mode == SImode)
{
*cost += COSTS_N_INSNS (arm_gen_constant (outer_code, SImode, NULL,
INTVAL (x), NULL, NULL,
0, 0));
/* Extra costs? */
}
else
{
*cost += COSTS_N_INSNS (arm_gen_constant
(outer_code, SImode, NULL,
trunc_int_for_mode (INTVAL (x), SImode),
NULL, NULL, 0, 0)
+ arm_gen_constant (outer_code, SImode, NULL,
INTVAL (x) >> 32, NULL,
NULL, 0, 0));
/* Extra costs? */
}
return true;
case CONST:
case LABEL_REF:
case SYMBOL_REF:
if (speed_p)
{
if (arm_arch_thumb2 && !flag_pic)
*cost = COSTS_N_INSNS (2);
else
*cost = COSTS_N_INSNS (1) + extra_cost->ldst.load;
}
else
*cost = COSTS_N_INSNS (2);
if (flag_pic)
{
*cost += COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->alu.arith;
}
return true;
case CONST_FIXED:
*cost = COSTS_N_INSNS (4);
/* Fixme. */
return true;
case CONST_DOUBLE:
if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
&& (mode == SFmode || !TARGET_VFP_SINGLE))
{
if (vfp3_const_double_rtx (x))
{
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->fp[mode == DFmode].fpconst;
return true;
}
if (speed_p)
{
*cost = COSTS_N_INSNS (1);
if (mode == DFmode)
*cost += extra_cost->ldst.loadd;
else
*cost += extra_cost->ldst.loadf;
}
else
*cost = COSTS_N_INSNS (2 + (mode == DFmode));
return true;
}
*cost = COSTS_N_INSNS (4);
return true;
case CONST_VECTOR:
/* Fixme. */
if (TARGET_NEON
&& TARGET_HARD_FLOAT
&& (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))
&& neon_immediate_valid_for_move (x, mode, NULL, NULL))
*cost = COSTS_N_INSNS (1);
else
*cost = COSTS_N_INSNS (4);
return true;
case HIGH:
case LO_SUM:
*cost = COSTS_N_INSNS (1);
/* When optimizing for size, we prefer constant pool entries to
MOVW/MOVT pairs, so bump the cost of these slightly. */
if (!speed_p)
*cost += 1;
return true;
case CLZ:
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->alu.clz;
return false;
case SMIN:
if (XEXP (x, 1) == const0_rtx)
{
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->alu.log_shift;
*cost += rtx_cost (XEXP (x, 0), code, 0, speed_p);
return true;
}
/* Fall through. */
case SMAX:
case UMIN:
case UMAX:
*cost = COSTS_N_INSNS (2);
return false;
case TRUNCATE:
if (GET_CODE (XEXP (x, 0)) == ASHIFTRT
&& CONST_INT_P (XEXP (XEXP (x, 0), 1))
&& INTVAL (XEXP (XEXP (x, 0), 1)) == 32
&& GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
&& ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
&& GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND)
|| (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
&& (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1))
== ZERO_EXTEND))))
{
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->mult[1].extend;
*cost += (rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), ZERO_EXTEND, 0,
speed_p)
+ rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 1), ZERO_EXTEND,
0, speed_p));
return true;
}
*cost = LIBCALL_COST (1);
return false;
case UNSPEC:
return arm_unspec_cost (x, outer_code, speed_p, cost);
case PC:
/* Reading the PC is like reading any other register. Writing it
is more expensive, but we take that into account elsewhere. */
*cost = 0;
return true;
case ZERO_EXTRACT:
/* TODO: Simple zero_extract of bottom bits using AND. */
/* Fall through. */
case SIGN_EXTRACT:
if (arm_arch6
&& mode == SImode
&& CONST_INT_P (XEXP (x, 1))
&& CONST_INT_P (XEXP (x, 2)))
{
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->alu.bfx;
*cost += rtx_cost (XEXP (x, 0), code, 0, speed_p);
return true;
}
/* Without UBFX/SBFX, need to resort to shift operations. */
*cost = COSTS_N_INSNS (2);
if (speed_p)
*cost += 2 * extra_cost->alu.shift;
*cost += rtx_cost (XEXP (x, 0), ASHIFT, 0, speed_p);
return true;
case FLOAT_EXTEND:
if (TARGET_HARD_FLOAT)
{
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->fp[mode == DFmode].widen;
if (!TARGET_FPU_ARMV8
&& GET_MODE (XEXP (x, 0)) == HFmode)
{
/* Pre v8, widening HF->DF is a two-step process, first
widening to SFmode. */
*cost += COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->fp[0].widen;
}
*cost += rtx_cost (XEXP (x, 0), code, 0, speed_p);
return true;
}
*cost = LIBCALL_COST (1);
return false;
case FLOAT_TRUNCATE:
if (TARGET_HARD_FLOAT)
{
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->fp[mode == DFmode].narrow;
*cost += rtx_cost (XEXP (x, 0), code, 0, speed_p);
return true;
/* Vector modes? */
}
*cost = LIBCALL_COST (1);
return false;
case FMA:
if (TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FMA)
{
rtx op0 = XEXP (x, 0);
rtx op1 = XEXP (x, 1);
rtx op2 = XEXP (x, 2);
*cost = COSTS_N_INSNS (1);
/* vfms or vfnma. */
if (GET_CODE (op0) == NEG)
op0 = XEXP (op0, 0);
/* vfnms or vfnma. */
if (GET_CODE (op2) == NEG)
op2 = XEXP (op2, 0);
*cost += rtx_cost (op0, FMA, 0, speed_p);
*cost += rtx_cost (op1, FMA, 1, speed_p);
*cost += rtx_cost (op2, FMA, 2, speed_p);
if (speed_p)
*cost += extra_cost->fp[mode ==DFmode].fma;
return true;
}
*cost = LIBCALL_COST (3);
return false;
case FIX:
case UNSIGNED_FIX:
if (TARGET_HARD_FLOAT)
{
if (GET_MODE_CLASS (mode) == MODE_INT)
{
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->fp[GET_MODE (XEXP (x, 0)) == DFmode].toint;
/* Strip of the 'cost' of rounding towards zero. */
if (GET_CODE (XEXP (x, 0)) == FIX)
*cost += rtx_cost (XEXP (XEXP (x, 0), 0), code, 0, speed_p);
else
*cost += rtx_cost (XEXP (x, 0), code, 0, speed_p);
/* ??? Increase the cost to deal with transferring from
FP -> CORE registers? */
return true;
}
else if (GET_MODE_CLASS (mode) == MODE_FLOAT
&& TARGET_FPU_ARMV8)
{
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->fp[mode == DFmode].roundint;
return false;
}
/* Vector costs? */
}
*cost = LIBCALL_COST (1);
return false;
case FLOAT:
case UNSIGNED_FLOAT:
if (TARGET_HARD_FLOAT)
{
/* ??? Increase the cost to deal with transferring from CORE
-> FP registers? */
*cost = COSTS_N_INSNS (1);
if (speed_p)
*cost += extra_cost->fp[mode == DFmode].fromint;
return false;
}
*cost = LIBCALL_COST (1);
return false;
case CALL:
*cost = COSTS_N_INSNS (1);
return true;
case ASM_OPERANDS:
{
/* Just a guess. Guess number of instructions in the asm
plus one insn per input. Always a minimum of COSTS_N_INSNS (1)
though (see PR60663). */
int asm_length = MAX (1, asm_str_count (ASM_OPERANDS_TEMPLATE (x)));
int num_operands = ASM_OPERANDS_INPUT_LENGTH (x);
*cost = COSTS_N_INSNS (asm_length + num_operands);
return true;
}
default:
if (mode != VOIDmode)
*cost = COSTS_N_INSNS (ARM_NUM_REGS (mode));
else
*cost = COSTS_N_INSNS (4); /* Who knows? */
return false;
}
}
#undef HANDLE_NARROW_SHIFT_ARITH
/* RTX costs when optimizing for size. */
static bool
arm_rtx_costs (rtx x, int code, int outer_code, int opno ATTRIBUTE_UNUSED,
int *total, bool speed)
{
bool result;
if (TARGET_OLD_RTX_COSTS
|| (!current_tune->insn_extra_cost && !TARGET_NEW_GENERIC_COSTS))
{
/* Old way. (Deprecated.) */
if (!speed)
result = arm_size_rtx_costs (x, (enum rtx_code) code,
(enum rtx_code) outer_code, total);
else
result = current_tune->rtx_costs (x, (enum rtx_code) code,
(enum rtx_code) outer_code, total,
speed);
}
else
{
/* New way. */
if (current_tune->insn_extra_cost)
result = arm_new_rtx_costs (x, (enum rtx_code) code,
(enum rtx_code) outer_code,
current_tune->insn_extra_cost,
total, speed);
/* TARGET_NEW_GENERIC_COSTS && !TARGET_OLD_RTX_COSTS
&& current_tune->insn_extra_cost != NULL */
else
result = arm_new_rtx_costs (x, (enum rtx_code) code,
(enum rtx_code) outer_code,
&generic_extra_costs, total, speed);
}
if (dump_file && (dump_flags & TDF_DETAILS))
{
print_rtl_single (dump_file, x);
fprintf (dump_file, "\n%s cost: %d (%s)\n", speed ? "Hot" : "Cold",
*total, result ? "final" : "partial");
}
return result;
}
/* RTX costs for cores with a slow MUL implementation. Thumb-2 is not
supported on any "slowmul" cores, so it can be ignored. */
static bool
arm_slowmul_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
int *total, bool speed)
{
machine_mode mode = GET_MODE (x);
if (TARGET_THUMB)
{
*total = thumb1_rtx_costs (x, code, outer_code);
return true;
}
switch (code)
{
case MULT:
if (GET_MODE_CLASS (mode) == MODE_FLOAT
|| mode == DImode)
{
*total = COSTS_N_INSNS (20);
return false;
}
if (CONST_INT_P (XEXP (x, 1)))
{
unsigned HOST_WIDE_INT i = (INTVAL (XEXP (x, 1))
& (unsigned HOST_WIDE_INT) 0xffffffff);
int cost, const_ok = const_ok_for_arm (i);
int j, booth_unit_size;
/* Tune as appropriate. */
cost = const_ok ? 4 : 8;
booth_unit_size = 2;
for (j = 0; i && j < 32; j += booth_unit_size)
{
i >>= booth_unit_size;
cost++;
}
*total = COSTS_N_INSNS (cost);
*total += rtx_cost (XEXP (x, 0), code, 0, speed);
return true;
}
*total = COSTS_N_INSNS (20);
return false;
default:
return arm_rtx_costs_1 (x, outer_code, total, speed);;
}
}
/* RTX cost for cores with a fast multiply unit (M variants). */
static bool
arm_fastmul_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
int *total, bool speed)
{
machine_mode mode = GET_MODE (x);
if (TARGET_THUMB1)
{
*total = thumb1_rtx_costs (x, code, outer_code);
return true;
}
/* ??? should thumb2 use different costs? */
switch (code)
{
case MULT:
/* There is no point basing this on the tuning, since it is always the
fast variant if it exists at all. */
if (mode == DImode
&& (GET_CODE (XEXP (x, 0)) == GET_CODE (XEXP (x, 1)))
&& (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
|| GET_CODE (XEXP (x, 0)) == SIGN_EXTEND))
{
*total = COSTS_N_INSNS(2);
return false;
}
if (mode == DImode)
{
*total = COSTS_N_INSNS (5);
return false;
}
if (CONST_INT_P (XEXP (x, 1)))
{
unsigned HOST_WIDE_INT i = (INTVAL (XEXP (x, 1))
& (unsigned HOST_WIDE_INT) 0xffffffff);
int cost, const_ok = const_ok_for_arm (i);
int j, booth_unit_size;
/* Tune as appropriate. */
cost = const_ok ? 4 : 8;
booth_unit_size = 8;
for (j = 0; i && j < 32; j += booth_unit_size)
{
i >>= booth_unit_size;
cost++;
}
*total = COSTS_N_INSNS(cost);
return false;
}
if (mode == SImode)
{
*total = COSTS_N_INSNS (4);
return false;
}
if (GET_MODE_CLASS (mode) == MODE_FLOAT)
{
if (TARGET_HARD_FLOAT
&& (mode == SFmode
|| (mode == DFmode && !TARGET_VFP_SINGLE)))
{
*total = COSTS_N_INSNS (1);
return false;
}
}
/* Requires a lib call */
*total = COSTS_N_INSNS (20);
return false;
default:
return arm_rtx_costs_1 (x, outer_code, total, speed);
}
}
/* RTX cost for XScale CPUs. Thumb-2 is not supported on any xscale cores,
so it can be ignored. */
static bool
arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
int *total, bool speed)
{
machine_mode mode = GET_MODE (x);
if (TARGET_THUMB)
{
*total = thumb1_rtx_costs (x, code, outer_code);
return true;
}
switch (code)
{
case COMPARE:
if (GET_CODE (XEXP (x, 0)) != MULT)
return arm_rtx_costs_1 (x, outer_code, total, speed);
/* A COMPARE of a MULT is slow on XScale; the muls instruction
will stall until the multiplication is complete. */
*total = COSTS_N_INSNS (3);
return false;
case MULT:
/* There is no point basing this on the tuning, since it is always the
fast variant if it exists at all. */
if (mode == DImode
&& (GET_CODE (XEXP (x, 0)) == GET_CODE (XEXP (x, 1)))
&& (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
|| GET_CODE (XEXP (x, 0)) == SIGN_EXTEND))
{
*total = COSTS_N_INSNS (2);
return false;
}
if (mode == DImode)
{
*total = COSTS_N_INSNS (5);
return false;
}
if (CONST_INT_P (XEXP (x, 1)))
{
/* If operand 1 is a constant we can more accurately
calculate the cost of the multiply. The multiplier can
retire 15 bits on the first cycle and a further 12 on the
second. We do, of course, have to load the constant into
a register first. */
unsigned HOST_WIDE_INT i = INTVAL (XEXP (x, 1));
/* There's a general overhead of one cycle. */
int cost = 1;
unsigned HOST_WIDE_INT masked_const;
if (i & 0x80000000)
i = ~i;
i &= (unsigned HOST_WIDE_INT) 0xffffffff;
masked_const = i & 0xffff8000;
if (masked_const != 0)
{
cost++;
masked_const = i & 0xf8000000;
if (masked_const != 0)
cost++;
}
*total = COSTS_N_INSNS (cost);
return false;
}
if (mode == SImode)
{
*total = COSTS_N_INSNS (3);
return false;
}
/* Requires a lib call */
*total = COSTS_N_INSNS (20);
return false;
default:
return arm_rtx_costs_1 (x, outer_code, total, speed);
}
}
/* RTX costs for 9e (and later) cores. */
static bool
arm_9e_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
int *total, bool speed)
{
machine_mode mode = GET_MODE (x);
if (TARGET_THUMB1)
{
switch (code)
{
case MULT:
/* Small multiply: 32 cycles for an integer multiply inst. */
if (arm_arch6m && arm_m_profile_small_mul)
*total = COSTS_N_INSNS (32);
else
*total = COSTS_N_INSNS (3);
return true;
default:
*total = thumb1_rtx_costs (x, code, outer_code);
return true;
}
}
switch (code)
{
case MULT:
/* There is no point basing this on the tuning, since it is always the
fast variant if it exists at all. */
if (mode == DImode
&& (GET_CODE (XEXP (x, 0)) == GET_CODE (XEXP (x, 1)))
&& (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
|| GET_CODE (XEXP (x, 0)) == SIGN_EXTEND))
{
*total = COSTS_N_INSNS (2);
return false;
}
if (mode == DImode)
{
*total = COSTS_N_INSNS (5);
return false;
}
if (mode == SImode)
{
*total = COSTS_N_INSNS (2);
return false;
}
if (GET_MODE_CLASS (mode) == MODE_FLOAT)
{
if (TARGET_HARD_FLOAT
&& (mode == SFmode
|| (mode == DFmode && !TARGET_VFP_SINGLE)))
{
*total = COSTS_N_INSNS (1);
return false;
}
}
*total = COSTS_N_INSNS (20);
return false;
default:
return arm_rtx_costs_1 (x, outer_code, total, speed);
}
}
/* All address computations that can be done are free, but rtx cost returns
the same for practically all of them. So we weight the different types
of address here in the order (most pref first):
PRE/POST_INC/DEC, SHIFT or NON-INT sum, INT sum, REG, MEM or LABEL. */
static inline int
arm_arm_address_cost (rtx x)
{
enum rtx_code c = GET_CODE (x);
if (c == PRE_INC || c == PRE_DEC || c == POST_INC || c == POST_DEC)
return 0;
if (c == MEM || c == LABEL_REF || c == SYMBOL_REF)
return 10;
if (c == PLUS)
{
if (CONST_INT_P (XEXP (x, 1)))
return 2;
if (ARITHMETIC_P (XEXP (x, 0)) || ARITHMETIC_P (XEXP (x, 1)))
return 3;
return 4;
}
return 6;
}
static inline int
arm_thumb_address_cost (rtx x)
{
enum rtx_code c = GET_CODE (x);
if (c == REG)
return 1;
if (c == PLUS
&& REG_P (XEXP (x, 0))
&& CONST_INT_P (XEXP (x, 1)))
return 1;
return 2;
}
static int
arm_address_cost (rtx x, machine_mode mode ATTRIBUTE_UNUSED,
addr_space_t as ATTRIBUTE_UNUSED, bool speed ATTRIBUTE_UNUSED)
{
return TARGET_32BIT ? arm_arm_address_cost (x) : arm_thumb_address_cost (x);
}
/* Adjust cost hook for XScale. */
static bool
xscale_sched_adjust_cost (rtx_insn *insn, rtx link, rtx_insn *dep, int * cost)
{
/* Some true dependencies can have a higher cost depending
on precisely how certain input operands are used. */
if (REG_NOTE_KIND(link) == 0
&& recog_memoized (insn) >= 0
&& recog_memoized (dep) >= 0)
{
int shift_opnum = get_attr_shift (insn);
enum attr_type attr_type = get_attr_type (dep);
/* If nonzero, SHIFT_OPNUM contains the operand number of a shifted
operand for INSN. If we have a shifted input operand and the
instruction we depend on is another ALU instruction, then we may
have to account for an additional stall. */
if (shift_opnum != 0
&& (attr_type == TYPE_ALU_SHIFT_IMM
|| attr_type == TYPE_ALUS_SHIFT_IMM
|| attr_type == TYPE_LOGIC_SHIFT_IMM
|| attr_type == TYPE_LOGICS_SHIFT_IMM
|| attr_type == TYPE_ALU_SHIFT_REG
|| attr_type == TYPE_ALUS_SHIFT_REG
|| attr_type == TYPE_LOGIC_SHIFT_REG
|| attr_type == TYPE_LOGICS_SHIFT_REG
|| attr_type == TYPE_MOV_SHIFT
|| attr_type == TYPE_MVN_SHIFT
|| attr_type == TYPE_MOV_SHIFT_REG
|| attr_type == TYPE_MVN_SHIFT_REG))
{
rtx shifted_operand;
int opno;
/* Get the shifted operand. */
extract_insn (insn);
shifted_operand = recog_data.operand[shift_opnum];
/* Iterate over all the operands in DEP. If we write an operand
that overlaps with SHIFTED_OPERAND, then we have increase the
cost of this dependency. */
extract_insn (dep);
preprocess_constraints (dep);
for (opno = 0; opno < recog_data.n_operands; opno++)
{
/* We can ignore strict inputs. */
if (recog_data.operand_type[opno] == OP_IN)
continue;
if (reg_overlap_mentioned_p (recog_data.operand[opno],
shifted_operand))
{
*cost = 2;
return false;
}
}
}
}
return true;
}
/* Adjust cost hook for Cortex A9. */
static bool
cortex_a9_sched_adjust_cost (rtx_insn *insn, rtx link, rtx_insn *dep, int * cost)
{
switch (REG_NOTE_KIND (link))
{
case REG_DEP_ANTI:
*cost = 0;
return false;
case REG_DEP_TRUE:
case REG_DEP_OUTPUT:
if (recog_memoized (insn) >= 0
&& recog_memoized (dep) >= 0)
{
if (GET_CODE (PATTERN (insn)) == SET)
{
if (GET_MODE_CLASS
(GET_MODE (SET_DEST (PATTERN (insn)))) == MODE_FLOAT
|| GET_MODE_CLASS
(GET_MODE (SET_SRC (PATTERN (insn)))) == MODE_FLOAT)
{
enum attr_type attr_type_insn = get_attr_type (insn);
enum attr_type attr_type_dep = get_attr_type (dep);
/* By default all dependencies of the form
s0 = s0 <op> s1
s0 = s0 <op> s2
have an extra latency of 1 cycle because
of the input and output dependency in this
case. However this gets modeled as an true
dependency and hence all these checks. */
if (REG_P (SET_DEST (PATTERN (insn)))
&& REG_P (SET_DEST (PATTERN (dep)))
&& reg_overlap_mentioned_p (SET_DEST (PATTERN (insn)),
SET_DEST (PATTERN (dep))))
{
/* FMACS is a special case where the dependent
instruction can be issued 3 cycles before
the normal latency in case of an output
dependency. */
if ((attr_type_insn == TYPE_FMACS
|| attr_type_insn == TYPE_FMACD)
&& (attr_type_dep == TYPE_FMACS
|| attr_type_dep == TYPE_FMACD))
{
if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT)
*cost = insn_default_latency (dep) - 3;
else
*cost = insn_default_latency (dep);
return false;
}
else
{
if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT)
*cost = insn_default_latency (dep) + 1;
else
*cost = insn_default_latency (dep);
}
return false;
}
}
}
}
break;
default:
gcc_unreachable ();
}
return true;
}
/* Adjust cost hook for FA726TE. */
static bool
fa726te_sched_adjust_cost (rtx_insn *insn, rtx link, rtx_insn *dep, int * cost)
{
/* For FA726TE, true dependency on CPSR (i.e. set cond followed by predicated)
have penalty of 3. */
if (REG_NOTE_KIND (link) == REG_DEP_TRUE
&& recog_memoized (insn) >= 0
&& recog_memoized (dep) >= 0
&& get_attr_conds (dep) == CONDS_SET)
{
/* Use of carry (e.g. 64-bit arithmetic) in ALU: 3-cycle latency. */
if (get_attr_conds (insn) == CONDS_USE
&& get_attr_type (insn) != TYPE_BRANCH)
{
*cost = 3;
return false;
}
if (GET_CODE (PATTERN (insn)) == COND_EXEC
|| get_attr_conds (insn) == CONDS_USE)
{
*cost = 0;
return false;
}
}
return true;
}
/* Implement TARGET_REGISTER_MOVE_COST.
Moves between VFP_REGS and GENERAL_REGS are a single insn, but
it is typically more expensive than a single memory access. We set
the cost to less than two memory accesses so that floating
point to integer conversion does not go through memory. */
int
arm_register_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
reg_class_t from, reg_class_t to)
{
if (TARGET_32BIT)
{
if ((IS_VFP_CLASS (from) && !IS_VFP_CLASS (to))
|| (!IS_VFP_CLASS (from) && IS_VFP_CLASS (to)))
return 15;
else if ((from == IWMMXT_REGS && to != IWMMXT_REGS)
|| (from != IWMMXT_REGS && to == IWMMXT_REGS))
return 4;
else if (from == IWMMXT_GR_REGS || to == IWMMXT_GR_REGS)
return 20;
else
return 2;
}
else
{
if (from == HI_REGS || to == HI_REGS)
return 4;
else
return 2;
}
}
/* Implement TARGET_MEMORY_MOVE_COST. */
int
arm_memory_move_cost (machine_mode mode, reg_class_t rclass,
bool in ATTRIBUTE_UNUSED)
{
if (TARGET_32BIT)
return 10;
else
{
if (GET_MODE_SIZE (mode) < 4)
return 8;
else
return ((2 * GET_MODE_SIZE (mode)) * (rclass == LO_REGS ? 1 : 2));
}
}
/* Vectorizer cost model implementation. */
/* Implement targetm.vectorize.builtin_vectorization_cost. */
static int
arm_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
tree vectype,
int misalign ATTRIBUTE_UNUSED)
{
unsigned elements;
switch (type_of_cost)
{
case scalar_stmt:
return current_tune->vec_costs->scalar_stmt_cost;
case scalar_load:
return current_tune->vec_costs->scalar_load_cost;
case scalar_store:
return current_tune->vec_costs->scalar_store_cost;
case vector_stmt:
return current_tune->vec_costs->vec_stmt_cost;
case vector_load:
return current_tune->vec_costs->vec_align_load_cost;
case vector_store:
return current_tune->vec_costs->vec_store_cost;
case vec_to_scalar:
return current_tune->vec_costs->vec_to_scalar_cost;
case scalar_to_vec:
return current_tune->vec_costs->scalar_to_vec_cost;
case unaligned_load:
return current_tune->vec_costs->vec_unalign_load_cost;
case unaligned_store:
return current_tune->vec_costs->vec_unalign_store_cost;
case cond_branch_taken:
return current_tune->vec_costs->cond_taken_branch_cost;
case cond_branch_not_taken:
return current_tune->vec_costs->cond_not_taken_branch_cost;
case vec_perm:
case vec_promote_demote:
return current_tune->vec_costs->vec_stmt_cost;
case vec_construct:
elements = TYPE_VECTOR_SUBPARTS (vectype);
return elements / 2 + 1;
default:
gcc_unreachable ();
}
}
/* Implement targetm.vectorize.add_stmt_cost. */
static unsigned
arm_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
struct _stmt_vec_info *stmt_info, int misalign,
enum vect_cost_model_location where)
{
unsigned *cost = (unsigned *) data;
unsigned retval = 0;
if (flag_vect_cost_model)
{
tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
int stmt_cost = arm_builtin_vectorization_cost (kind, vectype, misalign);
/* Statements in an inner loop relative to the loop being
vectorized are weighted more heavily. The value here is
arbitrary and could potentially be improved with analysis. */
if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
count *= 50; /* FIXME. */
retval = (unsigned) (count * stmt_cost);
cost[where] += retval;
}
return retval;
}
/* Return true if and only if this insn can dual-issue only as older. */
static bool
cortexa7_older_only (rtx_insn *insn)
{
if (recog_memoized (insn) < 0)
return false;
switch (get_attr_type (insn))
{
case TYPE_ALU_DSP_REG:
case TYPE_ALU_SREG:
case TYPE_ALUS_SREG:
case TYPE_LOGIC_REG:
case TYPE_LOGICS_REG:
case TYPE_ADC_REG:
case TYPE_ADCS_REG:
case TYPE_ADR:
case TYPE_BFM:
case TYPE_REV:
case TYPE_MVN_REG:
case TYPE_SHIFT_IMM:
case TYPE_SHIFT_REG:
case TYPE_LOAD_BYTE:
case TYPE_LOAD1:
case TYPE_STORE1:
case TYPE_FFARITHS:
case TYPE_FADDS:
case TYPE_FFARITHD:
case TYPE_FADDD:
case TYPE_FMOV:
case TYPE_F_CVT:
case TYPE_FCMPS:
case TYPE_FCMPD:
case TYPE_FCONSTS:
case TYPE_FCONSTD:
case TYPE_FMULS:
case TYPE_FMACS:
case TYPE_FMULD:
case TYPE_FMACD:
case TYPE_FDIVS:
case TYPE_FDIVD:
case TYPE_F_MRC:
case TYPE_F_MRRC:
case TYPE_F_FLAG:
case TYPE_F_LOADS:
case TYPE_F_STORES:
return true;
default:
return false;
}
}
/* Return true if and only if this insn can dual-issue as younger. */
static bool
cortexa7_younger (FILE *file, int verbose, rtx_insn *insn)
{
if (recog_memoized (insn) < 0)
{
if (verbose > 5)
fprintf (file, ";; not cortexa7_younger %d\n", INSN_UID (insn));
return false;
}
switch (get_attr_type (insn))
{
case TYPE_ALU_IMM:
case TYPE_ALUS_IMM:
case TYPE_LOGIC_IMM:
case TYPE_LOGICS_IMM:
case TYPE_EXTEND:
case TYPE_MVN_IMM:
case TYPE_MOV_IMM:
case TYPE_MOV_REG:
case TYPE_MOV_SHIFT:
case TYPE_MOV_SHIFT_REG:
case TYPE_BRANCH:
case TYPE_CALL:
return true;
default:
return false;
}
}
/* Look for an instruction that can dual issue only as an older
instruction, and move it in front of any instructions that can
dual-issue as younger, while preserving the relative order of all
other instructions in the ready list. This is a hueuristic to help
dual-issue in later cycles, by postponing issue of more flexible
instructions. This heuristic may affect dual issue opportunities
in the current cycle. */
static void
cortexa7_sched_reorder (FILE *file, int verbose, rtx_insn **ready,
int *n_readyp, int clock)
{
int i;
int first_older_only = -1, first_younger = -1;
if (verbose > 5)
fprintf (file,
";; sched_reorder for cycle %d with %d insns in ready list\n",
clock,
*n_readyp);
/* Traverse the ready list from the head (the instruction to issue
first), and looking for the first instruction that can issue as
younger and the first instruction that can dual-issue only as
older. */
for (i = *n_readyp - 1; i >= 0; i--)
{
rtx_insn *insn = ready[i];
if (cortexa7_older_only (insn))
{
first_older_only = i;
if (verbose > 5)
fprintf (file, ";; reorder older found %d\n", INSN_UID (insn));
break;
}
else if (cortexa7_younger (file, verbose, insn) && first_younger == -1)
first_younger = i;
}
/* Nothing to reorder because either no younger insn found or insn
that can dual-issue only as older appears before any insn that
can dual-issue as younger. */
if (first_younger == -1)
{
if (verbose > 5)
fprintf (file, ";; sched_reorder nothing to reorder as no younger\n");
return;
}
/* Nothing to reorder because no older-only insn in the ready list. */
if (first_older_only == -1)
{
if (verbose > 5)
fprintf (file, ";; sched_reorder nothing to reorder as no older_only\n");
return;
}
/* Move first_older_only insn before first_younger. */
if (verbose > 5)
fprintf (file, ";; cortexa7_sched_reorder insn %d before %d\n",
INSN_UID(ready [first_older_only]),
INSN_UID(ready [first_younger]));
rtx_insn *first_older_only_insn = ready [first_older_only];
for (i = first_older_only; i < first_younger; i++)
{
ready[i] = ready[i+1];
}
ready[i] = first_older_only_insn;
return;
}
/* Implement TARGET_SCHED_REORDER. */
static int
arm_sched_reorder (FILE *file, int verbose, rtx_insn **ready, int *n_readyp,
int clock)
{
switch (arm_tune)
{
case cortexa7:
cortexa7_sched_reorder (file, verbose, ready, n_readyp, clock);
break;
default:
/* Do nothing for other cores. */
break;
}
return arm_issue_rate ();
}
/* This function implements the target macro TARGET_SCHED_ADJUST_COST.
It corrects the value of COST based on the relationship between
INSN and DEP through the dependence LINK. It returns the new
value. There is a per-core adjust_cost hook to adjust scheduler costs
and the per-core hook can choose to completely override the generic
adjust_cost function. Only put bits of code into arm_adjust_cost that
are common across all cores. */
static int
arm_adjust_cost (rtx_insn *insn, rtx link, rtx_insn *dep, int cost)
{
rtx i_pat, d_pat;
/* When generating Thumb-1 code, we want to place flag-setting operations
close to a conditional branch which depends on them, so that we can
omit the comparison. */
if (TARGET_THUMB1
&& REG_NOTE_KIND (link) == 0
&& recog_memoized (insn) == CODE_FOR_cbranchsi4_insn
&& recog_memoized (dep) >= 0
&& get_attr_conds (dep) == CONDS_SET)
return 0;
if (current_tune->sched_adjust_cost != NULL)
{
if (!current_tune->sched_adjust_cost (insn, link, dep, &cost))
return cost;
}
/* XXX Is this strictly true? */
if (REG_NOTE_KIND (link) == REG_DEP_ANTI
|| REG_NOTE_KIND (link) == REG_DEP_OUTPUT)
return 0;
/* Call insns don't incur a stall, even if they follow a load. */
if (REG_NOTE_KIND (link) == 0
&& CALL_P (insn))
return 1;
if ((i_pat = single_set (insn)) != NULL
&& MEM_P (SET_SRC (i_pat))
&& (d_pat = single_set (dep)) != NULL
&& MEM_P (SET_DEST (d_pat)))
{
rtx src_mem = XEXP (SET_SRC (i_pat), 0);
/* This is a load after a store, there is no conflict if the load reads
from a cached area. Assume that loads from the stack, and from the
constant pool are cached, and that others will miss. This is a
hack. */
if ((GET_CODE (src_mem) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (src_mem))
|| reg_mentioned_p (stack_pointer_rtx, src_mem)
|| reg_mentioned_p (frame_pointer_rtx, src_mem)
|| reg_mentioned_p (hard_frame_pointer_rtx, src_mem))
return 1;
}
return cost;
}
int
arm_max_conditional_execute (void)
{
return max_insns_skipped;
}
static int
arm_default_branch_cost (bool speed_p, bool predictable_p ATTRIBUTE_UNUSED)
{
if (TARGET_32BIT)
return (TARGET_THUMB2 && !speed_p) ? 1 : 4;
else
return (optimize > 0) ? 2 : 0;
}
static int
arm_cortex_a5_branch_cost (bool speed_p, bool predictable_p)
{
return speed_p ? 0 : arm_default_branch_cost (speed_p, predictable_p);
}
/* Thumb-2 branches are relatively cheap on Cortex-M processors ("1 + P cycles"
on Cortex-M4, where P varies from 1 to 3 according to some criteria), since
sequences of non-executed instructions in IT blocks probably take the same
amount of time as executed instructions (and the IT instruction itself takes
space in icache). This function was experimentally determined to give good
results on a popular embedded benchmark. */
static int
arm_cortex_m_branch_cost (bool speed_p, bool predictable_p)
{
return (TARGET_32BIT && speed_p) ? 1
: arm_default_branch_cost (speed_p, predictable_p);
}
static int
arm_cortex_m7_branch_cost (bool speed_p, bool predictable_p)
{
return speed_p ? 0 : arm_default_branch_cost (speed_p, predictable_p);
}
static bool fp_consts_inited = false;
static REAL_VALUE_TYPE value_fp0;
static void
init_fp_table (void)
{
REAL_VALUE_TYPE r;
r = REAL_VALUE_ATOF ("0", DFmode);
value_fp0 = r;
fp_consts_inited = true;
}
/* Return TRUE if rtx X is a valid immediate FP constant. */
int
arm_const_double_rtx (rtx x)
{
REAL_VALUE_TYPE r;
if (!fp_consts_inited)
init_fp_table ();
REAL_VALUE_FROM_CONST_DOUBLE (r, x);
if (REAL_VALUE_MINUS_ZERO (r))
return 0;
if (REAL_VALUES_EQUAL (r, value_fp0))
return 1;
return 0;
}
/* VFPv3 has a fairly wide range of representable immediates, formed from
"quarter-precision" floating-point values. These can be evaluated using this
formula (with ^ for exponentiation):
-1^s * n * 2^-r
Where 's' is a sign bit (0/1), 'n' and 'r' are integers such that
16 <= n <= 31 and 0 <= r <= 7.
These values are mapped onto an 8-bit integer ABCDEFGH s.t.
- A (most-significant) is the sign bit.
- BCD are the exponent (encoded as r XOR 3).
- EFGH are the mantissa (encoded as n - 16).
*/
/* Return an integer index for a VFPv3 immediate operand X suitable for the
fconst[sd] instruction, or -1 if X isn't suitable. */
static int
vfp3_const_double_index (rtx x)
{
REAL_VALUE_TYPE r, m;
int sign, exponent;
unsigned HOST_WIDE_INT mantissa, mant_hi;
unsigned HOST_WIDE_INT mask;
int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
bool fail;
if (!TARGET_VFP3 || !CONST_DOUBLE_P (x))
return -1;
REAL_VALUE_FROM_CONST_DOUBLE (r, x);
/* We can't represent these things, so detect them first. */
if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r) || REAL_VALUE_MINUS_ZERO (r))
return -1;
/* Extract sign, exponent and mantissa. */
sign = REAL_VALUE_NEGATIVE (r) ? 1 : 0;
r = real_value_abs (&r);
exponent = REAL_EXP (&r);
/* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
highest (sign) bit, with a fixed binary point at bit point_pos.
WARNING: If there's ever a VFP version which uses more than 2 * H_W_I - 1
bits for the mantissa, this may fail (low bits would be lost). */
real_ldexp (&m, &r, point_pos - exponent);
wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
mantissa = w.elt (0);
mant_hi = w.elt (1);
/* If there are bits set in the low part of the mantissa, we can't
represent this value. */
if (mantissa != 0)
return -1;
/* Now make it so that mantissa contains the most-significant bits, and move
the point_pos to indicate that the least-significant bits have been
discarded. */
point_pos -= HOST_BITS_PER_WIDE_INT;
mantissa = mant_hi;
/* We can permit four significant bits of mantissa only, plus a high bit
which is always 1. */
mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
if ((mantissa & mask) != 0)
return -1;
/* Now we know the mantissa is in range, chop off the unneeded bits. */
mantissa >>= point_pos - 5;
/* The mantissa may be zero. Disallow that case. (It's possible to load the
floating-point immediate zero with Neon using an integer-zero load, but
that case is handled elsewhere.) */
if (mantissa == 0)
return -1;
gcc_assert (mantissa >= 16 && mantissa <= 31);
/* The value of 5 here would be 4 if GCC used IEEE754-like encoding (where
normalized significands are in the range [1, 2). (Our mantissa is shifted
left 4 places at this point relative to normalized IEEE754 values). GCC
internally uses [0.5, 1) (see real.c), so the exponent returned from
REAL_EXP must be altered. */
exponent = 5 - exponent;
if (exponent < 0 || exponent > 7)
return -1;
/* Sign, mantissa and exponent are now in the correct form to plug into the
formula described in the comment above. */
return (sign << 7) | ((exponent ^ 3) << 4) | (mantissa - 16);
}
/* Return TRUE if rtx X is a valid immediate VFPv3 constant. */
int
vfp3_const_double_rtx (rtx x)
{
if (!TARGET_VFP3)
return 0;
return vfp3_const_double_index (x) != -1;
}
/* Recognize immediates which can be used in various Neon instructions. Legal
immediates are described by the following table (for VMVN variants, the
bitwise inverse of the constant shown is recognized. In either case, VMOV
is output and the correct instruction to use for a given constant is chosen
by the assembler). The constant shown is replicated across all elements of
the destination vector.
insn elems variant constant (binary)
---- ----- ------- -----------------
vmov i32 0 00000000 00000000 00000000 abcdefgh
vmov i32 1 00000000 00000000 abcdefgh 00000000
vmov i32 2 00000000 abcdefgh 00000000 00000000
vmov i32 3 abcdefgh 00000000 00000000 00000000
vmov i16 4 00000000 abcdefgh
vmov i16 5 abcdefgh 00000000
vmvn i32 6 00000000 00000000 00000000 abcdefgh
vmvn i32 7 00000000 00000000 abcdefgh 00000000
vmvn i32 8 00000000 abcdefgh 00000000 00000000
vmvn i32 9 abcdefgh 00000000 00000000 00000000
vmvn i16 10 00000000 abcdefgh
vmvn i16 11 abcdefgh 00000000
vmov i32 12 00000000 00000000 abcdefgh 11111111
vmvn i32 13 00000000 00000000 abcdefgh 11111111
vmov i32 14 00000000 abcdefgh 11111111 11111111
vmvn i32 15 00000000 abcdefgh 11111111 11111111
vmov i8 16 abcdefgh
vmov i64 17 aaaaaaaa bbbbbbbb cccccccc dddddddd
eeeeeeee ffffffff gggggggg hhhhhhhh
vmov f32 18 aBbbbbbc defgh000 00000000 00000000
vmov f32 19 00000000 00000000 00000000 00000000
For case 18, B = !b. Representable values are exactly those accepted by
vfp3_const_double_index, but are output as floating-point numbers rather
than indices.
For case 19, we will change it to vmov.i32 when assembling.
Variants 0-5 (inclusive) may also be used as immediates for the second
operand of VORR/VBIC instructions.
The INVERSE argument causes the bitwise inverse of the given operand to be
recognized instead (used for recognizing legal immediates for the VAND/VORN
pseudo-instructions). If INVERSE is true, the value placed in *MODCONST is
*not* inverted (i.e. the pseudo-instruction forms vand/vorn should still be
output, rather than the real insns vbic/vorr).
INVERSE makes no difference to the recognition of float vectors.
The return value is the variant of immediate as shown in the above table, or
-1 if the given value doesn't match any of the listed patterns.
*/
static int
neon_valid_immediate (rtx op, machine_mode mode, int inverse,
rtx *modconst, int *elementwidth)
{
#define CHECK(STRIDE, ELSIZE, CLASS, TEST) \
matches = 1; \
for (i = 0; i < idx; i += (STRIDE)) \
if (!(TEST)) \
matches = 0; \
if (matches) \
{ \
immtype = (CLASS); \
elsize = (ELSIZE); \
break; \
}
unsigned int i, elsize = 0, idx = 0, n_elts;
unsigned int innersize;
unsigned char bytes[16];
int immtype = -1, matches;
unsigned int invmask = inverse ? 0xff : 0;
bool vector = GET_CODE (op) == CONST_VECTOR;
if (vector)
{
n_elts = CONST_VECTOR_NUNITS (op);
innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
}
else
{
n_elts = 1;
if (mode == VOIDmode)
mode = DImode;
innersize = GET_MODE_SIZE (mode);
}
/* Vectors of float constants. */
if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
{
rtx el0 = CONST_VECTOR_ELT (op, 0);
REAL_VALUE_TYPE r0;
if (!vfp3_const_double_rtx (el0) && el0 != CONST0_RTX (GET_MODE (el0)))
return -1;
REAL_VALUE_FROM_CONST_DOUBLE (r0, el0);
for (i = 1; i < n_elts; i++)
{
rtx elt = CONST_VECTOR_ELT (op, i);
REAL_VALUE_TYPE re;
REAL_VALUE_FROM_CONST_DOUBLE (re, elt);
if (!REAL_VALUES_EQUAL (r0, re))
return -1;
}
if (modconst)
*modconst = CONST_VECTOR_ELT (op, 0);
if (elementwidth)
*elementwidth = 0;
if (el0 == CONST0_RTX (GET_MODE (el0)))
return 19;
else
return 18;
}
/* Splat vector constant out into a byte vector. */
for (i = 0; i < n_elts; i++)
{
rtx el = vector ? CONST_VECTOR_ELT (op, i) : op;
unsigned HOST_WIDE_INT elpart;
unsigned int part, parts;
if (CONST_INT_P (el))
{
elpart = INTVAL (el);
parts = 1;
}
else if (CONST_DOUBLE_P (el))
{
elpart = CONST_DOUBLE_LOW (el);
parts = 2;
}
else
gcc_unreachable ();
for (part = 0; part < parts; part++)
{
unsigned int byte;
for (byte = 0; byte < innersize; byte++)
{
bytes[idx++] = (elpart & 0xff) ^ invmask;
elpart >>= BITS_PER_UNIT;
}
if (CONST_DOUBLE_P (el))
elpart = CONST_DOUBLE_HIGH (el);
}
}
/* Sanity check. */
gcc_assert (idx == GET_MODE_SIZE (mode));
do
{
CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
&& bytes[i + 2] == 0 && bytes[i + 3] == 0);
CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
&& bytes[i + 2] == 0 && bytes[i + 3] == 0);
CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
&& bytes[i + 2] == bytes[2] && bytes[i + 3] == 0);
CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
&& bytes[i + 2] == 0 && bytes[i + 3] == bytes[3]);
CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0);
CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1]);
CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
&& bytes[i + 2] == 0xff && bytes[i + 3] == 0xff);
CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
&& bytes[i + 2] == 0xff && bytes[i + 3] == 0xff);
CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
&& bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff);
CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
&& bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3]);
CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff);
CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1]);
CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
&& bytes[i + 2] == 0 && bytes[i + 3] == 0);
CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
&& bytes[i + 2] == 0xff && bytes[i + 3] == 0xff);
CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
&& bytes[i + 2] == bytes[2] && bytes[i + 3] == 0);
CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
&& bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff);
CHECK (1, 8, 16, bytes[i] == bytes[0]);
CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
&& bytes[i] == bytes[(i + 8) % idx]);
}
while (0);
if (immtype == -1)
return -1;
if (elementwidth)
*elementwidth = elsize;
if (modconst)
{
unsigned HOST_WIDE_INT imm = 0;
/* Un-invert bytes of recognized vector, if necessary. */
if (invmask != 0)
for (i = 0; i < idx; i++)
bytes[i] ^= invmask;
if (immtype == 17)
{
/* FIXME: Broken on 32-bit H_W_I hosts. */
gcc_assert (sizeof (HOST_WIDE_INT) == 8);
for (i = 0; i < 8; i++)
imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
<< (i * BITS_PER_UNIT);
*modconst = GEN_INT (imm);
}
else
{
unsigned HOST_WIDE_INT imm = 0;
for (i = 0; i < elsize / BITS_PER_UNIT; i++)
imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
*modconst = GEN_INT (imm);
}
}
return immtype;
#undef CHECK
}
/* Return TRUE if rtx X is legal for use as either a Neon VMOV (or, implicitly,
VMVN) immediate. Write back width per element to *ELEMENTWIDTH (or zero for
float elements), and a modified constant (whatever should be output for a
VMOV) in *MODCONST. */
int
neon_immediate_valid_for_move (rtx op, machine_mode mode,
rtx *modconst, int *elementwidth)
{
rtx tmpconst;
int tmpwidth;
int retval = neon_valid_immediate (op, mode, 0, &tmpconst, &tmpwidth);
if (retval == -1)
return 0;
if (modconst)
*modconst = tmpconst;
if (elementwidth)
*elementwidth = tmpwidth;
return 1;
}
/* Return TRUE if rtx X is legal for use in a VORR or VBIC instruction. If
the immediate is valid, write a constant suitable for using as an operand
to VORR/VBIC/VAND/VORN to *MODCONST and the corresponding element width to
*ELEMENTWIDTH. See neon_valid_immediate for description of INVERSE. */
int
neon_immediate_valid_for_logic (rtx op, machine_mode mode, int inverse,
rtx *modconst, int *elementwidth)
{
rtx tmpconst;
int tmpwidth;
int retval = neon_valid_immediate (op, mode, inverse, &tmpconst, &tmpwidth);
if (retval < 0 || retval > 5)
return 0;
if (modconst)
*modconst = tmpconst;
if (elementwidth)
*elementwidth = tmpwidth;
return 1;
}
/* Return TRUE if rtx OP is legal for use in a VSHR or VSHL instruction. If
the immediate is valid, write a constant suitable for using as an operand
to VSHR/VSHL to *MODCONST and the corresponding element width to
*ELEMENTWIDTH. ISLEFTSHIFT is for determine left or right shift,
because they have different limitations. */
int
neon_immediate_valid_for_shift (rtx op, machine_mode mode,
rtx *modconst, int *elementwidth,
bool isleftshift)
{
unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
unsigned int n_elts = CONST_VECTOR_NUNITS (op), i;
unsigned HOST_WIDE_INT last_elt = 0;
unsigned HOST_WIDE_INT maxshift;
/* Split vector constant out into a byte vector. */
for (i = 0; i < n_elts; i++)
{
rtx el = CONST_VECTOR_ELT (op, i);
unsigned HOST_WIDE_INT elpart;
if (CONST_INT_P (el))
elpart = INTVAL (el);
else if (CONST_DOUBLE_P (el))
return 0;
else
gcc_unreachable ();
if (i != 0 && elpart != last_elt)
return 0;
last_elt = elpart;
}
/* Shift less than element size. */
maxshift = innersize * 8;
if (isleftshift)
{
/* Left shift immediate value can be from 0 to <size>-1. */
if (last_elt >= maxshift)
return 0;
}
else
{
/* Right shift immediate value can be from 1 to <size>. */
if (last_elt == 0 || last_elt > maxshift)
return 0;
}
if (elementwidth)
*elementwidth = innersize * 8;
if (modconst)
*modconst = CONST_VECTOR_ELT (op, 0);
return 1;
}
/* Return a string suitable for output of Neon immediate logic operation
MNEM. */
char *
neon_output_logic_immediate (const char *mnem, rtx *op2, machine_mode mode,
int inverse, int quad)
{
int width, is_valid;
static char templ[40];
is_valid = neon_immediate_valid_for_logic (*op2, mode, inverse, op2, &width);
gcc_assert (is_valid != 0);
if (quad)
sprintf (templ, "%s.i%d\t%%q0, %%2", mnem, width);
else
sprintf (templ, "%s.i%d\t%%P0, %%2", mnem, width);
return templ;
}
/* Return a string suitable for output of Neon immediate shift operation
(VSHR or VSHL) MNEM. */
char *
neon_output_shift_immediate (const char *mnem, char sign, rtx *op2,
machine_mode mode, int quad,
bool isleftshift)
{
int width, is_valid;
static char templ[40];
is_valid = neon_immediate_valid_for_shift (*op2, mode, op2, &width, isleftshift);
gcc_assert (is_valid != 0);
if (quad)
sprintf (templ, "%s.%c%d\t%%q0, %%q1, %%2", mnem, sign, width);
else
sprintf (templ, "%s.%c%d\t%%P0, %%P1, %%2", mnem, sign, width);
return templ;
}
/* Output a sequence of pairwise operations to implement a reduction.
NOTE: We do "too much work" here, because pairwise operations work on two
registers-worth of operands in one go. Unfortunately we can't exploit those
extra calculations to do the full operation in fewer steps, I don't think.
Although all vector elements of the result but the first are ignored, we
actually calculate the same result in each of the elements. An alternative
such as initially loading a vector with zero to use as each of the second
operands would use up an additional register and take an extra instruction,
for no particular gain. */
void
neon_pairwise_reduce (rtx op0, rtx op1, machine_mode mode,
rtx (*reduc) (rtx, rtx, rtx))
{
machine_mode inner = GET_MODE_INNER (mode);
unsigned int i, parts = GET_MODE_SIZE (mode) / GET_MODE_SIZE (inner);
rtx tmpsum = op1;
for (i = parts / 2; i >= 1; i /= 2)
{
rtx dest = (i == 1) ? op0 : gen_reg_rtx (mode);
emit_insn (reduc (dest, tmpsum, tmpsum));
tmpsum = dest;
}
}
/* If VALS is a vector constant that can be loaded into a register
using VDUP, generate instructions to do so and return an RTX to
assign to the register. Otherwise return NULL_RTX. */
static rtx
neon_vdup_constant (rtx vals)
{
machine_mode mode = GET_MODE (vals);
machine_mode inner_mode = GET_MODE_INNER (mode);
int n_elts = GET_MODE_NUNITS (mode);
bool all_same = true;
rtx x;
int i;
if (GET_CODE (vals) != CONST_VECTOR || GET_MODE_SIZE (inner_mode) > 4)
return NULL_RTX;
for (i = 0; i < n_elts; ++i)
{
x = XVECEXP (vals, 0, i);
if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
all_same = false;
}
if (!all_same)
/* The elements are not all the same. We could handle repeating
patterns of a mode larger than INNER_MODE here (e.g. int8x8_t
{0, C, 0, C, 0, C, 0, C} which can be loaded using
vdup.i16). */
return NULL_RTX;
/* We can load this constant by using VDUP and a constant in a
single ARM register. This will be cheaper than a vector
load. */
x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
return gen_rtx_VEC_DUPLICATE (mode, x);
}
/* Generate code to load VALS, which is a PARALLEL containing only
constants (for vec_init) or CONST_VECTOR, efficiently into a
register. Returns an RTX to copy into the register, or NULL_RTX
for a PARALLEL that can not be converted into a CONST_VECTOR. */
rtx
neon_make_constant (rtx vals)
{
machine_mode mode = GET_MODE (vals);
rtx target;
rtx const_vec = NULL_RTX;
int n_elts = GET_MODE_NUNITS (mode);
int n_const = 0;
int i;
if (GET_CODE (vals) == CONST_VECTOR)
const_vec = vals;
else if (GET_CODE (vals) == PARALLEL)
{
/* A CONST_VECTOR must contain only CONST_INTs and
CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
Only store valid constants in a CONST_VECTOR. */
for (i = 0; i < n_elts; ++i)
{
rtx x = XVECEXP (vals, 0, i);
if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
n_const++;
}
if (n_const == n_elts)
const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
}
else
gcc_unreachable ();
if (const_vec != NULL
&& neon_immediate_valid_for_move (const_vec, mode, NULL, NULL))
/* Load using VMOV. On Cortex-A8 this takes one cycle. */
return const_vec;
else if ((target = neon_vdup_constant (vals)) != NULL_RTX)
/* Loaded using VDUP. On Cortex-A8 the VDUP takes one NEON
pipeline cycle; creating the constant takes one or two ARM
pipeline cycles. */
return target;
else if (const_vec != NULL_RTX)
/* Load from constant pool. On Cortex-A8 this takes two cycles
(for either double or quad vectors). We can not take advantage
of single-cycle VLD1 because we need a PC-relative addressing
mode. */
return const_vec;
else
/* A PARALLEL containing something not valid inside CONST_VECTOR.
We can not construct an initializer. */
return NULL_RTX;
}
/* Initialize vector TARGET to VALS. */
void
neon_expand_vector_init (rtx target, rtx vals)
{
machine_mode mode = GET_MODE (target);
machine_mode inner_mode = GET_MODE_INNER (mode);
int n_elts = GET_MODE_NUNITS (mode);
int n_var = 0, one_var = -1;
bool all_same = true;
rtx x, mem;
int i;
for (i = 0; i < n_elts; ++i)
{
x = XVECEXP (vals, 0, i);
if (!CONSTANT_P (x))
++n_var, one_var = i;
if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
all_same = false;
}
if (n_var == 0)
{
rtx constant = neon_make_constant (vals);
if (constant != NULL_RTX)
{
emit_move_insn (target, constant);
return;
}
}
/* Splat a single non-constant element if we can. */
if (all_same && GET_MODE_SIZE (inner_mode) <= 4)
{
x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
emit_insn (gen_rtx_SET (VOIDmode, target,
gen_rtx_VEC_DUPLICATE (mode, x)));
return;
}
/* One field is non-constant. Load constant then overwrite varying
field. This is more efficient than using the stack. */
if (n_var == 1)
{
rtx copy = copy_rtx (vals);
rtx index = GEN_INT (one_var);
/* Load constant part of vector, substitute neighboring value for
varying element. */
XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, (one_var + 1) % n_elts);
neon_expand_vector_init (target, copy);
/* Insert variable. */
x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
switch (mode)
{
case V8QImode:
emit_insn (gen_neon_vset_lanev8qi (target, x, target, index));
break;
case V16QImode:
emit_insn (gen_neon_vset_lanev16qi (target, x, target, index));
break;
case V4HImode:
emit_insn (gen_neon_vset_lanev4hi (target, x, target, index));
break;
case V8HImode:
emit_insn (gen_neon_vset_lanev8hi (target, x, target, index));
break;
case V2SImode:
emit_insn (gen_neon_vset_lanev2si (target, x, target, index));
break;
case V4SImode:
emit_insn (gen_neon_vset_lanev4si (target, x, target, index));
break;
case V2SFmode:
emit_insn (gen_neon_vset_lanev2sf (target, x, target, index));
break;
case V4SFmode:
emit_insn (gen_neon_vset_lanev4sf (target, x, target, index));
break;
case V2DImode:
emit_insn (gen_neon_vset_lanev2di (target, x, target, index));
break;
default:
gcc_unreachable ();
}
return;
}
/* Construct the vector in memory one field at a time
and load the whole vector. */
mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
for (i = 0; i < n_elts; i++)
emit_move_insn (adjust_address_nv (mem, inner_mode,
i * GET_MODE_SIZE (inner_mode)),
XVECEXP (vals, 0, i));
emit_move_insn (target, mem);
}
/* Ensure OPERAND lies between LOW (inclusive) and HIGH (exclusive). Raise
ERR if it doesn't. FIXME: NEON bounds checks occur late in compilation, so
reported source locations are bogus. */
static void
bounds_check (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
const char *err)
{
HOST_WIDE_INT lane;
gcc_assert (CONST_INT_P (operand));
lane = INTVAL (operand);
if (lane < low || lane >= high)
error (err);
}
/* Bounds-check lanes. */
void
neon_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
{
bounds_check (operand, low, high, "lane out of range");
}
/* Bounds-check constants. */
void
neon_const_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
{
bounds_check (operand, low, high, "constant out of range");
}
HOST_WIDE_INT
neon_element_bits (machine_mode mode)
{
if (mode == DImode)
return GET_MODE_BITSIZE (mode);
else
return GET_MODE_BITSIZE (GET_MODE_INNER (mode));
}
/* Predicates for `match_operand' and `match_operator'. */
/* Return TRUE if OP is a valid coprocessor memory address pattern.
WB is true if full writeback address modes are allowed and is false
if limited writeback address modes (POST_INC and PRE_DEC) are
allowed. */
int
arm_coproc_mem_operand (rtx op, bool wb)
{
rtx ind;
/* Reject eliminable registers. */
if (! (reload_in_progress || reload_completed || lra_in_progress)
&& ( reg_mentioned_p (frame_pointer_rtx, op)
|| reg_mentioned_p (arg_pointer_rtx, op)
|| reg_mentioned_p (virtual_incoming_args_rtx, op)
|| reg_mentioned_p (virtual_outgoing_args_rtx, op)
|| reg_mentioned_p (virtual_stack_dynamic_rtx, op)
|| reg_mentioned_p (virtual_stack_vars_rtx, op)))
return FALSE;
/* Constants are converted into offsets from labels. */
if (!MEM_P (op))
return FALSE;
ind = XEXP (op, 0);
if (reload_completed
&& (GET_CODE (ind) == LABEL_REF
|| (GET_CODE (ind) == CONST
&& GET_CODE (XEXP (ind, 0)) == PLUS
&& GET_CODE (XEXP (XEXP (ind, 0), 0)) == LABEL_REF
&& CONST_INT_P (XEXP (XEXP (ind, 0), 1)))))
return TRUE;
/* Match: (mem (reg)). */
if (REG_P (ind))
return arm_address_register_rtx_p (ind, 0);
/* Autoincremment addressing modes. POST_INC and PRE_DEC are
acceptable in any case (subject to verification by
arm_address_register_rtx_p). We need WB to be true to accept
PRE_INC and POST_DEC. */
if (GET_CODE (ind) == POST_INC
|| GET_CODE (ind) == PRE_DEC
|| (wb
&& (GET_CODE (ind) == PRE_INC
|| GET_CODE (ind) == POST_DEC)))
return arm_address_register_rtx_p (XEXP (ind, 0), 0);
if (wb
&& (GET_CODE (ind) == POST_MODIFY || GET_CODE (ind) == PRE_MODIFY)
&& arm_address_register_rtx_p (XEXP (ind, 0), 0)
&& GET_CODE (XEXP (ind, 1)) == PLUS
&& rtx_equal_p (XEXP (XEXP (ind, 1), 0), XEXP (ind, 0)))
ind = XEXP (ind, 1);
/* Match:
(plus (reg)
(const)). */
if (GET_CODE (ind) == PLUS
&& REG_P (XEXP (ind, 0))
&& REG_MODE_OK_FOR_BASE_P (XEXP (ind, 0), VOIDmode)
&& CONST_INT_P (XEXP (ind, 1))
&& INTVAL (XEXP (ind, 1)) > -1024
&& INTVAL (XEXP (ind, 1)) < 1024
&& (INTVAL (XEXP (ind, 1)) & 3) == 0)
return TRUE;
return FALSE;
}
/* Return TRUE if OP is a memory operand which we can load or store a vector
to/from. TYPE is one of the following values:
0 - Vector load/stor (vldr)
1 - Core registers (ldm)
2 - Element/structure loads (vld1)
*/
int
neon_vector_mem_operand (rtx op, int type, bool strict)
{
rtx ind;
/* Reject eliminable registers. */
if (! (reload_in_progress || reload_completed)
&& ( reg_mentioned_p (frame_pointer_rtx, op)
|| reg_mentioned_p (arg_pointer_rtx, op)
|| reg_mentioned_p (virtual_incoming_args_rtx, op)
|| reg_mentioned_p (virtual_outgoing_args_rtx, op)
|| reg_mentioned_p (virtual_stack_dynamic_rtx, op)
|| reg_mentioned_p (virtual_stack_vars_rtx, op)))
return !strict;
/* Constants are converted into offsets from labels. */
if (!MEM_P (op))
return FALSE;
ind = XEXP (op, 0);
if (reload_completed
&& (GET_CODE (ind) == LABEL_REF
|| (GET_CODE (ind) == CONST
&& GET_CODE (XEXP (ind, 0)) == PLUS
&& GET_CODE (XEXP (XEXP (ind, 0), 0)) == LABEL_REF
&& CONST_INT_P (XEXP (XEXP (ind, 0), 1)))))
return TRUE;
/* Match: (mem (reg)). */
if (REG_P (ind))
return arm_address_register_rtx_p (ind, 0);
/* Allow post-increment with Neon registers. */
if ((type != 1 && GET_CODE (ind) == POST_INC)
|| (type == 0 && GET_CODE (ind) == PRE_DEC))
return arm_address_register_rtx_p (XEXP (ind, 0), 0);
/* Allow post-increment by register for VLDn */
if (type == 2 && GET_CODE (ind) == POST_MODIFY
&& GET_CODE (XEXP (ind, 1)) == PLUS
&& REG_P (XEXP (XEXP (ind, 1), 1)))
return true;
/* Match:
(plus (reg)
(const)). */
if (type == 0
&& GET_CODE (ind) == PLUS
&& REG_P (XEXP (ind, 0))
&& REG_MODE_OK_FOR_BASE_P (XEXP (ind, 0), VOIDmode)
&& CONST_INT_P (XEXP (ind, 1))
&& INTVAL (XEXP (ind, 1)) > -1024
/* For quad modes, we restrict the constant offset to be slightly less
than what the instruction format permits. We have no such constraint
on double mode offsets. (This must match arm_legitimate_index_p.) */
&& (INTVAL (XEXP (ind, 1))
< (VALID_NEON_QREG_MODE (GET_MODE (op))? 1016 : 1024))
&& (INTVAL (XEXP (ind, 1)) & 3) == 0)
return TRUE;
return FALSE;
}
/* Return TRUE if OP is a mem suitable for loading/storing a Neon struct
type. */
int
neon_struct_mem_operand (rtx op)
{
rtx ind;
/* Reject eliminable registers. */
if (! (reload_in_progress || reload_completed)
&& ( reg_mentioned_p (frame_pointer_rtx, op)
|| reg_mentioned_p (arg_pointer_rtx, op)
|| reg_mentioned_p (virtual_incoming_args_rtx, op)
|| reg_mentioned_p (virtual_outgoing_args_rtx, op)
|| reg_mentioned_p (virtual_stack_dynamic_rtx, op)
|| reg_mentioned_p (virtual_stack_vars_rtx, op)))
return FALSE;
/* Constants are converted into offsets from labels. */
if (!MEM_P (op))
return FALSE;
ind = XEXP (op, 0);
if (reload_completed
&& (GET_CODE (ind) == LABEL_REF
|| (GET_CODE (ind) == CONST
&& GET_CODE (XEXP (ind, 0)) == PLUS
&& GET_CODE (XEXP (XEXP (ind, 0), 0)) == LABEL_REF
&& CONST_INT_P (XEXP (XEXP (ind, 0), 1)))))
return TRUE;
/* Match: (mem (reg)). */
if (REG_P (ind))
return arm_address_register_rtx_p (ind, 0);
/* vldm/vstm allows POST_INC (ia) and PRE_DEC (db). */
if (GET_CODE (ind) == POST_INC
|| GET_CODE (ind) == PRE_DEC)
return arm_address_register_rtx_p (XEXP (ind, 0), 0);
return FALSE;
}
/* Return true if X is a register that will be eliminated later on. */
int
arm_eliminable_register (rtx x)
{
return REG_P (x) && (REGNO (x) == FRAME_POINTER_REGNUM
|| REGNO (x) == ARG_POINTER_REGNUM
|| (REGNO (x) >= FIRST_VIRTUAL_REGISTER
&& REGNO (x) <= LAST_VIRTUAL_REGISTER));
}
/* Return GENERAL_REGS if a scratch register required to reload x to/from
coprocessor registers. Otherwise return NO_REGS. */
enum reg_class
coproc_secondary_reload_class (machine_mode mode, rtx x, bool wb)
{
if (mode == HFmode)
{
if (!TARGET_NEON_FP16)
return GENERAL_REGS;
if (s_register_operand (x, mode) || neon_vector_mem_operand (x, 2, true))
return NO_REGS;
return GENERAL_REGS;
}
/* The neon move patterns handle all legitimate vector and struct
addresses. */
if (TARGET_NEON
&& (MEM_P (x) || GET_CODE (x) == CONST_VECTOR)
&& (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
|| GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
|| VALID_NEON_STRUCT_MODE (mode)))
return NO_REGS;
if (arm_coproc_mem_operand (x, wb) || s_register_operand (x, mode))
return NO_REGS;
return GENERAL_REGS;
}
/* Values which must be returned in the most-significant end of the return
register. */
static bool
arm_return_in_msb (const_tree valtype)
{
return (TARGET_AAPCS_BASED
&& BYTES_BIG_ENDIAN
&& (AGGREGATE_TYPE_P (valtype)
|| TREE_CODE (valtype) == COMPLEX_TYPE
|| FIXED_POINT_TYPE_P (valtype)));
}
/* Return TRUE if X references a SYMBOL_REF. */
int
symbol_mentioned_p (rtx x)
{
const char * fmt;
int i;
if (GET_CODE (x) == SYMBOL_REF)
return 1;
/* UNSPEC_TLS entries for a symbol include the SYMBOL_REF, but they
are constant offsets, not symbols. */
if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
return 0;
fmt = GET_RTX_FORMAT (GET_CODE (x));
for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
{
if (fmt[i] == 'E')
{
int j;
for (j = XVECLEN (x, i) - 1; j >= 0; j--)
if (symbol_mentioned_p (XVECEXP (x, i, j)))
return 1;
}
else if (fmt[i] == 'e' && symbol_mentioned_p (XEXP (x, i)))
return 1;
}
return 0;
}
/* Return TRUE if X references a LABEL_REF. */
int
label_mentioned_p (rtx x)
{
const char * fmt;
int i;
if (GET_CODE (x) == LABEL_REF)
return 1;
/* UNSPEC_TLS entries for a symbol include a LABEL_REF for the referencing
instruction, but they are constant offsets, not symbols. */
if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
return 0;
fmt = GET_RTX_FORMAT (GET_CODE (x));
for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
{
if (fmt[i] == 'E')
{
int j;
for (j = XVECLEN (x, i) - 1; j >= 0; j--)
if (label_mentioned_p (XVECEXP (x, i, j)))
return 1;
}
else if (fmt[i] == 'e' && label_mentioned_p (XEXP (x, i)))
return 1;
}
return 0;
}
int
tls_mentioned_p (rtx x)
{
switch (GET_CODE (x))
{
case CONST:
return tls_mentioned_p (XEXP (x, 0));
case UNSPEC:
if (XINT (x, 1) == UNSPEC_TLS)
return 1;
default:
return 0;
}
}
/* Must not copy any rtx that uses a pc-relative address. */
static bool
arm_cannot_copy_insn_p (rtx_insn *insn)
{
/* The tls call insn cannot be copied, as it is paired with a data
word. */
if (recog_memoized (insn) == CODE_FOR_tlscall)
return true;
subrtx_iterator::array_type array;
FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
{
const_rtx x = *iter;
if (GET_CODE (x) == UNSPEC
&& (XINT (x, 1) == UNSPEC_PIC_BASE
|| XINT (x, 1) == UNSPEC_PIC_UNIFIED))
return true;
}
return false;
}
enum rtx_code
minmax_code (rtx x)
{
enum rtx_code code = GET_CODE (x);
switch (code)
{
case SMAX:
return GE;
case SMIN:
return LE;
case UMIN:
return LEU;
case UMAX:
return GEU;
default:
gcc_unreachable ();
}
}
/* Match pair of min/max operators that can be implemented via usat/ssat. */
bool
arm_sat_operator_match (rtx lo_bound, rtx hi_bound,
int *mask, bool *signed_sat)
{
/* The high bound must be a power of two minus one. */
int log = exact_log2 (INTVAL (hi_bound) + 1);
if (log == -1)
return false;
/* The low bound is either zero (for usat) or one less than the
negation of the high bound (for ssat). */
if (INTVAL (lo_bound) == 0)
{
if (mask)
*mask = log;
if (signed_sat)
*signed_sat = false;
return true;
}
if (INTVAL (lo_bound) == -INTVAL (hi_bound) - 1)
{
if (mask)
*mask = log + 1;
if (signed_sat)
*signed_sat = true;
return true;
}
return false;
}
/* Return 1 if memory locations are adjacent. */
int
adjacent_mem_locations (rtx a, rtx b)
{
/* We don't guarantee to preserve the order of these memory refs. */
if (volatile_refs_p (a) || volatile_refs_p (b))
return 0;
if ((REG_P (XEXP (a, 0))
|| (GET_CODE (XEXP (a, 0)) == PLUS
&& CONST_INT_P (XEXP (XEXP (a, 0), 1))))
&& (REG_P (XEXP (b, 0))
|| (GET_CODE (XEXP (b, 0)) == PLUS
&& CONST_INT_P (XEXP (XEXP (b, 0), 1)))))
{
HOST_WIDE_INT val0 = 0, val1 = 0;
rtx reg0, reg1;
int val_diff;
if (GET_CODE (XEXP (a, 0)) == PLUS)
{
reg0 = XEXP (XEXP (a, 0), 0);
val0 = INTVAL (XEXP (XEXP (a, 0), 1));
}
else
reg0 = XEXP (a, 0);
if (GET_CODE (XEXP (b, 0)) == PLUS)
{
reg1 = XEXP (XEXP (b, 0), 0);
val1 = INTVAL (XEXP (XEXP (b, 0), 1));
}
else
reg1 = XEXP (b, 0);
/* Don't accept any offset that will require multiple
instructions to handle, since this would cause the
arith_adjacentmem pattern to output an overlong sequence. */
if (!const_ok_for_op (val0, PLUS) || !const_ok_for_op (val1, PLUS))
return 0;
/* Don't allow an eliminable register: register elimination can make
the offset too large. */
if (arm_eliminable_register (reg0))
return 0;
val_diff = val1 - val0;
if (arm_ld_sched)
{
/* If the target has load delay slots, then there's no benefit
to using an ldm instruction unless the offset is zero and
we are optimizing for size. */
return (optimize_size && (REGNO (reg0) == REGNO (reg1))
&& (val0 == 0 || val1 == 0 || val0 == 4 || val1 == 4)
&& (val_diff == 4 || val_diff == -4));
}
return ((REGNO (reg0) == REGNO (reg1))
&& (val_diff == 4 || val_diff == -4));
}
return 0;
}
/* Return true if OP is a valid load or store multiple operation. LOAD is true
for load operations, false for store operations. CONSECUTIVE is true
if the register numbers in the operation must be consecutive in the register
bank. RETURN_PC is true if value is to be loaded in PC.
The pattern we are trying to match for load is:
[(SET (R_d0) (MEM (PLUS (addr) (offset))))
(SET (R_d1) (MEM (PLUS (addr) (offset + <reg_increment>))))
:
:
(SET (R_dn) (MEM (PLUS (addr) (offset + n * <reg_increment>))))
]
where
1. If offset is 0, first insn should be (SET (R_d0) (MEM (src_addr))).
2. REGNO (R_d0) < REGNO (R_d1) < ... < REGNO (R_dn).
3. If consecutive is TRUE, then for kth register being loaded,
REGNO (R_dk) = REGNO (R_d0) + k.
The pattern for store is similar. */
bool
ldm_stm_operation_p (rtx op, bool load, machine_mode mode,
bool consecutive, bool return_pc)
{
HOST_WIDE_INT count = XVECLEN (op, 0);
rtx reg, mem, addr;
unsigned regno;
unsigned first_regno;
HOST_WIDE_INT i = 1, base = 0, offset = 0;
rtx elt;
bool addr_reg_in_reglist = false;
bool update = false;
int reg_increment;
int offset_adj;
int regs_per_val;
/* If not in SImode, then registers must be consecutive
(e.g., VLDM instructions for DFmode). */
gcc_assert ((mode == SImode) || consecutive);
/* Setting return_pc for stores is illegal. */
gcc_assert (!return_pc || load);
/* Set up the increments and the regs per val based on the mode. */
reg_increment = GET_MODE_SIZE (mode);
regs_per_val = reg_increment / 4;
offset_adj = return_pc ? 1 : 0;
if (count <= 1
|| GET_CODE (XVECEXP (op, 0, offset_adj)) != SET
|| (load && !REG_P (SET_DEST (XVECEXP (op, 0, offset_adj)))))
return false;
/* Check if this is a write-back. */
elt = XVECEXP (op, 0, offset_adj);
if (GET_CODE (SET_SRC (elt)) == PLUS)
{
i++;
base = 1;
update = true;
/* The offset adjustment must be the number of registers being
popped times the size of a single register. */
if (!REG_P (SET_DEST (elt))
|| !REG_P (XEXP (SET_SRC (elt), 0))
|| (REGNO (SET_DEST (elt)) != REGNO (XEXP (SET_SRC (elt), 0)))
|| !CONST_INT_P (XEXP (SET_SRC (elt), 1))
|| INTVAL (XEXP (SET_SRC (elt), 1)) !=
((count - 1 - offset_adj) * reg_increment))
return false;
}
i = i + offset_adj;
base = base + offset_adj;
/* Perform a quick check so we don't blow up below. If only one reg is loaded,
success depends on the type: VLDM can do just one reg,
LDM must do at least two. */
if ((count <= i) && (mode == SImode))
return false;
elt = XVECEXP (op, 0, i - 1);
if (GET_CODE (elt) != SET)
return false;
if (load)
{
reg = SET_DEST (elt);
mem = SET_SRC (elt);
}
else
{
reg = SET_SRC (elt);
mem = SET_DEST (elt);
}
if (!REG_P (reg) || !MEM_P (mem))
return false;
regno = REGNO (reg);
first_regno = regno;
addr = XEXP (mem, 0);
if (GET_CODE (addr) == PLUS)
{
if (!CONST_INT_P (XEXP (addr, 1)))
return false;
offset = INTVAL (XEXP (addr, 1));
addr = XEXP (addr, 0);
}
if (!REG_P (addr))
return false;
/* Don't allow SP to be loaded unless it is also the base register. It
guarantees that SP is reset correctly when an LDM instruction
is interrupted. Otherwise, we might end up with a corrupt stack. */
if (load && (REGNO (reg) == SP_REGNUM) && (REGNO (addr) != SP_REGNUM))
return false;
for (; i < count; i++)
{
elt = XVECEXP (op, 0, i);
if (GET_CODE (elt) != SET)
return false;
if (load)
{
reg = SET_DEST (elt);
mem = SET_SRC (elt);
}
else
{
reg = SET_SRC (elt);
mem = SET_DEST (elt);
}
if (!REG_P (reg)
|| GET_MODE (reg) != mode
|| REGNO (reg) <= regno
|| (consecutive
&& (REGNO (reg) !=
(unsigned int) (first_regno + regs_per_val * (i - base))))
/* Don't allow SP to be loaded unless it is also the base register. It
guarantees that SP is reset correctly when an LDM instruction
is interrupted. Otherwise, we might end up with a corrupt stack. */
|| (load && (REGNO (reg) == SP_REGNUM) && (REGNO (addr) != SP_REGNUM))
|| !MEM_P (mem)
|| GET_MODE (mem) != mode
|| ((GET_CODE (XEXP (mem, 0)) != PLUS
|| !rtx_equal_p (XEXP (XEXP (mem, 0), 0), addr)
|| !CONST_INT_P (XEXP (XEXP (mem, 0), 1))
|| (INTVAL (XEXP (XEXP (mem, 0), 1)) !=
offset + (i - base) * reg_increment))
&& (!REG_P (XEXP (mem, 0))
|| offset + (i - base) * reg_increment != 0)))
return false;
regno = REGNO (reg);
if (regno == REGNO (addr))
addr_reg_in_reglist = true;
}
if (load)
{
if (update && addr_reg_in_reglist)
return false;
/* For Thumb-1, address register is always modified - either by write-back
or by explicit load. If the pattern does not describe an update,
then the address register must be in the list of loaded registers. */
if (TARGET_THUMB1)
return update || addr_reg_in_reglist;
}
return true;
}
/* Return true iff it would be profitable to turn a sequence of NOPS loads
or stores (depending on IS_STORE) into a load-multiple or store-multiple
instruction. ADD_OFFSET is nonzero if the base address register needs
to be modified with an add instruction before we can use it. */
static bool
multiple_operation_profitable_p (bool is_store ATTRIBUTE_UNUSED,
int nops, HOST_WIDE_INT add_offset)
{
/* For ARM8,9 & StrongARM, 2 ldr instructions are faster than an ldm
if the offset isn't small enough. The reason 2 ldrs are faster
is because these ARMs are able to do more than one cache access
in a single cycle. The ARM9 and StrongARM have Harvard caches,
whilst the ARM8 has a double bandwidth cache. This means that
these cores can do both an instruction fetch and a data fetch in
a single cycle, so the trick of calculating the address into a
scratch register (one of the result regs) and then doing a load
multiple actually becomes slower (and no smaller in code size).
That is the transformation
ldr rd1, [rbase + offset]
ldr rd2, [rbase + offset + 4]
to
add rd1, rbase, offset
ldmia rd1, {rd1, rd2}
produces worse code -- '3 cycles + any stalls on rd2' instead of
'2 cycles + any stalls on rd2'. On ARMs with only one cache
access per cycle, the first sequence could never complete in less
than 6 cycles, whereas the ldm sequence would only take 5 and
would make better use of sequential accesses if not hitting the
cache.
We cheat here and test 'arm_ld_sched' which we currently know to
only be true for the ARM8, ARM9 and StrongARM. If this ever
changes, then the test below needs to be reworked. */
if (nops == 2 && arm_ld_sched && add_offset != 0)
return false;
/* XScale has load-store double instructions, but they have stricter
alignment requirements than load-store multiple, so we cannot
use them.
For XScale ldm requires 2 + NREGS cycles to complete and blocks
the pipeline until completion.
NREGS CYCLES
1 3
2 4
3 5
4 6
An ldr instruction takes 1-3 cycles, but does not block the
pipeline.
NREGS CYCLES
1 1-3
2 2-6
3 3-9
4 4-12
Best case ldr will always win. However, the more ldr instructions
we issue, the less likely we are to be able to schedule them well.
Using ldr instructions also increases code size.
As a compromise, we use ldr for counts of 1 or 2 regs, and ldm
for counts of 3 or 4 regs. */
if (nops <= 2 && arm_tune_xscale && !optimize_size)
return false;
return true;
}
/* Subroutine of load_multiple_sequence and store_multiple_sequence.
Given an array of UNSORTED_OFFSETS, of which there are NOPS, compute
an array ORDER which describes the sequence to use when accessing the
offsets that produces an ascending order. In this sequence, each
offset must be larger by exactly 4 than the previous one. ORDER[0]
must have been filled in with the lowest offset by the caller.
If UNSORTED_REGS is nonnull, it is an array of register numbers that
we use to verify that ORDER produces an ascending order of registers.
Return true if it was possible to construct such an order, false if
not. */
static bool
compute_offset_order (int nops, HOST_WIDE_INT *unsorted_offsets, int *order,
int *unsorted_regs)
{
int i;
for (i = 1; i < nops; i++)
{
int j;
order[i] = order[i - 1];
for (j = 0; j < nops; j++)
if (unsorted_offsets[j] == unsorted_offsets[order[i - 1]] + 4)
{
/* We must find exactly one offset that is higher than the
previous one by 4. */
if (order[i] != order[i - 1])
return false;
order[i] = j;
}
if (order[i] == order[i - 1])
return false;
/* The register numbers must be ascending. */
if (unsorted_regs != NULL
&& unsorted_regs[order[i]] <= unsorted_regs[order[i - 1]])
return false;
}
return true;
}
/* Used to determine in a peephole whether a sequence of load
instructions can be changed into a load-multiple instruction.
NOPS is the number of separate load instructions we are examining. The
first NOPS entries in OPERANDS are the destination registers, the
next NOPS entries are memory operands. If this function is
successful, *BASE is set to the common base register of the memory
accesses; *LOAD_OFFSET is set to the first memory location's offset
from that base register.
REGS is an array filled in with the destination register numbers.
SAVED_ORDER (if nonnull), is an array filled in with an order that maps
insn numbers to an ascending order of stores. If CHECK_REGS is true,
the sequence of registers in REGS matches the loads from ascending memory
locations, and the function verifies that the register numbers are
themselves ascending. If CHECK_REGS is false, the register numbers
are stored in the order they are found in the operands. */
static int
load_multiple_sequence (rtx *operands, int nops, int *regs, int *saved_order,
int *base, HOST_WIDE_INT *load_offset, bool check_regs)
{
int unsorted_regs[MAX_LDM_STM_OPS];
HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS];
int order[MAX_LDM_STM_OPS];
rtx base_reg_rtx = NULL;
int base_reg = -1;
int i, ldm_case;
/* Can only handle up to MAX_LDM_STM_OPS insns at present, though could be
easily extended if required. */
gcc_assert (nops >= 2 && nops <= MAX_LDM_STM_OPS);
memset (order, 0, MAX_LDM_STM_OPS * sizeof (int));
/* Loop over the operands and check that the memory references are
suitable (i.e. immediate offsets from the same base register). At
the same time, extract the target register, and the memory
offsets. */
for (i = 0; i < nops; i++)
{
rtx reg;
rtx offset;
/* Convert a subreg of a mem into the mem itself. */
if (GET_CODE (operands[nops + i]) == SUBREG)
operands[nops + i] = alter_subreg (operands + (nops + i), true);
gcc_assert (MEM_P (operands[nops + i]));
/* Don't reorder volatile memory references; it doesn't seem worth
looking for the case where the order is ok anyway. */
if (MEM_VOLATILE_P (operands[nops + i]))
return 0;
offset = const0_rtx;
if ((REG_P (reg = XEXP (operands[nops + i], 0))
|| (GET_CODE (reg) == SUBREG
&& REG_P (reg = SUBREG_REG (reg))))
|| (GET_CODE (XEXP (operands[nops + i], 0)) == PLUS
&& ((REG_P (reg = XEXP (XEXP (operands[nops + i], 0), 0)))
|| (GET_CODE (reg) == SUBREG
&& REG_P (reg = SUBREG_REG (reg))))
&& (CONST_INT_P (offset
= XEXP (XEXP (operands[nops + i], 0), 1)))))
{
if (i == 0)
{
base_reg = REGNO (reg);
base_reg_rtx = reg;
if (TARGET_THUMB1 && base_reg > LAST_LO_REGNUM)
return 0;
}
else if (base_reg != (int) REGNO (reg))
/* Not addressed from the same base register. */
return 0;
unsorted_regs[i] = (REG_P (operands[i])
? REGNO (operands[i])
: REGNO (SUBREG_REG (operands[i])));
/* If it isn't an integer register, or if it overwrites the
base register but isn't the last insn in the list, then
we can't do this. */
if (unsorted_regs[i] < 0
|| (TARGET_THUMB1 && unsorted_regs[i] > LAST_LO_REGNUM)
|| unsorted_regs[i] > 14
|| (i != nops - 1 && unsorted_regs[i] == base_reg))
return 0;
/* Don't allow SP to be loaded unless it is also the base
register. It guarantees that SP is reset correctly when
an LDM instruction is interrupted. Otherwise, we might
end up with a corrupt stack. */
if (unsorted_regs[i] == SP_REGNUM && base_reg != SP_REGNUM)
return 0;
unsorted_offsets[i] = INTVAL (offset);
if (i == 0 || unsorted_offsets[i] < unsorted_offsets[order[0]])
order[0] = i;
}
else
/* Not a suitable memory address. */
return 0;
}
/* All the useful information has now been extracted from the
operands into unsorted_regs and unsorted_offsets; additionally,
order[0] has been set to the lowest offset in the list. Sort
the offsets into order, verifying that they are adjacent, and
check that the register numbers are ascending. */
if (!compute_offset_order (nops, unsorted_offsets, order,
check_regs ? unsorted_regs : NULL))
return 0;
if (saved_order)
memcpy (saved_order, order, sizeof order);
if (base)
{
*base = base_reg;
for (i = 0; i < nops; i++)
regs[i] = unsorted_regs[check_regs ? order[i] : i];
*load_offset = unsorted_offsets[order[0]];
}
if (TARGET_THUMB1
&& !peep2_reg_dead_p (nops, base_reg_rtx))
return 0;
if (unsorted_offsets[order[0]] == 0)
ldm_case = 1; /* ldmia */
else if (TARGET_ARM && unsorted_offsets[order[0]] == 4)
ldm_case = 2; /* ldmib */
else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0)
ldm_case = 3; /* ldmda */
else if (TARGET_32BIT && unsorted_offsets[order[nops - 1]] == -4)
ldm_case = 4; /* ldmdb */
else if (const_ok_for_arm (unsorted_offsets[order[0]])
|| const_ok_for_arm (-unsorted_offsets[order[0]]))
ldm_case = 5;
else
return 0;
if (!multiple_operation_profitable_p (false, nops,
ldm_case == 5
? unsorted_offsets[order[0]] : 0))
return 0;
return ldm_case;
}
/* Used to determine in a peephole whether a sequence of store instructions can
be changed into a store-multiple instruction.
NOPS is the number of separate store instructions we are examining.
NOPS_TOTAL is the total number of instructions recognized by the peephole
pattern.
The first NOPS entries in OPERANDS are the source registers, the next
NOPS entries are memory operands. If this function is successful, *BASE is
set to the common base register of the memory accesses; *LOAD_OFFSET is set
to the first memory location's offset from that base register. REGS is an
array filled in with the source register numbers, REG_RTXS (if nonnull) is
likewise filled with the corresponding rtx's.
SAVED_ORDER (if nonnull), is an array filled in with an order that maps insn
numbers to an ascending order of stores.
If CHECK_REGS is true, the sequence of registers in *REGS matches the stores
from ascending memory locations, and the function verifies that the register
numbers are themselves ascending. If CHECK_REGS is false, the register
numbers are stored in the order they are found in the operands. */
static int
store_multiple_sequence (rtx *operands, int nops, int nops_total,
int *regs, rtx *reg_rtxs, int *saved_order, int *base,
HOST_WIDE_INT *load_offset, bool check_regs)
{
int unsorted_regs[MAX_LDM_STM_OPS];
rtx unsorted_reg_rtxs[MAX_LDM_STM_OPS];
HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS];
int order[MAX_LDM_STM_OPS];
int base_reg = -1;
rtx base_reg_rtx = NULL;
int i, stm_case;
/* Write back of base register is currently only supported for Thumb 1. */
int base_writeback = TARGET_THUMB1;
/* Can only handle up to MAX_LDM_STM_OPS insns at present, though could be
easily extended if required. */
gcc_assert (nops >= 2 && nops <= MAX_LDM_STM_OPS);
memset (order, 0, MAX_LDM_STM_OPS * sizeof (int));
/* Loop over the operands and check that the memory references are
suitable (i.e. immediate offsets from the same base register). At
the same time, extract the target register, and the memory
offsets. */
for (i = 0; i < nops; i++)
{
rtx reg;
rtx offset;
/* Convert a subreg of a mem into the mem itself. */
if (GET_CODE (operands[nops + i]) == SUBREG)
operands[nops + i] = alter_subreg (operands + (nops + i), true);
gcc_assert (MEM_P (operands[nops + i]));
/* Don't reorder volatile memory references; it doesn't seem worth
looking for the case where the order is ok anyway. */
if (MEM_VOLATILE_P (operands[nops + i]))
return 0;
offset = const0_rtx;
if ((REG_P (reg = XEXP (operands[nops + i], 0))
|| (GET_CODE (reg) == SUBREG
&& REG_P (reg = SUBREG_REG (reg))))
|| (GET_CODE (XEXP (operands[nops + i], 0)) == PLUS
&& ((REG_P (reg = XEXP (XEXP (operands[nops + i], 0), 0)))
|| (GET_CODE (reg) == SUBREG
&& REG_P (reg = SUBREG_REG (reg))))
&& (CONST_INT_P (offset
= XEXP (XEXP (operands[nops + i], 0), 1)))))
{
unsorted_reg_rtxs[i] = (REG_P (operands[i])
? operands[i] : SUBREG_REG (operands[i]));
unsorted_regs[i] = REGNO (unsorted_reg_rtxs[i]);
if (i == 0)
{
base_reg = REGNO (reg);
base_reg_rtx = reg;
if (TARGET_THUMB1 && base_reg > LAST_LO_REGNUM)
return 0;
}
else if (base_reg != (int) REGNO (reg))
/* Not addressed from the same base register. */
return 0;
/* If it isn't an integer register, then we can't do this. */
if (unsorted_regs[i] < 0
|| (TARGET_THUMB1 && unsorted_regs[i] > LAST_LO_REGNUM)
/* The effects are unpredictable if the base register is
both updated and stored. */
|| (base_writeback && unsorted_regs[i] == base_reg)
|| (TARGET_THUMB2 && unsorted_regs[i] == SP_REGNUM)
|| unsorted_regs[i] > 14)
return 0;
unsorted_offsets[i] = INTVAL (offset);
if (i == 0 || unsorted_offsets[i] < unsorted_offsets[order[0]])
order[0] = i;
}
else
/* Not a suitable memory address. */
return 0;
}
/* All the useful information has now been extracted from the
operands into unsorted_regs and unsorted_offsets; additionally,
order[0] has been set to the lowest offset in the list. Sort
the offsets into order, verifying that they are adjacent, and
check that the register numbers are ascending. */
if (!compute_offset_order (nops, unsorted_offsets, order,
check_regs ? unsorted_regs : NULL))
return 0;
if (saved_order)
memcpy (saved_order, order, sizeof order);
if (base)
{
*base = base_reg;
for (i = 0; i < nops; i++)
{
regs[i] = unsorted_regs[check_regs ? order[i] : i];
if (reg_rtxs)
reg_rtxs[i] = unsorted_reg_rtxs[check_regs ? order[i] : i];
}
*load_offset = unsorted_offsets[order[0]];
}
if (TARGET_THUMB1
&& !peep2_reg_dead_p (nops_total, base_reg_rtx))
return 0;
if (unsorted_offsets[order[0]] == 0)
stm_case = 1; /* stmia */
else if (TARGET_ARM && unsorted_offsets[order[0]] == 4)
stm_case = 2; /* stmib */
else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0)
stm_case = 3; /* stmda */
else if (TARGET_32BIT && unsorted_offsets[order[nops - 1]] == -4)
stm_case = 4; /* stmdb */
else
return 0;
if (!multiple_operation_profitable_p (false, nops, 0))
return 0;
return stm_case;
}
/* Routines for use in generating RTL. */
/* Generate a load-multiple instruction. COUNT is the number of loads in
the instruction; REGS and MEMS are arrays containing the operands.
BASEREG is the base register to be used in addressing the memory operands.
WBACK_OFFSET is nonzero if the instruction should update the base
register. */
static rtx
arm_gen_load_multiple_1 (int count, int *regs, rtx *mems, rtx basereg,
HOST_WIDE_INT wback_offset)
{
int i = 0, j;
rtx result;
if (!multiple_operation_profitable_p (false, count, 0))
{
rtx seq;
start_sequence ();
for (i = 0; i < count; i++)
emit_move_insn (gen_rtx_REG (SImode, regs[i]), mems[i]);
if (wback_offset != 0)
emit_move_insn (basereg, plus_constant (Pmode, basereg, wback_offset));
seq = get_insns ();
end_sequence ();
return seq;
}
result = gen_rtx_PARALLEL (VOIDmode,
rtvec_alloc (count + (wback_offset != 0 ? 1 : 0)));
if (wback_offset != 0)
{
XVECEXP (result, 0, 0)
= gen_rtx_SET (VOIDmode, basereg,
plus_constant (Pmode, basereg, wback_offset));
i = 1;
count++;
}
for (j = 0; i < count; i++, j++)
XVECEXP (result, 0, i)
= gen_rtx_SET (VOIDmode, gen_rtx_REG (SImode, regs[j]), mems[j]);
return result;
}
/* Generate a store-multiple instruction. COUNT is the number of stores in
the instruction; REGS and MEMS are arrays containing the operands.
BASEREG is the base register to be used in addressing the memory operands.
WBACK_OFFSET is nonzero if the instruction should update the base
register. */
static rtx
arm_gen_store_multiple_1 (int count, int *regs, rtx *mems, rtx basereg,
HOST_WIDE_INT wback_offset)
{
int i = 0, j;
rtx result;
if (GET_CODE (basereg) == PLUS)
basereg = XEXP (basereg, 0);
if (!multiple_operation_profitable_p (false, count, 0))
{
rtx seq;
start_sequence ();
for (i = 0; i < count; i++)
emit_move_insn (mems[i], gen_rtx_REG (SImode, regs[i]));
if (wback_offset != 0)
emit_move_insn (basereg, plus_constant (Pmode, basereg, wback_offset));
seq = get_insns ();
end_sequence ();
return seq;
}
result = gen_rtx_PARALLEL (VOIDmode,
rtvec_alloc (count + (wback_offset != 0 ? 1 : 0)));
if (wback_offset != 0)
{
XVECEXP (result, 0, 0)
= gen_rtx_SET (VOIDmode, basereg,
plus_constant (Pmode, basereg, wback_offset));
i = 1;
count++;
}
for (j = 0; i < count; i++, j++)
XVECEXP (result, 0, i)
= gen_rtx_SET (VOIDmode, mems[j], gen_rtx_REG (SImode, regs[j]));
return result;
}
/* Generate either a load-multiple or a store-multiple instruction. This
function can be used in situations where we can start with a single MEM
rtx and adjust its address upwards.
COUNT is the number of operations in the instruction, not counting a
possible update of the base register. REGS is an array containing the
register operands.
BASEREG is the base register to be used in addressing the memory operands,
which are constructed from BASEMEM.
WRITE_BACK specifies whether the generated instruction should include an
update of the base register.
OFFSETP is used to pass an offset to and from this function; this offset
is not used when constructing the address (instead BASEMEM should have an
appropriate offset in its address), it is used only for setting
MEM_OFFSET. It is updated only if WRITE_BACK is true.*/
static rtx
arm_gen_multiple_op (bool is_load, int *regs, int count, rtx basereg,
bool write_back, rtx basemem, HOST_WIDE_INT *offsetp)
{
rtx mems[MAX_LDM_STM_OPS];
HOST_WIDE_INT offset = *offsetp;
int i;
gcc_assert (count <= MAX_LDM_STM_OPS);
if (GET_CODE (basereg) == PLUS)
basereg = XEXP (basereg, 0);
for (i = 0; i < count; i++)
{
rtx addr = plus_constant (Pmode, basereg, i * 4);
mems[i] = adjust_automodify_address_nv (basemem, SImode, addr, offset);
offset += 4;
}
if (write_back)
*offsetp = offset;
if (is_load)
return arm_gen_load_multiple_1 (count, regs, mems, basereg,
write_back ? 4 * count : 0);
else
return arm_gen_store_multiple_1 (count, regs, mems, basereg,
write_back ? 4 * count : 0);
}
rtx
arm_gen_load_multiple (int *regs, int count, rtx basereg, int write_back,
rtx basemem, HOST_WIDE_INT *offsetp)
{
return arm_gen_multiple_op (TRUE, regs, count, basereg, write_back, basemem,
offsetp);
}
rtx
arm_gen_store_multiple (int *regs, int count, rtx basereg, int write_back,
rtx basemem, HOST_WIDE_INT *offsetp)
{
return arm_gen_multiple_op (FALSE, regs, count, basereg, write_back, basemem,
offsetp);
}
/* Called from a peephole2 expander to turn a sequence of loads into an
LDM instruction. OPERANDS are the operands found by the peephole matcher;
NOPS indicates how many separate loads we are trying to combine. SORT_REGS
is true if we can reorder the registers because they are used commutatively
subsequently.
Returns true iff we could generate a new instruction. */
bool
gen_ldm_seq (rtx *operands, int nops, bool sort_regs)
{
int regs[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS];
rtx mems[MAX_LDM_STM_OPS];
int i, j, base_reg;
rtx base_reg_rtx;
HOST_WIDE_INT offset;
int write_back = FALSE;
int ldm_case;
rtx addr;
ldm_case = load_multiple_sequence (operands, nops, regs, mem_order,
&base_reg, &offset, !sort_regs);
if (ldm_case == 0)
return false;
if (sort_regs)
for (i = 0; i < nops - 1; i++)
for (j = i + 1; j < nops; j++)
if (regs[i] > regs[j])
{
int t = regs[i];
regs[i] = regs[j];
regs[j] = t;
}
base_reg_rtx = gen_rtx_REG (Pmode, base_reg);
if (TARGET_THUMB1)
{
gcc_assert (peep2_reg_dead_p (nops, base_reg_rtx));
gcc_assert (ldm_case == 1 || ldm_case == 5);
write_back = TRUE;
}
if (ldm_case == 5)
{
rtx newbase = TARGET_THUMB1 ? base_reg_rtx : gen_rtx_REG (SImode, regs[0]);
emit_insn (gen_addsi3 (newbase, base_reg_rtx, GEN_INT (offset)));
offset = 0;
if (!TARGET_THUMB1)
{
base_reg = regs[0];
base_reg_rtx = newbase;
}
}
for (i = 0; i < nops; i++)
{
addr = plus_constant (Pmode, base_reg_rtx, offset + i * 4);
mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]],
SImode, addr, 0);
}
emit_insn (arm_gen_load_multiple_1 (nops, regs, mems, base_reg_rtx,
write_back ? offset + i * 4 : 0));
return true;
}
/* Called from a peephole2 expander to turn a sequence of stores into an
STM instruction. OPERANDS are the operands found by the peephole matcher;
NOPS indicates how many separate stores we are trying to combine.
Returns true iff we could generate a new instruction. */
bool
gen_stm_seq (rtx *operands, int nops)
{
int i;
int regs[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS];
rtx mems[MAX_LDM_STM_OPS];
int base_reg;
rtx base_reg_rtx;
HOST_WIDE_INT offset;
int write_back = FALSE;
int stm_case;
rtx addr;
bool base_reg_dies;
stm_case = store_multiple_sequence (operands, nops, nops, regs, NULL,
mem_order, &base_reg, &offset, true);
if (stm_case == 0)
return false;
base_reg_rtx = gen_rtx_REG (Pmode, base_reg);
base_reg_dies = peep2_reg_dead_p (nops, base_reg_rtx);
if (TARGET_THUMB1)
{
gcc_assert (base_reg_dies);
write_back = TRUE;
}
if (stm_case == 5)
{
gcc_assert (base_reg_dies);
emit_insn (gen_addsi3 (base_reg_rtx, base_reg_rtx, GEN_INT (offset)));
offset = 0;
}
addr = plus_constant (Pmode, base_reg_rtx, offset);
for (i = 0; i < nops; i++)
{
addr = plus_constant (Pmode, base_reg_rtx, offset + i * 4);
mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]],
SImode, addr, 0);
}
emit_insn (arm_gen_store_multiple_1 (nops, regs, mems, base_reg_rtx,
write_back ? offset + i * 4 : 0));
return true;
}
/* Called from a peephole2 expander to turn a sequence of stores that are
preceded by constant loads into an STM instruction. OPERANDS are the
operands found by the peephole matcher; NOPS indicates how many
separate stores we are trying to combine; there are 2 * NOPS
instructions in the peephole.
Returns true iff we could generate a new instruction. */
bool
gen_const_stm_seq (rtx *operands, int nops)
{
int regs[MAX_LDM_STM_OPS], sorted_regs[MAX_LDM_STM_OPS];
int reg_order[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS];
rtx reg_rtxs[MAX_LDM_STM_OPS], orig_reg_rtxs[MAX_LDM_STM_OPS];
rtx mems[MAX_LDM_STM_OPS];
int base_reg;
rtx base_reg_rtx;
HOST_WIDE_INT offset;
int write_back = FALSE;
int stm_case;
rtx addr;
bool base_reg_dies;
int i, j;
HARD_REG_SET allocated;
stm_case = store_multiple_sequence (operands, nops, 2 * nops, regs, reg_rtxs,
mem_order, &base_reg, &offset, false);
if (stm_case == 0)
return false;
memcpy (orig_reg_rtxs, reg_rtxs, sizeof orig_reg_rtxs);
/* If the same register is used more than once, try to find a free
register. */
CLEAR_HARD_REG_SET (allocated);
for (i = 0; i < nops; i++)
{
for (j = i + 1; j < nops; j++)
if (regs[i] == regs[j])
{
rtx t = peep2_find_free_register (0, nops * 2,
TARGET_THUMB1 ? "l" : "r",
SImode, &allocated);
if (t == NULL_RTX)
return false;
reg_rtxs[i] = t;
regs[i] = REGNO (t);
}
}
/* Compute an ordering that maps the register numbers to an ascending
sequence. */
reg_order[0] = 0;
for (i = 0; i < nops; i++)
if (regs[i] < regs[reg_order[0]])
reg_order[0] = i;
for (i = 1; i < nops; i++)
{
int this_order = reg_order[i - 1];
for (j = 0; j < nops; j++)
if (regs[j] > regs[reg_order[i - 1]]
&& (this_order == reg_order[i - 1]
|| regs[j] < regs[this_order]))
this_order = j;
reg_order[i] = this_order;
}
/* Ensure that registers that must be live after the instruction end
up with the correct value. */
for (i = 0; i < nops; i++)
{
int this_order = reg_order[i];
if ((this_order != mem_order[i]
|| orig_reg_rtxs[this_order] != reg_rtxs[this_order])
&& !peep2_reg_dead_p (nops * 2, orig_reg_rtxs[this_order]))
return false;
}
/* Load the constants. */
for (i = 0; i < nops; i++)
{
rtx op = operands[2 * nops + mem_order[i]];
sorted_regs[i] = regs[reg_order[i]];
emit_move_insn (reg_rtxs[reg_order[i]], op);
}
base_reg_rtx = gen_rtx_REG (Pmode, base_reg);
base_reg_dies = peep2_reg_dead_p (nops * 2, base_reg_rtx);
if (TARGET_THUMB1)
{
gcc_assert (base_reg_dies);
write_back = TRUE;
}
if (stm_case == 5)
{
gcc_assert (base_reg_dies);
emit_insn (gen_addsi3 (base_reg_rtx, base_reg_rtx, GEN_INT (offset)));
offset = 0;
}
addr = plus_constant (Pmode, base_reg_rtx, offset);
for (i = 0; i < nops; i++)
{
addr = plus_constant (Pmode, base_reg_rtx, offset + i * 4);
mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]],
SImode, addr, 0);
}
emit_insn (arm_gen_store_multiple_1 (nops, sorted_regs, mems, base_reg_rtx,
write_back ? offset + i * 4 : 0));
return true;
}
/* Copy a block of memory using plain ldr/str/ldrh/strh instructions, to permit
unaligned copies on processors which support unaligned semantics for those
instructions. INTERLEAVE_FACTOR can be used to attempt to hide load latency
(using more registers) by doing e.g. load/load/store/store for a factor of 2.
An interleave factor of 1 (the minimum) will perform no interleaving.
Load/store multiple are used for aligned addresses where possible. */
static void
arm_block_move_unaligned_straight (rtx dstbase, rtx srcbase,
HOST_WIDE_INT length,
unsigned int interleave_factor)
{
rtx *regs = XALLOCAVEC (rtx, interleave_factor);
int *regnos = XALLOCAVEC (int, interleave_factor);
HOST_WIDE_INT block_size_bytes = interleave_factor * UNITS_PER_WORD;
HOST_WIDE_INT i, j;
HOST_WIDE_INT remaining = length, words;
rtx halfword_tmp = NULL, byte_tmp = NULL;
rtx dst, src;
bool src_aligned = MEM_ALIGN (srcbase) >= BITS_PER_WORD;
bool dst_aligned = MEM_ALIGN (dstbase) >= BITS_PER_WORD;
HOST_WIDE_INT srcoffset, dstoffset;
HOST_WIDE_INT src_autoinc, dst_autoinc;
rtx mem, addr;
gcc_assert (1 <= interleave_factor && interleave_factor <= 4);
/* Use hard registers if we have aligned source or destination so we can use
load/store multiple with contiguous registers. */
if (dst_aligned || src_aligned)
for (i = 0; i < interleave_factor; i++)
regs[i] = gen_rtx_REG (SImode, i);
else
for (i = 0; i < interleave_factor; i++)
regs[i] = gen_reg_rtx (SImode);
dst = copy_addr_to_reg (XEXP (dstbase, 0));
src = copy_addr_to_reg (XEXP (srcbase, 0));
srcoffset = dstoffset = 0;
/* Calls to arm_gen_load_multiple and arm_gen_store_multiple update SRC/DST.
For copying the last bytes we want to subtract this offset again. */
src_autoinc = dst_autoinc = 0;
for (i = 0; i < interleave_factor; i++)
regnos[i] = i;
/* Copy BLOCK_SIZE_BYTES chunks. */
for (i = 0; i + block_size_bytes <= length; i += block_size_bytes)
{
/* Load words. */
if (src_aligned && interleave_factor > 1)
{
emit_insn (arm_gen_load_multiple (regnos, interleave_factor, src,
TRUE, srcbase, &srcoffset));
src_autoinc += UNITS_PER_WORD * interleave_factor;
}
else
{
for (j = 0; j < interleave_factor; j++)
{
addr = plus_constant (Pmode, src, (srcoffset + j * UNITS_PER_WORD
- src_autoinc));
mem = adjust_automodify_address (srcbase, SImode, addr,
srcoffset + j * UNITS_PER_WORD);
emit_insn (gen_unaligned_loadsi (regs[j], mem));
}
srcoffset += block_size_bytes;
}
/* Store words. */
if (dst_aligned && interleave_factor > 1)
{
emit_insn (arm_gen_store_multiple (regnos, interleave_factor, dst,
TRUE, dstbase, &dstoffset));
dst_autoinc += UNITS_PER_WORD * interleave_factor;
}
else
{
for (j = 0; j < interleave_factor; j++)
{
addr = plus_constant (Pmode, dst, (dstoffset + j * UNITS_PER_WORD
- dst_autoinc));
mem = adjust_automodify_address (dstbase, SImode, addr,
dstoffset + j * UNITS_PER_WORD);
emit_insn (gen_unaligned_storesi (mem, regs[j]));
}
dstoffset += block_size_bytes;
}
remaining -= block_size_bytes;
}
/* Copy any whole words left (note these aren't interleaved with any
subsequent halfword/byte load/stores in the interests of simplicity). */
words = remaining / UNITS_PER_WORD;
gcc_assert (words < interleave_factor);
if (src_aligned && words > 1)
{
emit_insn (arm_gen_load_multiple (regnos, words, src, TRUE, srcbase,
&srcoffset));
src_autoinc += UNITS_PER_WORD * words;
}
else
{
for (j = 0; j < words; j++)
{
addr = plus_constant (Pmode, src,
srcoffset + j * UNITS_PER_WORD - src_autoinc);
mem = adjust_automodify_address (srcbase, SImode, addr,
srcoffset + j * UNITS_PER_WORD);
emit_insn (gen_unaligned_loadsi (regs[j], mem));
}
srcoffset += words * UNITS_PER_WORD;
}
if (dst_aligned && words > 1)
{
emit_insn (arm_gen_store_multiple (regnos, words, dst, TRUE, dstbase,
&dstoffset));
dst_autoinc += words * UNITS_PER_WORD;
}
else
{
for (j = 0; j < words; j++)
{
addr = plus_constant (Pmode, dst,
dstoffset + j * UNITS_PER_WORD - dst_autoinc);
mem = adjust_automodify_address (dstbase, SImode, addr,
dstoffset + j * UNITS_PER_WORD);
emit_insn (gen_unaligned_storesi (mem, regs[j]));
}
dstoffset += words * UNITS_PER_WORD;
}
remaining -= words * UNITS_PER_WORD;
gcc_assert (remaining < 4);
/* Copy a halfword if necessary. */
if (remaining >= 2)
{
halfword_tmp = gen_reg_rtx (SImode);
addr = plus_constant (Pmode, src, srcoffset - src_autoinc);
mem = adjust_automodify_address (srcbase, HImode, addr, srcoffset);
emit_insn (gen_unaligned_loadhiu (halfword_tmp, mem));
/* Either write out immediately, or delay until we've loaded the last
byte, depending on interleave factor. */
if (interleave_factor == 1)
{
addr = plus_constant (Pmode, dst, dstoffset - dst_autoinc);
mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset);
emit_insn (gen_unaligned_storehi (mem,
gen_lowpart (HImode, halfword_tmp)));
halfword_tmp = NULL;
dstoffset += 2;
}
remaining -= 2;
srcoffset += 2;
}
gcc_assert (remaining < 2);
/* Copy last byte. */
if ((remaining & 1) != 0)
{
byte_tmp = gen_reg_rtx (SImode);
addr = plus_constant (Pmode, src, srcoffset - src_autoinc);
mem = adjust_automodify_address (srcbase, QImode, addr, srcoffset);
emit_move_insn (gen_lowpart (QImode, byte_tmp), mem);
if (interleave_factor == 1)
{
addr = plus_constant (Pmode, dst, dstoffset - dst_autoinc);
mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset);
emit_move_insn (mem, gen_lowpart (QImode, byte_tmp));
byte_tmp = NULL;
dstoffset++;
}
remaining--;
srcoffset++;
}
/* Store last halfword if we haven't done so already. */
if (halfword_tmp)
{
addr = plus_constant (Pmode, dst, dstoffset - dst_autoinc);
mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset);
emit_insn (gen_unaligned_storehi (mem,
gen_lowpart (HImode, halfword_tmp)));
dstoffset += 2;
}
/* Likewise for last byte. */
if (byte_tmp)
{
addr = plus_constant (Pmode, dst, dstoffset - dst_autoinc);
mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset);
emit_move_insn (mem, gen_lowpart (QImode, byte_tmp));
dstoffset++;
}
gcc_assert (remaining == 0 && srcoffset == dstoffset);
}
/* From mips_adjust_block_mem:
Helper function for doing a loop-based block operation on memory
reference MEM. Each iteration of the loop will operate on LENGTH
bytes of MEM.
Create a new base register for use within the loop and point it to
the start of MEM. Create a new memory reference that uses this
register. Store them in *LOOP_REG and *LOOP_MEM respectively. */
static void
arm_adjust_block_mem (rtx mem, HOST_WIDE_INT length, rtx *loop_reg,
rtx *loop_mem)
{
*loop_reg = copy_addr_to_reg (XEXP (mem, 0));
/* Although the new mem does not refer to a known location,
it does keep up to LENGTH bytes of alignment. */
*loop_mem = change_address (mem, BLKmode, *loop_reg);
set_mem_align (*loop_mem, MIN (MEM_ALIGN (mem), length * BITS_PER_UNIT));
}
/* From mips_block_move_loop:
Move LENGTH bytes from SRC to DEST using a loop that moves BYTES_PER_ITER
bytes at a time. LENGTH must be at least BYTES_PER_ITER. Assume that
the memory regions do not overlap. */
static void
arm_block_move_unaligned_loop (rtx dest, rtx src, HOST_WIDE_INT length,
unsigned int interleave_factor,
HOST_WIDE_INT bytes_per_iter)
{
rtx src_reg, dest_reg, final_src, test;
HOST_WIDE_INT leftover;
leftover = length % bytes_per_iter;
length -= leftover;
/* Create registers and memory references for use within the loop. */
arm_adjust_block_mem (src, bytes_per_iter, &src_reg, &src);
arm_adjust_block_mem (dest, bytes_per_iter, &dest_reg, &dest);
/* Calculate the value that SRC_REG should have after the last iteration of
the loop. */
final_src = expand_simple_binop (Pmode, PLUS, src_reg, GEN_INT (length),
0, 0, OPTAB_WIDEN);
/* Emit the start of the loop. */
rtx_code_label *label = gen_label_rtx ();
emit_label (label);
/* Emit the loop body. */
arm_block_move_unaligned_straight (dest, src, bytes_per_iter,
interleave_factor);
/* Move on to the next block. */
emit_move_insn (src_reg, plus_constant (Pmode, src_reg, bytes_per_iter));
emit_move_insn (dest_reg, plus_constant (Pmode, dest_reg, bytes_per_iter));
/* Emit the loop condition. */
test = gen_rtx_NE (VOIDmode, src_reg, final_src);
emit_jump_insn (gen_cbranchsi4 (test, src_reg, final_src, label));
/* Mop up any left-over bytes. */
if (leftover)
arm_block_move_unaligned_straight (dest, src, leftover, interleave_factor);
}
/* Emit a block move when either the source or destination is unaligned (not
aligned to a four-byte boundary). This may need further tuning depending on
core type, optimize_size setting, etc. */
static int
arm_movmemqi_unaligned (rtx *operands)
{
HOST_WIDE_INT length = INTVAL (operands[2]);
if (optimize_size)
{
bool src_aligned = MEM_ALIGN (operands[1]) >= BITS_PER_WORD;
bool dst_aligned = MEM_ALIGN (operands[0]) >= BITS_PER_WORD;
/* Inlined memcpy using ldr/str/ldrh/strh can be quite big: try to limit
size of code if optimizing for size. We'll use ldm/stm if src_aligned
or dst_aligned though: allow more interleaving in those cases since the
resulting code can be smaller. */
unsigned int interleave_factor = (src_aligned || dst_aligned) ? 2 : 1;
HOST_WIDE_INT bytes_per_iter = (src_aligned || dst_aligned) ? 8 : 4;
if (length > 12)
arm_block_move_unaligned_loop (operands[0], operands[1], length,
interleave_factor, bytes_per_iter);
else
arm_block_move_unaligned_straight (operands[0], operands[1], length,
interleave_factor);
}
else
{
/* Note that the loop created by arm_block_move_unaligned_loop may be
subject to loop unrolling, which makes tuning this condition a little
redundant. */
if (length > 32)
arm_block_move_unaligned_loop (operands[0], operands[1], length, 4, 16);
else
arm_block_move_unaligned_straight (operands[0], operands[1], length, 4);
}
return 1;
}
int
arm_gen_movmemqi (rtx *operands)
{
HOST_WIDE_INT in_words_to_go, out_words_to_go, last_bytes;
HOST_WIDE_INT srcoffset, dstoffset;
int i;
rtx src, dst, srcbase, dstbase;
rtx part_bytes_reg = NULL;
rtx mem;
if (!CONST_INT_P (operands[2])
|| !CONST_INT_P (operands[3])
|| INTVAL (operands[2]) > 64)
return 0;
if (unaligned_access && (INTVAL (operands[3]) & 3) != 0)
return arm_movmemqi_unaligned (operands);
if (INTVAL (operands[3]) & 3)
return 0;
dstbase = operands[0];
srcbase = operands[1];
dst = copy_to_mode_reg (SImode, XEXP (dstbase, 0));
src = copy_to_mode_reg (SImode, XEXP (srcbase, 0));
in_words_to_go = ARM_NUM_INTS (INTVAL (operands[2]));
out_words_to_go = INTVAL (operands[2]) / 4;
last_bytes = INTVAL (operands[2]) & 3;
dstoffset = srcoffset = 0;
if (out_words_to_go != in_words_to_go && ((in_words_to_go - 1) & 3) != 0)
part_bytes_reg = gen_rtx_REG (SImode, (in_words_to_go - 1) & 3);
for (i = 0; in_words_to_go >= 2; i+=4)
{
if (in_words_to_go > 4)
emit_insn (arm_gen_load_multiple (arm_regs_in_sequence, 4, src,
TRUE, srcbase, &srcoffset));
else
emit_insn (arm_gen_load_multiple (arm_regs_in_sequence, in_words_to_go,
src, FALSE, srcbase,
&srcoffset));
if (out_words_to_go)
{
if (out_words_to_go > 4)
emit_insn (arm_gen_store_multiple (arm_regs_in_sequence, 4, dst,
TRUE, dstbase, &dstoffset));
else if (out_words_to_go != 1)
emit_insn (arm_gen_store_multiple (arm_regs_in_sequence,
out_words_to_go, dst,
(last_bytes == 0
? FALSE : TRUE),
dstbase, &dstoffset));
else
{
mem = adjust_automodify_address (dstbase, SImode, dst, dstoffset);
emit_move_insn (mem, gen_rtx_REG (SImode, R0_REGNUM));
if (last_bytes != 0)
{
emit_insn (gen_addsi3 (dst, dst, GEN_INT (4)));
dstoffset += 4;
}
}
}
in_words_to_go -= in_words_to_go < 4 ? in_words_to_go : 4;
out_words_to_go -= out_words_to_go < 4 ? out_words_to_go : 4;
}
/* OUT_WORDS_TO_GO will be zero here if there are byte stores to do. */
if (out_words_to_go)
{
rtx sreg;
mem = adjust_automodify_address (srcbase, SImode, src, srcoffset);
sreg = copy_to_reg (mem);
mem = adjust_automodify_address (dstbase, SImode, dst, dstoffset);
emit_move_insn (mem, sreg);
in_words_to_go--;
gcc_assert (!in_words_to_go); /* Sanity check */
}
if (in_words_to_go)
{
gcc_assert (in_words_to_go > 0);
mem = adjust_automodify_address (srcbase, SImode, src, srcoffset);
part_bytes_reg = copy_to_mode_reg (SImode, mem);
}
gcc_assert (!last_bytes || part_bytes_reg);
if (BYTES_BIG_ENDIAN && last_bytes)
{
rtx tmp = gen_reg_rtx (SImode);
/* The bytes we want are in the top end of the word. */
emit_insn (gen_lshrsi3 (tmp, part_bytes_reg,
GEN_INT (8 * (4 - last_bytes))));
part_bytes_reg = tmp;
while (last_bytes)
{
mem = adjust_automodify_address (dstbase, QImode,
plus_constant (Pmode, dst,
last_bytes - 1),
dstoffset + last_bytes - 1);
emit_move_insn (mem, gen_lowpart (QImode, part_bytes_reg));
if (--last_bytes)
{
tmp = gen_reg_rtx (SImode);
emit_insn (gen_lshrsi3 (tmp, part_bytes_reg, GEN_INT (8)));
part_bytes_reg = tmp;
}
}
}
else
{
if (last_bytes > 1)
{
mem = adjust_automodify_address (dstbase, HImode, dst, dstoffset);
emit_move_insn (mem, gen_lowpart (HImode, part_bytes_reg));
last_bytes -= 2;
if (last_bytes)
{
rtx tmp = gen_reg_rtx (SImode);
emit_insn (gen_addsi3 (dst, dst, const2_rtx));
emit_insn (gen_lshrsi3 (tmp, part_bytes_reg, GEN_INT (16)));
part_bytes_reg = tmp;
dstoffset += 2;
}
}
if (last_bytes)
{
mem = adjust_automodify_address (dstbase, QImode, dst, dstoffset);
emit_move_insn (mem, gen_lowpart (QImode, part_bytes_reg));
}
}
return 1;
}
/* Helper for gen_movmem_ldrd_strd. Increase the address of memory rtx
by mode size. */
inline static rtx
next_consecutive_mem (rtx mem)
{
machine_mode mode = GET_MODE (mem);
HOST_WIDE_INT offset = GET_MODE_SIZE (mode);
rtx addr = plus_constant (Pmode, XEXP (mem, 0), offset);
return adjust_automodify_address (mem, mode, addr, offset);
}
/* Copy using LDRD/STRD instructions whenever possible.
Returns true upon success. */
bool
gen_movmem_ldrd_strd (rtx *operands)
{
unsigned HOST_WIDE_INT len;
HOST_WIDE_INT align;
rtx src, dst, base;
rtx reg0;
bool src_aligned, dst_aligned;
bool src_volatile, dst_volatile;
gcc_assert (CONST_INT_P (operands[2]));
gcc_assert (CONST_INT_P (operands[3]));
len = UINTVAL (operands[2]);
if (len > 64)
return false;
/* Maximum alignment we can assume for both src and dst buffers. */
align = INTVAL (operands[3]);
if ((!unaligned_access) && (len >= 4) && ((align & 3) != 0))
return false;
/* Place src and dst addresses in registers
and update the corresponding mem rtx. */
dst = operands[0];
dst_volatile = MEM_VOLATILE_P (dst);
dst_aligned = MEM_ALIGN (dst) >= BITS_PER_WORD;
base = copy_to_mode_reg (SImode, XEXP (dst, 0));
dst = adjust_automodify_address (dst, VOIDmode, base, 0);
src = operands[1];
src_volatile = MEM_VOLATILE_P (src);
src_aligned = MEM_ALIGN (src) >= BITS_PER_WORD;
base = copy_to_mode_reg (SImode, XEXP (src, 0));
src = adjust_automodify_address (src, VOIDmode, base, 0);
if (!unaligned_access && !(src_aligned && dst_aligned))
return false;
if (src_volatile || dst_volatile)
return false;
/* If we cannot generate any LDRD/STRD, try to generate LDM/STM. */
if (!(dst_aligned || src_aligned))
return arm_gen_movmemqi (operands);
src = adjust_address (src, DImode, 0);
dst = adjust_address (dst, DImode, 0);
while (len >= 8)
{
len -= 8;
reg0 = gen_reg_rtx (DImode);
if (src_aligned)
emit_move_insn (reg0, src);
else
emit_insn (gen_unaligned_loaddi (reg0, src));
if (dst_aligned)
emit_move_insn (dst, reg0);
else
emit_insn (gen_unaligned_storedi (dst, reg0));
src = next_consecutive_mem (src);
dst = next_consecutive_mem (dst);
}
gcc_assert (len < 8);
if (len >= 4)
{
/* More than a word but less than a double-word to copy. Copy a word. */
reg0 = gen_reg_rtx (SImode);
src = adjust_address (src, SImode, 0);
dst = adjust_address (dst, SImode, 0);
if (src_aligned)
emit_move_insn (reg0, src);
else
emit_insn (gen_unaligned_loadsi (reg0, src));
if (dst_aligned)
emit_move_insn (dst, reg0);
else
emit_insn (gen_unaligned_storesi (dst, reg0));
src = next_consecutive_mem (src);
dst = next_consecutive_mem (dst);
len -= 4;
}
if (len == 0)
return true;
/* Copy the remaining bytes. */
if (len >= 2)
{
dst = adjust_address (dst, HImode, 0);
src = adjust_address (src, HImode, 0);
reg0 = gen_reg_rtx (SImode);
if (src_aligned)
emit_insn (gen_zero_extendhisi2 (reg0, src));
else
emit_insn (gen_unaligned_loadhiu (reg0, src));
if (dst_aligned)
emit_insn (gen_movhi (dst, gen_lowpart(HImode, reg0)));
else
emit_insn (gen_unaligned_storehi (dst, gen_lowpart (HImode, reg0)));
src = next_consecutive_mem (src);
dst = next_consecutive_mem (dst);
if (len == 2)
return true;
}
dst = adjust_address (dst, QImode, 0);
src = adjust_address (src, QImode, 0);
reg0 = gen_reg_rtx (QImode);
emit_move_insn (reg0, src);
emit_move_insn (dst, reg0);
return true;
}
/* Select a dominance comparison mode if possible for a test of the general
form (OP (COND_OR (X) (Y)) (const_int 0)). We support three forms.
COND_OR == DOM_CC_X_AND_Y => (X && Y)
COND_OR == DOM_CC_NX_OR_Y => ((! X) || Y)
COND_OR == DOM_CC_X_OR_Y => (X || Y)
In all cases OP will be either EQ or NE, but we don't need to know which
here. If we are unable to support a dominance comparison we return
CC mode. This will then fail to match for the RTL expressions that
generate this call. */
machine_mode
arm_select_dominance_cc_mode (rtx x, rtx y, HOST_WIDE_INT cond_or)
{
enum rtx_code cond1, cond2;
int swapped = 0;
/* Currently we will probably get the wrong result if the individual
comparisons are not simple. This also ensures that it is safe to
reverse a comparison if necessary. */
if ((arm_select_cc_mode (cond1 = GET_CODE (x), XEXP (x, 0), XEXP (x, 1))
!= CCmode)
|| (arm_select_cc_mode (cond2 = GET_CODE (y), XEXP (y, 0), XEXP (y, 1))
!= CCmode))
return CCmode;
/* The if_then_else variant of this tests the second condition if the
first passes, but is true if the first fails. Reverse the first
condition to get a true "inclusive-or" expression. */
if (cond_or == DOM_CC_NX_OR_Y)
cond1 = reverse_condition (cond1);
/* If the comparisons are not equal, and one doesn't dominate the other,
then we can't do this. */
if (cond1 != cond2
&& !comparison_dominates_p (cond1, cond2)
&& (swapped = 1, !comparison_dominates_p (cond2, cond1)))
return CCmode;
if (swapped)
std::swap (cond1, cond2);
switch (cond1)
{
case EQ:
if (cond_or == DOM_CC_X_AND_Y)
return CC_DEQmode;
switch (cond2)
{
case EQ: return CC_DEQmode;
case LE: return CC_DLEmode;
case LEU: return CC_DLEUmode;
case GE: return CC_DGEmode;
case GEU: return CC_DGEUmode;
default: gcc_unreachable ();
}
case LT:
if (cond_or == DOM_CC_X_AND_Y)
return CC_DLTmode;
switch (cond2)
{
case LT:
return CC_DLTmode;
case LE:
return CC_DLEmode;
case NE:
return CC_DNEmode;
default:
gcc_unreachable ();
}
case GT:
if (cond_or == DOM_CC_X_AND_Y)
return CC_DGTmode;
switch (cond2)
{
case GT:
return CC_DGTmode;
case GE:
return CC_DGEmode;
case NE:
return CC_DNEmode;
default:
gcc_unreachable ();
}
case LTU:
if (cond_or == DOM_CC_X_AND_Y)
return CC_DLTUmode;
switch (cond2)
{