| /* Copyright (C) 2006-2015 Free Software Foundation, Inc. |
| |
| This file is free software; you can redistribute it and/or modify it under |
| the terms of the GNU General Public License as published by the Free |
| Software Foundation; either version 3 of the License, or (at your option) |
| any later version. |
| |
| This file is distributed in the hope that it will be useful, but WITHOUT |
| ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with GCC; see the file COPYING3. If not see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #include "config.h" |
| #include "system.h" |
| #include "coretypes.h" |
| #include "tm.h" |
| #include "rtl.h" |
| #include "regs.h" |
| #include "hard-reg-set.h" |
| #include "insn-config.h" |
| #include "conditions.h" |
| #include "insn-attr.h" |
| #include "flags.h" |
| #include "recog.h" |
| #include "obstack.h" |
| #include "hash-set.h" |
| #include "machmode.h" |
| #include "vec.h" |
| #include "double-int.h" |
| #include "input.h" |
| #include "alias.h" |
| #include "symtab.h" |
| #include "wide-int.h" |
| #include "inchash.h" |
| #include "tree.h" |
| #include "fold-const.h" |
| #include "stringpool.h" |
| #include "stor-layout.h" |
| #include "calls.h" |
| #include "varasm.h" |
| #include "hashtab.h" |
| #include "function.h" |
| #include "statistics.h" |
| #include "real.h" |
| #include "fixed-value.h" |
| #include "expmed.h" |
| #include "dojump.h" |
| #include "explow.h" |
| #include "emit-rtl.h" |
| #include "stmt.h" |
| #include "expr.h" |
| #include "insn-codes.h" |
| #include "optabs.h" |
| #include "except.h" |
| #include "output.h" |
| #include "predict.h" |
| #include "dominance.h" |
| #include "cfg.h" |
| #include "cfgrtl.h" |
| #include "cfganal.h" |
| #include "lcm.h" |
| #include "cfgbuild.h" |
| #include "cfgcleanup.h" |
| #include "basic-block.h" |
| #include "diagnostic-core.h" |
| #include "ggc.h" |
| #include "tm_p.h" |
| #include "target.h" |
| #include "target-def.h" |
| #include "langhooks.h" |
| #include "reload.h" |
| #include "sched-int.h" |
| #include "params.h" |
| #include "hash-table.h" |
| #include "tree-ssa-alias.h" |
| #include "internal-fn.h" |
| #include "gimple-fold.h" |
| #include "tree-eh.h" |
| #include "gimple-expr.h" |
| #include "is-a.h" |
| #include "gimple.h" |
| #include "gimplify.h" |
| #include "tm-constrs.h" |
| #include "sbitmap.h" |
| #include "df.h" |
| #include "ddg.h" |
| #include "timevar.h" |
| #include "dumpfile.h" |
| #include "cfgloop.h" |
| #include "builtins.h" |
| #include "rtl-iter.h" |
| |
| /* Builtin types, data and prototypes. */ |
| |
| enum spu_builtin_type_index |
| { |
| SPU_BTI_END_OF_PARAMS, |
| |
| /* We create new type nodes for these. */ |
| SPU_BTI_V16QI, |
| SPU_BTI_V8HI, |
| SPU_BTI_V4SI, |
| SPU_BTI_V2DI, |
| SPU_BTI_V4SF, |
| SPU_BTI_V2DF, |
| SPU_BTI_UV16QI, |
| SPU_BTI_UV8HI, |
| SPU_BTI_UV4SI, |
| SPU_BTI_UV2DI, |
| |
| /* A 16-byte type. (Implemented with V16QI_type_node) */ |
| SPU_BTI_QUADWORD, |
| |
| /* These all correspond to intSI_type_node */ |
| SPU_BTI_7, |
| SPU_BTI_S7, |
| SPU_BTI_U7, |
| SPU_BTI_S10, |
| SPU_BTI_S10_4, |
| SPU_BTI_U14, |
| SPU_BTI_16, |
| SPU_BTI_S16, |
| SPU_BTI_S16_2, |
| SPU_BTI_U16, |
| SPU_BTI_U16_2, |
| SPU_BTI_U18, |
| |
| /* These correspond to the standard types */ |
| SPU_BTI_INTQI, |
| SPU_BTI_INTHI, |
| SPU_BTI_INTSI, |
| SPU_BTI_INTDI, |
| |
| SPU_BTI_UINTQI, |
| SPU_BTI_UINTHI, |
| SPU_BTI_UINTSI, |
| SPU_BTI_UINTDI, |
| |
| SPU_BTI_FLOAT, |
| SPU_BTI_DOUBLE, |
| |
| SPU_BTI_VOID, |
| SPU_BTI_PTR, |
| |
| SPU_BTI_MAX |
| }; |
| |
| #define V16QI_type_node (spu_builtin_types[SPU_BTI_V16QI]) |
| #define V8HI_type_node (spu_builtin_types[SPU_BTI_V8HI]) |
| #define V4SI_type_node (spu_builtin_types[SPU_BTI_V4SI]) |
| #define V2DI_type_node (spu_builtin_types[SPU_BTI_V2DI]) |
| #define V4SF_type_node (spu_builtin_types[SPU_BTI_V4SF]) |
| #define V2DF_type_node (spu_builtin_types[SPU_BTI_V2DF]) |
| #define unsigned_V16QI_type_node (spu_builtin_types[SPU_BTI_UV16QI]) |
| #define unsigned_V8HI_type_node (spu_builtin_types[SPU_BTI_UV8HI]) |
| #define unsigned_V4SI_type_node (spu_builtin_types[SPU_BTI_UV4SI]) |
| #define unsigned_V2DI_type_node (spu_builtin_types[SPU_BTI_UV2DI]) |
| |
| static GTY(()) tree spu_builtin_types[SPU_BTI_MAX]; |
| |
| struct spu_builtin_range |
| { |
| int low, high; |
| }; |
| |
| static struct spu_builtin_range spu_builtin_range[] = { |
| {-0x40ll, 0x7fll}, /* SPU_BTI_7 */ |
| {-0x40ll, 0x3fll}, /* SPU_BTI_S7 */ |
| {0ll, 0x7fll}, /* SPU_BTI_U7 */ |
| {-0x200ll, 0x1ffll}, /* SPU_BTI_S10 */ |
| {-0x2000ll, 0x1fffll}, /* SPU_BTI_S10_4 */ |
| {0ll, 0x3fffll}, /* SPU_BTI_U14 */ |
| {-0x8000ll, 0xffffll}, /* SPU_BTI_16 */ |
| {-0x8000ll, 0x7fffll}, /* SPU_BTI_S16 */ |
| {-0x20000ll, 0x1ffffll}, /* SPU_BTI_S16_2 */ |
| {0ll, 0xffffll}, /* SPU_BTI_U16 */ |
| {0ll, 0x3ffffll}, /* SPU_BTI_U16_2 */ |
| {0ll, 0x3ffffll}, /* SPU_BTI_U18 */ |
| }; |
| |
| |
| /* Target specific attribute specifications. */ |
| char regs_ever_allocated[FIRST_PSEUDO_REGISTER]; |
| |
| /* Prototypes and external defs. */ |
| static int get_pipe (rtx_insn *insn); |
| static int spu_naked_function_p (tree func); |
| static int mem_is_padded_component_ref (rtx x); |
| static void fix_range (const char *); |
| static rtx spu_expand_load (rtx, rtx, rtx, int); |
| |
| /* Which instruction set architecture to use. */ |
| int spu_arch; |
| /* Which cpu are we tuning for. */ |
| int spu_tune; |
| |
| /* The hardware requires 8 insns between a hint and the branch it |
| effects. This variable describes how many rtl instructions the |
| compiler needs to see before inserting a hint, and then the compiler |
| will insert enough nops to make it at least 8 insns. The default is |
| for the compiler to allow up to 2 nops be emitted. The nops are |
| inserted in pairs, so we round down. */ |
| int spu_hint_dist = (8*4) - (2*4); |
| |
| enum spu_immediate { |
| SPU_NONE, |
| SPU_IL, |
| SPU_ILA, |
| SPU_ILH, |
| SPU_ILHU, |
| SPU_ORI, |
| SPU_ORHI, |
| SPU_ORBI, |
| SPU_IOHL |
| }; |
| enum immediate_class |
| { |
| IC_POOL, /* constant pool */ |
| IC_IL1, /* one il* instruction */ |
| IC_IL2, /* both ilhu and iohl instructions */ |
| IC_IL1s, /* one il* instruction */ |
| IC_IL2s, /* both ilhu and iohl instructions */ |
| IC_FSMBI, /* the fsmbi instruction */ |
| IC_CPAT, /* one of the c*d instructions */ |
| IC_FSMBI2 /* fsmbi plus 1 other instruction */ |
| }; |
| |
| static enum spu_immediate which_immediate_load (HOST_WIDE_INT val); |
| static enum spu_immediate which_logical_immediate (HOST_WIDE_INT val); |
| static int cpat_info(unsigned char *arr, int size, int *prun, int *pstart); |
| static enum immediate_class classify_immediate (rtx op, |
| machine_mode mode); |
| |
| /* Pointer mode for __ea references. */ |
| #define EAmode (spu_ea_model != 32 ? DImode : SImode) |
| |
| |
| /* Define the structure for the machine field in struct function. */ |
| struct GTY(()) machine_function |
| { |
| /* Register to use for PIC accesses. */ |
| rtx pic_reg; |
| }; |
| |
| /* How to allocate a 'struct machine_function'. */ |
| static struct machine_function * |
| spu_init_machine_status (void) |
| { |
| return ggc_cleared_alloc<machine_function> (); |
| } |
| |
| /* Implement TARGET_OPTION_OVERRIDE. */ |
| static void |
| spu_option_override (void) |
| { |
| /* Set up function hooks. */ |
| init_machine_status = spu_init_machine_status; |
| |
| /* Small loops will be unpeeled at -O3. For SPU it is more important |
| to keep code small by default. */ |
| if (!flag_unroll_loops && !flag_peel_loops) |
| maybe_set_param_value (PARAM_MAX_COMPLETELY_PEEL_TIMES, 4, |
| global_options.x_param_values, |
| global_options_set.x_param_values); |
| |
| flag_omit_frame_pointer = 1; |
| |
| /* Functions must be 8 byte aligned so we correctly handle dual issue */ |
| if (align_functions < 8) |
| align_functions = 8; |
| |
| spu_hint_dist = 8*4 - spu_max_nops*4; |
| if (spu_hint_dist < 0) |
| spu_hint_dist = 0; |
| |
| if (spu_fixed_range_string) |
| fix_range (spu_fixed_range_string); |
| |
| /* Determine processor architectural level. */ |
| if (spu_arch_string) |
| { |
| if (strcmp (&spu_arch_string[0], "cell") == 0) |
| spu_arch = PROCESSOR_CELL; |
| else if (strcmp (&spu_arch_string[0], "celledp") == 0) |
| spu_arch = PROCESSOR_CELLEDP; |
| else |
| error ("bad value (%s) for -march= switch", spu_arch_string); |
| } |
| |
| /* Determine processor to tune for. */ |
| if (spu_tune_string) |
| { |
| if (strcmp (&spu_tune_string[0], "cell") == 0) |
| spu_tune = PROCESSOR_CELL; |
| else if (strcmp (&spu_tune_string[0], "celledp") == 0) |
| spu_tune = PROCESSOR_CELLEDP; |
| else |
| error ("bad value (%s) for -mtune= switch", spu_tune_string); |
| } |
| |
| /* Change defaults according to the processor architecture. */ |
| if (spu_arch == PROCESSOR_CELLEDP) |
| { |
| /* If no command line option has been otherwise specified, change |
| the default to -mno-safe-hints on celledp -- only the original |
| Cell/B.E. processors require this workaround. */ |
| if (!(target_flags_explicit & MASK_SAFE_HINTS)) |
| target_flags &= ~MASK_SAFE_HINTS; |
| } |
| |
| REAL_MODE_FORMAT (SFmode) = &spu_single_format; |
| } |
| |
| /* Handle an attribute requiring a FUNCTION_DECL; arguments as in |
| struct attribute_spec.handler. */ |
| |
| /* True if MODE is valid for the target. By "valid", we mean able to |
| be manipulated in non-trivial ways. In particular, this means all |
| the arithmetic is supported. */ |
| static bool |
| spu_scalar_mode_supported_p (machine_mode mode) |
| { |
| switch (mode) |
| { |
| case QImode: |
| case HImode: |
| case SImode: |
| case SFmode: |
| case DImode: |
| case TImode: |
| case DFmode: |
| return true; |
| |
| default: |
| return false; |
| } |
| } |
| |
| /* Similarly for vector modes. "Supported" here is less strict. At |
| least some operations are supported; need to check optabs or builtins |
| for further details. */ |
| static bool |
| spu_vector_mode_supported_p (machine_mode mode) |
| { |
| switch (mode) |
| { |
| case V16QImode: |
| case V8HImode: |
| case V4SImode: |
| case V2DImode: |
| case V4SFmode: |
| case V2DFmode: |
| return true; |
| |
| default: |
| return false; |
| } |
| } |
| |
| /* GCC assumes that in a paradoxical SUBREG the inner mode occupies the |
| least significant bytes of the outer mode. This function returns |
| TRUE for the SUBREG's where this is correct. */ |
| int |
| valid_subreg (rtx op) |
| { |
| machine_mode om = GET_MODE (op); |
| machine_mode im = GET_MODE (SUBREG_REG (op)); |
| return om != VOIDmode && im != VOIDmode |
| && (GET_MODE_SIZE (im) == GET_MODE_SIZE (om) |
| || (GET_MODE_SIZE (im) <= 4 && GET_MODE_SIZE (om) <= 4) |
| || (GET_MODE_SIZE (im) >= 16 && GET_MODE_SIZE (om) >= 16)); |
| } |
| |
| /* When insv and ext[sz]v ar passed a TI SUBREG, we want to strip it off |
| and adjust the start offset. */ |
| static rtx |
| adjust_operand (rtx op, HOST_WIDE_INT * start) |
| { |
| machine_mode mode; |
| int op_size; |
| /* Strip any paradoxical SUBREG. */ |
| if (GET_CODE (op) == SUBREG |
| && (GET_MODE_BITSIZE (GET_MODE (op)) |
| > GET_MODE_BITSIZE (GET_MODE (SUBREG_REG (op))))) |
| { |
| if (start) |
| *start -= |
| GET_MODE_BITSIZE (GET_MODE (op)) - |
| GET_MODE_BITSIZE (GET_MODE (SUBREG_REG (op))); |
| op = SUBREG_REG (op); |
| } |
| /* If it is smaller than SI, assure a SUBREG */ |
| op_size = GET_MODE_BITSIZE (GET_MODE (op)); |
| if (op_size < 32) |
| { |
| if (start) |
| *start += 32 - op_size; |
| op_size = 32; |
| } |
| /* If it is not a MODE_INT (and/or it is smaller than SI) add a SUBREG. */ |
| mode = mode_for_size (op_size, MODE_INT, 0); |
| if (mode != GET_MODE (op)) |
| op = gen_rtx_SUBREG (mode, op, 0); |
| return op; |
| } |
| |
| void |
| spu_expand_extv (rtx ops[], int unsignedp) |
| { |
| rtx dst = ops[0], src = ops[1]; |
| HOST_WIDE_INT width = INTVAL (ops[2]); |
| HOST_WIDE_INT start = INTVAL (ops[3]); |
| HOST_WIDE_INT align_mask; |
| rtx s0, s1, mask, r0; |
| |
| gcc_assert (REG_P (dst) && GET_MODE (dst) == TImode); |
| |
| if (MEM_P (src)) |
| { |
| /* First, determine if we need 1 TImode load or 2. We need only 1 |
| if the bits being extracted do not cross the alignment boundary |
| as determined by the MEM and its address. */ |
| |
| align_mask = -MEM_ALIGN (src); |
| if ((start & align_mask) == ((start + width - 1) & align_mask)) |
| { |
| /* Alignment is sufficient for 1 load. */ |
| s0 = gen_reg_rtx (TImode); |
| r0 = spu_expand_load (s0, 0, src, start / 8); |
| start &= 7; |
| if (r0) |
| emit_insn (gen_rotqby_ti (s0, s0, r0)); |
| } |
| else |
| { |
| /* Need 2 loads. */ |
| s0 = gen_reg_rtx (TImode); |
| s1 = gen_reg_rtx (TImode); |
| r0 = spu_expand_load (s0, s1, src, start / 8); |
| start &= 7; |
| |
| gcc_assert (start + width <= 128); |
| if (r0) |
| { |
| rtx r1 = gen_reg_rtx (SImode); |
| mask = gen_reg_rtx (TImode); |
| emit_move_insn (mask, GEN_INT (-1)); |
| emit_insn (gen_rotqby_ti (s0, s0, r0)); |
| emit_insn (gen_rotqby_ti (s1, s1, r0)); |
| if (GET_CODE (r0) == CONST_INT) |
| r1 = GEN_INT (INTVAL (r0) & 15); |
| else |
| emit_insn (gen_andsi3 (r1, r0, GEN_INT (15))); |
| emit_insn (gen_shlqby_ti (mask, mask, r1)); |
| emit_insn (gen_selb (s0, s1, s0, mask)); |
| } |
| } |
| |
| } |
| else if (GET_CODE (src) == SUBREG) |
| { |
| rtx r = SUBREG_REG (src); |
| gcc_assert (REG_P (r) && SCALAR_INT_MODE_P (GET_MODE (r))); |
| s0 = gen_reg_rtx (TImode); |
| if (GET_MODE_SIZE (GET_MODE (r)) < GET_MODE_SIZE (TImode)) |
| emit_insn (gen_rtx_SET (VOIDmode, s0, gen_rtx_ZERO_EXTEND (TImode, r))); |
| else |
| emit_move_insn (s0, src); |
| } |
| else |
| { |
| gcc_assert (REG_P (src) && GET_MODE (src) == TImode); |
| s0 = gen_reg_rtx (TImode); |
| emit_move_insn (s0, src); |
| } |
| |
| /* Now s0 is TImode and contains the bits to extract at start. */ |
| |
| if (start) |
| emit_insn (gen_rotlti3 (s0, s0, GEN_INT (start))); |
| |
| if (128 - width) |
| s0 = expand_shift (RSHIFT_EXPR, TImode, s0, 128 - width, s0, unsignedp); |
| |
| emit_move_insn (dst, s0); |
| } |
| |
| void |
| spu_expand_insv (rtx ops[]) |
| { |
| HOST_WIDE_INT width = INTVAL (ops[1]); |
| HOST_WIDE_INT start = INTVAL (ops[2]); |
| HOST_WIDE_INT maskbits; |
| machine_mode dst_mode; |
| rtx dst = ops[0], src = ops[3]; |
| int dst_size; |
| rtx mask; |
| rtx shift_reg; |
| int shift; |
| |
| |
| if (GET_CODE (ops[0]) == MEM) |
| dst = gen_reg_rtx (TImode); |
| else |
| dst = adjust_operand (dst, &start); |
| dst_mode = GET_MODE (dst); |
| dst_size = GET_MODE_BITSIZE (GET_MODE (dst)); |
| |
| if (CONSTANT_P (src)) |
| { |
| machine_mode m = |
| (width <= 32 ? SImode : width <= 64 ? DImode : TImode); |
| src = force_reg (m, convert_to_mode (m, src, 0)); |
| } |
| src = adjust_operand (src, 0); |
| |
| mask = gen_reg_rtx (dst_mode); |
| shift_reg = gen_reg_rtx (dst_mode); |
| shift = dst_size - start - width; |
| |
| /* It's not safe to use subreg here because the compiler assumes |
| that the SUBREG_REG is right justified in the SUBREG. */ |
| convert_move (shift_reg, src, 1); |
| |
| if (shift > 0) |
| { |
| switch (dst_mode) |
| { |
| case SImode: |
| emit_insn (gen_ashlsi3 (shift_reg, shift_reg, GEN_INT (shift))); |
| break; |
| case DImode: |
| emit_insn (gen_ashldi3 (shift_reg, shift_reg, GEN_INT (shift))); |
| break; |
| case TImode: |
| emit_insn (gen_ashlti3 (shift_reg, shift_reg, GEN_INT (shift))); |
| break; |
| default: |
| abort (); |
| } |
| } |
| else if (shift < 0) |
| abort (); |
| |
| switch (dst_size) |
| { |
| case 32: |
| maskbits = (-1ll << (32 - width - start)); |
| if (start) |
| maskbits += (1ll << (32 - start)); |
| emit_move_insn (mask, GEN_INT (maskbits)); |
| break; |
| case 64: |
| maskbits = (-1ll << (64 - width - start)); |
| if (start) |
| maskbits += (1ll << (64 - start)); |
| emit_move_insn (mask, GEN_INT (maskbits)); |
| break; |
| case 128: |
| { |
| unsigned char arr[16]; |
| int i = start / 8; |
| memset (arr, 0, sizeof (arr)); |
| arr[i] = 0xff >> (start & 7); |
| for (i++; i <= (start + width - 1) / 8; i++) |
| arr[i] = 0xff; |
| arr[i - 1] &= 0xff << (7 - ((start + width - 1) & 7)); |
| emit_move_insn (mask, array_to_constant (TImode, arr)); |
| } |
| break; |
| default: |
| abort (); |
| } |
| if (GET_CODE (ops[0]) == MEM) |
| { |
| rtx low = gen_reg_rtx (SImode); |
| rtx rotl = gen_reg_rtx (SImode); |
| rtx mask0 = gen_reg_rtx (TImode); |
| rtx addr; |
| rtx addr0; |
| rtx addr1; |
| rtx mem; |
| |
| addr = force_reg (Pmode, XEXP (ops[0], 0)); |
| addr0 = gen_rtx_AND (Pmode, addr, GEN_INT (-16)); |
| emit_insn (gen_andsi3 (low, addr, GEN_INT (15))); |
| emit_insn (gen_negsi2 (rotl, low)); |
| emit_insn (gen_rotqby_ti (shift_reg, shift_reg, rotl)); |
| emit_insn (gen_rotqmby_ti (mask0, mask, rotl)); |
| mem = change_address (ops[0], TImode, addr0); |
| set_mem_alias_set (mem, 0); |
| emit_move_insn (dst, mem); |
| emit_insn (gen_selb (dst, dst, shift_reg, mask0)); |
| if (start + width > MEM_ALIGN (ops[0])) |
| { |
| rtx shl = gen_reg_rtx (SImode); |
| rtx mask1 = gen_reg_rtx (TImode); |
| rtx dst1 = gen_reg_rtx (TImode); |
| rtx mem1; |
| addr1 = plus_constant (Pmode, addr, 16); |
| addr1 = gen_rtx_AND (Pmode, addr1, GEN_INT (-16)); |
| emit_insn (gen_subsi3 (shl, GEN_INT (16), low)); |
| emit_insn (gen_shlqby_ti (mask1, mask, shl)); |
| mem1 = change_address (ops[0], TImode, addr1); |
| set_mem_alias_set (mem1, 0); |
| emit_move_insn (dst1, mem1); |
| emit_insn (gen_selb (dst1, dst1, shift_reg, mask1)); |
| emit_move_insn (mem1, dst1); |
| } |
| emit_move_insn (mem, dst); |
| } |
| else |
| emit_insn (gen_selb (dst, copy_rtx (dst), shift_reg, mask)); |
| } |
| |
| |
| int |
| spu_expand_block_move (rtx ops[]) |
| { |
| HOST_WIDE_INT bytes, align, offset; |
| rtx src, dst, sreg, dreg, target; |
| int i; |
| if (GET_CODE (ops[2]) != CONST_INT |
| || GET_CODE (ops[3]) != CONST_INT |
| || INTVAL (ops[2]) > (HOST_WIDE_INT) (MOVE_RATIO (optimize_insn_for_speed_p ()) * 8)) |
| return 0; |
| |
| bytes = INTVAL (ops[2]); |
| align = INTVAL (ops[3]); |
| |
| if (bytes <= 0) |
| return 1; |
| |
| dst = ops[0]; |
| src = ops[1]; |
| |
| if (align == 16) |
| { |
| for (offset = 0; offset + 16 <= bytes; offset += 16) |
| { |
| dst = adjust_address (ops[0], V16QImode, offset); |
| src = adjust_address (ops[1], V16QImode, offset); |
| emit_move_insn (dst, src); |
| } |
| if (offset < bytes) |
| { |
| rtx mask; |
| unsigned char arr[16] = { 0 }; |
| for (i = 0; i < bytes - offset; i++) |
| arr[i] = 0xff; |
| dst = adjust_address (ops[0], V16QImode, offset); |
| src = adjust_address (ops[1], V16QImode, offset); |
| mask = gen_reg_rtx (V16QImode); |
| sreg = gen_reg_rtx (V16QImode); |
| dreg = gen_reg_rtx (V16QImode); |
| target = gen_reg_rtx (V16QImode); |
| emit_move_insn (mask, array_to_constant (V16QImode, arr)); |
| emit_move_insn (dreg, dst); |
| emit_move_insn (sreg, src); |
| emit_insn (gen_selb (target, dreg, sreg, mask)); |
| emit_move_insn (dst, target); |
| } |
| return 1; |
| } |
| return 0; |
| } |
| |
| enum spu_comp_code |
| { SPU_EQ, SPU_GT, SPU_GTU }; |
| |
| int spu_comp_icode[12][3] = { |
| {CODE_FOR_ceq_qi, CODE_FOR_cgt_qi, CODE_FOR_clgt_qi}, |
| {CODE_FOR_ceq_hi, CODE_FOR_cgt_hi, CODE_FOR_clgt_hi}, |
| {CODE_FOR_ceq_si, CODE_FOR_cgt_si, CODE_FOR_clgt_si}, |
| {CODE_FOR_ceq_di, CODE_FOR_cgt_di, CODE_FOR_clgt_di}, |
| {CODE_FOR_ceq_ti, CODE_FOR_cgt_ti, CODE_FOR_clgt_ti}, |
| {CODE_FOR_ceq_sf, CODE_FOR_cgt_sf, 0}, |
| {CODE_FOR_ceq_df, CODE_FOR_cgt_df, 0}, |
| {CODE_FOR_ceq_v16qi, CODE_FOR_cgt_v16qi, CODE_FOR_clgt_v16qi}, |
| {CODE_FOR_ceq_v8hi, CODE_FOR_cgt_v8hi, CODE_FOR_clgt_v8hi}, |
| {CODE_FOR_ceq_v4si, CODE_FOR_cgt_v4si, CODE_FOR_clgt_v4si}, |
| {CODE_FOR_ceq_v4sf, CODE_FOR_cgt_v4sf, 0}, |
| {CODE_FOR_ceq_v2df, CODE_FOR_cgt_v2df, 0}, |
| }; |
| |
| /* Generate a compare for CODE. Return a brand-new rtx that represents |
| the result of the compare. GCC can figure this out too if we don't |
| provide all variations of compares, but GCC always wants to use |
| WORD_MODE, we can generate better code in most cases if we do it |
| ourselves. */ |
| void |
| spu_emit_branch_or_set (int is_set, rtx cmp, rtx operands[]) |
| { |
| int reverse_compare = 0; |
| int reverse_test = 0; |
| rtx compare_result, eq_result; |
| rtx comp_rtx, eq_rtx; |
| machine_mode comp_mode; |
| machine_mode op_mode; |
| enum spu_comp_code scode, eq_code; |
| enum insn_code ior_code; |
| enum rtx_code code = GET_CODE (cmp); |
| rtx op0 = XEXP (cmp, 0); |
| rtx op1 = XEXP (cmp, 1); |
| int index; |
| int eq_test = 0; |
| |
| /* When op1 is a CONST_INT change (X >= C) to (X > C-1), |
| and so on, to keep the constant in operand 1. */ |
| if (GET_CODE (op1) == CONST_INT) |
| { |
| HOST_WIDE_INT val = INTVAL (op1) - 1; |
| if (trunc_int_for_mode (val, GET_MODE (op0)) == val) |
| switch (code) |
| { |
| case GE: |
| op1 = GEN_INT (val); |
| code = GT; |
| break; |
| case LT: |
| op1 = GEN_INT (val); |
| code = LE; |
| break; |
| case GEU: |
| op1 = GEN_INT (val); |
| code = GTU; |
| break; |
| case LTU: |
| op1 = GEN_INT (val); |
| code = LEU; |
| break; |
| default: |
| break; |
| } |
| } |
| |
| /* However, if we generate an integer result, performing a reverse test |
| would require an extra negation, so avoid that where possible. */ |
| if (GET_CODE (op1) == CONST_INT && is_set == 1) |
| { |
| HOST_WIDE_INT val = INTVAL (op1) + 1; |
| if (trunc_int_for_mode (val, GET_MODE (op0)) == val) |
| switch (code) |
| { |
| case LE: |
| op1 = GEN_INT (val); |
| code = LT; |
| break; |
| case LEU: |
| op1 = GEN_INT (val); |
| code = LTU; |
| break; |
| default: |
| break; |
| } |
| } |
| |
| comp_mode = SImode; |
| op_mode = GET_MODE (op0); |
| |
| switch (code) |
| { |
| case GE: |
| scode = SPU_GT; |
| if (HONOR_NANS (op_mode)) |
| { |
| reverse_compare = 0; |
| reverse_test = 0; |
| eq_test = 1; |
| eq_code = SPU_EQ; |
| } |
| else |
| { |
| reverse_compare = 1; |
| reverse_test = 1; |
| } |
| break; |
| case LE: |
| scode = SPU_GT; |
| if (HONOR_NANS (op_mode)) |
| { |
| reverse_compare = 1; |
| reverse_test = 0; |
| eq_test = 1; |
| eq_code = SPU_EQ; |
| } |
| else |
| { |
| reverse_compare = 0; |
| reverse_test = 1; |
| } |
| break; |
| case LT: |
| reverse_compare = 1; |
| reverse_test = 0; |
| scode = SPU_GT; |
| break; |
| case GEU: |
| reverse_compare = 1; |
| reverse_test = 1; |
| scode = SPU_GTU; |
| break; |
| case LEU: |
| reverse_compare = 0; |
| reverse_test = 1; |
| scode = SPU_GTU; |
| break; |
| case LTU: |
| reverse_compare = 1; |
| reverse_test = 0; |
| scode = SPU_GTU; |
| break; |
| case NE: |
| reverse_compare = 0; |
| reverse_test = 1; |
| scode = SPU_EQ; |
| break; |
| |
| case EQ: |
| scode = SPU_EQ; |
| break; |
| case GT: |
| scode = SPU_GT; |
| break; |
| case GTU: |
| scode = SPU_GTU; |
| break; |
| default: |
| scode = SPU_EQ; |
| break; |
| } |
| |
| switch (op_mode) |
| { |
| case QImode: |
| index = 0; |
| comp_mode = QImode; |
| break; |
| case HImode: |
| index = 1; |
| comp_mode = HImode; |
| break; |
| case SImode: |
| index = 2; |
| break; |
| case DImode: |
| index = 3; |
| break; |
| case TImode: |
| index = 4; |
| break; |
| case SFmode: |
| index = 5; |
| break; |
| case DFmode: |
| index = 6; |
| break; |
| case V16QImode: |
| index = 7; |
| comp_mode = op_mode; |
| break; |
| case V8HImode: |
| index = 8; |
| comp_mode = op_mode; |
| break; |
| case V4SImode: |
| index = 9; |
| comp_mode = op_mode; |
| break; |
| case V4SFmode: |
| index = 10; |
| comp_mode = V4SImode; |
| break; |
| case V2DFmode: |
| index = 11; |
| comp_mode = V2DImode; |
| break; |
| case V2DImode: |
| default: |
| abort (); |
| } |
| |
| if (GET_MODE (op1) == DFmode |
| && (scode != SPU_GT && scode != SPU_EQ)) |
| abort (); |
| |
| if (is_set == 0 && op1 == const0_rtx |
| && (GET_MODE (op0) == SImode |
| || GET_MODE (op0) == HImode |
| || GET_MODE (op0) == QImode) && scode == SPU_EQ) |
| { |
| /* Don't need to set a register with the result when we are |
| comparing against zero and branching. */ |
| reverse_test = !reverse_test; |
| compare_result = op0; |
| } |
| else |
| { |
| compare_result = gen_reg_rtx (comp_mode); |
| |
| if (reverse_compare) |
| { |
| rtx t = op1; |
| op1 = op0; |
| op0 = t; |
| } |
| |
| if (spu_comp_icode[index][scode] == 0) |
| abort (); |
| |
| if (!(*insn_data[spu_comp_icode[index][scode]].operand[1].predicate) |
| (op0, op_mode)) |
| op0 = force_reg (op_mode, op0); |
| if (!(*insn_data[spu_comp_icode[index][scode]].operand[2].predicate) |
| (op1, op_mode)) |
| op1 = force_reg (op_mode, op1); |
| comp_rtx = GEN_FCN (spu_comp_icode[index][scode]) (compare_result, |
| op0, op1); |
| if (comp_rtx == 0) |
| abort (); |
| emit_insn (comp_rtx); |
| |
| if (eq_test) |
| { |
| eq_result = gen_reg_rtx (comp_mode); |
| eq_rtx = GEN_FCN (spu_comp_icode[index][eq_code]) (eq_result, |
| op0, op1); |
| if (eq_rtx == 0) |
| abort (); |
| emit_insn (eq_rtx); |
| ior_code = optab_handler (ior_optab, comp_mode); |
| gcc_assert (ior_code != CODE_FOR_nothing); |
| emit_insn (GEN_FCN (ior_code) |
| (compare_result, compare_result, eq_result)); |
| } |
| } |
| |
| if (is_set == 0) |
| { |
| rtx bcomp; |
| rtx loc_ref; |
| |
| /* We don't have branch on QI compare insns, so we convert the |
| QI compare result to a HI result. */ |
| if (comp_mode == QImode) |
| { |
| rtx old_res = compare_result; |
| compare_result = gen_reg_rtx (HImode); |
| comp_mode = HImode; |
| emit_insn (gen_extendqihi2 (compare_result, old_res)); |
| } |
| |
| if (reverse_test) |
| bcomp = gen_rtx_EQ (comp_mode, compare_result, const0_rtx); |
| else |
| bcomp = gen_rtx_NE (comp_mode, compare_result, const0_rtx); |
| |
| loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[3]); |
| emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, |
| gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp, |
| loc_ref, pc_rtx))); |
| } |
| else if (is_set == 2) |
| { |
| rtx target = operands[0]; |
| int compare_size = GET_MODE_BITSIZE (comp_mode); |
| int target_size = GET_MODE_BITSIZE (GET_MODE (target)); |
| machine_mode mode = mode_for_size (target_size, MODE_INT, 0); |
| rtx select_mask; |
| rtx op_t = operands[2]; |
| rtx op_f = operands[3]; |
| |
| /* The result of the comparison can be SI, HI or QI mode. Create a |
| mask based on that result. */ |
| if (target_size > compare_size) |
| { |
| select_mask = gen_reg_rtx (mode); |
| emit_insn (gen_extend_compare (select_mask, compare_result)); |
| } |
| else if (target_size < compare_size) |
| select_mask = |
| gen_rtx_SUBREG (mode, compare_result, |
| (compare_size - target_size) / BITS_PER_UNIT); |
| else if (comp_mode != mode) |
| select_mask = gen_rtx_SUBREG (mode, compare_result, 0); |
| else |
| select_mask = compare_result; |
| |
| if (GET_MODE (target) != GET_MODE (op_t) |
| || GET_MODE (target) != GET_MODE (op_f)) |
| abort (); |
| |
| if (reverse_test) |
| emit_insn (gen_selb (target, op_t, op_f, select_mask)); |
| else |
| emit_insn (gen_selb (target, op_f, op_t, select_mask)); |
| } |
| else |
| { |
| rtx target = operands[0]; |
| if (reverse_test) |
| emit_insn (gen_rtx_SET (VOIDmode, compare_result, |
| gen_rtx_NOT (comp_mode, compare_result))); |
| if (GET_MODE (target) == SImode && GET_MODE (compare_result) == HImode) |
| emit_insn (gen_extendhisi2 (target, compare_result)); |
| else if (GET_MODE (target) == SImode |
| && GET_MODE (compare_result) == QImode) |
| emit_insn (gen_extend_compare (target, compare_result)); |
| else |
| emit_move_insn (target, compare_result); |
| } |
| } |
| |
| HOST_WIDE_INT |
| const_double_to_hwint (rtx x) |
| { |
| HOST_WIDE_INT val; |
| REAL_VALUE_TYPE rv; |
| if (GET_MODE (x) == SFmode) |
| { |
| REAL_VALUE_FROM_CONST_DOUBLE (rv, x); |
| REAL_VALUE_TO_TARGET_SINGLE (rv, val); |
| } |
| else if (GET_MODE (x) == DFmode) |
| { |
| long l[2]; |
| REAL_VALUE_FROM_CONST_DOUBLE (rv, x); |
| REAL_VALUE_TO_TARGET_DOUBLE (rv, l); |
| val = l[0]; |
| val = (val << 32) | (l[1] & 0xffffffff); |
| } |
| else |
| abort (); |
| return val; |
| } |
| |
| rtx |
| hwint_to_const_double (machine_mode mode, HOST_WIDE_INT v) |
| { |
| long tv[2]; |
| REAL_VALUE_TYPE rv; |
| gcc_assert (mode == SFmode || mode == DFmode); |
| |
| if (mode == SFmode) |
| tv[0] = (v << 32) >> 32; |
| else if (mode == DFmode) |
| { |
| tv[1] = (v << 32) >> 32; |
| tv[0] = v >> 32; |
| } |
| real_from_target (&rv, tv, mode); |
| return CONST_DOUBLE_FROM_REAL_VALUE (rv, mode); |
| } |
| |
| void |
| print_operand_address (FILE * file, register rtx addr) |
| { |
| rtx reg; |
| rtx offset; |
| |
| if (GET_CODE (addr) == AND |
| && GET_CODE (XEXP (addr, 1)) == CONST_INT |
| && INTVAL (XEXP (addr, 1)) == -16) |
| addr = XEXP (addr, 0); |
| |
| switch (GET_CODE (addr)) |
| { |
| case REG: |
| fprintf (file, "0(%s)", reg_names[REGNO (addr)]); |
| break; |
| |
| case PLUS: |
| reg = XEXP (addr, 0); |
| offset = XEXP (addr, 1); |
| if (GET_CODE (offset) == REG) |
| { |
| fprintf (file, "%s,%s", reg_names[REGNO (reg)], |
| reg_names[REGNO (offset)]); |
| } |
| else if (GET_CODE (offset) == CONST_INT) |
| { |
| fprintf (file, HOST_WIDE_INT_PRINT_DEC "(%s)", |
| INTVAL (offset), reg_names[REGNO (reg)]); |
| } |
| else |
| abort (); |
| break; |
| |
| case CONST: |
| case LABEL_REF: |
| case SYMBOL_REF: |
| case CONST_INT: |
| output_addr_const (file, addr); |
| break; |
| |
| default: |
| debug_rtx (addr); |
| abort (); |
| } |
| } |
| |
| void |
| print_operand (FILE * file, rtx x, int code) |
| { |
| machine_mode mode = GET_MODE (x); |
| HOST_WIDE_INT val; |
| unsigned char arr[16]; |
| int xcode = GET_CODE (x); |
| int i, info; |
| if (GET_MODE (x) == VOIDmode) |
| switch (code) |
| { |
| case 'L': /* 128 bits, signed */ |
| case 'm': /* 128 bits, signed */ |
| case 'T': /* 128 bits, signed */ |
| case 't': /* 128 bits, signed */ |
| mode = TImode; |
| break; |
| case 'K': /* 64 bits, signed */ |
| case 'k': /* 64 bits, signed */ |
| case 'D': /* 64 bits, signed */ |
| case 'd': /* 64 bits, signed */ |
| mode = DImode; |
| break; |
| case 'J': /* 32 bits, signed */ |
| case 'j': /* 32 bits, signed */ |
| case 's': /* 32 bits, signed */ |
| case 'S': /* 32 bits, signed */ |
| mode = SImode; |
| break; |
| } |
| switch (code) |
| { |
| |
| case 'j': /* 32 bits, signed */ |
| case 'k': /* 64 bits, signed */ |
| case 'm': /* 128 bits, signed */ |
| if (xcode == CONST_INT |
| || xcode == CONST_DOUBLE || xcode == CONST_VECTOR) |
| { |
| gcc_assert (logical_immediate_p (x, mode)); |
| constant_to_array (mode, x, arr); |
| val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3]; |
| val = trunc_int_for_mode (val, SImode); |
| switch (which_logical_immediate (val)) |
| { |
| case SPU_ORI: |
| break; |
| case SPU_ORHI: |
| fprintf (file, "h"); |
| break; |
| case SPU_ORBI: |
| fprintf (file, "b"); |
| break; |
| default: |
| gcc_unreachable(); |
| } |
| } |
| else |
| gcc_unreachable(); |
| return; |
| |
| case 'J': /* 32 bits, signed */ |
| case 'K': /* 64 bits, signed */ |
| case 'L': /* 128 bits, signed */ |
| if (xcode == CONST_INT |
| || xcode == CONST_DOUBLE || xcode == CONST_VECTOR) |
| { |
| gcc_assert (logical_immediate_p (x, mode) |
| || iohl_immediate_p (x, mode)); |
| constant_to_array (mode, x, arr); |
| val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3]; |
| val = trunc_int_for_mode (val, SImode); |
| switch (which_logical_immediate (val)) |
| { |
| case SPU_ORI: |
| case SPU_IOHL: |
| break; |
| case SPU_ORHI: |
| val = trunc_int_for_mode (val, HImode); |
| break; |
| case SPU_ORBI: |
| val = trunc_int_for_mode (val, QImode); |
| break; |
| default: |
| gcc_unreachable(); |
| } |
| fprintf (file, HOST_WIDE_INT_PRINT_DEC, val); |
| } |
| else |
| gcc_unreachable(); |
| return; |
| |
| case 't': /* 128 bits, signed */ |
| case 'd': /* 64 bits, signed */ |
| case 's': /* 32 bits, signed */ |
| if (CONSTANT_P (x)) |
| { |
| enum immediate_class c = classify_immediate (x, mode); |
| switch (c) |
| { |
| case IC_IL1: |
| constant_to_array (mode, x, arr); |
| val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3]; |
| val = trunc_int_for_mode (val, SImode); |
| switch (which_immediate_load (val)) |
| { |
| case SPU_IL: |
| break; |
| case SPU_ILA: |
| fprintf (file, "a"); |
| break; |
| case SPU_ILH: |
| fprintf (file, "h"); |
| break; |
| case SPU_ILHU: |
| fprintf (file, "hu"); |
| break; |
| default: |
| gcc_unreachable (); |
| } |
| break; |
| case IC_CPAT: |
| constant_to_array (mode, x, arr); |
| cpat_info (arr, GET_MODE_SIZE (mode), &info, 0); |
| if (info == 1) |
| fprintf (file, "b"); |
| else if (info == 2) |
| fprintf (file, "h"); |
| else if (info == 4) |
| fprintf (file, "w"); |
| else if (info == 8) |
| fprintf (file, "d"); |
| break; |
| case IC_IL1s: |
| if (xcode == CONST_VECTOR) |
| { |
| x = CONST_VECTOR_ELT (x, 0); |
| xcode = GET_CODE (x); |
| } |
| if (xcode == SYMBOL_REF || xcode == LABEL_REF || xcode == CONST) |
| fprintf (file, "a"); |
| else if (xcode == HIGH) |
| fprintf (file, "hu"); |
| break; |
| case IC_FSMBI: |
| case IC_FSMBI2: |
| case IC_IL2: |
| case IC_IL2s: |
| case IC_POOL: |
| abort (); |
| } |
| } |
| else |
| gcc_unreachable (); |
| return; |
| |
| case 'T': /* 128 bits, signed */ |
| case 'D': /* 64 bits, signed */ |
| case 'S': /* 32 bits, signed */ |
| if (CONSTANT_P (x)) |
| { |
| enum immediate_class c = classify_immediate (x, mode); |
| switch (c) |
| { |
| case IC_IL1: |
| constant_to_array (mode, x, arr); |
| val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3]; |
| val = trunc_int_for_mode (val, SImode); |
| switch (which_immediate_load (val)) |
| { |
| case SPU_IL: |
| case SPU_ILA: |
| break; |
| case SPU_ILH: |
| case SPU_ILHU: |
| val = trunc_int_for_mode (((arr[0] << 8) | arr[1]), HImode); |
| break; |
| default: |
| gcc_unreachable (); |
| } |
| fprintf (file, HOST_WIDE_INT_PRINT_DEC, val); |
| break; |
| case IC_FSMBI: |
| constant_to_array (mode, x, arr); |
| val = 0; |
| for (i = 0; i < 16; i++) |
| { |
| val <<= 1; |
| val |= arr[i] & 1; |
| } |
| print_operand (file, GEN_INT (val), 0); |
| break; |
| case IC_CPAT: |
| constant_to_array (mode, x, arr); |
| cpat_info (arr, GET_MODE_SIZE (mode), 0, &info); |
| fprintf (file, HOST_WIDE_INT_PRINT_DEC, (HOST_WIDE_INT)info); |
| break; |
| case IC_IL1s: |
| if (xcode == HIGH) |
| x = XEXP (x, 0); |
| if (GET_CODE (x) == CONST_VECTOR) |
| x = CONST_VECTOR_ELT (x, 0); |
| output_addr_const (file, x); |
| if (xcode == HIGH) |
| fprintf (file, "@h"); |
| break; |
| case IC_IL2: |
| case IC_IL2s: |
| case IC_FSMBI2: |
| case IC_POOL: |
| abort (); |
| } |
| } |
| else |
| gcc_unreachable (); |
| return; |
| |
| case 'C': |
| if (xcode == CONST_INT) |
| { |
| /* Only 4 least significant bits are relevant for generate |
| control word instructions. */ |
| fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x) & 15); |
| return; |
| } |
| break; |
| |
| case 'M': /* print code for c*d */ |
| if (GET_CODE (x) == CONST_INT) |
| switch (INTVAL (x)) |
| { |
| case 1: |
| fprintf (file, "b"); |
| break; |
| case 2: |
| fprintf (file, "h"); |
| break; |
| case 4: |
| fprintf (file, "w"); |
| break; |
| case 8: |
| fprintf (file, "d"); |
| break; |
| default: |
| gcc_unreachable(); |
| } |
| else |
| gcc_unreachable(); |
| return; |
| |
| case 'N': /* Negate the operand */ |
| if (xcode == CONST_INT) |
| fprintf (file, HOST_WIDE_INT_PRINT_DEC, -INTVAL (x)); |
| else if (xcode == CONST_VECTOR) |
| fprintf (file, HOST_WIDE_INT_PRINT_DEC, |
| -INTVAL (CONST_VECTOR_ELT (x, 0))); |
| return; |
| |
| case 'I': /* enable/disable interrupts */ |
| if (xcode == CONST_INT) |
| fprintf (file, "%s", INTVAL (x) == 0 ? "d" : "e"); |
| return; |
| |
| case 'b': /* branch modifiers */ |
| if (xcode == REG) |
| fprintf (file, "%s", GET_MODE (x) == HImode ? "h" : ""); |
| else if (COMPARISON_P (x)) |
| fprintf (file, "%s", xcode == NE ? "n" : ""); |
| return; |
| |
| case 'i': /* indirect call */ |
| if (xcode == MEM) |
| { |
| if (GET_CODE (XEXP (x, 0)) == REG) |
| /* Used in indirect function calls. */ |
| fprintf (file, "%s", reg_names[REGNO (XEXP (x, 0))]); |
| else |
| output_address (XEXP (x, 0)); |
| } |
| return; |
| |
| case 'p': /* load/store */ |
| if (xcode == MEM) |
| { |
| x = XEXP (x, 0); |
| xcode = GET_CODE (x); |
| } |
| if (xcode == AND) |
| { |
| x = XEXP (x, 0); |
| xcode = GET_CODE (x); |
| } |
| if (xcode == REG) |
| fprintf (file, "d"); |
| else if (xcode == CONST_INT) |
| fprintf (file, "a"); |
| else if (xcode == CONST || xcode == SYMBOL_REF || xcode == LABEL_REF) |
| fprintf (file, "r"); |
| else if (xcode == PLUS || xcode == LO_SUM) |
| { |
| if (GET_CODE (XEXP (x, 1)) == REG) |
| fprintf (file, "x"); |
| else |
| fprintf (file, "d"); |
| } |
| return; |
| |
| case 'e': |
| val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0)); |
| val &= 0x7; |
| output_addr_const (file, GEN_INT (val)); |
| return; |
| |
| case 'f': |
| val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0)); |
| val &= 0x1f; |
| output_addr_const (file, GEN_INT (val)); |
| return; |
| |
| case 'g': |
| val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0)); |
| val &= 0x3f; |
| output_addr_const (file, GEN_INT (val)); |
| return; |
| |
| case 'h': |
| val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0)); |
| val = (val >> 3) & 0x1f; |
| output_addr_const (file, GEN_INT (val)); |
| return; |
| |
| case 'E': |
| val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0)); |
| val = -val; |
| val &= 0x7; |
| output_addr_const (file, GEN_INT (val)); |
| return; |
| |
| case 'F': |
| val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0)); |
| val = -val; |
| val &= 0x1f; |
| output_addr_const (file, GEN_INT (val)); |
| return; |
| |
| case 'G': |
| val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0)); |
| val = -val; |
| val &= 0x3f; |
| output_addr_const (file, GEN_INT (val)); |
| return; |
| |
| case 'H': |
| val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0)); |
| val = -(val & -8ll); |
| val = (val >> 3) & 0x1f; |
| output_addr_const (file, GEN_INT (val)); |
| return; |
| |
| case 'v': |
| case 'w': |
| constant_to_array (mode, x, arr); |
| val = (((arr[0] << 1) + (arr[1] >> 7)) & 0xff) - 127; |
| output_addr_const (file, GEN_INT (code == 'w' ? -val : val)); |
| return; |
| |
| case 0: |
| if (xcode == REG) |
| fprintf (file, "%s", reg_names[REGNO (x)]); |
| else if (xcode == MEM) |
| output_address (XEXP (x, 0)); |
| else if (xcode == CONST_VECTOR) |
| print_operand (file, CONST_VECTOR_ELT (x, 0), 0); |
| else |
| output_addr_const (file, x); |
| return; |
| |
| /* unused letters |
| o qr u yz |
| AB OPQR UVWXYZ */ |
| default: |
| output_operand_lossage ("invalid %%xn code"); |
| } |
| gcc_unreachable (); |
| } |
| |
| /* For PIC mode we've reserved PIC_OFFSET_TABLE_REGNUM, which is a |
| caller saved register. For leaf functions it is more efficient to |
| use a volatile register because we won't need to save and restore the |
| pic register. This routine is only valid after register allocation |
| is completed, so we can pick an unused register. */ |
| static rtx |
| get_pic_reg (void) |
| { |
| if (!reload_completed && !reload_in_progress) |
| abort (); |
| |
| /* If we've already made the decision, we need to keep with it. Once we've |
| decided to use LAST_ARG_REGNUM, future calls to df_regs_ever_live_p may |
| return true since the register is now live; this should not cause us to |
| "switch back" to using pic_offset_table_rtx. */ |
| if (!cfun->machine->pic_reg) |
| { |
| if (crtl->is_leaf && !df_regs_ever_live_p (LAST_ARG_REGNUM)) |
| cfun->machine->pic_reg = gen_rtx_REG (SImode, LAST_ARG_REGNUM); |
| else |
| cfun->machine->pic_reg = pic_offset_table_rtx; |
| } |
| |
| return cfun->machine->pic_reg; |
| } |
| |
| /* Split constant addresses to handle cases that are too large. |
| Add in the pic register when in PIC mode. |
| Split immediates that require more than 1 instruction. */ |
| int |
| spu_split_immediate (rtx * ops) |
| { |
| machine_mode mode = GET_MODE (ops[0]); |
| enum immediate_class c = classify_immediate (ops[1], mode); |
| |
| switch (c) |
| { |
| case IC_IL2: |
| { |
| unsigned char arrhi[16]; |
| unsigned char arrlo[16]; |
| rtx to, temp, hi, lo; |
| int i; |
| machine_mode imode = mode; |
| /* We need to do reals as ints because the constant used in the |
| IOR might not be a legitimate real constant. */ |
| imode = int_mode_for_mode (mode); |
| constant_to_array (mode, ops[1], arrhi); |
| if (imode != mode) |
| to = simplify_gen_subreg (imode, ops[0], mode, 0); |
| else |
| to = ops[0]; |
| temp = !can_create_pseudo_p () ? to : gen_reg_rtx (imode); |
| for (i = 0; i < 16; i += 4) |
| { |
| arrlo[i + 2] = arrhi[i + 2]; |
| arrlo[i + 3] = arrhi[i + 3]; |
| arrlo[i + 0] = arrlo[i + 1] = 0; |
| arrhi[i + 2] = arrhi[i + 3] = 0; |
| } |
| hi = array_to_constant (imode, arrhi); |
| lo = array_to_constant (imode, arrlo); |
| emit_move_insn (temp, hi); |
| emit_insn (gen_rtx_SET |
| (VOIDmode, to, gen_rtx_IOR (imode, temp, lo))); |
| return 1; |
| } |
| case IC_FSMBI2: |
| { |
| unsigned char arr_fsmbi[16]; |
| unsigned char arr_andbi[16]; |
| rtx to, reg_fsmbi, reg_and; |
| int i; |
| machine_mode imode = mode; |
| /* We need to do reals as ints because the constant used in the |
| * AND might not be a legitimate real constant. */ |
| imode = int_mode_for_mode (mode); |
| constant_to_array (mode, ops[1], arr_fsmbi); |
| if (imode != mode) |
| to = simplify_gen_subreg(imode, ops[0], GET_MODE (ops[0]), 0); |
| else |
| to = ops[0]; |
| for (i = 0; i < 16; i++) |
| if (arr_fsmbi[i] != 0) |
| { |
| arr_andbi[0] = arr_fsmbi[i]; |
| arr_fsmbi[i] = 0xff; |
| } |
| for (i = 1; i < 16; i++) |
| arr_andbi[i] = arr_andbi[0]; |
| reg_fsmbi = array_to_constant (imode, arr_fsmbi); |
| reg_and = array_to_constant (imode, arr_andbi); |
| emit_move_insn (to, reg_fsmbi); |
| emit_insn (gen_rtx_SET |
| (VOIDmode, to, gen_rtx_AND (imode, to, reg_and))); |
| return 1; |
| } |
| case IC_POOL: |
| if (reload_in_progress || reload_completed) |
| { |
| rtx mem = force_const_mem (mode, ops[1]); |
| if (TARGET_LARGE_MEM) |
| { |
| rtx addr = gen_rtx_REG (Pmode, REGNO (ops[0])); |
| emit_move_insn (addr, XEXP (mem, 0)); |
| mem = replace_equiv_address (mem, addr); |
| } |
| emit_move_insn (ops[0], mem); |
| return 1; |
| } |
| break; |
| case IC_IL1s: |
| case IC_IL2s: |
| if (reload_completed && GET_CODE (ops[1]) != HIGH) |
| { |
| if (c == IC_IL2s) |
| { |
| emit_move_insn (ops[0], gen_rtx_HIGH (mode, ops[1])); |
| emit_move_insn (ops[0], gen_rtx_LO_SUM (mode, ops[0], ops[1])); |
| } |
| else if (flag_pic) |
| emit_insn (gen_pic (ops[0], ops[1])); |
| if (flag_pic) |
| { |
| rtx pic_reg = get_pic_reg (); |
| emit_insn (gen_addsi3 (ops[0], ops[0], pic_reg)); |
| } |
| return flag_pic || c == IC_IL2s; |
| } |
| break; |
| case IC_IL1: |
| case IC_FSMBI: |
| case IC_CPAT: |
| break; |
| } |
| return 0; |
| } |
| |
| /* SAVING is TRUE when we are generating the actual load and store |
| instructions for REGNO. When determining the size of the stack |
| needed for saving register we must allocate enough space for the |
| worst case, because we don't always have the information early enough |
| to not allocate it. But we can at least eliminate the actual loads |
| and stores during the prologue/epilogue. */ |
| static int |
| need_to_save_reg (int regno, int saving) |
| { |
| if (df_regs_ever_live_p (regno) && !call_used_regs[regno]) |
| return 1; |
| if (flag_pic |
| && regno == PIC_OFFSET_TABLE_REGNUM |
| && (!saving || cfun->machine->pic_reg == pic_offset_table_rtx)) |
| return 1; |
| return 0; |
| } |
| |
| /* This function is only correct starting with local register |
| allocation */ |
| int |
| spu_saved_regs_size (void) |
| { |
| int reg_save_size = 0; |
| int regno; |
| |
| for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; --regno) |
| if (need_to_save_reg (regno, 0)) |
| reg_save_size += 0x10; |
| return reg_save_size; |
| } |
| |
| static rtx_insn * |
| frame_emit_store (int regno, rtx addr, HOST_WIDE_INT offset) |
| { |
| rtx reg = gen_rtx_REG (V4SImode, regno); |
| rtx mem = |
| gen_frame_mem (V4SImode, gen_rtx_PLUS (Pmode, addr, GEN_INT (offset))); |
| return emit_insn (gen_movv4si (mem, reg)); |
| } |
| |
| static rtx_insn * |
| frame_emit_load (int regno, rtx addr, HOST_WIDE_INT offset) |
| { |
| rtx reg = gen_rtx_REG (V4SImode, regno); |
| rtx mem = |
| gen_frame_mem (V4SImode, gen_rtx_PLUS (Pmode, addr, GEN_INT (offset))); |
| return emit_insn (gen_movv4si (reg, mem)); |
| } |
| |
| /* This happens after reload, so we need to expand it. */ |
| static rtx_insn * |
| frame_emit_add_imm (rtx dst, rtx src, HOST_WIDE_INT imm, rtx scratch) |
| { |
| rtx_insn *insn; |
| if (satisfies_constraint_K (GEN_INT (imm))) |
| { |
| insn = emit_insn (gen_addsi3 (dst, src, GEN_INT (imm))); |
| } |
| else |
| { |
| emit_insn (gen_movsi (scratch, gen_int_mode (imm, SImode))); |
| insn = emit_insn (gen_addsi3 (dst, src, scratch)); |
| if (REGNO (src) == REGNO (scratch)) |
| abort (); |
| } |
| return insn; |
| } |
| |
| /* Return nonzero if this function is known to have a null epilogue. */ |
| |
| int |
| direct_return (void) |
| { |
| if (reload_completed) |
| { |
| if (cfun->static_chain_decl == 0 |
| && (spu_saved_regs_size () |
| + get_frame_size () |
| + crtl->outgoing_args_size |
| + crtl->args.pretend_args_size == 0) |
| && crtl->is_leaf) |
| return 1; |
| } |
| return 0; |
| } |
| |
| /* |
| The stack frame looks like this: |
| +-------------+ |
| | incoming | |
| | args | |
| AP -> +-------------+ |
| | $lr save | |
| +-------------+ |
| prev SP | back chain | |
| +-------------+ |
| | var args | |
| | reg save | crtl->args.pretend_args_size bytes |
| +-------------+ |
| | ... | |
| | saved regs | spu_saved_regs_size() bytes |
| FP -> +-------------+ |
| | ... | |
| | vars | get_frame_size() bytes |
| HFP -> +-------------+ |
| | ... | |
| | outgoing | |
| | args | crtl->outgoing_args_size bytes |
| +-------------+ |
| | $lr of next | |
| | frame | |
| +-------------+ |
| | back chain | |
| SP -> +-------------+ |
| |
| */ |
| void |
| spu_expand_prologue (void) |
| { |
| HOST_WIDE_INT size = get_frame_size (), offset, regno; |
| HOST_WIDE_INT total_size; |
| HOST_WIDE_INT saved_regs_size; |
| rtx sp_reg = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM); |
| rtx scratch_reg_0, scratch_reg_1; |
| rtx_insn *insn; |
| rtx real; |
| |
| if (flag_pic && optimize == 0 && !cfun->machine->pic_reg) |
| cfun->machine->pic_reg = pic_offset_table_rtx; |
| |
| if (spu_naked_function_p (current_function_decl)) |
| return; |
| |
| scratch_reg_0 = gen_rtx_REG (SImode, LAST_ARG_REGNUM + 1); |
| scratch_reg_1 = gen_rtx_REG (SImode, LAST_ARG_REGNUM + 2); |
| |
| saved_regs_size = spu_saved_regs_size (); |
| total_size = size + saved_regs_size |
| + crtl->outgoing_args_size |
| + crtl->args.pretend_args_size; |
| |
| if (!crtl->is_leaf |
| || cfun->calls_alloca || total_size > 0) |
| total_size += STACK_POINTER_OFFSET; |
| |
| /* Save this first because code after this might use the link |
| register as a scratch register. */ |
| if (!crtl->is_leaf) |
| { |
| insn = frame_emit_store (LINK_REGISTER_REGNUM, sp_reg, 16); |
| RTX_FRAME_RELATED_P (insn) = 1; |
| } |
| |
| if (total_size > 0) |
| { |
| offset = -crtl->args.pretend_args_size; |
| for (regno = 0; regno < FIRST_PSEUDO_REGISTER; ++regno) |
| if (need_to_save_reg (regno, 1)) |
| { |
| offset -= 16; |
| insn = frame_emit_store (regno, sp_reg, offset); |
| RTX_FRAME_RELATED_P (insn) = 1; |
| } |
| } |
| |
| if (flag_pic && cfun->machine->pic_reg) |
| { |
| rtx pic_reg = cfun->machine->pic_reg; |
| insn = emit_insn (gen_load_pic_offset (pic_reg, scratch_reg_0)); |
| insn = emit_insn (gen_subsi3 (pic_reg, pic_reg, scratch_reg_0)); |
| } |
| |
| if (total_size > 0) |
| { |
| if (flag_stack_check) |
| { |
| /* We compare against total_size-1 because |
| ($sp >= total_size) <=> ($sp > total_size-1) */ |
| rtx scratch_v4si = gen_rtx_REG (V4SImode, REGNO (scratch_reg_0)); |
| rtx sp_v4si = gen_rtx_REG (V4SImode, STACK_POINTER_REGNUM); |
| rtx size_v4si = spu_const (V4SImode, total_size - 1); |
| if (!satisfies_constraint_K (GEN_INT (total_size - 1))) |
| { |
| emit_move_insn (scratch_v4si, size_v4si); |
| size_v4si = scratch_v4si; |
| } |
| emit_insn (gen_cgt_v4si (scratch_v4si, sp_v4si, size_v4si)); |
| emit_insn (gen_vec_extractv4si |
| (scratch_reg_0, scratch_v4si, GEN_INT (1))); |
| emit_insn (gen_spu_heq (scratch_reg_0, GEN_INT (0))); |
| } |
| |
| /* Adjust the stack pointer, and make sure scratch_reg_0 contains |
| the value of the previous $sp because we save it as the back |
| chain. */ |
| if (total_size <= 2000) |
| { |
| /* In this case we save the back chain first. */ |
| insn = frame_emit_store (STACK_POINTER_REGNUM, sp_reg, -total_size); |
| insn = |
| frame_emit_add_imm (sp_reg, sp_reg, -total_size, scratch_reg_0); |
| } |
| else |
| { |
| insn = emit_move_insn (scratch_reg_0, sp_reg); |
| insn = |
| frame_emit_add_imm (sp_reg, sp_reg, -total_size, scratch_reg_1); |
| } |
| RTX_FRAME_RELATED_P (insn) = 1; |
| real = gen_addsi3 (sp_reg, sp_reg, GEN_INT (-total_size)); |
| add_reg_note (insn, REG_FRAME_RELATED_EXPR, real); |
| |
| if (total_size > 2000) |
| { |
| /* Save the back chain ptr */ |
| insn = frame_emit_store (REGNO (scratch_reg_0), sp_reg, 0); |
| } |
| |
| if (frame_pointer_needed) |
| { |
| rtx fp_reg = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM); |
| HOST_WIDE_INT fp_offset = STACK_POINTER_OFFSET |
| + crtl->outgoing_args_size; |
| /* Set the new frame_pointer */ |
| insn = frame_emit_add_imm (fp_reg, sp_reg, fp_offset, scratch_reg_0); |
| RTX_FRAME_RELATED_P (insn) = 1; |
| real = gen_addsi3 (fp_reg, sp_reg, GEN_INT (fp_offset)); |
| add_reg_note (insn, REG_FRAME_RELATED_EXPR, real); |
| REGNO_POINTER_ALIGN (HARD_FRAME_POINTER_REGNUM) = STACK_BOUNDARY; |
| } |
| } |
| |
| if (flag_stack_usage_info) |
| current_function_static_stack_size = total_size; |
| } |
| |
| void |
| spu_expand_epilogue (bool sibcall_p) |
| { |
| int size = get_frame_size (), offset, regno; |
| HOST_WIDE_INT saved_regs_size, total_size; |
| rtx sp_reg = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM); |
| rtx scratch_reg_0; |
| |
| if (spu_naked_function_p (current_function_decl)) |
| return; |
| |
| scratch_reg_0 = gen_rtx_REG (SImode, LAST_ARG_REGNUM + 1); |
| |
| saved_regs_size = spu_saved_regs_size (); |
| total_size = size + saved_regs_size |
| + crtl->outgoing_args_size |
| + crtl->args.pretend_args_size; |
| |
| if (!crtl->is_leaf |
| || cfun->calls_alloca || total_size > 0) |
| total_size += STACK_POINTER_OFFSET; |
| |
| if (total_size > 0) |
| { |
| if (cfun->calls_alloca) |
| frame_emit_load (STACK_POINTER_REGNUM, sp_reg, 0); |
| else |
| frame_emit_add_imm (sp_reg, sp_reg, total_size, scratch_reg_0); |
| |
| |
| if (saved_regs_size > 0) |
| { |
| offset = -crtl->args.pretend_args_size; |
| for (regno = 0; regno < FIRST_PSEUDO_REGISTER; ++regno) |
| if (need_to_save_reg (regno, 1)) |
| { |
| offset -= 0x10; |
| frame_emit_load (regno, sp_reg, offset); |
| } |
| } |
| } |
| |
| if (!crtl->is_leaf) |
| frame_emit_load (LINK_REGISTER_REGNUM, sp_reg, 16); |
| |
| if (!sibcall_p) |
| { |
| emit_use (gen_rtx_REG (SImode, LINK_REGISTER_REGNUM)); |
| emit_jump_insn (gen__return ()); |
| } |
| } |
| |
| rtx |
| spu_return_addr (int count, rtx frame ATTRIBUTE_UNUSED) |
| { |
| if (count != 0) |
| return 0; |
| /* This is inefficient because it ends up copying to a save-register |
| which then gets saved even though $lr has already been saved. But |
| it does generate better code for leaf functions and we don't need |
| to use RETURN_ADDRESS_POINTER_REGNUM to get it working. It's only |
| used for __builtin_return_address anyway, so maybe we don't care if |
| it's inefficient. */ |
| return get_hard_reg_initial_val (Pmode, LINK_REGISTER_REGNUM); |
| } |
| |
| |
| /* Given VAL, generate a constant appropriate for MODE. |
| If MODE is a vector mode, every element will be VAL. |
| For TImode, VAL will be zero extended to 128 bits. */ |
| rtx |
| spu_const (machine_mode mode, HOST_WIDE_INT val) |
| { |
| rtx inner; |
| rtvec v; |
| int units, i; |
| |
| gcc_assert (GET_MODE_CLASS (mode) == MODE_INT |
| || GET_MODE_CLASS (mode) == MODE_FLOAT |
| || GET_MODE_CLASS (mode) == MODE_VECTOR_INT |
| || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT); |
| |
| if (GET_MODE_CLASS (mode) == MODE_INT) |
| return immed_double_const (val, 0, mode); |
| |
| /* val is the bit representation of the float */ |
| if (GET_MODE_CLASS (mode) == MODE_FLOAT) |
| return hwint_to_const_double (mode, val); |
| |
| if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) |
| inner = immed_double_const (val, 0, GET_MODE_INNER (mode)); |
| else |
| inner = hwint_to_const_double (GET_MODE_INNER (mode), val); |
| |
| units = GET_MODE_NUNITS (mode); |
| |
| v = rtvec_alloc (units); |
| |
| for (i = 0; i < units; ++i) |
| RTVEC_ELT (v, i) = inner; |
| |
| return gen_rtx_CONST_VECTOR (mode, v); |
| } |
| |
| /* Create a MODE vector constant from 4 ints. */ |
| rtx |
| spu_const_from_ints(machine_mode mode, int a, int b, int c, int d) |
| { |
| unsigned char arr[16]; |
| arr[0] = (a >> 24) & 0xff; |
| arr[1] = (a >> 16) & 0xff; |
| arr[2] = (a >> 8) & 0xff; |
| arr[3] = (a >> 0) & 0xff; |
| arr[4] = (b >> 24) & 0xff; |
| arr[5] = (b >> 16) & 0xff; |
| arr[6] = (b >> 8) & 0xff; |
| arr[7] = (b >> 0) & 0xff; |
| arr[8] = (c >> 24) & 0xff; |
| arr[9] = (c >> 16) & 0xff; |
| arr[10] = (c >> 8) & 0xff; |
| arr[11] = (c >> 0) & 0xff; |
| arr[12] = (d >> 24) & 0xff; |
| arr[13] = (d >> 16) & 0xff; |
| arr[14] = (d >> 8) & 0xff; |
| arr[15] = (d >> 0) & 0xff; |
| return array_to_constant(mode, arr); |
| } |
| |
| /* branch hint stuff */ |
| |
| /* An array of these is used to propagate hints to predecessor blocks. */ |
| struct spu_bb_info |
| { |
| rtx_insn *prop_jump; /* propagated from another block */ |
| int bb_index; /* the original block. */ |
| }; |
| static struct spu_bb_info *spu_bb_info; |
| |
| #define STOP_HINT_P(INSN) \ |
| (CALL_P(INSN) \ |
| || INSN_CODE(INSN) == CODE_FOR_divmodsi4 \ |
| || INSN_CODE(INSN) == CODE_FOR_udivmodsi4) |
| |
| /* 1 when RTX is a hinted branch or its target. We keep track of |
| what has been hinted so the safe-hint code can test it easily. */ |
| #define HINTED_P(RTX) \ |
| (RTL_FLAG_CHECK3("HINTED_P", (RTX), CODE_LABEL, JUMP_INSN, CALL_INSN)->unchanging) |
| |
| /* 1 when RTX is an insn that must be scheduled on an even boundary. */ |
| #define SCHED_ON_EVEN_P(RTX) \ |
| (RTL_FLAG_CHECK2("SCHED_ON_EVEN_P", (RTX), JUMP_INSN, CALL_INSN)->in_struct) |
| |
| /* Emit a nop for INSN such that the two will dual issue. This assumes |
| INSN is 8-byte aligned. When INSN is inline asm we emit an lnop. |
| We check for TImode to handle a MULTI1 insn which has dual issued its |
| first instruction. get_pipe returns -1 for MULTI0 or inline asm. */ |
| static void |
| emit_nop_for_insn (rtx_insn *insn) |
| { |
| int p; |
| rtx_insn *new_insn; |
| |
| /* We need to handle JUMP_TABLE_DATA separately. */ |
| if (JUMP_TABLE_DATA_P (insn)) |
| { |
| new_insn = emit_insn_after (gen_lnop(), insn); |
| recog_memoized (new_insn); |
| INSN_LOCATION (new_insn) = UNKNOWN_LOCATION; |
| return; |
| } |
| |
| p = get_pipe (insn); |
| if ((CALL_P (insn) || JUMP_P (insn)) && SCHED_ON_EVEN_P (insn)) |
| new_insn = emit_insn_after (gen_lnop (), insn); |
| else if (p == 1 && GET_MODE (insn) == TImode) |
| { |
| new_insn = emit_insn_before (gen_nopn (GEN_INT (127)), insn); |
| PUT_MODE (new_insn, TImode); |
| PUT_MODE (insn, VOIDmode); |
| } |
| else |
| new_insn = emit_insn_after (gen_lnop (), insn); |
| recog_memoized (new_insn); |
| INSN_LOCATION (new_insn) = INSN_LOCATION (insn); |
| } |
| |
| /* Insert nops in basic blocks to meet dual issue alignment |
| requirements. Also make sure hbrp and hint instructions are at least |
| one cycle apart, possibly inserting a nop. */ |
| static void |
| pad_bb(void) |
| { |
| rtx_insn *insn, *next_insn, *prev_insn, *hbr_insn = 0; |
| int length; |
| int addr; |
| |
| /* This sets up INSN_ADDRESSES. */ |
| shorten_branches (get_insns ()); |
| |
| /* Keep track of length added by nops. */ |
| length = 0; |
| |
| prev_insn = 0; |
| insn = get_insns (); |
| if (!active_insn_p (insn)) |
| insn = next_active_insn (insn); |
| for (; insn; insn = next_insn) |
| { |
| next_insn = next_active_insn (insn); |
| if (INSN_CODE (insn) == CODE_FOR_iprefetch |
| || INSN_CODE (insn) == CODE_FOR_hbr) |
| { |
| if (hbr_insn) |
| { |
| int a0 = INSN_ADDRESSES (INSN_UID (hbr_insn)); |
| int a1 = INSN_ADDRESSES (INSN_UID (insn)); |
| if ((a1 - a0 == 8 && GET_MODE (insn) != TImode) |
| || (a1 - a0 == 4)) |
| { |
| prev_insn = emit_insn_before (gen_lnop (), insn); |
| PUT_MODE (prev_insn, GET_MODE (insn)); |
| PUT_MODE (insn, TImode); |
| INSN_LOCATION (prev_insn) = INSN_LOCATION (insn); |
| length += 4; |
| } |
| } |
| hbr_insn = insn; |
| } |
| if (INSN_CODE (insn) == CODE_FOR_blockage && next_insn) |
| { |
| if (GET_MODE (insn) == TImode) |
| PUT_MODE (next_insn, TImode); |
| insn = next_insn; |
| next_insn = next_active_insn (insn); |
| } |
| addr = INSN_ADDRESSES (INSN_UID (insn)); |
| if ((CALL_P (insn) || JUMP_P (insn)) && SCHED_ON_EVEN_P (insn)) |
| { |
| if (((addr + length) & 7) != 0) |
| { |
| emit_nop_for_insn (prev_insn); |
| length += 4; |
| } |
| } |
| else if (GET_MODE (insn) == TImode |
| && ((next_insn && GET_MODE (next_insn) != TImode) |
| || get_attr_type (insn) == TYPE_MULTI0) |
| && ((addr + length) & 7) != 0) |
| { |
| /* prev_insn will always be set because the first insn is |
| always 8-byte aligned. */ |
| emit_nop_for_insn (prev_insn); |
| length += 4; |
| } |
| prev_insn = insn; |
| } |
| } |
| |
| |
| /* Routines for branch hints. */ |
| |
| static void |
| spu_emit_branch_hint (rtx_insn *before, rtx_insn *branch, rtx target, |
| int distance, sbitmap blocks) |
| { |
| rtx branch_label = 0; |
| rtx_insn *hint; |
| rtx_insn *insn; |
| rtx_jump_table_data *table; |
| |
| if (before == 0 || branch == 0 || target == 0) |
| return; |
| |
| /* While scheduling we require hints to be no further than 600, so |
| we need to enforce that here too */ |
| if (distance > 600) |
| return; |
| |
| /* If we have a Basic block note, emit it after the basic block note. */ |
| if (NOTE_INSN_BASIC_BLOCK_P (before)) |
| before = NEXT_INSN (before); |
| |
| branch_label = gen_label_rtx (); |
| LABEL_NUSES (branch_label)++; |
| LABEL_PRESERVE_P (branch_label) = 1; |
| insn = emit_label_before (branch_label, branch); |
| branch_label = gen_rtx_LABEL_REF (VOIDmode, branch_label); |
| bitmap_set_bit (blocks, BLOCK_FOR_INSN (branch)->index); |
| |
| hint = emit_insn_before (gen_hbr (branch_label, target), before); |
| recog_memoized (hint); |
| INSN_LOCATION (hint) = INSN_LOCATION (branch); |
| HINTED_P (branch) = 1; |
| |
| if (GET_CODE (target) == LABEL_REF) |
| HINTED_P (XEXP (target, 0)) = 1; |
| else if (tablejump_p (branch, 0, &table)) |
| { |
| rtvec vec; |
| int j; |
| if (GET_CODE (PATTERN (table)) == ADDR_VEC) |
| vec = XVEC (PATTERN (table), 0); |
| else |
| vec = XVEC (PATTERN (table), 1); |
| for (j = GET_NUM_ELEM (vec) - 1; j >= 0; --j) |
| HINTED_P (XEXP (RTVEC_ELT (vec, j), 0)) = 1; |
| } |
| |
| if (distance >= 588) |
| { |
| /* Make sure the hint isn't scheduled any earlier than this point, |
| which could make it too far for the branch offest to fit */ |
| insn = emit_insn_before (gen_blockage (), hint); |
| recog_memoized (insn); |
| INSN_LOCATION (insn) = INSN_LOCATION (hint); |
| } |
| else if (distance <= 8 * 4) |
| { |
| /* To guarantee at least 8 insns between the hint and branch we |
| insert nops. */ |
| int d; |
| for (d = distance; d < 8 * 4; d += 4) |
| { |
| insn = |
| emit_insn_after (gen_nopn_nv (gen_rtx_REG (SImode, 127)), hint); |
| recog_memoized (insn); |
| INSN_LOCATION (insn) = INSN_LOCATION (hint); |
| } |
| |
| /* Make sure any nops inserted aren't scheduled before the hint. */ |
| insn = emit_insn_after (gen_blockage (), hint); |
| recog_memoized (insn); |
| INSN_LOCATION (insn) = INSN_LOCATION (hint); |
| |
| /* Make sure any nops inserted aren't scheduled after the call. */ |
| if (CALL_P (branch) && distance < 8 * 4) |
| { |
| insn = emit_insn_before (gen_blockage (), branch); |
| recog_memoized (insn); |
| INSN_LOCATION (insn) = INSN_LOCATION (branch); |
| } |
| } |
| } |
| |
| /* Returns 0 if we don't want a hint for this branch. Otherwise return |
| the rtx for the branch target. */ |
| static rtx |
| get_branch_target (rtx_insn *branch) |
| { |
| if (JUMP_P (branch)) |
| { |
| rtx set, src; |
| |
| /* Return statements */ |
| if (GET_CODE (PATTERN (branch)) == RETURN) |
| return gen_rtx_REG (SImode, LINK_REGISTER_REGNUM); |
| |
| /* ASM GOTOs. */ |
| if (extract_asm_operands (PATTERN (branch)) != NULL) |
| return NULL; |
| |
| set = single_set (branch); |
| src = SET_SRC (set); |
| if (GET_CODE (SET_DEST (set)) != PC) |
| abort (); |
| |
| if (GET_CODE (src) == IF_THEN_ELSE) |
| { |
| rtx lab = 0; |
| rtx note = find_reg_note (branch, REG_BR_PROB, 0); |
| if (note) |
| { |
| /* If the more probable case is not a fall through, then |
| try a branch hint. */ |
| int prob = XINT (note, 0); |
| if (prob > (REG_BR_PROB_BASE * 6 / 10) |
| && GET_CODE (XEXP (src, 1)) != PC) |
| lab = XEXP (src, 1); |
| else if (prob < (REG_BR_PROB_BASE * 4 / 10) |
| && GET_CODE (XEXP (src, 2)) != PC) |
| lab = XEXP (src, 2); |
| } |
| if (lab) |
| { |
| if (GET_CODE (lab) == RETURN) |
| return gen_rtx_REG (SImode, LINK_REGISTER_REGNUM); |
| return lab; |
| } |
| return 0; |
| } |
| |
| return src; |
| } |
| else if (CALL_P (branch)) |
| { |
| rtx call; |
| /* All of our call patterns are in a PARALLEL and the CALL is |
| the first pattern in the PARALLEL. */ |
| if (GET_CODE (PATTERN (branch)) != PARALLEL) |
| abort (); |
| call = XVECEXP (PATTERN (branch), 0, 0); |
| if (GET_CODE (call) == SET) |
| call = SET_SRC (call); |
| if (GET_CODE (call) != CALL) |
| abort (); |
| return XEXP (XEXP (call, 0), 0); |
| } |
| return 0; |
| } |
| |
| /* The special $hbr register is used to prevent the insn scheduler from |
| moving hbr insns across instructions which invalidate them. It |
| should only be used in a clobber, and this function searches for |
| insns which clobber it. */ |
| static bool |
| insn_clobbers_hbr (rtx_insn *insn) |
| { |
| if (INSN_P (insn) |
| && GET_CODE (PATTERN (insn)) == PARALLEL) |
| { |
| rtx parallel = PATTERN (insn); |
| rtx clobber; |
| int j; |
| for (j = XVECLEN (parallel, 0) - 1; j >= 0; j--) |
| { |
| clobber = XVECEXP (parallel, 0, j); |
| if (GET_CODE (clobber) == CLOBBER |
| && GET_CODE (XEXP (clobber, 0)) == REG |
| && REGNO (XEXP (clobber, 0)) == HBR_REGNUM) |
| return 1; |
| } |
| } |
| return 0; |
| } |
| |
| /* Search up to 32 insns starting at FIRST: |
| - at any kind of hinted branch, just return |
| - at any unconditional branch in the first 15 insns, just return |
| - at a call or indirect branch, after the first 15 insns, force it to |
| an even address and return |
| - at any unconditional branch, after the first 15 insns, force it to |
| an even address. |
| At then end of the search, insert an hbrp within 4 insns of FIRST, |
| and an hbrp within 16 instructions of FIRST. |
| */ |
| static void |
| insert_hbrp_for_ilb_runout (rtx_insn *first) |
| { |
| rtx_insn *insn, *before_4 = 0, *before_16 = 0; |
| int addr = 0, length, first_addr = -1; |
| int hbrp_addr0 = 128 * 4, hbrp_addr1 = 128 * 4; |
| int insert_lnop_after = 0; |
| for (insn = first; insn; insn = NEXT_INSN (insn)) |
| if (INSN_P (insn)) |
| { |
| if (first_addr == -1) |
| first_addr = INSN_ADDRESSES (INSN_UID (insn)); |
| addr = INSN_ADDRESSES (INSN_UID (insn)) - first_addr; |
| length = get_attr_length (insn); |
| |
| if (before_4 == 0 && addr + length >= 4 * 4) |
| before_4 = insn; |
| /* We test for 14 instructions because the first hbrp will add |
| up to 2 instructions. */ |
| if (before_16 == 0 && addr + length >= 14 * 4) |
| before_16 = insn; |
| |
| if (INSN_CODE (insn) == CODE_FOR_hbr) |
| { |
| /* Make sure an hbrp is at least 2 cycles away from a hint. |
| Insert an lnop after the hbrp when necessary. */ |
| if (before_4 == 0 && addr > 0) |
| { |
| before_4 = insn; |
| insert_lnop_after |= 1; |
| } |
| else if (before_4 && addr <= 4 * 4) |
| insert_lnop_after |= 1; |
| if (before_16 == 0 && addr > 10 * 4) |
| { |
| before_16 = insn; |
| insert_lnop_after |= 2; |
| } |
| else if (before_16 && addr <= 14 * 4) |
| insert_lnop_after |= 2; |
| } |
| |
| if (INSN_CODE (insn) == CODE_FOR_iprefetch) |
| { |
| if (addr < hbrp_addr0) |
| hbrp_addr0 = addr; |
| else if (addr < hbrp_addr1) |
| hbrp_addr1 = addr; |
| } |
| |
| if (CALL_P (insn) || JUMP_P (insn)) |
| { |
| if (HINTED_P (insn)) |
| return; |
| |
| /* Any branch after the first 15 insns should be on an even |
| address to avoid a special case branch. There might be |
| some nops and/or hbrps inserted, so we test after 10 |
| insns. */ |
| if (addr > 10 * 4) |
| SCHED_ON_EVEN_P (insn) = 1; |
| } |
| |
| if (CALL_P (insn) || tablejump_p (insn, 0, 0)) |
| return; |
| |
| |
| if (addr + length >= 32 * 4) |
| { |
| gcc_assert (before_4 && before_16); |
| if (hbrp_addr0 > 4 * 4) |
| { |
| insn = |
| emit_insn_before (gen_iprefetch (GEN_INT (1)), before_4); |
| recog_memoized (insn); |
| INSN_LOCATION (insn) = INSN_LOCATION (before_4); |
| INSN_ADDRESSES_NEW (insn, |
| INSN_ADDRESSES (INSN_UID (before_4))); |
| PUT_MODE (insn, GET_MODE (before_4)); |
| PUT_MODE (before_4, TImode); |
| if (insert_lnop_after & 1) |
| { |
| insn = emit_insn_before (gen_lnop (), before_4); |
| recog_memoized (insn); |
| INSN_LOCATION (insn) = INSN_LOCATION (before_4); |
| INSN_ADDRESSES_NEW (insn, |
| INSN_ADDRESSES (INSN_UID (before_4))); |
| PUT_MODE (insn, TImode); |
| } |
| } |
| if ((hbrp_addr0 <= 4 * 4 || hbrp_addr0 > 16 * 4) |
| && hbrp_addr1 > 16 * 4) |
| { |
| insn = |
| emit_insn_before (gen_iprefetch (GEN_INT (2)), before_16); |
| recog_memoized (insn); |
| INSN_LOCATION (insn) = INSN_LOCATION (before_16); |
| INSN_ADDRESSES_NEW (insn, |
| INSN_ADDRESSES (INSN_UID (before_16))); |
| PUT_MODE (insn, GET_MODE (before_16)); |
| PUT_MODE (before_16, TImode); |
| if (insert_lnop_after & 2) |
| { |
| insn = emit_insn_before (gen_lnop (), before_16); |
| recog_memoized (insn); |
| INSN_LOCATION (insn) = INSN_LOCATION (before_16); |
| INSN_ADDRESSES_NEW (insn, |
| INSN_ADDRESSES (INSN_UID |
| (before_16))); |
| PUT_MODE (insn, TImode); |
| } |
| } |
| return; |
| } |
| } |
| else if (BARRIER_P (insn)) |
| return; |
| |
| } |
| |
| /* The SPU might hang when it executes 48 inline instructions after a |
| hinted branch jumps to its hinted target. The beginning of a |
| function and the return from a call might have been hinted, and |
| must be handled as well. To prevent a hang we insert 2 hbrps. The |
| first should be within 6 insns of the branch target. The second |
| should be within 22 insns of the branch target. When determining |
| if hbrps are necessary, we look for only 32 inline instructions, |
| because up to 12 nops and 4 hbrps could be inserted. Similarily, |
| when inserting new hbrps, we insert them within 4 and 16 insns of |
| the target. */ |
| static void |
| insert_hbrp (void) |
| { |
| rtx_insn *insn; |
| if (TARGET_SAFE_HINTS) |
| { |
| shorten_branches (get_insns ()); |
| /* Insert hbrp at beginning of function */ |
| insn = next_active_insn (get_insns ()); |
| if (insn) |
| insert_hbrp_for_ilb_runout (insn); |
| /* Insert hbrp after hinted targets. */ |
| for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) |
| if ((LABEL_P (insn) && HINTED_P (insn)) || CALL_P (insn)) |
| insert_hbrp_for_ilb_runout (next_active_insn (insn)); |
| } |
| } |
| |
| static int in_spu_reorg; |
| |
| static void |
| spu_var_tracking (void) |
| { |
| if (flag_var_tracking) |
| { |
| df_analyze (); |
| timevar_push (TV_VAR_TRACKING); |
| variable_tracking_main (); |
| timevar_pop (TV_VAR_TRACKING); |
| df_finish_pass (false); |
| } |
| } |
| |
| /* Insert branch hints. There are no branch optimizations after this |
| pass, so it's safe to set our branch hints now. */ |
| static void |
| spu_machine_dependent_reorg (void) |
| { |
| sbitmap blocks; |
| basic_block bb; |
| rtx_insn *branch, *insn; |
| rtx branch_target = 0; |
| int branch_addr = 0, insn_addr, required_dist = 0; |
| int i; |
| unsigned int j; |
| |
| if (!TARGET_BRANCH_HINTS || optimize == 0) |
| { |
| /* We still do it for unoptimized code because an external |
| function might have hinted a call or return. */ |
| compute_bb_for_insn (); |
| insert_hbrp (); |
| pad_bb (); |
| spu_var_tracking (); |
| free_bb_for_insn (); |
| return; |
| } |
| |
| blocks = sbitmap_alloc (last_basic_block_for_fn (cfun)); |
| bitmap_clear (blocks); |
| |
| in_spu_reorg = 1; |
| compute_bb_for_insn (); |
| |
| /* (Re-)discover loops so that bb->loop_father can be used |
| in the analysis below. */ |
| loop_optimizer_init (AVOID_CFG_MODIFICATIONS); |
| |
| compact_blocks (); |
| |
| spu_bb_info = |
| (struct spu_bb_info *) xcalloc (n_basic_blocks_for_fn (cfun), |
| sizeof (struct spu_bb_info)); |
| |
| /* We need exact insn addresses and lengths. */ |
| shorten_branches (get_insns ()); |
| |
| for (i = n_basic_blocks_for_fn (cfun) - 1; i >= 0; i--) |
| { |
| bb = BASIC_BLOCK_FOR_FN (cfun, i); |
| branch = 0; |
| if (spu_bb_info[i].prop_jump) |
| { |
| branch = spu_bb_info[i].prop_jump; |
| branch_target = get_branch_target (branch); |
| branch_addr = INSN_ADDRESSES (INSN_UID (branch)); |
| required_dist = spu_hint_dist; |
| } |
| /* Search from end of a block to beginning. In this loop, find |
| jumps which need a branch and emit them only when: |
| - it's an indirect branch and we're at the insn which sets |
| the register |
| - we're at an insn that will invalidate the hint. e.g., a |
| call, another hint insn, inline asm that clobbers $hbr, and |
| some inlined operations (divmodsi4). Don't consider jumps |
| because they are only at the end of a block and are |
| considered when we are deciding whether to propagate |
| - we're getting too far away from the branch. The hbr insns |
| only have a signed 10 bit offset |
| We go back as far as possible so the branch will be considered |
| for propagation when we get to the beginning of the block. */ |
| for (insn = BB_END (bb); insn; insn = PREV_INSN (insn)) |
| { |
| if (INSN_P (insn)) |
| { |
| insn_addr = INSN_ADDRESSES (INSN_UID (insn)); |
| if (branch |
| && ((GET_CODE (branch_target) == REG |
| && set_of (branch_target, insn) != NULL_RTX) |
| || insn_clobbers_hbr (insn) |
| || branch_addr - insn_addr > 600)) |
| { |
| rtx_insn *next = NEXT_INSN (insn); |
| int next_addr = INSN_ADDRESSES (INSN_UID (next)); |
| if (insn != BB_END (bb) |
| && branch_addr - next_addr >= required_dist) |
| { |
| if (dump_file) |
| fprintf (dump_file, |
| "hint for %i in block %i before %i\n", |
| INSN_UID (branch), bb->index, |
| INSN_UID (next)); |
| spu_emit_branch_hint (next, branch, branch_target, |
| branch_addr - next_addr, blocks); |
| } |
| branch = 0; |
| } |
| |
| /* JUMP_P will only be true at the end of a block. When |
| branch is already set it means we've previously decided |
| to propagate a hint for that branch into this block. */ |
| if (CALL_P (insn) || (JUMP_P (insn) && !branch)) |
| { |
| branch = 0; |
| if ((branch_target = get_branch_target (insn))) |
| { |
| branch = insn; |
| branch_addr = insn_addr; |
| required_dist = spu_hint_dist; |
| } |
| } |
| } |
| if (insn == BB_HEAD (bb)) |
| break; |
| } |
| |
| if (branch) |
| { |
| /* If we haven't emitted a hint for this branch yet, it might |
| be profitable to emit it in one of the predecessor blocks, |
| especially for loops. */ |
| rtx_insn *bbend; |
| basic_block prev = 0, prop = 0, prev2 = 0; |
| int loop_exit = 0, simple_loop = 0; |
| int next_addr = INSN_ADDRESSES (INSN_UID (NEXT_INSN (insn))); |
| |
| for (j = 0; j < EDGE_COUNT (bb->preds); j++) |
| if (EDGE_PRED (bb, j)->flags & EDGE_FALLTHRU) |
| prev = EDGE_PRED (bb, j)->src; |
| else |
| prev2 = EDGE_PRED (bb, j)->src; |
| |
| for (j = 0; j < EDGE_COUNT (bb->succs); j++) |
| if (EDGE_SUCC (bb, j)->flags & EDGE_LOOP_EXIT) |
| loop_exit = 1; |
| else if (EDGE_SUCC (bb, j)->dest == bb) |
| simple_loop = 1; |
| |
| /* If this branch is a loop exit then propagate to previous |
| fallthru block. This catches the cases when it is a simple |
| loop or when there is an initial branch into the loop. */ |
| if (prev && (loop_exit || simple_loop) |
| && bb_loop_depth (prev) <= bb_loop_depth (bb)) |
| prop = prev; |
| |
| /* If there is only one adjacent predecessor. Don't propagate |
| outside this loop. */ |
| else if (prev && single_pred_p (bb) |
| && prev->loop_father == bb->loop_father) |
| prop = prev; |
| |
| /* If this is the JOIN block of a simple IF-THEN then |
| propagate the hint to the HEADER block. */ |
| else if (prev && prev2 |
| && EDGE_COUNT (bb->preds) == 2 |
| && EDGE_COUNT (prev->preds) == 1 |
| && EDGE_PRED (prev, 0)->src == prev2 |
| && prev2->loop_father == bb->loop_father |
| && GET_CODE (branch_target) != REG) |
| prop = prev; |
| |
| /* Don't propagate when: |
| - this is a simple loop and the hint would be too far |
| - this is not a simple loop and there are 16 insns in |
| this block already |
| - the predecessor block ends in a branch that will be |
| hinted |
| - the predecessor block ends in an insn that invalidates |
| the hint */ |
| if (prop |
| && prop->index >= 0 |
| && (bbend = BB_END (prop)) |
| && branch_addr - INSN_ADDRESSES (INSN_UID (bbend)) < |
| (simple_loop ? 600 : 16 * 4) && get_branch_target (bbend) == 0 |
| && (JUMP_P (bbend) || !insn_clobbers_hbr (bbend))) |
| { |
| if (dump_file) |
| fprintf (dump_file, "propagate from %i to %i (loop depth %i) " |
| "for %i (loop_exit %i simple_loop %i dist %i)\n", |
| bb->index, prop->index, bb_loop_depth (bb), |
| INSN_UID (branch), loop_exit, simple_loop, |
| branch_addr - INSN_ADDRESSES (INSN_UID (bbend))); |
| |
| spu_bb_info[prop->index].prop_jump = branch; |
| spu_bb_info[prop->index].bb_index = i; |
| } |
| else if (branch_addr - next_addr >= required_dist) |
| { |
| if (dump_file) |
| fprintf (dump_file, "hint for %i in block %i before %i\n", |
| INSN_UID (branch), bb->index, |
| INSN_UID (NEXT_INSN (insn))); |
| spu_emit_branch_hint (NEXT_INSN (insn), branch, branch_target, |
| branch_addr - next_addr, blocks); |
| } |
| branch = 0; |
| } |
| } |
| free (spu_bb_info); |
| |
| if (!bitmap_empty_p (blocks)) |
| find_many_sub_basic_blocks (blocks); |
| |
| /* We have to schedule to make sure alignment is ok. */ |
| FOR_EACH_BB_FN (bb, cfun) bb->flags &= ~BB_DISABLE_SCHEDULE; |
| |
| /* The hints need to be scheduled, so call it again. */ |
| schedule_insns (); |
| df_finish_pass (true); |
| |
| insert_hbrp (); |
| |
| pad_bb (); |
| |
| for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) |
| if (NONJUMP_INSN_P (insn) && INSN_CODE (insn) == CODE_FOR_hbr) |
| { |
| /* Adjust the LABEL_REF in a hint when we have inserted a nop |
| between its branch label and the branch . We don't move the |
| label because GCC expects it at the beginning of the block. */ |
| rtx unspec = SET_SRC (XVECEXP (PATTERN (insn), 0, 0)); |
| rtx label_ref = XVECEXP (unspec, 0, 0); |
| rtx_insn *label = as_a <rtx_insn *> (XEXP (label_ref, 0)); |
| rtx_insn *branch; |
| int offset = 0; |
| for (branch = NEXT_INSN (label); |
| !JUMP_P (branch) && !CALL_P (branch); |
| branch = NEXT_INSN (branch)) |
| if (NONJUMP_INSN_P (branch)) |
| offset += get_attr_length (branch); |
| if (offset > 0) |
| XVECEXP (unspec, 0, 0) = plus_constant (Pmode, label_ref, offset); |
| } |
| |
| spu_var_tracking (); |
| |
| loop_optimizer_finalize (); |
| |
| free_bb_for_insn (); |
| |
| in_spu_reorg = 0; |
| } |
| |
| |
| /* Insn scheduling routines, primarily for dual issue. */ |
| static int |
| spu_sched_issue_rate (void) |
| { |
| return 2; |
| } |
| |
| static int |
| uses_ls_unit(rtx_insn *insn) |
| { |
| rtx set = single_set (insn); |
| if (set != 0 |
| && (GET_CODE (SET_DEST (set)) == MEM |
| || GET_CODE (SET_SRC (set)) == MEM)) |
| return 1; |
| return 0; |
| } |
| |
| static int |
| get_pipe (rtx_insn *insn) |
| { |
| enum attr_type t; |
| /* Handle inline asm */ |
| if (INSN_CODE (insn) == -1) |
| return -1; |
| t = get_attr_type (insn); |
| switch (t) |
| { |
| case TYPE_CONVERT: |
| return -2; |
| case TYPE_MULTI0: |
| return -1; |
| |
| case TYPE_FX2: |
| case TYPE_FX3: |
| case TYPE_SPR: |
| case TYPE_NOP: |
| case TYPE_FXB: |
| case TYPE_FPD: |
| case TYPE_FP6: |
| case TYPE_FP7: |
| return 0; |
| |
| case TYPE_LNOP: |
| case TYPE_SHUF: |
| case TYPE_LOAD: |
| case TYPE_STORE: |
| case TYPE_BR: |
| case TYPE_MULTI1: |
| case TYPE_HBR: |
| case TYPE_IPREFETCH: |
| return 1; |
| default: |
| abort (); |
| } |
| } |
| |
| |
| /* haifa-sched.c has a static variable that keeps track of the current |
| cycle. It is passed to spu_sched_reorder, and we record it here for |
| use by spu_sched_variable_issue. It won't be accurate if the |
| scheduler updates it's clock_var between the two calls. */ |
| static int clock_var; |
| |
| /* This is used to keep track of insn alignment. Set to 0 at the |
| beginning of each block and increased by the "length" attr of each |
| insn scheduled. */ |
| static int spu_sched_length; |
| |
| /* Record when we've issued pipe0 and pipe1 insns so we can reorder the |
| ready list appropriately in spu_sched_reorder(). */ |
| static int pipe0_clock; |
| static int pipe1_clock; |
| |
| static int prev_clock_var; |
| |
| static int prev_priority; |
| |
| /* The SPU needs to load the next ilb sometime during the execution of |
| the previous ilb. There is a potential conflict if every cycle has a |
| load or store. To avoid the conflict we make sure the load/store |
| unit is free for at least one cycle during the execution of insns in |
| the previous ilb. */ |
| static int spu_ls_first; |
| static int prev_ls_clock; |
| |
| static void |
| spu_sched_init_global (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED, |
| int max_ready ATTRIBUTE_UNUSED) |
| { |
| spu_sched_length = 0; |
| } |
| |
| static void |
| spu_sched_init (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED, |
| int max_ready ATTRIBUTE_UNUSED) |
| { |
| if (align_labels > 4 || align_loops > 4 || align_jumps > 4) |
| { |
| /* When any block might be at least 8-byte aligned, assume they |
| will all be at least 8-byte aligned to make sure dual issue |
| works out correctly. */ |
| spu_sched_length = 0; |
| } |
| spu_ls_first = INT_MAX; |
| clock_var = -1; |
| prev_ls_clock = -1; |
| pipe0_clock = -1; |
| pipe1_clock = -1; |
| prev_clock_var = -1; |
| prev_priority = -1; |
| } |
| |
| static int |
| spu_sched_variable_issue (FILE *file ATTRIBUTE_UNUSED, |
| int verbose ATTRIBUTE_UNUSED, |
| rtx_insn *insn, int more) |
| { |
| int len; |
| int p; |
| if (GET_CODE (PATTERN (insn)) == USE |
| || GET_CODE (PATTERN (insn)) == CLOBBER |
| || (len = get_attr_length (insn)) == 0) |
| return more; |
| |
| spu_sched_length += len; |
| |
| /* Reset on inline asm */ |
| if (INSN_CODE (insn) == -1) |
| { |
| spu_ls_first = INT_MAX; |
| pipe0_clock = -1; |
| pipe1_clock = -1; |
| return 0; |
| } |
| p = get_pipe (insn); |
| if (p == 0) |
| pipe0_clock = clock_var; |
| else |
| pipe1_clock = clock_var; |
| |
| if (in_spu_reorg) |
| { |
| if (clock_var - prev_ls_clock > 1 |
| || INSN_CODE (insn) == CODE_FOR_iprefetch) |
| spu_ls_first = INT_MAX; |
| if (uses_ls_unit (insn)) |
| { |
| if (spu_ls_first == INT_MAX) |
| spu_ls_first = spu_sched_length; |
| prev_ls_clock = clock_var; |
| } |
| |
| /* The scheduler hasn't inserted the nop, but we will later on. |
| Include those nops in spu_sched_length. */ |
| if (prev_clock_var == clock_var && (spu_sched_length & 7)) |
| spu_sched_length += 4; |
| prev_clock_var = clock_var; |
| |
| /* more is -1 when called from spu_sched_reorder for new insns |
| that don't have INSN_PRIORITY */ |
| if (more >= 0) |
| prev_priority = INSN_PRIORITY (insn); |
| } |
| |
| /* Always try issuing more insns. spu_sched_reorder will decide |
| when the cycle should be advanced. */ |
| return 1; |
| } |
| |
| /* This function is called for both TARGET_SCHED_REORDER and |
| TARGET_SCHED_REORDER2. */ |
| static int |
| spu_sched_reorder (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED, |
| rtx_insn **ready, int *nreadyp, int clock) |
| { |
| int i, nready = *nreadyp; |
| int pipe_0, pipe_1, pipe_hbrp, pipe_ls, schedule_i; |
| rtx_insn *insn; |
| |
| clock_var = clock; |
| |
| if (nready <= 0 || pipe1_clock >= clock) |
| return 0; |
| |
| /* Find any rtl insns that don't generate assembly insns and schedule |
| them first. */ |
| for (i = nready - 1; i >= 0; i--) |
| { |
| insn = ready[i]; |
| if (INSN_CODE (insn) == -1 |
| || INSN_CODE (insn) == CODE_FOR_blockage |
| || (INSN_P (insn) && get_attr_length (insn) == 0)) |
| { |
| ready[i] = ready[nready - 1]; |
| ready[nready - 1] = insn; |
| return 1; |
| } |
| } |
| |
| pipe_0 = pipe_1 = pipe_hbrp = pipe_ls = schedule_i = -1; |
| for (i = 0; i < nready; i++) |
| if (INSN_CODE (ready[i]) != -1) |
| { |
| insn = ready[i]; |
| switch (get_attr_type (insn)) |
| { |
| default: |
| case TYPE_MULTI0: |
| case TYPE_CONVERT: |
| case TYPE_FX2: |
| case TYPE_FX3: |
| case TYPE_SPR: |
| case TYPE_NOP: |
| case TYPE_FXB: |
| case TYPE_FPD: |
| case TYPE_FP6: |
| case TYPE_FP7: |
| pipe_0 = i; |
| break; |
| case TYPE_LOAD: |
| case TYPE_STORE: |
| pipe_ls = i; |
| case TYPE_LNOP: |
| case TYPE_SHUF: |
| case TYPE_BR: |
| case TYPE_MULTI1: |
| case TYPE_HBR: |
| pipe_1 = i; |
| break; |
| case TYPE_IPREFETCH: |
| pipe_hbrp = i; |
| break; |
| } |
| } |
| |
| /* In the first scheduling phase, schedule loads and stores together |
| to increase the chance they will get merged during postreload CSE. */ |
| if (!reload_completed && pipe_ls >= 0) |
| { |
| insn = ready[pipe_ls]; |
| ready[pipe_ls] = ready[nready - 1]; |
| ready[nready - 1] = insn; |
| return 1; |
| } |
| |
| /* If there is an hbrp ready, prefer it over other pipe 1 insns. */ |
| if (pipe_hbrp >= 0) |
| pipe_1 = pipe_hbrp; |
| |
| /* When we have loads/stores in every cycle of the last 15 insns and |
| we are about to schedule another load/store, emit an hbrp insn |
| instead. */ |
| if (in_spu_reorg |
| && spu_sched_length - spu_ls_first >= 4 * 15 |
| && !(pipe0_clock < clock && pipe_0 >= 0) && pipe_1 == pipe_ls) |
| { |
| insn = sched_emit_insn (gen_iprefetch (GEN_INT (3))); |
| recog_memoized (insn); |
| if (pipe0_clock < clock) |
| PUT_MODE (insn, TImode); |
| spu_sched_variable_issue (file, verbose, insn, -1); |
| return 0; |
| } |
| |
| /* In general, we want to emit nops to increase dual issue, but dual |
| issue isn't faster when one of the insns could be scheduled later |
| without effecting the critical path. We look at INSN_PRIORITY to |
| make a good guess, but it isn't perfect so -mdual-nops=n can be |
| used to effect it. */ |
| if (in_spu_reorg && spu_dual_nops < 10) |
| { |
| /* When we are at an even address and we are not issuing nops to |
| improve scheduling then we need to advance the cycle. */ |
| if ((spu_sched_length & 7) == 0 && prev_clock_var == clock |
| && (spu_dual_nops == 0 |
| || (pipe_1 != -1 |
| && prev_priority > |
| INSN_PRIORITY (ready[pipe_1]) + spu_dual_nops))) |
| return 0; |
| |
| /* When at an odd address, schedule the highest priority insn |
| without considering pipeline. */ |
| if ((spu_sched_length & 7) == 4 && prev_clock_var != clock |
| && (spu_dual_nops == 0 |
| || (prev_priority > |
| INSN_PRIORITY (ready[nready - 1]) + spu_dual_nops))) |
| return 1; |
| } |
| |
| |
| /* We haven't issued a pipe0 insn yet this cycle, if there is a |
| pipe0 insn in the ready list, schedule it. */ |
| if (pipe0_clock < clock && pipe_0 >= 0) |
| schedule_i = pipe_0; |
| |
| /* Either we've scheduled a pipe0 insn already or there is no pipe0 |
| insn to schedule. Put a pipe1 insn at the front of the ready list. */ |
| else |
| schedule_i = pipe_1; |
| |
| if (schedule_i > -1) |
| { |
| insn = ready[schedule_i]; |
| ready[schedule_i] = ready[nready - 1]; |
| ready[nready - 1] = insn; |
| return 1; |
| } |
| return 0; |
| } |
| |
| /* INSN is dependent on DEP_INSN. */ |
| static int |
| spu_sched_adjust_cost (rtx_insn *insn, rtx link, rtx_insn *dep_insn, int cost) |
| { |
| rtx set; |
| |
| /* The blockage pattern is used to prevent instructions from being |
| moved across it and has no cost. */ |
| if (INSN_CODE (insn) == CODE_FOR_blockage |
| |