blob: e99cea37d05144488d7ca52c9072280192866c97 [file] [log] [blame]
/* Copyright (C) 2006-2015 Free Software Foundation, Inc.
This file is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This file is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "tm.h"
#include "rtl.h"
#include "regs.h"
#include "hard-reg-set.h"
#include "insn-config.h"
#include "conditions.h"
#include "insn-attr.h"
#include "flags.h"
#include "recog.h"
#include "obstack.h"
#include "hash-set.h"
#include "machmode.h"
#include "vec.h"
#include "double-int.h"
#include "input.h"
#include "alias.h"
#include "symtab.h"
#include "wide-int.h"
#include "inchash.h"
#include "tree.h"
#include "fold-const.h"
#include "stringpool.h"
#include "stor-layout.h"
#include "calls.h"
#include "varasm.h"
#include "hashtab.h"
#include "function.h"
#include "statistics.h"
#include "real.h"
#include "fixed-value.h"
#include "expmed.h"
#include "dojump.h"
#include "explow.h"
#include "emit-rtl.h"
#include "stmt.h"
#include "expr.h"
#include "insn-codes.h"
#include "optabs.h"
#include "except.h"
#include "output.h"
#include "predict.h"
#include "dominance.h"
#include "cfg.h"
#include "cfgrtl.h"
#include "cfganal.h"
#include "lcm.h"
#include "cfgbuild.h"
#include "cfgcleanup.h"
#include "basic-block.h"
#include "diagnostic-core.h"
#include "ggc.h"
#include "tm_p.h"
#include "target.h"
#include "target-def.h"
#include "langhooks.h"
#include "reload.h"
#include "sched-int.h"
#include "params.h"
#include "hash-table.h"
#include "tree-ssa-alias.h"
#include "internal-fn.h"
#include "gimple-fold.h"
#include "tree-eh.h"
#include "gimple-expr.h"
#include "is-a.h"
#include "gimple.h"
#include "gimplify.h"
#include "tm-constrs.h"
#include "sbitmap.h"
#include "df.h"
#include "ddg.h"
#include "timevar.h"
#include "dumpfile.h"
#include "cfgloop.h"
#include "builtins.h"
#include "rtl-iter.h"
/* Builtin types, data and prototypes. */
enum spu_builtin_type_index
{
SPU_BTI_END_OF_PARAMS,
/* We create new type nodes for these. */
SPU_BTI_V16QI,
SPU_BTI_V8HI,
SPU_BTI_V4SI,
SPU_BTI_V2DI,
SPU_BTI_V4SF,
SPU_BTI_V2DF,
SPU_BTI_UV16QI,
SPU_BTI_UV8HI,
SPU_BTI_UV4SI,
SPU_BTI_UV2DI,
/* A 16-byte type. (Implemented with V16QI_type_node) */
SPU_BTI_QUADWORD,
/* These all correspond to intSI_type_node */
SPU_BTI_7,
SPU_BTI_S7,
SPU_BTI_U7,
SPU_BTI_S10,
SPU_BTI_S10_4,
SPU_BTI_U14,
SPU_BTI_16,
SPU_BTI_S16,
SPU_BTI_S16_2,
SPU_BTI_U16,
SPU_BTI_U16_2,
SPU_BTI_U18,
/* These correspond to the standard types */
SPU_BTI_INTQI,
SPU_BTI_INTHI,
SPU_BTI_INTSI,
SPU_BTI_INTDI,
SPU_BTI_UINTQI,
SPU_BTI_UINTHI,
SPU_BTI_UINTSI,
SPU_BTI_UINTDI,
SPU_BTI_FLOAT,
SPU_BTI_DOUBLE,
SPU_BTI_VOID,
SPU_BTI_PTR,
SPU_BTI_MAX
};
#define V16QI_type_node (spu_builtin_types[SPU_BTI_V16QI])
#define V8HI_type_node (spu_builtin_types[SPU_BTI_V8HI])
#define V4SI_type_node (spu_builtin_types[SPU_BTI_V4SI])
#define V2DI_type_node (spu_builtin_types[SPU_BTI_V2DI])
#define V4SF_type_node (spu_builtin_types[SPU_BTI_V4SF])
#define V2DF_type_node (spu_builtin_types[SPU_BTI_V2DF])
#define unsigned_V16QI_type_node (spu_builtin_types[SPU_BTI_UV16QI])
#define unsigned_V8HI_type_node (spu_builtin_types[SPU_BTI_UV8HI])
#define unsigned_V4SI_type_node (spu_builtin_types[SPU_BTI_UV4SI])
#define unsigned_V2DI_type_node (spu_builtin_types[SPU_BTI_UV2DI])
static GTY(()) tree spu_builtin_types[SPU_BTI_MAX];
struct spu_builtin_range
{
int low, high;
};
static struct spu_builtin_range spu_builtin_range[] = {
{-0x40ll, 0x7fll}, /* SPU_BTI_7 */
{-0x40ll, 0x3fll}, /* SPU_BTI_S7 */
{0ll, 0x7fll}, /* SPU_BTI_U7 */
{-0x200ll, 0x1ffll}, /* SPU_BTI_S10 */
{-0x2000ll, 0x1fffll}, /* SPU_BTI_S10_4 */
{0ll, 0x3fffll}, /* SPU_BTI_U14 */
{-0x8000ll, 0xffffll}, /* SPU_BTI_16 */
{-0x8000ll, 0x7fffll}, /* SPU_BTI_S16 */
{-0x20000ll, 0x1ffffll}, /* SPU_BTI_S16_2 */
{0ll, 0xffffll}, /* SPU_BTI_U16 */
{0ll, 0x3ffffll}, /* SPU_BTI_U16_2 */
{0ll, 0x3ffffll}, /* SPU_BTI_U18 */
};
/* Target specific attribute specifications. */
char regs_ever_allocated[FIRST_PSEUDO_REGISTER];
/* Prototypes and external defs. */
static int get_pipe (rtx_insn *insn);
static int spu_naked_function_p (tree func);
static int mem_is_padded_component_ref (rtx x);
static void fix_range (const char *);
static rtx spu_expand_load (rtx, rtx, rtx, int);
/* Which instruction set architecture to use. */
int spu_arch;
/* Which cpu are we tuning for. */
int spu_tune;
/* The hardware requires 8 insns between a hint and the branch it
effects. This variable describes how many rtl instructions the
compiler needs to see before inserting a hint, and then the compiler
will insert enough nops to make it at least 8 insns. The default is
for the compiler to allow up to 2 nops be emitted. The nops are
inserted in pairs, so we round down. */
int spu_hint_dist = (8*4) - (2*4);
enum spu_immediate {
SPU_NONE,
SPU_IL,
SPU_ILA,
SPU_ILH,
SPU_ILHU,
SPU_ORI,
SPU_ORHI,
SPU_ORBI,
SPU_IOHL
};
enum immediate_class
{
IC_POOL, /* constant pool */
IC_IL1, /* one il* instruction */
IC_IL2, /* both ilhu and iohl instructions */
IC_IL1s, /* one il* instruction */
IC_IL2s, /* both ilhu and iohl instructions */
IC_FSMBI, /* the fsmbi instruction */
IC_CPAT, /* one of the c*d instructions */
IC_FSMBI2 /* fsmbi plus 1 other instruction */
};
static enum spu_immediate which_immediate_load (HOST_WIDE_INT val);
static enum spu_immediate which_logical_immediate (HOST_WIDE_INT val);
static int cpat_info(unsigned char *arr, int size, int *prun, int *pstart);
static enum immediate_class classify_immediate (rtx op,
machine_mode mode);
/* Pointer mode for __ea references. */
#define EAmode (spu_ea_model != 32 ? DImode : SImode)
/* Define the structure for the machine field in struct function. */
struct GTY(()) machine_function
{
/* Register to use for PIC accesses. */
rtx pic_reg;
};
/* How to allocate a 'struct machine_function'. */
static struct machine_function *
spu_init_machine_status (void)
{
return ggc_cleared_alloc<machine_function> ();
}
/* Implement TARGET_OPTION_OVERRIDE. */
static void
spu_option_override (void)
{
/* Set up function hooks. */
init_machine_status = spu_init_machine_status;
/* Small loops will be unpeeled at -O3. For SPU it is more important
to keep code small by default. */
if (!flag_unroll_loops && !flag_peel_loops)
maybe_set_param_value (PARAM_MAX_COMPLETELY_PEEL_TIMES, 4,
global_options.x_param_values,
global_options_set.x_param_values);
flag_omit_frame_pointer = 1;
/* Functions must be 8 byte aligned so we correctly handle dual issue */
if (align_functions < 8)
align_functions = 8;
spu_hint_dist = 8*4 - spu_max_nops*4;
if (spu_hint_dist < 0)
spu_hint_dist = 0;
if (spu_fixed_range_string)
fix_range (spu_fixed_range_string);
/* Determine processor architectural level. */
if (spu_arch_string)
{
if (strcmp (&spu_arch_string[0], "cell") == 0)
spu_arch = PROCESSOR_CELL;
else if (strcmp (&spu_arch_string[0], "celledp") == 0)
spu_arch = PROCESSOR_CELLEDP;
else
error ("bad value (%s) for -march= switch", spu_arch_string);
}
/* Determine processor to tune for. */
if (spu_tune_string)
{
if (strcmp (&spu_tune_string[0], "cell") == 0)
spu_tune = PROCESSOR_CELL;
else if (strcmp (&spu_tune_string[0], "celledp") == 0)
spu_tune = PROCESSOR_CELLEDP;
else
error ("bad value (%s) for -mtune= switch", spu_tune_string);
}
/* Change defaults according to the processor architecture. */
if (spu_arch == PROCESSOR_CELLEDP)
{
/* If no command line option has been otherwise specified, change
the default to -mno-safe-hints on celledp -- only the original
Cell/B.E. processors require this workaround. */
if (!(target_flags_explicit & MASK_SAFE_HINTS))
target_flags &= ~MASK_SAFE_HINTS;
}
REAL_MODE_FORMAT (SFmode) = &spu_single_format;
}
/* Handle an attribute requiring a FUNCTION_DECL; arguments as in
struct attribute_spec.handler. */
/* True if MODE is valid for the target. By "valid", we mean able to
be manipulated in non-trivial ways. In particular, this means all
the arithmetic is supported. */
static bool
spu_scalar_mode_supported_p (machine_mode mode)
{
switch (mode)
{
case QImode:
case HImode:
case SImode:
case SFmode:
case DImode:
case TImode:
case DFmode:
return true;
default:
return false;
}
}
/* Similarly for vector modes. "Supported" here is less strict. At
least some operations are supported; need to check optabs or builtins
for further details. */
static bool
spu_vector_mode_supported_p (machine_mode mode)
{
switch (mode)
{
case V16QImode:
case V8HImode:
case V4SImode:
case V2DImode:
case V4SFmode:
case V2DFmode:
return true;
default:
return false;
}
}
/* GCC assumes that in a paradoxical SUBREG the inner mode occupies the
least significant bytes of the outer mode. This function returns
TRUE for the SUBREG's where this is correct. */
int
valid_subreg (rtx op)
{
machine_mode om = GET_MODE (op);
machine_mode im = GET_MODE (SUBREG_REG (op));
return om != VOIDmode && im != VOIDmode
&& (GET_MODE_SIZE (im) == GET_MODE_SIZE (om)
|| (GET_MODE_SIZE (im) <= 4 && GET_MODE_SIZE (om) <= 4)
|| (GET_MODE_SIZE (im) >= 16 && GET_MODE_SIZE (om) >= 16));
}
/* When insv and ext[sz]v ar passed a TI SUBREG, we want to strip it off
and adjust the start offset. */
static rtx
adjust_operand (rtx op, HOST_WIDE_INT * start)
{
machine_mode mode;
int op_size;
/* Strip any paradoxical SUBREG. */
if (GET_CODE (op) == SUBREG
&& (GET_MODE_BITSIZE (GET_MODE (op))
> GET_MODE_BITSIZE (GET_MODE (SUBREG_REG (op)))))
{
if (start)
*start -=
GET_MODE_BITSIZE (GET_MODE (op)) -
GET_MODE_BITSIZE (GET_MODE (SUBREG_REG (op)));
op = SUBREG_REG (op);
}
/* If it is smaller than SI, assure a SUBREG */
op_size = GET_MODE_BITSIZE (GET_MODE (op));
if (op_size < 32)
{
if (start)
*start += 32 - op_size;
op_size = 32;
}
/* If it is not a MODE_INT (and/or it is smaller than SI) add a SUBREG. */
mode = mode_for_size (op_size, MODE_INT, 0);
if (mode != GET_MODE (op))
op = gen_rtx_SUBREG (mode, op, 0);
return op;
}
void
spu_expand_extv (rtx ops[], int unsignedp)
{
rtx dst = ops[0], src = ops[1];
HOST_WIDE_INT width = INTVAL (ops[2]);
HOST_WIDE_INT start = INTVAL (ops[3]);
HOST_WIDE_INT align_mask;
rtx s0, s1, mask, r0;
gcc_assert (REG_P (dst) && GET_MODE (dst) == TImode);
if (MEM_P (src))
{
/* First, determine if we need 1 TImode load or 2. We need only 1
if the bits being extracted do not cross the alignment boundary
as determined by the MEM and its address. */
align_mask = -MEM_ALIGN (src);
if ((start & align_mask) == ((start + width - 1) & align_mask))
{
/* Alignment is sufficient for 1 load. */
s0 = gen_reg_rtx (TImode);
r0 = spu_expand_load (s0, 0, src, start / 8);
start &= 7;
if (r0)
emit_insn (gen_rotqby_ti (s0, s0, r0));
}
else
{
/* Need 2 loads. */
s0 = gen_reg_rtx (TImode);
s1 = gen_reg_rtx (TImode);
r0 = spu_expand_load (s0, s1, src, start / 8);
start &= 7;
gcc_assert (start + width <= 128);
if (r0)
{
rtx r1 = gen_reg_rtx (SImode);
mask = gen_reg_rtx (TImode);
emit_move_insn (mask, GEN_INT (-1));
emit_insn (gen_rotqby_ti (s0, s0, r0));
emit_insn (gen_rotqby_ti (s1, s1, r0));
if (GET_CODE (r0) == CONST_INT)
r1 = GEN_INT (INTVAL (r0) & 15);
else
emit_insn (gen_andsi3 (r1, r0, GEN_INT (15)));
emit_insn (gen_shlqby_ti (mask, mask, r1));
emit_insn (gen_selb (s0, s1, s0, mask));
}
}
}
else if (GET_CODE (src) == SUBREG)
{
rtx r = SUBREG_REG (src);
gcc_assert (REG_P (r) && SCALAR_INT_MODE_P (GET_MODE (r)));
s0 = gen_reg_rtx (TImode);
if (GET_MODE_SIZE (GET_MODE (r)) < GET_MODE_SIZE (TImode))
emit_insn (gen_rtx_SET (VOIDmode, s0, gen_rtx_ZERO_EXTEND (TImode, r)));
else
emit_move_insn (s0, src);
}
else
{
gcc_assert (REG_P (src) && GET_MODE (src) == TImode);
s0 = gen_reg_rtx (TImode);
emit_move_insn (s0, src);
}
/* Now s0 is TImode and contains the bits to extract at start. */
if (start)
emit_insn (gen_rotlti3 (s0, s0, GEN_INT (start)));
if (128 - width)
s0 = expand_shift (RSHIFT_EXPR, TImode, s0, 128 - width, s0, unsignedp);
emit_move_insn (dst, s0);
}
void
spu_expand_insv (rtx ops[])
{
HOST_WIDE_INT width = INTVAL (ops[1]);
HOST_WIDE_INT start = INTVAL (ops[2]);
HOST_WIDE_INT maskbits;
machine_mode dst_mode;
rtx dst = ops[0], src = ops[3];
int dst_size;
rtx mask;
rtx shift_reg;
int shift;
if (GET_CODE (ops[0]) == MEM)
dst = gen_reg_rtx (TImode);
else
dst = adjust_operand (dst, &start);
dst_mode = GET_MODE (dst);
dst_size = GET_MODE_BITSIZE (GET_MODE (dst));
if (CONSTANT_P (src))
{
machine_mode m =
(width <= 32 ? SImode : width <= 64 ? DImode : TImode);
src = force_reg (m, convert_to_mode (m, src, 0));
}
src = adjust_operand (src, 0);
mask = gen_reg_rtx (dst_mode);
shift_reg = gen_reg_rtx (dst_mode);
shift = dst_size - start - width;
/* It's not safe to use subreg here because the compiler assumes
that the SUBREG_REG is right justified in the SUBREG. */
convert_move (shift_reg, src, 1);
if (shift > 0)
{
switch (dst_mode)
{
case SImode:
emit_insn (gen_ashlsi3 (shift_reg, shift_reg, GEN_INT (shift)));
break;
case DImode:
emit_insn (gen_ashldi3 (shift_reg, shift_reg, GEN_INT (shift)));
break;
case TImode:
emit_insn (gen_ashlti3 (shift_reg, shift_reg, GEN_INT (shift)));
break;
default:
abort ();
}
}
else if (shift < 0)
abort ();
switch (dst_size)
{
case 32:
maskbits = (-1ll << (32 - width - start));
if (start)
maskbits += (1ll << (32 - start));
emit_move_insn (mask, GEN_INT (maskbits));
break;
case 64:
maskbits = (-1ll << (64 - width - start));
if (start)
maskbits += (1ll << (64 - start));
emit_move_insn (mask, GEN_INT (maskbits));
break;
case 128:
{
unsigned char arr[16];
int i = start / 8;
memset (arr, 0, sizeof (arr));
arr[i] = 0xff >> (start & 7);
for (i++; i <= (start + width - 1) / 8; i++)
arr[i] = 0xff;
arr[i - 1] &= 0xff << (7 - ((start + width - 1) & 7));
emit_move_insn (mask, array_to_constant (TImode, arr));
}
break;
default:
abort ();
}
if (GET_CODE (ops[0]) == MEM)
{
rtx low = gen_reg_rtx (SImode);
rtx rotl = gen_reg_rtx (SImode);
rtx mask0 = gen_reg_rtx (TImode);
rtx addr;
rtx addr0;
rtx addr1;
rtx mem;
addr = force_reg (Pmode, XEXP (ops[0], 0));
addr0 = gen_rtx_AND (Pmode, addr, GEN_INT (-16));
emit_insn (gen_andsi3 (low, addr, GEN_INT (15)));
emit_insn (gen_negsi2 (rotl, low));
emit_insn (gen_rotqby_ti (shift_reg, shift_reg, rotl));
emit_insn (gen_rotqmby_ti (mask0, mask, rotl));
mem = change_address (ops[0], TImode, addr0);
set_mem_alias_set (mem, 0);
emit_move_insn (dst, mem);
emit_insn (gen_selb (dst, dst, shift_reg, mask0));
if (start + width > MEM_ALIGN (ops[0]))
{
rtx shl = gen_reg_rtx (SImode);
rtx mask1 = gen_reg_rtx (TImode);
rtx dst1 = gen_reg_rtx (TImode);
rtx mem1;
addr1 = plus_constant (Pmode, addr, 16);
addr1 = gen_rtx_AND (Pmode, addr1, GEN_INT (-16));
emit_insn (gen_subsi3 (shl, GEN_INT (16), low));
emit_insn (gen_shlqby_ti (mask1, mask, shl));
mem1 = change_address (ops[0], TImode, addr1);
set_mem_alias_set (mem1, 0);
emit_move_insn (dst1, mem1);
emit_insn (gen_selb (dst1, dst1, shift_reg, mask1));
emit_move_insn (mem1, dst1);
}
emit_move_insn (mem, dst);
}
else
emit_insn (gen_selb (dst, copy_rtx (dst), shift_reg, mask));
}
int
spu_expand_block_move (rtx ops[])
{
HOST_WIDE_INT bytes, align, offset;
rtx src, dst, sreg, dreg, target;
int i;
if (GET_CODE (ops[2]) != CONST_INT
|| GET_CODE (ops[3]) != CONST_INT
|| INTVAL (ops[2]) > (HOST_WIDE_INT) (MOVE_RATIO (optimize_insn_for_speed_p ()) * 8))
return 0;
bytes = INTVAL (ops[2]);
align = INTVAL (ops[3]);
if (bytes <= 0)
return 1;
dst = ops[0];
src = ops[1];
if (align == 16)
{
for (offset = 0; offset + 16 <= bytes; offset += 16)
{
dst = adjust_address (ops[0], V16QImode, offset);
src = adjust_address (ops[1], V16QImode, offset);
emit_move_insn (dst, src);
}
if (offset < bytes)
{
rtx mask;
unsigned char arr[16] = { 0 };
for (i = 0; i < bytes - offset; i++)
arr[i] = 0xff;
dst = adjust_address (ops[0], V16QImode, offset);
src = adjust_address (ops[1], V16QImode, offset);
mask = gen_reg_rtx (V16QImode);
sreg = gen_reg_rtx (V16QImode);
dreg = gen_reg_rtx (V16QImode);
target = gen_reg_rtx (V16QImode);
emit_move_insn (mask, array_to_constant (V16QImode, arr));
emit_move_insn (dreg, dst);
emit_move_insn (sreg, src);
emit_insn (gen_selb (target, dreg, sreg, mask));
emit_move_insn (dst, target);
}
return 1;
}
return 0;
}
enum spu_comp_code
{ SPU_EQ, SPU_GT, SPU_GTU };
int spu_comp_icode[12][3] = {
{CODE_FOR_ceq_qi, CODE_FOR_cgt_qi, CODE_FOR_clgt_qi},
{CODE_FOR_ceq_hi, CODE_FOR_cgt_hi, CODE_FOR_clgt_hi},
{CODE_FOR_ceq_si, CODE_FOR_cgt_si, CODE_FOR_clgt_si},
{CODE_FOR_ceq_di, CODE_FOR_cgt_di, CODE_FOR_clgt_di},
{CODE_FOR_ceq_ti, CODE_FOR_cgt_ti, CODE_FOR_clgt_ti},
{CODE_FOR_ceq_sf, CODE_FOR_cgt_sf, 0},
{CODE_FOR_ceq_df, CODE_FOR_cgt_df, 0},
{CODE_FOR_ceq_v16qi, CODE_FOR_cgt_v16qi, CODE_FOR_clgt_v16qi},
{CODE_FOR_ceq_v8hi, CODE_FOR_cgt_v8hi, CODE_FOR_clgt_v8hi},
{CODE_FOR_ceq_v4si, CODE_FOR_cgt_v4si, CODE_FOR_clgt_v4si},
{CODE_FOR_ceq_v4sf, CODE_FOR_cgt_v4sf, 0},
{CODE_FOR_ceq_v2df, CODE_FOR_cgt_v2df, 0},
};
/* Generate a compare for CODE. Return a brand-new rtx that represents
the result of the compare. GCC can figure this out too if we don't
provide all variations of compares, but GCC always wants to use
WORD_MODE, we can generate better code in most cases if we do it
ourselves. */
void
spu_emit_branch_or_set (int is_set, rtx cmp, rtx operands[])
{
int reverse_compare = 0;
int reverse_test = 0;
rtx compare_result, eq_result;
rtx comp_rtx, eq_rtx;
machine_mode comp_mode;
machine_mode op_mode;
enum spu_comp_code scode, eq_code;
enum insn_code ior_code;
enum rtx_code code = GET_CODE (cmp);
rtx op0 = XEXP (cmp, 0);
rtx op1 = XEXP (cmp, 1);
int index;
int eq_test = 0;
/* When op1 is a CONST_INT change (X >= C) to (X > C-1),
and so on, to keep the constant in operand 1. */
if (GET_CODE (op1) == CONST_INT)
{
HOST_WIDE_INT val = INTVAL (op1) - 1;
if (trunc_int_for_mode (val, GET_MODE (op0)) == val)
switch (code)
{
case GE:
op1 = GEN_INT (val);
code = GT;
break;
case LT:
op1 = GEN_INT (val);
code = LE;
break;
case GEU:
op1 = GEN_INT (val);
code = GTU;
break;
case LTU:
op1 = GEN_INT (val);
code = LEU;
break;
default:
break;
}
}
/* However, if we generate an integer result, performing a reverse test
would require an extra negation, so avoid that where possible. */
if (GET_CODE (op1) == CONST_INT && is_set == 1)
{
HOST_WIDE_INT val = INTVAL (op1) + 1;
if (trunc_int_for_mode (val, GET_MODE (op0)) == val)
switch (code)
{
case LE:
op1 = GEN_INT (val);
code = LT;
break;
case LEU:
op1 = GEN_INT (val);
code = LTU;
break;
default:
break;
}
}
comp_mode = SImode;
op_mode = GET_MODE (op0);
switch (code)
{
case GE:
scode = SPU_GT;
if (HONOR_NANS (op_mode))
{
reverse_compare = 0;
reverse_test = 0;
eq_test = 1;
eq_code = SPU_EQ;
}
else
{
reverse_compare = 1;
reverse_test = 1;
}
break;
case LE:
scode = SPU_GT;
if (HONOR_NANS (op_mode))
{
reverse_compare = 1;
reverse_test = 0;
eq_test = 1;
eq_code = SPU_EQ;
}
else
{
reverse_compare = 0;
reverse_test = 1;
}
break;
case LT:
reverse_compare = 1;
reverse_test = 0;
scode = SPU_GT;
break;
case GEU:
reverse_compare = 1;
reverse_test = 1;
scode = SPU_GTU;
break;
case LEU:
reverse_compare = 0;
reverse_test = 1;
scode = SPU_GTU;
break;
case LTU:
reverse_compare = 1;
reverse_test = 0;
scode = SPU_GTU;
break;
case NE:
reverse_compare = 0;
reverse_test = 1;
scode = SPU_EQ;
break;
case EQ:
scode = SPU_EQ;
break;
case GT:
scode = SPU_GT;
break;
case GTU:
scode = SPU_GTU;
break;
default:
scode = SPU_EQ;
break;
}
switch (op_mode)
{
case QImode:
index = 0;
comp_mode = QImode;
break;
case HImode:
index = 1;
comp_mode = HImode;
break;
case SImode:
index = 2;
break;
case DImode:
index = 3;
break;
case TImode:
index = 4;
break;
case SFmode:
index = 5;
break;
case DFmode:
index = 6;
break;
case V16QImode:
index = 7;
comp_mode = op_mode;
break;
case V8HImode:
index = 8;
comp_mode = op_mode;
break;
case V4SImode:
index = 9;
comp_mode = op_mode;
break;
case V4SFmode:
index = 10;
comp_mode = V4SImode;
break;
case V2DFmode:
index = 11;
comp_mode = V2DImode;
break;
case V2DImode:
default:
abort ();
}
if (GET_MODE (op1) == DFmode
&& (scode != SPU_GT && scode != SPU_EQ))
abort ();
if (is_set == 0 && op1 == const0_rtx
&& (GET_MODE (op0) == SImode
|| GET_MODE (op0) == HImode
|| GET_MODE (op0) == QImode) && scode == SPU_EQ)
{
/* Don't need to set a register with the result when we are
comparing against zero and branching. */
reverse_test = !reverse_test;
compare_result = op0;
}
else
{
compare_result = gen_reg_rtx (comp_mode);
if (reverse_compare)
{
rtx t = op1;
op1 = op0;
op0 = t;
}
if (spu_comp_icode[index][scode] == 0)
abort ();
if (!(*insn_data[spu_comp_icode[index][scode]].operand[1].predicate)
(op0, op_mode))
op0 = force_reg (op_mode, op0);
if (!(*insn_data[spu_comp_icode[index][scode]].operand[2].predicate)
(op1, op_mode))
op1 = force_reg (op_mode, op1);
comp_rtx = GEN_FCN (spu_comp_icode[index][scode]) (compare_result,
op0, op1);
if (comp_rtx == 0)
abort ();
emit_insn (comp_rtx);
if (eq_test)
{
eq_result = gen_reg_rtx (comp_mode);
eq_rtx = GEN_FCN (spu_comp_icode[index][eq_code]) (eq_result,
op0, op1);
if (eq_rtx == 0)
abort ();
emit_insn (eq_rtx);
ior_code = optab_handler (ior_optab, comp_mode);
gcc_assert (ior_code != CODE_FOR_nothing);
emit_insn (GEN_FCN (ior_code)
(compare_result, compare_result, eq_result));
}
}
if (is_set == 0)
{
rtx bcomp;
rtx loc_ref;
/* We don't have branch on QI compare insns, so we convert the
QI compare result to a HI result. */
if (comp_mode == QImode)
{
rtx old_res = compare_result;
compare_result = gen_reg_rtx (HImode);
comp_mode = HImode;
emit_insn (gen_extendqihi2 (compare_result, old_res));
}
if (reverse_test)
bcomp = gen_rtx_EQ (comp_mode, compare_result, const0_rtx);
else
bcomp = gen_rtx_NE (comp_mode, compare_result, const0_rtx);
loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[3]);
emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx,
gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
loc_ref, pc_rtx)));
}
else if (is_set == 2)
{
rtx target = operands[0];
int compare_size = GET_MODE_BITSIZE (comp_mode);
int target_size = GET_MODE_BITSIZE (GET_MODE (target));
machine_mode mode = mode_for_size (target_size, MODE_INT, 0);
rtx select_mask;
rtx op_t = operands[2];
rtx op_f = operands[3];
/* The result of the comparison can be SI, HI or QI mode. Create a
mask based on that result. */
if (target_size > compare_size)
{
select_mask = gen_reg_rtx (mode);
emit_insn (gen_extend_compare (select_mask, compare_result));
}
else if (target_size < compare_size)
select_mask =
gen_rtx_SUBREG (mode, compare_result,
(compare_size - target_size) / BITS_PER_UNIT);
else if (comp_mode != mode)
select_mask = gen_rtx_SUBREG (mode, compare_result, 0);
else
select_mask = compare_result;
if (GET_MODE (target) != GET_MODE (op_t)
|| GET_MODE (target) != GET_MODE (op_f))
abort ();
if (reverse_test)
emit_insn (gen_selb (target, op_t, op_f, select_mask));
else
emit_insn (gen_selb (target, op_f, op_t, select_mask));
}
else
{
rtx target = operands[0];
if (reverse_test)
emit_insn (gen_rtx_SET (VOIDmode, compare_result,
gen_rtx_NOT (comp_mode, compare_result)));
if (GET_MODE (target) == SImode && GET_MODE (compare_result) == HImode)
emit_insn (gen_extendhisi2 (target, compare_result));
else if (GET_MODE (target) == SImode
&& GET_MODE (compare_result) == QImode)
emit_insn (gen_extend_compare (target, compare_result));
else
emit_move_insn (target, compare_result);
}
}
HOST_WIDE_INT
const_double_to_hwint (rtx x)
{
HOST_WIDE_INT val;
REAL_VALUE_TYPE rv;
if (GET_MODE (x) == SFmode)
{
REAL_VALUE_FROM_CONST_DOUBLE (rv, x);
REAL_VALUE_TO_TARGET_SINGLE (rv, val);
}
else if (GET_MODE (x) == DFmode)
{
long l[2];
REAL_VALUE_FROM_CONST_DOUBLE (rv, x);
REAL_VALUE_TO_TARGET_DOUBLE (rv, l);
val = l[0];
val = (val << 32) | (l[1] & 0xffffffff);
}
else
abort ();
return val;
}
rtx
hwint_to_const_double (machine_mode mode, HOST_WIDE_INT v)
{
long tv[2];
REAL_VALUE_TYPE rv;
gcc_assert (mode == SFmode || mode == DFmode);
if (mode == SFmode)
tv[0] = (v << 32) >> 32;
else if (mode == DFmode)
{
tv[1] = (v << 32) >> 32;
tv[0] = v >> 32;
}
real_from_target (&rv, tv, mode);
return CONST_DOUBLE_FROM_REAL_VALUE (rv, mode);
}
void
print_operand_address (FILE * file, register rtx addr)
{
rtx reg;
rtx offset;
if (GET_CODE (addr) == AND
&& GET_CODE (XEXP (addr, 1)) == CONST_INT
&& INTVAL (XEXP (addr, 1)) == -16)
addr = XEXP (addr, 0);
switch (GET_CODE (addr))
{
case REG:
fprintf (file, "0(%s)", reg_names[REGNO (addr)]);
break;
case PLUS:
reg = XEXP (addr, 0);
offset = XEXP (addr, 1);
if (GET_CODE (offset) == REG)
{
fprintf (file, "%s,%s", reg_names[REGNO (reg)],
reg_names[REGNO (offset)]);
}
else if (GET_CODE (offset) == CONST_INT)
{
fprintf (file, HOST_WIDE_INT_PRINT_DEC "(%s)",
INTVAL (offset), reg_names[REGNO (reg)]);
}
else
abort ();
break;
case CONST:
case LABEL_REF:
case SYMBOL_REF:
case CONST_INT:
output_addr_const (file, addr);
break;
default:
debug_rtx (addr);
abort ();
}
}
void
print_operand (FILE * file, rtx x, int code)
{
machine_mode mode = GET_MODE (x);
HOST_WIDE_INT val;
unsigned char arr[16];
int xcode = GET_CODE (x);
int i, info;
if (GET_MODE (x) == VOIDmode)
switch (code)
{
case 'L': /* 128 bits, signed */
case 'm': /* 128 bits, signed */
case 'T': /* 128 bits, signed */
case 't': /* 128 bits, signed */
mode = TImode;
break;
case 'K': /* 64 bits, signed */
case 'k': /* 64 bits, signed */
case 'D': /* 64 bits, signed */
case 'd': /* 64 bits, signed */
mode = DImode;
break;
case 'J': /* 32 bits, signed */
case 'j': /* 32 bits, signed */
case 's': /* 32 bits, signed */
case 'S': /* 32 bits, signed */
mode = SImode;
break;
}
switch (code)
{
case 'j': /* 32 bits, signed */
case 'k': /* 64 bits, signed */
case 'm': /* 128 bits, signed */
if (xcode == CONST_INT
|| xcode == CONST_DOUBLE || xcode == CONST_VECTOR)
{
gcc_assert (logical_immediate_p (x, mode));
constant_to_array (mode, x, arr);
val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
val = trunc_int_for_mode (val, SImode);
switch (which_logical_immediate (val))
{
case SPU_ORI:
break;
case SPU_ORHI:
fprintf (file, "h");
break;
case SPU_ORBI:
fprintf (file, "b");
break;
default:
gcc_unreachable();
}
}
else
gcc_unreachable();
return;
case 'J': /* 32 bits, signed */
case 'K': /* 64 bits, signed */
case 'L': /* 128 bits, signed */
if (xcode == CONST_INT
|| xcode == CONST_DOUBLE || xcode == CONST_VECTOR)
{
gcc_assert (logical_immediate_p (x, mode)
|| iohl_immediate_p (x, mode));
constant_to_array (mode, x, arr);
val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
val = trunc_int_for_mode (val, SImode);
switch (which_logical_immediate (val))
{
case SPU_ORI:
case SPU_IOHL:
break;
case SPU_ORHI:
val = trunc_int_for_mode (val, HImode);
break;
case SPU_ORBI:
val = trunc_int_for_mode (val, QImode);
break;
default:
gcc_unreachable();
}
fprintf (file, HOST_WIDE_INT_PRINT_DEC, val);
}
else
gcc_unreachable();
return;
case 't': /* 128 bits, signed */
case 'd': /* 64 bits, signed */
case 's': /* 32 bits, signed */
if (CONSTANT_P (x))
{
enum immediate_class c = classify_immediate (x, mode);
switch (c)
{
case IC_IL1:
constant_to_array (mode, x, arr);
val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
val = trunc_int_for_mode (val, SImode);
switch (which_immediate_load (val))
{
case SPU_IL:
break;
case SPU_ILA:
fprintf (file, "a");
break;
case SPU_ILH:
fprintf (file, "h");
break;
case SPU_ILHU:
fprintf (file, "hu");
break;
default:
gcc_unreachable ();
}
break;
case IC_CPAT:
constant_to_array (mode, x, arr);
cpat_info (arr, GET_MODE_SIZE (mode), &info, 0);
if (info == 1)
fprintf (file, "b");
else if (info == 2)
fprintf (file, "h");
else if (info == 4)
fprintf (file, "w");
else if (info == 8)
fprintf (file, "d");
break;
case IC_IL1s:
if (xcode == CONST_VECTOR)
{
x = CONST_VECTOR_ELT (x, 0);
xcode = GET_CODE (x);
}
if (xcode == SYMBOL_REF || xcode == LABEL_REF || xcode == CONST)
fprintf (file, "a");
else if (xcode == HIGH)
fprintf (file, "hu");
break;
case IC_FSMBI:
case IC_FSMBI2:
case IC_IL2:
case IC_IL2s:
case IC_POOL:
abort ();
}
}
else
gcc_unreachable ();
return;
case 'T': /* 128 bits, signed */
case 'D': /* 64 bits, signed */
case 'S': /* 32 bits, signed */
if (CONSTANT_P (x))
{
enum immediate_class c = classify_immediate (x, mode);
switch (c)
{
case IC_IL1:
constant_to_array (mode, x, arr);
val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
val = trunc_int_for_mode (val, SImode);
switch (which_immediate_load (val))
{
case SPU_IL:
case SPU_ILA:
break;
case SPU_ILH:
case SPU_ILHU:
val = trunc_int_for_mode (((arr[0] << 8) | arr[1]), HImode);
break;
default:
gcc_unreachable ();
}
fprintf (file, HOST_WIDE_INT_PRINT_DEC, val);
break;
case IC_FSMBI:
constant_to_array (mode, x, arr);
val = 0;
for (i = 0; i < 16; i++)
{
val <<= 1;
val |= arr[i] & 1;
}
print_operand (file, GEN_INT (val), 0);
break;
case IC_CPAT:
constant_to_array (mode, x, arr);
cpat_info (arr, GET_MODE_SIZE (mode), 0, &info);
fprintf (file, HOST_WIDE_INT_PRINT_DEC, (HOST_WIDE_INT)info);
break;
case IC_IL1s:
if (xcode == HIGH)
x = XEXP (x, 0);
if (GET_CODE (x) == CONST_VECTOR)
x = CONST_VECTOR_ELT (x, 0);
output_addr_const (file, x);
if (xcode == HIGH)
fprintf (file, "@h");
break;
case IC_IL2:
case IC_IL2s:
case IC_FSMBI2:
case IC_POOL:
abort ();
}
}
else
gcc_unreachable ();
return;
case 'C':
if (xcode == CONST_INT)
{
/* Only 4 least significant bits are relevant for generate
control word instructions. */
fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x) & 15);
return;
}
break;
case 'M': /* print code for c*d */
if (GET_CODE (x) == CONST_INT)
switch (INTVAL (x))
{
case 1:
fprintf (file, "b");
break;
case 2:
fprintf (file, "h");
break;
case 4:
fprintf (file, "w");
break;
case 8:
fprintf (file, "d");
break;
default:
gcc_unreachable();
}
else
gcc_unreachable();
return;
case 'N': /* Negate the operand */
if (xcode == CONST_INT)
fprintf (file, HOST_WIDE_INT_PRINT_DEC, -INTVAL (x));
else if (xcode == CONST_VECTOR)
fprintf (file, HOST_WIDE_INT_PRINT_DEC,
-INTVAL (CONST_VECTOR_ELT (x, 0)));
return;
case 'I': /* enable/disable interrupts */
if (xcode == CONST_INT)
fprintf (file, "%s", INTVAL (x) == 0 ? "d" : "e");
return;
case 'b': /* branch modifiers */
if (xcode == REG)
fprintf (file, "%s", GET_MODE (x) == HImode ? "h" : "");
else if (COMPARISON_P (x))
fprintf (file, "%s", xcode == NE ? "n" : "");
return;
case 'i': /* indirect call */
if (xcode == MEM)
{
if (GET_CODE (XEXP (x, 0)) == REG)
/* Used in indirect function calls. */
fprintf (file, "%s", reg_names[REGNO (XEXP (x, 0))]);
else
output_address (XEXP (x, 0));
}
return;
case 'p': /* load/store */
if (xcode == MEM)
{
x = XEXP (x, 0);
xcode = GET_CODE (x);
}
if (xcode == AND)
{
x = XEXP (x, 0);
xcode = GET_CODE (x);
}
if (xcode == REG)
fprintf (file, "d");
else if (xcode == CONST_INT)
fprintf (file, "a");
else if (xcode == CONST || xcode == SYMBOL_REF || xcode == LABEL_REF)
fprintf (file, "r");
else if (xcode == PLUS || xcode == LO_SUM)
{
if (GET_CODE (XEXP (x, 1)) == REG)
fprintf (file, "x");
else
fprintf (file, "d");
}
return;
case 'e':
val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
val &= 0x7;
output_addr_const (file, GEN_INT (val));
return;
case 'f':
val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
val &= 0x1f;
output_addr_const (file, GEN_INT (val));
return;
case 'g':
val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
val &= 0x3f;
output_addr_const (file, GEN_INT (val));
return;
case 'h':
val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
val = (val >> 3) & 0x1f;
output_addr_const (file, GEN_INT (val));
return;
case 'E':
val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
val = -val;
val &= 0x7;
output_addr_const (file, GEN_INT (val));
return;
case 'F':
val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
val = -val;
val &= 0x1f;
output_addr_const (file, GEN_INT (val));
return;
case 'G':
val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
val = -val;
val &= 0x3f;
output_addr_const (file, GEN_INT (val));
return;
case 'H':
val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
val = -(val & -8ll);
val = (val >> 3) & 0x1f;
output_addr_const (file, GEN_INT (val));
return;
case 'v':
case 'w':
constant_to_array (mode, x, arr);
val = (((arr[0] << 1) + (arr[1] >> 7)) & 0xff) - 127;
output_addr_const (file, GEN_INT (code == 'w' ? -val : val));
return;
case 0:
if (xcode == REG)
fprintf (file, "%s", reg_names[REGNO (x)]);
else if (xcode == MEM)
output_address (XEXP (x, 0));
else if (xcode == CONST_VECTOR)
print_operand (file, CONST_VECTOR_ELT (x, 0), 0);
else
output_addr_const (file, x);
return;
/* unused letters
o qr u yz
AB OPQR UVWXYZ */
default:
output_operand_lossage ("invalid %%xn code");
}
gcc_unreachable ();
}
/* For PIC mode we've reserved PIC_OFFSET_TABLE_REGNUM, which is a
caller saved register. For leaf functions it is more efficient to
use a volatile register because we won't need to save and restore the
pic register. This routine is only valid after register allocation
is completed, so we can pick an unused register. */
static rtx
get_pic_reg (void)
{
if (!reload_completed && !reload_in_progress)
abort ();
/* If we've already made the decision, we need to keep with it. Once we've
decided to use LAST_ARG_REGNUM, future calls to df_regs_ever_live_p may
return true since the register is now live; this should not cause us to
"switch back" to using pic_offset_table_rtx. */
if (!cfun->machine->pic_reg)
{
if (crtl->is_leaf && !df_regs_ever_live_p (LAST_ARG_REGNUM))
cfun->machine->pic_reg = gen_rtx_REG (SImode, LAST_ARG_REGNUM);
else
cfun->machine->pic_reg = pic_offset_table_rtx;
}
return cfun->machine->pic_reg;
}
/* Split constant addresses to handle cases that are too large.
Add in the pic register when in PIC mode.
Split immediates that require more than 1 instruction. */
int
spu_split_immediate (rtx * ops)
{
machine_mode mode = GET_MODE (ops[0]);
enum immediate_class c = classify_immediate (ops[1], mode);
switch (c)
{
case IC_IL2:
{
unsigned char arrhi[16];
unsigned char arrlo[16];
rtx to, temp, hi, lo;
int i;
machine_mode imode = mode;
/* We need to do reals as ints because the constant used in the
IOR might not be a legitimate real constant. */
imode = int_mode_for_mode (mode);
constant_to_array (mode, ops[1], arrhi);
if (imode != mode)
to = simplify_gen_subreg (imode, ops[0], mode, 0);
else
to = ops[0];
temp = !can_create_pseudo_p () ? to : gen_reg_rtx (imode);
for (i = 0; i < 16; i += 4)
{
arrlo[i + 2] = arrhi[i + 2];
arrlo[i + 3] = arrhi[i + 3];
arrlo[i + 0] = arrlo[i + 1] = 0;
arrhi[i + 2] = arrhi[i + 3] = 0;
}
hi = array_to_constant (imode, arrhi);
lo = array_to_constant (imode, arrlo);
emit_move_insn (temp, hi);
emit_insn (gen_rtx_SET
(VOIDmode, to, gen_rtx_IOR (imode, temp, lo)));
return 1;
}
case IC_FSMBI2:
{
unsigned char arr_fsmbi[16];
unsigned char arr_andbi[16];
rtx to, reg_fsmbi, reg_and;
int i;
machine_mode imode = mode;
/* We need to do reals as ints because the constant used in the
* AND might not be a legitimate real constant. */
imode = int_mode_for_mode (mode);
constant_to_array (mode, ops[1], arr_fsmbi);
if (imode != mode)
to = simplify_gen_subreg(imode, ops[0], GET_MODE (ops[0]), 0);
else
to = ops[0];
for (i = 0; i < 16; i++)
if (arr_fsmbi[i] != 0)
{
arr_andbi[0] = arr_fsmbi[i];
arr_fsmbi[i] = 0xff;
}
for (i = 1; i < 16; i++)
arr_andbi[i] = arr_andbi[0];
reg_fsmbi = array_to_constant (imode, arr_fsmbi);
reg_and = array_to_constant (imode, arr_andbi);
emit_move_insn (to, reg_fsmbi);
emit_insn (gen_rtx_SET
(VOIDmode, to, gen_rtx_AND (imode, to, reg_and)));
return 1;
}
case IC_POOL:
if (reload_in_progress || reload_completed)
{
rtx mem = force_const_mem (mode, ops[1]);
if (TARGET_LARGE_MEM)
{
rtx addr = gen_rtx_REG (Pmode, REGNO (ops[0]));
emit_move_insn (addr, XEXP (mem, 0));
mem = replace_equiv_address (mem, addr);
}
emit_move_insn (ops[0], mem);
return 1;
}
break;
case IC_IL1s:
case IC_IL2s:
if (reload_completed && GET_CODE (ops[1]) != HIGH)
{
if (c == IC_IL2s)
{
emit_move_insn (ops[0], gen_rtx_HIGH (mode, ops[1]));
emit_move_insn (ops[0], gen_rtx_LO_SUM (mode, ops[0], ops[1]));
}
else if (flag_pic)
emit_insn (gen_pic (ops[0], ops[1]));
if (flag_pic)
{
rtx pic_reg = get_pic_reg ();
emit_insn (gen_addsi3 (ops[0], ops[0], pic_reg));
}
return flag_pic || c == IC_IL2s;
}
break;
case IC_IL1:
case IC_FSMBI:
case IC_CPAT:
break;
}
return 0;
}
/* SAVING is TRUE when we are generating the actual load and store
instructions for REGNO. When determining the size of the stack
needed for saving register we must allocate enough space for the
worst case, because we don't always have the information early enough
to not allocate it. But we can at least eliminate the actual loads
and stores during the prologue/epilogue. */
static int
need_to_save_reg (int regno, int saving)
{
if (df_regs_ever_live_p (regno) && !call_used_regs[regno])
return 1;
if (flag_pic
&& regno == PIC_OFFSET_TABLE_REGNUM
&& (!saving || cfun->machine->pic_reg == pic_offset_table_rtx))
return 1;
return 0;
}
/* This function is only correct starting with local register
allocation */
int
spu_saved_regs_size (void)
{
int reg_save_size = 0;
int regno;
for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; --regno)
if (need_to_save_reg (regno, 0))
reg_save_size += 0x10;
return reg_save_size;
}
static rtx_insn *
frame_emit_store (int regno, rtx addr, HOST_WIDE_INT offset)
{
rtx reg = gen_rtx_REG (V4SImode, regno);
rtx mem =
gen_frame_mem (V4SImode, gen_rtx_PLUS (Pmode, addr, GEN_INT (offset)));
return emit_insn (gen_movv4si (mem, reg));
}
static rtx_insn *
frame_emit_load (int regno, rtx addr, HOST_WIDE_INT offset)
{
rtx reg = gen_rtx_REG (V4SImode, regno);
rtx mem =
gen_frame_mem (V4SImode, gen_rtx_PLUS (Pmode, addr, GEN_INT (offset)));
return emit_insn (gen_movv4si (reg, mem));
}
/* This happens after reload, so we need to expand it. */
static rtx_insn *
frame_emit_add_imm (rtx dst, rtx src, HOST_WIDE_INT imm, rtx scratch)
{
rtx_insn *insn;
if (satisfies_constraint_K (GEN_INT (imm)))
{
insn = emit_insn (gen_addsi3 (dst, src, GEN_INT (imm)));
}
else
{
emit_insn (gen_movsi (scratch, gen_int_mode (imm, SImode)));
insn = emit_insn (gen_addsi3 (dst, src, scratch));
if (REGNO (src) == REGNO (scratch))
abort ();
}
return insn;
}
/* Return nonzero if this function is known to have a null epilogue. */
int
direct_return (void)
{
if (reload_completed)
{
if (cfun->static_chain_decl == 0
&& (spu_saved_regs_size ()
+ get_frame_size ()
+ crtl->outgoing_args_size
+ crtl->args.pretend_args_size == 0)
&& crtl->is_leaf)
return 1;
}
return 0;
}
/*
The stack frame looks like this:
+-------------+
| incoming |
| args |
AP -> +-------------+
| $lr save |
+-------------+
prev SP | back chain |
+-------------+
| var args |
| reg save | crtl->args.pretend_args_size bytes
+-------------+
| ... |
| saved regs | spu_saved_regs_size() bytes
FP -> +-------------+
| ... |
| vars | get_frame_size() bytes
HFP -> +-------------+
| ... |
| outgoing |
| args | crtl->outgoing_args_size bytes
+-------------+
| $lr of next |
| frame |
+-------------+
| back chain |
SP -> +-------------+
*/
void
spu_expand_prologue (void)
{
HOST_WIDE_INT size = get_frame_size (), offset, regno;
HOST_WIDE_INT total_size;
HOST_WIDE_INT saved_regs_size;
rtx sp_reg = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
rtx scratch_reg_0, scratch_reg_1;
rtx_insn *insn;
rtx real;
if (flag_pic && optimize == 0 && !cfun->machine->pic_reg)
cfun->machine->pic_reg = pic_offset_table_rtx;
if (spu_naked_function_p (current_function_decl))
return;
scratch_reg_0 = gen_rtx_REG (SImode, LAST_ARG_REGNUM + 1);
scratch_reg_1 = gen_rtx_REG (SImode, LAST_ARG_REGNUM + 2);
saved_regs_size = spu_saved_regs_size ();
total_size = size + saved_regs_size
+ crtl->outgoing_args_size
+ crtl->args.pretend_args_size;
if (!crtl->is_leaf
|| cfun->calls_alloca || total_size > 0)
total_size += STACK_POINTER_OFFSET;
/* Save this first because code after this might use the link
register as a scratch register. */
if (!crtl->is_leaf)
{
insn = frame_emit_store (LINK_REGISTER_REGNUM, sp_reg, 16);
RTX_FRAME_RELATED_P (insn) = 1;
}
if (total_size > 0)
{
offset = -crtl->args.pretend_args_size;
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; ++regno)
if (need_to_save_reg (regno, 1))
{
offset -= 16;
insn = frame_emit_store (regno, sp_reg, offset);
RTX_FRAME_RELATED_P (insn) = 1;
}
}
if (flag_pic && cfun->machine->pic_reg)
{
rtx pic_reg = cfun->machine->pic_reg;
insn = emit_insn (gen_load_pic_offset (pic_reg, scratch_reg_0));
insn = emit_insn (gen_subsi3 (pic_reg, pic_reg, scratch_reg_0));
}
if (total_size > 0)
{
if (flag_stack_check)
{
/* We compare against total_size-1 because
($sp >= total_size) <=> ($sp > total_size-1) */
rtx scratch_v4si = gen_rtx_REG (V4SImode, REGNO (scratch_reg_0));
rtx sp_v4si = gen_rtx_REG (V4SImode, STACK_POINTER_REGNUM);
rtx size_v4si = spu_const (V4SImode, total_size - 1);
if (!satisfies_constraint_K (GEN_INT (total_size - 1)))
{
emit_move_insn (scratch_v4si, size_v4si);
size_v4si = scratch_v4si;
}
emit_insn (gen_cgt_v4si (scratch_v4si, sp_v4si, size_v4si));
emit_insn (gen_vec_extractv4si
(scratch_reg_0, scratch_v4si, GEN_INT (1)));
emit_insn (gen_spu_heq (scratch_reg_0, GEN_INT (0)));
}
/* Adjust the stack pointer, and make sure scratch_reg_0 contains
the value of the previous $sp because we save it as the back
chain. */
if (total_size <= 2000)
{
/* In this case we save the back chain first. */
insn = frame_emit_store (STACK_POINTER_REGNUM, sp_reg, -total_size);
insn =
frame_emit_add_imm (sp_reg, sp_reg, -total_size, scratch_reg_0);
}
else
{
insn = emit_move_insn (scratch_reg_0, sp_reg);
insn =
frame_emit_add_imm (sp_reg, sp_reg, -total_size, scratch_reg_1);
}
RTX_FRAME_RELATED_P (insn) = 1;
real = gen_addsi3 (sp_reg, sp_reg, GEN_INT (-total_size));
add_reg_note (insn, REG_FRAME_RELATED_EXPR, real);
if (total_size > 2000)
{
/* Save the back chain ptr */
insn = frame_emit_store (REGNO (scratch_reg_0), sp_reg, 0);
}
if (frame_pointer_needed)
{
rtx fp_reg = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
HOST_WIDE_INT fp_offset = STACK_POINTER_OFFSET
+ crtl->outgoing_args_size;
/* Set the new frame_pointer */
insn = frame_emit_add_imm (fp_reg, sp_reg, fp_offset, scratch_reg_0);
RTX_FRAME_RELATED_P (insn) = 1;
real = gen_addsi3 (fp_reg, sp_reg, GEN_INT (fp_offset));
add_reg_note (insn, REG_FRAME_RELATED_EXPR, real);
REGNO_POINTER_ALIGN (HARD_FRAME_POINTER_REGNUM) = STACK_BOUNDARY;
}
}
if (flag_stack_usage_info)
current_function_static_stack_size = total_size;
}
void
spu_expand_epilogue (bool sibcall_p)
{
int size = get_frame_size (), offset, regno;
HOST_WIDE_INT saved_regs_size, total_size;
rtx sp_reg = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
rtx scratch_reg_0;
if (spu_naked_function_p (current_function_decl))
return;
scratch_reg_0 = gen_rtx_REG (SImode, LAST_ARG_REGNUM + 1);
saved_regs_size = spu_saved_regs_size ();
total_size = size + saved_regs_size
+ crtl->outgoing_args_size
+ crtl->args.pretend_args_size;
if (!crtl->is_leaf
|| cfun->calls_alloca || total_size > 0)
total_size += STACK_POINTER_OFFSET;
if (total_size > 0)
{
if (cfun->calls_alloca)
frame_emit_load (STACK_POINTER_REGNUM, sp_reg, 0);
else
frame_emit_add_imm (sp_reg, sp_reg, total_size, scratch_reg_0);
if (saved_regs_size > 0)
{
offset = -crtl->args.pretend_args_size;
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; ++regno)
if (need_to_save_reg (regno, 1))
{
offset -= 0x10;
frame_emit_load (regno, sp_reg, offset);
}
}
}
if (!crtl->is_leaf)
frame_emit_load (LINK_REGISTER_REGNUM, sp_reg, 16);
if (!sibcall_p)
{
emit_use (gen_rtx_REG (SImode, LINK_REGISTER_REGNUM));
emit_jump_insn (gen__return ());
}
}
rtx
spu_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
{
if (count != 0)
return 0;
/* This is inefficient because it ends up copying to a save-register
which then gets saved even though $lr has already been saved. But
it does generate better code for leaf functions and we don't need
to use RETURN_ADDRESS_POINTER_REGNUM to get it working. It's only
used for __builtin_return_address anyway, so maybe we don't care if
it's inefficient. */
return get_hard_reg_initial_val (Pmode, LINK_REGISTER_REGNUM);
}
/* Given VAL, generate a constant appropriate for MODE.
If MODE is a vector mode, every element will be VAL.
For TImode, VAL will be zero extended to 128 bits. */
rtx
spu_const (machine_mode mode, HOST_WIDE_INT val)
{
rtx inner;
rtvec v;
int units, i;
gcc_assert (GET_MODE_CLASS (mode) == MODE_INT
|| GET_MODE_CLASS (mode) == MODE_FLOAT
|| GET_MODE_CLASS (mode) == MODE_VECTOR_INT
|| GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT);
if (GET_MODE_CLASS (mode) == MODE_INT)
return immed_double_const (val, 0, mode);
/* val is the bit representation of the float */
if (GET_MODE_CLASS (mode) == MODE_FLOAT)
return hwint_to_const_double (mode, val);
if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
inner = immed_double_const (val, 0, GET_MODE_INNER (mode));
else
inner = hwint_to_const_double (GET_MODE_INNER (mode), val);
units = GET_MODE_NUNITS (mode);
v = rtvec_alloc (units);
for (i = 0; i < units; ++i)
RTVEC_ELT (v, i) = inner;
return gen_rtx_CONST_VECTOR (mode, v);
}
/* Create a MODE vector constant from 4 ints. */
rtx
spu_const_from_ints(machine_mode mode, int a, int b, int c, int d)
{
unsigned char arr[16];
arr[0] = (a >> 24) & 0xff;
arr[1] = (a >> 16) & 0xff;
arr[2] = (a >> 8) & 0xff;
arr[3] = (a >> 0) & 0xff;
arr[4] = (b >> 24) & 0xff;
arr[5] = (b >> 16) & 0xff;
arr[6] = (b >> 8) & 0xff;
arr[7] = (b >> 0) & 0xff;
arr[8] = (c >> 24) & 0xff;
arr[9] = (c >> 16) & 0xff;
arr[10] = (c >> 8) & 0xff;
arr[11] = (c >> 0) & 0xff;
arr[12] = (d >> 24) & 0xff;
arr[13] = (d >> 16) & 0xff;
arr[14] = (d >> 8) & 0xff;
arr[15] = (d >> 0) & 0xff;
return array_to_constant(mode, arr);
}
/* branch hint stuff */
/* An array of these is used to propagate hints to predecessor blocks. */
struct spu_bb_info
{
rtx_insn *prop_jump; /* propagated from another block */
int bb_index; /* the original block. */
};
static struct spu_bb_info *spu_bb_info;
#define STOP_HINT_P(INSN) \
(CALL_P(INSN) \
|| INSN_CODE(INSN) == CODE_FOR_divmodsi4 \
|| INSN_CODE(INSN) == CODE_FOR_udivmodsi4)
/* 1 when RTX is a hinted branch or its target. We keep track of
what has been hinted so the safe-hint code can test it easily. */
#define HINTED_P(RTX) \
(RTL_FLAG_CHECK3("HINTED_P", (RTX), CODE_LABEL, JUMP_INSN, CALL_INSN)->unchanging)
/* 1 when RTX is an insn that must be scheduled on an even boundary. */
#define SCHED_ON_EVEN_P(RTX) \
(RTL_FLAG_CHECK2("SCHED_ON_EVEN_P", (RTX), JUMP_INSN, CALL_INSN)->in_struct)
/* Emit a nop for INSN such that the two will dual issue. This assumes
INSN is 8-byte aligned. When INSN is inline asm we emit an lnop.
We check for TImode to handle a MULTI1 insn which has dual issued its
first instruction. get_pipe returns -1 for MULTI0 or inline asm. */
static void
emit_nop_for_insn (rtx_insn *insn)
{
int p;
rtx_insn *new_insn;
/* We need to handle JUMP_TABLE_DATA separately. */
if (JUMP_TABLE_DATA_P (insn))
{
new_insn = emit_insn_after (gen_lnop(), insn);
recog_memoized (new_insn);
INSN_LOCATION (new_insn) = UNKNOWN_LOCATION;
return;
}
p = get_pipe (insn);
if ((CALL_P (insn) || JUMP_P (insn)) && SCHED_ON_EVEN_P (insn))
new_insn = emit_insn_after (gen_lnop (), insn);
else if (p == 1 && GET_MODE (insn) == TImode)
{
new_insn = emit_insn_before (gen_nopn (GEN_INT (127)), insn);
PUT_MODE (new_insn, TImode);
PUT_MODE (insn, VOIDmode);
}
else
new_insn = emit_insn_after (gen_lnop (), insn);
recog_memoized (new_insn);
INSN_LOCATION (new_insn) = INSN_LOCATION (insn);
}
/* Insert nops in basic blocks to meet dual issue alignment
requirements. Also make sure hbrp and hint instructions are at least
one cycle apart, possibly inserting a nop. */
static void
pad_bb(void)
{
rtx_insn *insn, *next_insn, *prev_insn, *hbr_insn = 0;
int length;
int addr;
/* This sets up INSN_ADDRESSES. */
shorten_branches (get_insns ());
/* Keep track of length added by nops. */
length = 0;
prev_insn = 0;
insn = get_insns ();
if (!active_insn_p (insn))
insn = next_active_insn (insn);
for (; insn; insn = next_insn)
{
next_insn = next_active_insn (insn);
if (INSN_CODE (insn) == CODE_FOR_iprefetch
|| INSN_CODE (insn) == CODE_FOR_hbr)
{
if (hbr_insn)
{
int a0 = INSN_ADDRESSES (INSN_UID (hbr_insn));
int a1 = INSN_ADDRESSES (INSN_UID (insn));
if ((a1 - a0 == 8 && GET_MODE (insn) != TImode)
|| (a1 - a0 == 4))
{
prev_insn = emit_insn_before (gen_lnop (), insn);
PUT_MODE (prev_insn, GET_MODE (insn));
PUT_MODE (insn, TImode);
INSN_LOCATION (prev_insn) = INSN_LOCATION (insn);
length += 4;
}
}
hbr_insn = insn;
}
if (INSN_CODE (insn) == CODE_FOR_blockage && next_insn)
{
if (GET_MODE (insn) == TImode)
PUT_MODE (next_insn, TImode);
insn = next_insn;
next_insn = next_active_insn (insn);
}
addr = INSN_ADDRESSES (INSN_UID (insn));
if ((CALL_P (insn) || JUMP_P (insn)) && SCHED_ON_EVEN_P (insn))
{
if (((addr + length) & 7) != 0)
{
emit_nop_for_insn (prev_insn);
length += 4;
}
}
else if (GET_MODE (insn) == TImode
&& ((next_insn && GET_MODE (next_insn) != TImode)
|| get_attr_type (insn) == TYPE_MULTI0)
&& ((addr + length) & 7) != 0)
{
/* prev_insn will always be set because the first insn is
always 8-byte aligned. */
emit_nop_for_insn (prev_insn);
length += 4;
}
prev_insn = insn;
}
}
/* Routines for branch hints. */
static void
spu_emit_branch_hint (rtx_insn *before, rtx_insn *branch, rtx target,
int distance, sbitmap blocks)
{
rtx branch_label = 0;
rtx_insn *hint;
rtx_insn *insn;
rtx_jump_table_data *table;
if (before == 0 || branch == 0 || target == 0)
return;
/* While scheduling we require hints to be no further than 600, so
we need to enforce that here too */
if (distance > 600)
return;
/* If we have a Basic block note, emit it after the basic block note. */
if (NOTE_INSN_BASIC_BLOCK_P (before))
before = NEXT_INSN (before);
branch_label = gen_label_rtx ();
LABEL_NUSES (branch_label)++;
LABEL_PRESERVE_P (branch_label) = 1;
insn = emit_label_before (branch_label, branch);
branch_label = gen_rtx_LABEL_REF (VOIDmode, branch_label);
bitmap_set_bit (blocks, BLOCK_FOR_INSN (branch)->index);
hint = emit_insn_before (gen_hbr (branch_label, target), before);
recog_memoized (hint);
INSN_LOCATION (hint) = INSN_LOCATION (branch);
HINTED_P (branch) = 1;
if (GET_CODE (target) == LABEL_REF)
HINTED_P (XEXP (target, 0)) = 1;
else if (tablejump_p (branch, 0, &table))
{
rtvec vec;
int j;
if (GET_CODE (PATTERN (table)) == ADDR_VEC)
vec = XVEC (PATTERN (table), 0);
else
vec = XVEC (PATTERN (table), 1);
for (j = GET_NUM_ELEM (vec) - 1; j >= 0; --j)
HINTED_P (XEXP (RTVEC_ELT (vec, j), 0)) = 1;
}
if (distance >= 588)
{
/* Make sure the hint isn't scheduled any earlier than this point,
which could make it too far for the branch offest to fit */
insn = emit_insn_before (gen_blockage (), hint);
recog_memoized (insn);
INSN_LOCATION (insn) = INSN_LOCATION (hint);
}
else if (distance <= 8 * 4)
{
/* To guarantee at least 8 insns between the hint and branch we
insert nops. */
int d;
for (d = distance; d < 8 * 4; d += 4)
{
insn =
emit_insn_after (gen_nopn_nv (gen_rtx_REG (SImode, 127)), hint);
recog_memoized (insn);
INSN_LOCATION (insn) = INSN_LOCATION (hint);
}
/* Make sure any nops inserted aren't scheduled before the hint. */
insn = emit_insn_after (gen_blockage (), hint);
recog_memoized (insn);
INSN_LOCATION (insn) = INSN_LOCATION (hint);
/* Make sure any nops inserted aren't scheduled after the call. */
if (CALL_P (branch) && distance < 8 * 4)
{
insn = emit_insn_before (gen_blockage (), branch);
recog_memoized (insn);
INSN_LOCATION (insn) = INSN_LOCATION (branch);
}
}
}
/* Returns 0 if we don't want a hint for this branch. Otherwise return
the rtx for the branch target. */
static rtx
get_branch_target (rtx_insn *branch)
{
if (JUMP_P (branch))
{
rtx set, src;
/* Return statements */
if (GET_CODE (PATTERN (branch)) == RETURN)
return gen_rtx_REG (SImode, LINK_REGISTER_REGNUM);
/* ASM GOTOs. */
if (extract_asm_operands (PATTERN (branch)) != NULL)
return NULL;
set = single_set (branch);
src = SET_SRC (set);
if (GET_CODE (SET_DEST (set)) != PC)
abort ();
if (GET_CODE (src) == IF_THEN_ELSE)
{
rtx lab = 0;
rtx note = find_reg_note (branch, REG_BR_PROB, 0);
if (note)
{
/* If the more probable case is not a fall through, then
try a branch hint. */
int prob = XINT (note, 0);
if (prob > (REG_BR_PROB_BASE * 6 / 10)
&& GET_CODE (XEXP (src, 1)) != PC)
lab = XEXP (src, 1);
else if (prob < (REG_BR_PROB_BASE * 4 / 10)
&& GET_CODE (XEXP (src, 2)) != PC)
lab = XEXP (src, 2);
}
if (lab)
{
if (GET_CODE (lab) == RETURN)
return gen_rtx_REG (SImode, LINK_REGISTER_REGNUM);
return lab;
}
return 0;
}
return src;
}
else if (CALL_P (branch))
{
rtx call;
/* All of our call patterns are in a PARALLEL and the CALL is
the first pattern in the PARALLEL. */
if (GET_CODE (PATTERN (branch)) != PARALLEL)
abort ();
call = XVECEXP (PATTERN (branch), 0, 0);
if (GET_CODE (call) == SET)
call = SET_SRC (call);
if (GET_CODE (call) != CALL)
abort ();
return XEXP (XEXP (call, 0), 0);
}
return 0;
}
/* The special $hbr register is used to prevent the insn scheduler from
moving hbr insns across instructions which invalidate them. It
should only be used in a clobber, and this function searches for
insns which clobber it. */
static bool
insn_clobbers_hbr (rtx_insn *insn)
{
if (INSN_P (insn)
&& GET_CODE (PATTERN (insn)) == PARALLEL)
{
rtx parallel = PATTERN (insn);
rtx clobber;
int j;
for (j = XVECLEN (parallel, 0) - 1; j >= 0; j--)
{
clobber = XVECEXP (parallel, 0, j);
if (GET_CODE (clobber) == CLOBBER
&& GET_CODE (XEXP (clobber, 0)) == REG
&& REGNO (XEXP (clobber, 0)) == HBR_REGNUM)
return 1;
}
}
return 0;
}
/* Search up to 32 insns starting at FIRST:
- at any kind of hinted branch, just return
- at any unconditional branch in the first 15 insns, just return
- at a call or indirect branch, after the first 15 insns, force it to
an even address and return
- at any unconditional branch, after the first 15 insns, force it to
an even address.
At then end of the search, insert an hbrp within 4 insns of FIRST,
and an hbrp within 16 instructions of FIRST.
*/
static void
insert_hbrp_for_ilb_runout (rtx_insn *first)
{
rtx_insn *insn, *before_4 = 0, *before_16 = 0;
int addr = 0, length, first_addr = -1;
int hbrp_addr0 = 128 * 4, hbrp_addr1 = 128 * 4;
int insert_lnop_after = 0;
for (insn = first; insn; insn = NEXT_INSN (insn))
if (INSN_P (insn))
{
if (first_addr == -1)
first_addr = INSN_ADDRESSES (INSN_UID (insn));
addr = INSN_ADDRESSES (INSN_UID (insn)) - first_addr;
length = get_attr_length (insn);
if (before_4 == 0 && addr + length >= 4 * 4)
before_4 = insn;
/* We test for 14 instructions because the first hbrp will add
up to 2 instructions. */
if (before_16 == 0 && addr + length >= 14 * 4)
before_16 = insn;
if (INSN_CODE (insn) == CODE_FOR_hbr)
{
/* Make sure an hbrp is at least 2 cycles away from a hint.
Insert an lnop after the hbrp when necessary. */
if (before_4 == 0 && addr > 0)
{
before_4 = insn;
insert_lnop_after |= 1;
}
else if (before_4 && addr <= 4 * 4)
insert_lnop_after |= 1;
if (before_16 == 0 && addr > 10 * 4)
{
before_16 = insn;
insert_lnop_after |= 2;
}
else if (before_16 && addr <= 14 * 4)
insert_lnop_after |= 2;
}
if (INSN_CODE (insn) == CODE_FOR_iprefetch)
{
if (addr < hbrp_addr0)
hbrp_addr0 = addr;
else if (addr < hbrp_addr1)
hbrp_addr1 = addr;
}
if (CALL_P (insn) || JUMP_P (insn))
{
if (HINTED_P (insn))
return;
/* Any branch after the first 15 insns should be on an even
address to avoid a special case branch. There might be
some nops and/or hbrps inserted, so we test after 10
insns. */
if (addr > 10 * 4)
SCHED_ON_EVEN_P (insn) = 1;
}
if (CALL_P (insn) || tablejump_p (insn, 0, 0))
return;
if (addr + length >= 32 * 4)
{
gcc_assert (before_4 && before_16);
if (hbrp_addr0 > 4 * 4)
{
insn =
emit_insn_before (gen_iprefetch (GEN_INT (1)), before_4);
recog_memoized (insn);
INSN_LOCATION (insn) = INSN_LOCATION (before_4);
INSN_ADDRESSES_NEW (insn,
INSN_ADDRESSES (INSN_UID (before_4)));
PUT_MODE (insn, GET_MODE (before_4));
PUT_MODE (before_4, TImode);
if (insert_lnop_after & 1)
{
insn = emit_insn_before (gen_lnop (), before_4);
recog_memoized (insn);
INSN_LOCATION (insn) = INSN_LOCATION (before_4);
INSN_ADDRESSES_NEW (insn,
INSN_ADDRESSES (INSN_UID (before_4)));
PUT_MODE (insn, TImode);
}
}
if ((hbrp_addr0 <= 4 * 4 || hbrp_addr0 > 16 * 4)
&& hbrp_addr1 > 16 * 4)
{
insn =
emit_insn_before (gen_iprefetch (GEN_INT (2)), before_16);
recog_memoized (insn);
INSN_LOCATION (insn) = INSN_LOCATION (before_16);
INSN_ADDRESSES_NEW (insn,
INSN_ADDRESSES (INSN_UID (before_16)));
PUT_MODE (insn, GET_MODE (before_16));
PUT_MODE (before_16, TImode);
if (insert_lnop_after & 2)
{
insn = emit_insn_before (gen_lnop (), before_16);
recog_memoized (insn);
INSN_LOCATION (insn) = INSN_LOCATION (before_16);
INSN_ADDRESSES_NEW (insn,
INSN_ADDRESSES (INSN_UID
(before_16)));
PUT_MODE (insn, TImode);
}
}
return;
}
}
else if (BARRIER_P (insn))
return;
}
/* The SPU might hang when it executes 48 inline instructions after a
hinted branch jumps to its hinted target. The beginning of a
function and the return from a call might have been hinted, and
must be handled as well. To prevent a hang we insert 2 hbrps. The
first should be within 6 insns of the branch target. The second
should be within 22 insns of the branch target. When determining
if hbrps are necessary, we look for only 32 inline instructions,
because up to 12 nops and 4 hbrps could be inserted. Similarily,
when inserting new hbrps, we insert them within 4 and 16 insns of
the target. */
static void
insert_hbrp (void)
{
rtx_insn *insn;
if (TARGET_SAFE_HINTS)
{
shorten_branches (get_insns ());
/* Insert hbrp at beginning of function */
insn = next_active_insn (get_insns ());
if (insn)
insert_hbrp_for_ilb_runout (insn);
/* Insert hbrp after hinted targets. */
for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
if ((LABEL_P (insn) && HINTED_P (insn)) || CALL_P (insn))
insert_hbrp_for_ilb_runout (next_active_insn (insn));
}
}
static int in_spu_reorg;
static void
spu_var_tracking (void)
{
if (flag_var_tracking)
{
df_analyze ();
timevar_push (TV_VAR_TRACKING);
variable_tracking_main ();
timevar_pop (TV_VAR_TRACKING);
df_finish_pass (false);
}
}
/* Insert branch hints. There are no branch optimizations after this
pass, so it's safe to set our branch hints now. */
static void
spu_machine_dependent_reorg (void)
{
sbitmap blocks;
basic_block bb;
rtx_insn *branch, *insn;
rtx branch_target = 0;
int branch_addr = 0, insn_addr, required_dist = 0;
int i;
unsigned int j;
if (!TARGET_BRANCH_HINTS || optimize == 0)
{
/* We still do it for unoptimized code because an external
function might have hinted a call or return. */
compute_bb_for_insn ();
insert_hbrp ();
pad_bb ();
spu_var_tracking ();
free_bb_for_insn ();
return;
}
blocks = sbitmap_alloc (last_basic_block_for_fn (cfun));
bitmap_clear (blocks);
in_spu_reorg = 1;
compute_bb_for_insn ();
/* (Re-)discover loops so that bb->loop_father can be used
in the analysis below. */
loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
compact_blocks ();
spu_bb_info =
(struct spu_bb_info *) xcalloc (n_basic_blocks_for_fn (cfun),
sizeof (struct spu_bb_info));
/* We need exact insn addresses and lengths. */
shorten_branches (get_insns ());
for (i = n_basic_blocks_for_fn (cfun) - 1; i >= 0; i--)
{
bb = BASIC_BLOCK_FOR_FN (cfun, i);
branch = 0;
if (spu_bb_info[i].prop_jump)
{
branch = spu_bb_info[i].prop_jump;
branch_target = get_branch_target (branch);
branch_addr = INSN_ADDRESSES (INSN_UID (branch));
required_dist = spu_hint_dist;
}
/* Search from end of a block to beginning. In this loop, find
jumps which need a branch and emit them only when:
- it's an indirect branch and we're at the insn which sets
the register
- we're at an insn that will invalidate the hint. e.g., a
call, another hint insn, inline asm that clobbers $hbr, and
some inlined operations (divmodsi4). Don't consider jumps
because they are only at the end of a block and are
considered when we are deciding whether to propagate
- we're getting too far away from the branch. The hbr insns
only have a signed 10 bit offset
We go back as far as possible so the branch will be considered
for propagation when we get to the beginning of the block. */
for (insn = BB_END (bb); insn; insn = PREV_INSN (insn))
{
if (INSN_P (insn))
{
insn_addr = INSN_ADDRESSES (INSN_UID (insn));
if (branch
&& ((GET_CODE (branch_target) == REG
&& set_of (branch_target, insn) != NULL_RTX)
|| insn_clobbers_hbr (insn)
|| branch_addr - insn_addr > 600))
{
rtx_insn *next = NEXT_INSN (insn);
int next_addr = INSN_ADDRESSES (INSN_UID (next));
if (insn != BB_END (bb)
&& branch_addr - next_addr >= required_dist)
{
if (dump_file)
fprintf (dump_file,
"hint for %i in block %i before %i\n",
INSN_UID (branch), bb->index,
INSN_UID (next));
spu_emit_branch_hint (next, branch, branch_target,
branch_addr - next_addr, blocks);
}
branch = 0;
}
/* JUMP_P will only be true at the end of a block. When
branch is already set it means we've previously decided
to propagate a hint for that branch into this block. */
if (CALL_P (insn) || (JUMP_P (insn) && !branch))
{
branch = 0;
if ((branch_target = get_branch_target (insn)))
{
branch = insn;
branch_addr = insn_addr;
required_dist = spu_hint_dist;
}
}
}
if (insn == BB_HEAD (bb))
break;
}
if (branch)
{
/* If we haven't emitted a hint for this branch yet, it might
be profitable to emit it in one of the predecessor blocks,
especially for loops. */
rtx_insn *bbend;
basic_block prev = 0, prop = 0, prev2 = 0;
int loop_exit = 0, simple_loop = 0;
int next_addr = INSN_ADDRESSES (INSN_UID (NEXT_INSN (insn)));
for (j = 0; j < EDGE_COUNT (bb->preds); j++)
if (EDGE_PRED (bb, j)->flags & EDGE_FALLTHRU)
prev = EDGE_PRED (bb, j)->src;
else
prev2 = EDGE_PRED (bb, j)->src;
for (j = 0; j < EDGE_COUNT (bb->succs); j++)
if (EDGE_SUCC (bb, j)->flags & EDGE_LOOP_EXIT)
loop_exit = 1;
else if (EDGE_SUCC (bb, j)->dest == bb)
simple_loop = 1;
/* If this branch is a loop exit then propagate to previous
fallthru block. This catches the cases when it is a simple
loop or when there is an initial branch into the loop. */
if (prev && (loop_exit || simple_loop)
&& bb_loop_depth (prev) <= bb_loop_depth (bb))
prop = prev;
/* If there is only one adjacent predecessor. Don't propagate
outside this loop. */
else if (prev && single_pred_p (bb)
&& prev->loop_father == bb->loop_father)
prop = prev;
/* If this is the JOIN block of a simple IF-THEN then
propagate the hint to the HEADER block. */
else if (prev && prev2
&& EDGE_COUNT (bb->preds) == 2
&& EDGE_COUNT (prev->preds) == 1
&& EDGE_PRED (prev, 0)->src == prev2
&& prev2->loop_father == bb->loop_father
&& GET_CODE (branch_target) != REG)
prop = prev;
/* Don't propagate when:
- this is a simple loop and the hint would be too far
- this is not a simple loop and there are 16 insns in
this block already
- the predecessor block ends in a branch that will be
hinted
- the predecessor block ends in an insn that invalidates
the hint */
if (prop
&& prop->index >= 0
&& (bbend = BB_END (prop))
&& branch_addr - INSN_ADDRESSES (INSN_UID (bbend)) <
(simple_loop ? 600 : 16 * 4) && get_branch_target (bbend) == 0
&& (JUMP_P (bbend) || !insn_clobbers_hbr (bbend)))
{
if (dump_file)
fprintf (dump_file, "propagate from %i to %i (loop depth %i) "
"for %i (loop_exit %i simple_loop %i dist %i)\n",
bb->index, prop->index, bb_loop_depth (bb),
INSN_UID (branch), loop_exit, simple_loop,
branch_addr - INSN_ADDRESSES (INSN_UID (bbend)));
spu_bb_info[prop->index].prop_jump = branch;
spu_bb_info[prop->index].bb_index = i;
}
else if (branch_addr - next_addr >= required_dist)
{
if (dump_file)
fprintf (dump_file, "hint for %i in block %i before %i\n",
INSN_UID (branch), bb->index,
INSN_UID (NEXT_INSN (insn)));
spu_emit_branch_hint (NEXT_INSN (insn), branch, branch_target,
branch_addr - next_addr, blocks);
}
branch = 0;
}
}
free (spu_bb_info);
if (!bitmap_empty_p (blocks))
find_many_sub_basic_blocks (blocks);
/* We have to schedule to make sure alignment is ok. */
FOR_EACH_BB_FN (bb, cfun) bb->flags &= ~BB_DISABLE_SCHEDULE;
/* The hints need to be scheduled, so call it again. */
schedule_insns ();
df_finish_pass (true);
insert_hbrp ();
pad_bb ();
for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
if (NONJUMP_INSN_P (insn) && INSN_CODE (insn) == CODE_FOR_hbr)
{
/* Adjust the LABEL_REF in a hint when we have inserted a nop
between its branch label and the branch . We don't move the
label because GCC expects it at the beginning of the block. */
rtx unspec = SET_SRC (XVECEXP (PATTERN (insn), 0, 0));
rtx label_ref = XVECEXP (unspec, 0, 0);
rtx_insn *label = as_a <rtx_insn *> (XEXP (label_ref, 0));
rtx_insn *branch;
int offset = 0;
for (branch = NEXT_INSN (label);
!JUMP_P (branch) && !CALL_P (branch);
branch = NEXT_INSN (branch))
if (NONJUMP_INSN_P (branch))
offset += get_attr_length (branch);
if (offset > 0)
XVECEXP (unspec, 0, 0) = plus_constant (Pmode, label_ref, offset);
}
spu_var_tracking ();
loop_optimizer_finalize ();
free_bb_for_insn ();
in_spu_reorg = 0;
}
/* Insn scheduling routines, primarily for dual issue. */
static int
spu_sched_issue_rate (void)
{
return 2;
}
static int
uses_ls_unit(rtx_insn *insn)
{
rtx set = single_set (insn);
if (set != 0
&& (GET_CODE (SET_DEST (set)) == MEM
|| GET_CODE (SET_SRC (set)) == MEM))
return 1;
return 0;
}
static int
get_pipe (rtx_insn *insn)
{
enum attr_type t;
/* Handle inline asm */
if (INSN_CODE (insn) == -1)
return -1;
t = get_attr_type (insn);
switch (t)
{
case TYPE_CONVERT:
return -2;
case TYPE_MULTI0:
return -1;
case TYPE_FX2:
case TYPE_FX3:
case TYPE_SPR:
case TYPE_NOP:
case TYPE_FXB:
case TYPE_FPD:
case TYPE_FP6:
case TYPE_FP7:
return 0;
case TYPE_LNOP:
case TYPE_SHUF:
case TYPE_LOAD:
case TYPE_STORE:
case TYPE_BR:
case TYPE_MULTI1:
case TYPE_HBR:
case TYPE_IPREFETCH:
return 1;
default:
abort ();
}
}
/* haifa-sched.c has a static variable that keeps track of the current
cycle. It is passed to spu_sched_reorder, and we record it here for
use by spu_sched_variable_issue. It won't be accurate if the
scheduler updates it's clock_var between the two calls. */
static int clock_var;
/* This is used to keep track of insn alignment. Set to 0 at the
beginning of each block and increased by the "length" attr of each
insn scheduled. */
static int spu_sched_length;
/* Record when we've issued pipe0 and pipe1 insns so we can reorder the
ready list appropriately in spu_sched_reorder(). */
static int pipe0_clock;
static int pipe1_clock;
static int prev_clock_var;
static int prev_priority;
/* The SPU needs to load the next ilb sometime during the execution of
the previous ilb. There is a potential conflict if every cycle has a
load or store. To avoid the conflict we make sure the load/store
unit is free for at least one cycle during the execution of insns in
the previous ilb. */
static int spu_ls_first;
static int prev_ls_clock;
static void
spu_sched_init_global (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
int max_ready ATTRIBUTE_UNUSED)
{
spu_sched_length = 0;
}
static void
spu_sched_init (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
int max_ready ATTRIBUTE_UNUSED)
{
if (align_labels > 4 || align_loops > 4 || align_jumps > 4)
{
/* When any block might be at least 8-byte aligned, assume they
will all be at least 8-byte aligned to make sure dual issue
works out correctly. */
spu_sched_length = 0;
}
spu_ls_first = INT_MAX;
clock_var = -1;
prev_ls_clock = -1;
pipe0_clock = -1;
pipe1_clock = -1;
prev_clock_var = -1;
prev_priority = -1;
}
static int
spu_sched_variable_issue (FILE *file ATTRIBUTE_UNUSED,
int verbose ATTRIBUTE_UNUSED,
rtx_insn *insn, int more)
{
int len;
int p;
if (GET_CODE (PATTERN (insn)) == USE
|| GET_CODE (PATTERN (insn)) == CLOBBER
|| (len = get_attr_length (insn)) == 0)
return more;
spu_sched_length += len;
/* Reset on inline asm */
if (INSN_CODE (insn) == -1)
{
spu_ls_first = INT_MAX;
pipe0_clock = -1;
pipe1_clock = -1;
return 0;
}
p = get_pipe (insn);
if (p == 0)
pipe0_clock = clock_var;
else
pipe1_clock = clock_var;
if (in_spu_reorg)
{
if (clock_var - prev_ls_clock > 1
|| INSN_CODE (insn) == CODE_FOR_iprefetch)
spu_ls_first = INT_MAX;
if (uses_ls_unit (insn))
{
if (spu_ls_first == INT_MAX)
spu_ls_first = spu_sched_length;
prev_ls_clock = clock_var;
}
/* The scheduler hasn't inserted the nop, but we will later on.
Include those nops in spu_sched_length. */
if (prev_clock_var == clock_var && (spu_sched_length & 7))
spu_sched_length += 4;
prev_clock_var = clock_var;
/* more is -1 when called from spu_sched_reorder for new insns
that don't have INSN_PRIORITY */
if (more >= 0)
prev_priority = INSN_PRIORITY (insn);
}
/* Always try issuing more insns. spu_sched_reorder will decide
when the cycle should be advanced. */
return 1;
}
/* This function is called for both TARGET_SCHED_REORDER and
TARGET_SCHED_REORDER2. */
static int
spu_sched_reorder (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
rtx_insn **ready, int *nreadyp, int clock)
{
int i, nready = *nreadyp;
int pipe_0, pipe_1, pipe_hbrp, pipe_ls, schedule_i;
rtx_insn *insn;
clock_var = clock;
if (nready <= 0 || pipe1_clock >= clock)
return 0;
/* Find any rtl insns that don't generate assembly insns and schedule
them first. */
for (i = nready - 1; i >= 0; i--)
{
insn = ready[i];
if (INSN_CODE (insn) == -1
|| INSN_CODE (insn) == CODE_FOR_blockage
|| (INSN_P (insn) && get_attr_length (insn) == 0))
{
ready[i] = ready[nready - 1];
ready[nready - 1] = insn;
return 1;
}
}
pipe_0 = pipe_1 = pipe_hbrp = pipe_ls = schedule_i = -1;
for (i = 0; i < nready; i++)
if (INSN_CODE (ready[i]) != -1)
{
insn = ready[i];
switch (get_attr_type (insn))
{
default:
case TYPE_MULTI0:
case TYPE_CONVERT:
case TYPE_FX2:
case TYPE_FX3:
case TYPE_SPR:
case TYPE_NOP:
case TYPE_FXB:
case TYPE_FPD:
case TYPE_FP6:
case TYPE_FP7:
pipe_0 = i;
break;
case TYPE_LOAD:
case TYPE_STORE:
pipe_ls = i;
case TYPE_LNOP:
case TYPE_SHUF:
case TYPE_BR:
case TYPE_MULTI1:
case TYPE_HBR:
pipe_1 = i;
break;
case TYPE_IPREFETCH:
pipe_hbrp = i;
break;
}
}
/* In the first scheduling phase, schedule loads and stores together
to increase the chance they will get merged during postreload CSE. */
if (!reload_completed && pipe_ls >= 0)
{
insn = ready[pipe_ls];
ready[pipe_ls] = ready[nready - 1];
ready[nready - 1] = insn;
return 1;
}
/* If there is an hbrp ready, prefer it over other pipe 1 insns. */
if (pipe_hbrp >= 0)
pipe_1 = pipe_hbrp;
/* When we have loads/stores in every cycle of the last 15 insns and
we are about to schedule another load/store, emit an hbrp insn
instead. */
if (in_spu_reorg
&& spu_sched_length - spu_ls_first >= 4 * 15
&& !(pipe0_clock < clock && pipe_0 >= 0) && pipe_1 == pipe_ls)
{
insn = sched_emit_insn (gen_iprefetch (GEN_INT (3)));
recog_memoized (insn);
if (pipe0_clock < clock)
PUT_MODE (insn, TImode);
spu_sched_variable_issue (file, verbose, insn, -1);
return 0;
}
/* In general, we want to emit nops to increase dual issue, but dual
issue isn't faster when one of the insns could be scheduled later
without effecting the critical path. We look at INSN_PRIORITY to
make a good guess, but it isn't perfect so -mdual-nops=n can be
used to effect it. */
if (in_spu_reorg && spu_dual_nops < 10)
{
/* When we are at an even address and we are not issuing nops to
improve scheduling then we need to advance the cycle. */
if ((spu_sched_length & 7) == 0 && prev_clock_var == clock
&& (spu_dual_nops == 0
|| (pipe_1 != -1
&& prev_priority >
INSN_PRIORITY (ready[pipe_1]) + spu_dual_nops)))
return 0;
/* When at an odd address, schedule the highest priority insn
without considering pipeline. */
if ((spu_sched_length & 7) == 4 && prev_clock_var != clock
&& (spu_dual_nops == 0
|| (prev_priority >
INSN_PRIORITY (ready[nready - 1]) + spu_dual_nops)))
return 1;
}
/* We haven't issued a pipe0 insn yet this cycle, if there is a
pipe0 insn in the ready list, schedule it. */
if (pipe0_clock < clock && pipe_0 >= 0)
schedule_i = pipe_0;
/* Either we've scheduled a pipe0 insn already or there is no pipe0
insn to schedule. Put a pipe1 insn at the front of the ready list. */
else
schedule_i = pipe_1;
if (schedule_i > -1)
{
insn = ready[schedule_i];
ready[schedule_i] = ready[nready - 1];
ready[nready - 1] = insn;
return 1;
}
return 0;
}
/* INSN is dependent on DEP_INSN. */
static int
spu_sched_adjust_cost (rtx_insn *insn, rtx link, rtx_insn *dep_insn, int cost)
{
rtx set;
/* The blockage pattern is used to prevent instructions from being
moved across it and has no cost. */
if (INSN_CODE (insn) == CODE_FOR_blockage