blob: 56dd99b551195c4809d15e2bb558f4cf59e7f7f7 [file] [log] [blame]
/* Copyright (C) 1988-2021 Free Software Foundation, Inc.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
#define IN_TARGET_CODE 1
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "backend.h"
#include "rtl.h"
#include "tree.h"
#include "memmodel.h"
#include "gimple.h"
#include "cfghooks.h"
#include "cfgloop.h"
#include "df.h"
#include "tm_p.h"
#include "stringpool.h"
#include "expmed.h"
#include "optabs.h"
#include "regs.h"
#include "emit-rtl.h"
#include "recog.h"
#include "cgraph.h"
#include "diagnostic.h"
#include "cfgbuild.h"
#include "alias.h"
#include "fold-const.h"
#include "attribs.h"
#include "calls.h"
#include "stor-layout.h"
#include "varasm.h"
#include "output.h"
#include "insn-attr.h"
#include "flags.h"
#include "except.h"
#include "explow.h"
#include "expr.h"
#include "cfgrtl.h"
#include "common/common-target.h"
#include "langhooks.h"
#include "reload.h"
#include "gimplify.h"
#include "dwarf2.h"
#include "tm-constrs.h"
#include "cselib.h"
#include "sched-int.h"
#include "opts.h"
#include "tree-pass.h"
#include "context.h"
#include "pass_manager.h"
#include "target-globals.h"
#include "gimple-iterator.h"
#include "tree-vectorizer.h"
#include "shrink-wrap.h"
#include "builtins.h"
#include "rtl-iter.h"
#include "tree-iterator.h"
#include "dbgcnt.h"
#include "case-cfn-macros.h"
#include "dojump.h"
#include "fold-const-call.h"
#include "tree-vrp.h"
#include "tree-ssanames.h"
#include "selftest.h"
#include "selftest-rtl.h"
#include "print-rtl.h"
#include "intl.h"
#include "ifcvt.h"
#include "symbol-summary.h"
#include "ipa-prop.h"
#include "ipa-fnsummary.h"
#include "wide-int-bitmask.h"
#include "tree-vector-builder.h"
#include "debug.h"
#include "dwarf2out.h"
#include "i386-options.h"
#include "i386-builtins.h"
#include "i386-expand.h"
/* Split one or more double-mode RTL references into pairs of half-mode
references. The RTL can be REG, offsettable MEM, integer constant, or
CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
split and "num" is its length. lo_half and hi_half are output arrays
that parallel "operands". */
void
split_double_mode (machine_mode mode, rtx operands[],
int num, rtx lo_half[], rtx hi_half[])
{
machine_mode half_mode;
unsigned int byte;
rtx mem_op = NULL_RTX;
int mem_num = 0;
switch (mode)
{
case E_TImode:
half_mode = DImode;
break;
case E_DImode:
half_mode = SImode;
break;
case E_P2HImode:
half_mode = HImode;
break;
case E_P2QImode:
half_mode = QImode;
break;
default:
gcc_unreachable ();
}
byte = GET_MODE_SIZE (half_mode);
while (num--)
{
rtx op = operands[num];
/* simplify_subreg refuse to split volatile memory addresses,
but we still have to handle it. */
if (MEM_P (op))
{
if (mem_op && rtx_equal_p (op, mem_op))
{
lo_half[num] = lo_half[mem_num];
hi_half[num] = hi_half[mem_num];
}
else
{
mem_op = op;
mem_num = num;
lo_half[num] = adjust_address (op, half_mode, 0);
hi_half[num] = adjust_address (op, half_mode, byte);
}
}
else
{
lo_half[num] = simplify_gen_subreg (half_mode, op,
GET_MODE (op) == VOIDmode
? mode : GET_MODE (op), 0);
rtx tmp = simplify_gen_subreg (half_mode, op,
GET_MODE (op) == VOIDmode
? mode : GET_MODE (op), byte);
/* simplify_gen_subreg will return NULL RTX for the
high half of the paradoxical subreg. */
hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
}
}
}
/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
for the target. */
void
ix86_expand_clear (rtx dest)
{
rtx tmp;
/* We play register width games, which are only valid after reload. */
gcc_assert (reload_completed);
/* Avoid HImode and its attendant prefix byte. */
if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
dest = gen_rtx_REG (SImode, REGNO (dest));
tmp = gen_rtx_SET (dest, const0_rtx);
if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
{
rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
}
emit_insn (tmp);
}
/* Return true if V can be broadcasted from an integer of WIDTH bits
which is returned in VAL_BROADCAST. Otherwise, return false. */
static bool
ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
HOST_WIDE_INT &val_broadcast)
{
wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
val_broadcast = wi::extract_uhwi (val, 0, width);
for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
{
HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
if (val_broadcast != each)
return false;
}
val_broadcast = sext_hwi (val_broadcast, width);
return true;
}
/* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
static rtx
ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
{
/* Don't use integer vector broadcast if we can't move from GPR to SSE
register directly. */
if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
return nullptr;
/* Convert CONST_WIDE_INT to a non-standard SSE constant integer
broadcast only if vector broadcast is available. */
if (!TARGET_AVX
|| !CONST_WIDE_INT_P (op)
|| standard_sse_constant_p (op, mode))
return nullptr;
HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
HOST_WIDE_INT val_broadcast;
scalar_int_mode broadcast_mode;
if (TARGET_AVX2
&& ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
val_broadcast))
broadcast_mode = QImode;
else if (TARGET_AVX2
&& ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
val_broadcast))
broadcast_mode = HImode;
else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
val_broadcast))
broadcast_mode = SImode;
else if (TARGET_64BIT
&& ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
val_broadcast))
broadcast_mode = DImode;
else
return nullptr;
/* Check if OP can be broadcasted from VAL. */
for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
if (val != CONST_WIDE_INT_ELT (op, i))
return nullptr;
unsigned int nunits = (GET_MODE_SIZE (mode)
/ GET_MODE_SIZE (broadcast_mode));
machine_mode vector_mode;
if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
gcc_unreachable ();
rtx target = ix86_gen_scratch_sse_rtx (vector_mode);
bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
target,
GEN_INT (val_broadcast));
gcc_assert (ok);
target = lowpart_subreg (mode, target, vector_mode);
return target;
}
void
ix86_expand_move (machine_mode mode, rtx operands[])
{
rtx op0, op1;
rtx tmp, addend = NULL_RTX;
enum tls_model model;
op0 = operands[0];
op1 = operands[1];
/* Avoid complex sets of likely spilled hard registers before reload. */
if (!ix86_hardreg_mov_ok (op0, op1))
{
tmp = gen_reg_rtx (mode);
operands[0] = tmp;
ix86_expand_move (mode, operands);
operands[0] = op0;
operands[1] = tmp;
op1 = tmp;
}
switch (GET_CODE (op1))
{
case CONST:
tmp = XEXP (op1, 0);
if (GET_CODE (tmp) != PLUS
|| GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
break;
op1 = XEXP (tmp, 0);
addend = XEXP (tmp, 1);
/* FALLTHRU */
case SYMBOL_REF:
model = SYMBOL_REF_TLS_MODEL (op1);
if (model)
op1 = legitimize_tls_address (op1, model, true);
else if (ix86_force_load_from_GOT_p (op1))
{
/* Load the external function address via GOT slot to avoid PLT. */
op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
(TARGET_64BIT
? UNSPEC_GOTPCREL
: UNSPEC_GOT));
op1 = gen_rtx_CONST (Pmode, op1);
op1 = gen_const_mem (Pmode, op1);
set_mem_alias_set (op1, ix86_GOT_alias_set ());
}
else
{
tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
if (tmp)
{
op1 = tmp;
if (!addend)
break;
}
else
{
op1 = operands[1];
break;
}
}
if (addend)
{
op1 = force_operand (op1, NULL_RTX);
op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
op0, 1, OPTAB_DIRECT);
}
else
op1 = force_operand (op1, op0);
if (op1 == op0)
return;
op1 = convert_to_mode (mode, op1, 1);
default:
break;
}
if ((flag_pic || MACHOPIC_INDIRECT)
&& symbolic_operand (op1, mode))
{
if (TARGET_MACHO && !TARGET_64BIT)
{
#if TARGET_MACHO
/* dynamic-no-pic */
if (MACHOPIC_INDIRECT)
{
rtx temp = (op0 && REG_P (op0) && mode == Pmode)
? op0 : gen_reg_rtx (Pmode);
op1 = machopic_indirect_data_reference (op1, temp);
if (MACHOPIC_PURE)
op1 = machopic_legitimize_pic_address (op1, mode,
temp == op1 ? 0 : temp);
}
if (op0 != op1 && GET_CODE (op0) != MEM)
{
rtx insn = gen_rtx_SET (op0, op1);
emit_insn (insn);
return;
}
if (GET_CODE (op0) == MEM)
op1 = force_reg (Pmode, op1);
else
{
rtx temp = op0;
if (GET_CODE (temp) != REG)
temp = gen_reg_rtx (Pmode);
temp = legitimize_pic_address (op1, temp);
if (temp == op0)
return;
op1 = temp;
}
/* dynamic-no-pic */
#endif
}
else
{
if (MEM_P (op0))
op1 = force_reg (mode, op1);
else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
{
rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
op1 = legitimize_pic_address (op1, reg);
if (op0 == op1)
return;
op1 = convert_to_mode (mode, op1, 1);
}
}
}
else
{
if (MEM_P (op0)
&& (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
|| !push_operand (op0, mode))
&& MEM_P (op1))
op1 = force_reg (mode, op1);
if (push_operand (op0, mode)
&& ! general_no_elim_operand (op1, mode))
op1 = copy_to_mode_reg (mode, op1);
/* Force large constants in 64bit compilation into register
to get them CSEed. */
if (can_create_pseudo_p ()
&& (mode == DImode) && TARGET_64BIT
&& immediate_operand (op1, mode)
&& !x86_64_zext_immediate_operand (op1, VOIDmode)
&& !register_operand (op0, mode)
&& optimize)
op1 = copy_to_mode_reg (mode, op1);
if (can_create_pseudo_p ())
{
if (CONST_DOUBLE_P (op1))
{
/* If we are loading a floating point constant to a
register, force the value to memory now, since we'll
get better code out the back end. */
op1 = validize_mem (force_const_mem (mode, op1));
if (!register_operand (op0, mode))
{
rtx temp = gen_reg_rtx (mode);
emit_insn (gen_rtx_SET (temp, op1));
emit_move_insn (op0, temp);
return;
}
}
else if (GET_MODE_SIZE (mode) >= 16)
{
rtx tmp = ix86_convert_const_wide_int_to_broadcast
(GET_MODE (op0), op1);
if (tmp != nullptr)
op1 = tmp;
}
}
}
emit_insn (gen_rtx_SET (op0, op1));
}
/* OP is a memref of CONST_VECTOR, return scalar constant mem
if CONST_VECTOR is a vec_duplicate, else return NULL. */
static rtx
ix86_broadcast_from_constant (machine_mode mode, rtx op)
{
int nunits = GET_MODE_NUNITS (mode);
if (nunits < 2)
return nullptr;
/* Don't use integer vector broadcast if we can't move from GPR to SSE
register directly. */
if (!TARGET_INTER_UNIT_MOVES_TO_VEC
&& INTEGRAL_MODE_P (mode))
return nullptr;
/* Convert CONST_VECTOR to a non-standard SSE constant integer
broadcast only if vector broadcast is available. */
if (!(TARGET_AVX2
|| (TARGET_AVX
&& (GET_MODE_INNER (mode) == SImode
|| GET_MODE_INNER (mode) == DImode))
|| FLOAT_MODE_P (mode))
|| standard_sse_constant_p (op, mode))
return nullptr;
/* Don't broadcast from a 64-bit integer constant in 32-bit mode.
We can still put 64-bit integer constant in memory when
avx512 embed broadcast is available. */
if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
&& (!TARGET_AVX512F
|| (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
return nullptr;
if (GET_MODE_INNER (mode) == TImode)
return nullptr;
rtx constant = get_pool_constant (XEXP (op, 0));
if (GET_CODE (constant) != CONST_VECTOR)
return nullptr;
/* There could be some rtx like
(mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
but with "*.LC1" refer to V2DI constant vector. */
if (GET_MODE (constant) != mode)
{
constant = simplify_subreg (mode, constant, GET_MODE (constant),
0);
if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
return nullptr;
}
rtx first = XVECEXP (constant, 0, 0);
for (int i = 1; i < nunits; ++i)
{
rtx tmp = XVECEXP (constant, 0, i);
/* Vector duplicate value. */
if (!rtx_equal_p (tmp, first))
return nullptr;
}
return first;
}
void
ix86_expand_vector_move (machine_mode mode, rtx operands[])
{
rtx op0 = operands[0], op1 = operands[1];
/* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
psABI since the biggest alignment is 4 byte for IA MCU psABI. */
unsigned int align = (TARGET_IAMCU
? GET_MODE_BITSIZE (mode)
: GET_MODE_ALIGNMENT (mode));
if (push_operand (op0, VOIDmode))
op0 = emit_move_resolve_push (mode, op0);
/* Force constants other than zero into memory. We do not know how
the instructions used to build constants modify the upper 64 bits
of the register, once we have that information we may be able
to handle some of them more efficiently. */
if (can_create_pseudo_p ()
&& (CONSTANT_P (op1)
|| (SUBREG_P (op1)
&& CONSTANT_P (SUBREG_REG (op1))))
&& ((register_operand (op0, mode)
&& !standard_sse_constant_p (op1, mode))
/* ix86_expand_vector_move_misalign() does not like constants. */
|| (SSE_REG_MODE_P (mode)
&& MEM_P (op0)
&& MEM_ALIGN (op0) < align)))
{
if (SUBREG_P (op1))
{
machine_mode imode = GET_MODE (SUBREG_REG (op1));
rtx r = force_const_mem (imode, SUBREG_REG (op1));
if (r)
r = validize_mem (r);
else
r = force_reg (imode, SUBREG_REG (op1));
op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
}
else
{
machine_mode mode = GET_MODE (op0);
rtx tmp = ix86_convert_const_wide_int_to_broadcast
(mode, op1);
if (tmp == nullptr)
op1 = validize_mem (force_const_mem (mode, op1));
else
op1 = tmp;
}
}
if (can_create_pseudo_p ()
&& GET_MODE_SIZE (mode) >= 16
&& VECTOR_MODE_P (mode)
&& (MEM_P (op1)
&& SYMBOL_REF_P (XEXP (op1, 0))
&& CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
{
rtx first = ix86_broadcast_from_constant (mode, op1);
if (first != nullptr)
{
/* Broadcast to XMM/YMM/ZMM register from an integer
constant or scalar mem. */
op1 = gen_reg_rtx (mode);
if (FLOAT_MODE_P (mode)
|| (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
first = force_const_mem (GET_MODE_INNER (mode), first);
bool ok = ix86_expand_vector_init_duplicate (false, mode,
op1, first);
gcc_assert (ok);
emit_move_insn (op0, op1);
return;
}
}
/* We need to check memory alignment for SSE mode since attribute
can make operands unaligned. */
if (can_create_pseudo_p ()
&& SSE_REG_MODE_P (mode)
&& ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
|| (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
{
rtx tmp[2];
/* ix86_expand_vector_move_misalign() does not like both
arguments in memory. */
if (!register_operand (op0, mode)
&& !register_operand (op1, mode))
{
rtx scratch = ix86_gen_scratch_sse_rtx (mode);
emit_move_insn (scratch, op1);
op1 = scratch;
}
tmp[0] = op0; tmp[1] = op1;
ix86_expand_vector_move_misalign (mode, tmp);
return;
}
/* If operand0 is a hard register, make operand1 a pseudo. */
if (can_create_pseudo_p ()
&& !ix86_hardreg_mov_ok (op0, op1))
{
rtx tmp = gen_reg_rtx (GET_MODE (op0));
emit_move_insn (tmp, op1);
emit_move_insn (op0, tmp);
return;
}
/* Make operand1 a register if it isn't already. */
if (can_create_pseudo_p ()
&& !register_operand (op0, mode)
&& !register_operand (op1, mode))
{
rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0));
emit_move_insn (tmp, op1);
emit_move_insn (op0, tmp);
return;
}
emit_insn (gen_rtx_SET (op0, op1));
}
/* Split 32-byte AVX unaligned load and store if needed. */
static void
ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
{
rtx m;
rtx (*extract) (rtx, rtx, rtx);
machine_mode mode;
if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
|| (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
{
emit_insn (gen_rtx_SET (op0, op1));
return;
}
rtx orig_op0 = NULL_RTX;
mode = GET_MODE (op0);
switch (GET_MODE_CLASS (mode))
{
case MODE_VECTOR_INT:
case MODE_INT:
if (mode != V32QImode)
{
if (!MEM_P (op0))
{
orig_op0 = op0;
op0 = gen_reg_rtx (V32QImode);
}
else
op0 = gen_lowpart (V32QImode, op0);
op1 = gen_lowpart (V32QImode, op1);
mode = V32QImode;
}
break;
case MODE_VECTOR_FLOAT:
break;
default:
gcc_unreachable ();
}
switch (mode)
{
default:
gcc_unreachable ();
case E_V32QImode:
extract = gen_avx_vextractf128v32qi;
mode = V16QImode;
break;
case E_V16HFmode:
extract = gen_avx_vextractf128v16hf;
mode = V8HFmode;
break;
case E_V8SFmode:
extract = gen_avx_vextractf128v8sf;
mode = V4SFmode;
break;
case E_V4DFmode:
extract = gen_avx_vextractf128v4df;
mode = V2DFmode;
break;
}
if (MEM_P (op1))
{
rtx r = gen_reg_rtx (mode);
m = adjust_address (op1, mode, 0);
emit_move_insn (r, m);
m = adjust_address (op1, mode, 16);
r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
emit_move_insn (op0, r);
}
else if (MEM_P (op0))
{
m = adjust_address (op0, mode, 0);
emit_insn (extract (m, op1, const0_rtx));
m = adjust_address (op0, mode, 16);
emit_insn (extract (m, copy_rtx (op1), const1_rtx));
}
else
gcc_unreachable ();
if (orig_op0)
emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
}
/* Implement the movmisalign patterns for SSE. Non-SSE modes go
straight to ix86_expand_vector_move. */
/* Code generation for scalar reg-reg moves of single and double precision data:
if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
movaps reg, reg
else
movss reg, reg
if (x86_sse_partial_reg_dependency == true)
movapd reg, reg
else
movsd reg, reg
Code generation for scalar loads of double precision data:
if (x86_sse_split_regs == true)
movlpd mem, reg (gas syntax)
else
movsd mem, reg
Code generation for unaligned packed loads of single precision data
(x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
if (x86_sse_unaligned_move_optimal)
movups mem, reg
if (x86_sse_partial_reg_dependency == true)
{
xorps reg, reg
movlps mem, reg
movhps mem+8, reg
}
else
{
movlps mem, reg
movhps mem+8, reg
}
Code generation for unaligned packed loads of double precision data
(x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
if (x86_sse_unaligned_move_optimal)
movupd mem, reg
if (x86_sse_split_regs == true)
{
movlpd mem, reg
movhpd mem+8, reg
}
else
{
movsd mem, reg
movhpd mem+8, reg
}
*/
void
ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
{
rtx op0, op1, m;
op0 = operands[0];
op1 = operands[1];
/* Use unaligned load/store for AVX512 or when optimizing for size. */
if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
{
emit_insn (gen_rtx_SET (op0, op1));
return;
}
if (TARGET_AVX)
{
if (GET_MODE_SIZE (mode) == 32)
ix86_avx256_split_vector_move_misalign (op0, op1);
else
/* Always use 128-bit mov<mode>_internal pattern for AVX. */
emit_insn (gen_rtx_SET (op0, op1));
return;
}
if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
|| TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
{
emit_insn (gen_rtx_SET (op0, op1));
return;
}
/* ??? If we have typed data, then it would appear that using
movdqu is the only way to get unaligned data loaded with
integer type. */
if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
{
emit_insn (gen_rtx_SET (op0, op1));
return;
}
if (MEM_P (op1))
{
if (TARGET_SSE2 && mode == V2DFmode)
{
rtx zero;
/* When SSE registers are split into halves, we can avoid
writing to the top half twice. */
if (TARGET_SSE_SPLIT_REGS)
{
emit_clobber (op0);
zero = op0;
}
else
{
/* ??? Not sure about the best option for the Intel chips.
The following would seem to satisfy; the register is
entirely cleared, breaking the dependency chain. We
then store to the upper half, with a dependency depth
of one. A rumor has it that Intel recommends two movsd
followed by an unpacklpd, but this is unconfirmed. And
given that the dependency depth of the unpacklpd would
still be one, I'm not sure why this would be better. */
zero = CONST0_RTX (V2DFmode);
}
m = adjust_address (op1, DFmode, 0);
emit_insn (gen_sse2_loadlpd (op0, zero, m));
m = adjust_address (op1, DFmode, 8);
emit_insn (gen_sse2_loadhpd (op0, op0, m));
}
else
{
rtx t;
if (mode != V4SFmode)
t = gen_reg_rtx (V4SFmode);
else
t = op0;
if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
emit_move_insn (t, CONST0_RTX (V4SFmode));
else
emit_clobber (t);
m = adjust_address (op1, V2SFmode, 0);
emit_insn (gen_sse_loadlps (t, t, m));
m = adjust_address (op1, V2SFmode, 8);
emit_insn (gen_sse_loadhps (t, t, m));
if (mode != V4SFmode)
emit_move_insn (op0, gen_lowpart (mode, t));
}
}
else if (MEM_P (op0))
{
if (TARGET_SSE2 && mode == V2DFmode)
{
m = adjust_address (op0, DFmode, 0);
emit_insn (gen_sse2_storelpd (m, op1));
m = adjust_address (op0, DFmode, 8);
emit_insn (gen_sse2_storehpd (m, op1));
}
else
{
if (mode != V4SFmode)
op1 = gen_lowpart (V4SFmode, op1);
m = adjust_address (op0, V2SFmode, 0);
emit_insn (gen_sse_storelps (m, op1));
m = adjust_address (op0, V2SFmode, 8);
emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
}
}
else
gcc_unreachable ();
}
/* Move bits 64:95 to bits 32:63. */
void
ix86_move_vector_high_sse_to_mmx (rtx op)
{
rtx mask = gen_rtx_PARALLEL (VOIDmode,
gen_rtvec (4, GEN_INT (0), GEN_INT (2),
GEN_INT (0), GEN_INT (0)));
rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
rtx insn = gen_rtx_SET (dest, op);
emit_insn (insn);
}
/* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
void
ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
{
rtx op0 = operands[0];
rtx op1 = operands[1];
rtx op2 = operands[2];
machine_mode dmode = GET_MODE (op0);
machine_mode smode = GET_MODE (op1);
machine_mode inner_dmode = GET_MODE_INNER (dmode);
machine_mode inner_smode = GET_MODE_INNER (smode);
/* Get the corresponding SSE mode for destination. */
int nunits = 16 / GET_MODE_SIZE (inner_dmode);
machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
nunits).require ();
machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
nunits / 2).require ();
/* Get the corresponding SSE mode for source. */
nunits = 16 / GET_MODE_SIZE (inner_smode);
machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
nunits).require ();
/* Generate SSE pack with signed/unsigned saturation. */
rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
op1, op2));
emit_insn (insn);
ix86_move_vector_high_sse_to_mmx (op0);
}
/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
void
ix86_split_mmx_punpck (rtx operands[], bool high_p)
{
rtx op0 = operands[0];
rtx op1 = operands[1];
rtx op2 = operands[2];
machine_mode mode = GET_MODE (op0);
rtx mask;
/* The corresponding SSE mode. */
machine_mode sse_mode, double_sse_mode;
switch (mode)
{
case E_V4QImode:
case E_V8QImode:
sse_mode = V16QImode;
double_sse_mode = V32QImode;
mask = gen_rtx_PARALLEL (VOIDmode,
gen_rtvec (16,
GEN_INT (0), GEN_INT (16),
GEN_INT (1), GEN_INT (17),
GEN_INT (2), GEN_INT (18),
GEN_INT (3), GEN_INT (19),
GEN_INT (4), GEN_INT (20),
GEN_INT (5), GEN_INT (21),
GEN_INT (6), GEN_INT (22),
GEN_INT (7), GEN_INT (23)));
break;
case E_V4HImode:
case E_V2HImode:
sse_mode = V8HImode;
double_sse_mode = V16HImode;
mask = gen_rtx_PARALLEL (VOIDmode,
gen_rtvec (8,
GEN_INT (0), GEN_INT (8),
GEN_INT (1), GEN_INT (9),
GEN_INT (2), GEN_INT (10),
GEN_INT (3), GEN_INT (11)));
break;
case E_V2SImode:
sse_mode = V4SImode;
double_sse_mode = V8SImode;
mask = gen_rtx_PARALLEL (VOIDmode,
gen_rtvec (4,
GEN_INT (0), GEN_INT (4),
GEN_INT (1), GEN_INT (5)));
break;
case E_V2SFmode:
sse_mode = V4SFmode;
double_sse_mode = V8SFmode;
mask = gen_rtx_PARALLEL (VOIDmode,
gen_rtvec (4,
GEN_INT (0), GEN_INT (4),
GEN_INT (1), GEN_INT (5)));
break;
default:
gcc_unreachable ();
}
/* Generate SSE punpcklXX. */
rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
rtx insn = gen_rtx_SET (dest, op2);
emit_insn (insn);
/* Move high bits to low bits. */
if (high_p)
{
if (sse_mode == V4SFmode)
{
mask = gen_rtx_PARALLEL (VOIDmode,
gen_rtvec (4, GEN_INT (2), GEN_INT (3),
GEN_INT (4), GEN_INT (5)));
op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
}
else
{
int sz = GET_MODE_SIZE (mode);
if (sz == 4)
mask = gen_rtx_PARALLEL (VOIDmode,
gen_rtvec (4, GEN_INT (1), GEN_INT (0),
GEN_INT (0), GEN_INT (1)));
else if (sz == 8)
mask = gen_rtx_PARALLEL (VOIDmode,
gen_rtvec (4, GEN_INT (2), GEN_INT (3),
GEN_INT (0), GEN_INT (1)));
else
gcc_unreachable ();
dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
}
insn = gen_rtx_SET (dest, op1);
emit_insn (insn);
}
}
/* Helper function of ix86_fixup_binary_operands to canonicalize
operand order. Returns true if the operands should be swapped. */
static bool
ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
rtx operands[])
{
rtx dst = operands[0];
rtx src1 = operands[1];
rtx src2 = operands[2];
/* If the operation is not commutative, we can't do anything. */
if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
&& GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
return false;
/* Highest priority is that src1 should match dst. */
if (rtx_equal_p (dst, src1))
return false;
if (rtx_equal_p (dst, src2))
return true;
/* Next highest priority is that immediate constants come second. */
if (immediate_operand (src2, mode))
return false;
if (immediate_operand (src1, mode))
return true;
/* Lowest priority is that memory references should come second. */
if (MEM_P (src2))
return false;
if (MEM_P (src1))
return true;
return false;
}
/* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
destination to use for the operation. If different from the true
destination in operands[0], a copy operation will be required. */
rtx
ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
rtx operands[])
{
rtx dst = operands[0];
rtx src1 = operands[1];
rtx src2 = operands[2];
/* Canonicalize operand order. */
if (ix86_swap_binary_operands_p (code, mode, operands))
{
/* It is invalid to swap operands of different modes. */
gcc_assert (GET_MODE (src1) == GET_MODE (src2));
std::swap (src1, src2);
}
/* Both source operands cannot be in memory. */
if (MEM_P (src1) && MEM_P (src2))
{
/* Optimization: Only read from memory once. */
if (rtx_equal_p (src1, src2))
{
src2 = force_reg (mode, src2);
src1 = src2;
}
else if (rtx_equal_p (dst, src1))
src2 = force_reg (mode, src2);
else
src1 = force_reg (mode, src1);
}
/* If the destination is memory, and we do not have matching source
operands, do things in registers. */
if (MEM_P (dst) && !rtx_equal_p (dst, src1))
dst = gen_reg_rtx (mode);
/* Source 1 cannot be a constant. */
if (CONSTANT_P (src1))
src1 = force_reg (mode, src1);
/* Source 1 cannot be a non-matching memory. */
if (MEM_P (src1) && !rtx_equal_p (dst, src1))
src1 = force_reg (mode, src1);
/* Improve address combine. */
if (code == PLUS
&& GET_MODE_CLASS (mode) == MODE_INT
&& MEM_P (src2))
src2 = force_reg (mode, src2);
operands[1] = src1;
operands[2] = src2;
return dst;
}
/* Similarly, but assume that the destination has already been
set up properly. */
void
ix86_fixup_binary_operands_no_copy (enum rtx_code code,
machine_mode mode, rtx operands[])
{
rtx dst = ix86_fixup_binary_operands (code, mode, operands);
gcc_assert (dst == operands[0]);
}
/* Attempt to expand a binary operator. Make the expansion closer to the
actual machine, then just general_operand, which will allow 3 separate
memory references (one output, two input) in a single insn. */
void
ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
rtx operands[])
{
rtx src1, src2, dst, op, clob;
dst = ix86_fixup_binary_operands (code, mode, operands);
src1 = operands[1];
src2 = operands[2];
/* Emit the instruction. */
op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
if (reload_completed
&& code == PLUS
&& !rtx_equal_p (dst, src1))
{
/* This is going to be an LEA; avoid splitting it later. */
emit_insn (op);
}
else
{
clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
}
/* Fix up the destination if needed. */
if (dst != operands[0])
emit_move_insn (operands[0], dst);
}
/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
the given OPERANDS. */
void
ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
rtx operands[])
{
rtx op1 = NULL_RTX, op2 = NULL_RTX;
if (SUBREG_P (operands[1]))
{
op1 = operands[1];
op2 = operands[2];
}
else if (SUBREG_P (operands[2]))
{
op1 = operands[2];
op2 = operands[1];
}
/* Optimize (__m128i) d | (__m128i) e and similar code
when d and e are float vectors into float vector logical
insn. In C/C++ without using intrinsics there is no other way
to express vector logical operation on float vectors than
to cast them temporarily to integer vectors. */
if (op1
&& !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
&& (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
&& GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
&& GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
&& SUBREG_BYTE (op1) == 0
&& (GET_CODE (op2) == CONST_VECTOR
|| (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
&& SUBREG_BYTE (op2) == 0))
&& can_create_pseudo_p ())
{
rtx dst;
switch (GET_MODE (SUBREG_REG (op1)))
{
case E_V4SFmode:
case E_V8SFmode:
case E_V16SFmode:
case E_V2DFmode:
case E_V4DFmode:
case E_V8DFmode:
dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
if (GET_CODE (op2) == CONST_VECTOR)
{
op2 = gen_lowpart (GET_MODE (dst), op2);
op2 = force_reg (GET_MODE (dst), op2);
}
else
{
op1 = operands[1];
op2 = SUBREG_REG (operands[2]);
if (!vector_operand (op2, GET_MODE (dst)))
op2 = force_reg (GET_MODE (dst), op2);
}
op1 = SUBREG_REG (op1);
if (!vector_operand (op1, GET_MODE (dst)))
op1 = force_reg (GET_MODE (dst), op1);
emit_insn (gen_rtx_SET (dst,
gen_rtx_fmt_ee (code, GET_MODE (dst),
op1, op2)));
emit_move_insn (operands[0], gen_lowpart (mode, dst));
return;
default:
break;
}
}
if (!vector_operand (operands[1], mode))
operands[1] = force_reg (mode, operands[1]);
if (!vector_operand (operands[2], mode))
operands[2] = force_reg (mode, operands[2]);
ix86_fixup_binary_operands_no_copy (code, mode, operands);
emit_insn (gen_rtx_SET (operands[0],
gen_rtx_fmt_ee (code, mode, operands[1],
operands[2])));
}
/* Return TRUE or FALSE depending on whether the binary operator meets the
appropriate constraints. */
bool
ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
rtx operands[3])
{
rtx dst = operands[0];
rtx src1 = operands[1];
rtx src2 = operands[2];
/* Both source operands cannot be in memory. */
if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
&& (MEM_P (src2) || bcst_mem_operand (src2, mode)))
return false;
/* Canonicalize operand order for commutative operators. */
if (ix86_swap_binary_operands_p (code, mode, operands))
std::swap (src1, src2);
/* If the destination is memory, we must have a matching source operand. */
if (MEM_P (dst) && !rtx_equal_p (dst, src1))
return false;
/* Source 1 cannot be a constant. */
if (CONSTANT_P (src1))
return false;
/* Source 1 cannot be a non-matching memory. */
if (MEM_P (src1) && !rtx_equal_p (dst, src1))
/* Support "andhi/andsi/anddi" as a zero-extending move. */
return (code == AND
&& (mode == HImode
|| mode == SImode
|| (TARGET_64BIT && mode == DImode))
&& satisfies_constraint_L (src2));
return true;
}
/* Attempt to expand a unary operator. Make the expansion closer to the
actual machine, then just general_operand, which will allow 2 separate
memory references (one output, one input) in a single insn. */
void
ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
rtx operands[])
{
bool matching_memory = false;
rtx src, dst, op, clob;
dst = operands[0];
src = operands[1];
/* If the destination is memory, and we do not have matching source
operands, do things in registers. */
if (MEM_P (dst))
{
if (rtx_equal_p (dst, src))
matching_memory = true;
else
dst = gen_reg_rtx (mode);
}
/* When source operand is memory, destination must match. */
if (MEM_P (src) && !matching_memory)
src = force_reg (mode, src);
/* Emit the instruction. */
op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
if (code == NOT)
emit_insn (op);
else
{
clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
}
/* Fix up the destination if needed. */
if (dst != operands[0])
emit_move_insn (operands[0], dst);
}
/* Predict just emitted jump instruction to be taken with probability PROB. */
static void
predict_jump (int prob)
{
rtx_insn *insn = get_last_insn ();
gcc_assert (JUMP_P (insn));
add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
}
/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
divisor are within the range [0-255]. */
void
ix86_split_idivmod (machine_mode mode, rtx operands[],
bool unsigned_p)
{
rtx_code_label *end_label, *qimode_label;
rtx div, mod;
rtx_insn *insn;
rtx scratch, tmp0, tmp1, tmp2;
rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
switch (mode)
{
case E_SImode:
if (GET_MODE (operands[0]) == SImode)
{
if (GET_MODE (operands[1]) == SImode)
gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
else
gen_divmod4_1
= unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
}
else
gen_divmod4_1
= unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
break;
case E_DImode:
gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
break;
default:
gcc_unreachable ();
}
end_label = gen_label_rtx ();
qimode_label = gen_label_rtx ();
scratch = gen_reg_rtx (mode);
/* Use 8bit unsigned divimod if dividend and divisor are within
the range [0-255]. */
emit_move_insn (scratch, operands[2]);
scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
scratch, 1, OPTAB_DIRECT);
emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
gen_rtx_LABEL_REF (VOIDmode, qimode_label),
pc_rtx);
insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
predict_jump (REG_BR_PROB_BASE * 50 / 100);
JUMP_LABEL (insn) = qimode_label;
/* Generate original signed/unsigned divimod. */
emit_insn (gen_divmod4_1 (operands[0], operands[1],
operands[2], operands[3]));
/* Branch to the end. */
emit_jump_insn (gen_jump (end_label));
emit_barrier ();
/* Generate 8bit unsigned divide. */
emit_label (qimode_label);
/* Don't use operands[0] for result of 8bit divide since not all
registers support QImode ZERO_EXTRACT. */
tmp0 = lowpart_subreg (HImode, scratch, mode);
tmp1 = lowpart_subreg (HImode, operands[2], mode);
tmp2 = lowpart_subreg (QImode, operands[3], mode);
emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
if (unsigned_p)
{
div = gen_rtx_UDIV (mode, operands[2], operands[3]);
mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
}
else
{
div = gen_rtx_DIV (mode, operands[2], operands[3]);
mod = gen_rtx_MOD (mode, operands[2], operands[3]);
}
if (mode == SImode)
{
if (GET_MODE (operands[0]) != SImode)
div = gen_rtx_ZERO_EXTEND (DImode, div);
if (GET_MODE (operands[1]) != SImode)
mod = gen_rtx_ZERO_EXTEND (DImode, mod);
}
/* Extract remainder from AH. */
scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
GEN_INT (8), GEN_INT (8));
insn = emit_move_insn (operands[1], tmp1);
set_unique_reg_note (insn, REG_EQUAL, mod);
/* Zero extend quotient from AL. */
tmp1 = gen_lowpart (QImode, tmp0);
insn = emit_insn (gen_extend_insn
(operands[0], tmp1,
GET_MODE (operands[0]), QImode, 1));
set_unique_reg_note (insn, REG_EQUAL, div);
emit_label (end_label);
}
/* Emit x86 binary operand CODE in mode MODE, where the first operand
matches destination. RTX includes clobber of FLAGS_REG. */
void
ix86_emit_binop (enum rtx_code code, machine_mode mode,
rtx dst, rtx src)
{
rtx op, clob;
op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
}
/* Return true if regno1 def is nearest to the insn. */
static bool
find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
{
rtx_insn *prev = insn;
rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
if (insn == start)
return false;
while (prev && prev != start)
{
if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
{
prev = PREV_INSN (prev);
continue;
}
if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
return true;
else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
return false;
prev = PREV_INSN (prev);
}
/* None of the regs is defined in the bb. */
return false;
}
/* INSN_UID of the last insn emitted by zero store peephole2s. */
int ix86_last_zero_store_uid;
/* Split lea instructions into a sequence of instructions
which are executed on ALU to avoid AGU stalls.
It is assumed that it is allowed to clobber flags register
at lea position. */
void
ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
{
unsigned int regno0, regno1, regno2;
struct ix86_address parts;
rtx target, tmp;
int ok, adds;
ok = ix86_decompose_address (operands[1], &parts);
gcc_assert (ok);
target = gen_lowpart (mode, operands[0]);
regno0 = true_regnum (target);
regno1 = INVALID_REGNUM;
regno2 = INVALID_REGNUM;
if (parts.base)
{
parts.base = gen_lowpart (mode, parts.base);
regno1 = true_regnum (parts.base);
}
if (parts.index)
{
parts.index = gen_lowpart (mode, parts.index);
regno2 = true_regnum (parts.index);
}
if (parts.disp)
parts.disp = gen_lowpart (mode, parts.disp);
if (parts.scale > 1)
{
/* Case r1 = r1 + ... */
if (regno1 == regno0)
{
/* If we have a case r1 = r1 + C * r2 then we
should use multiplication which is very
expensive. Assume cost model is wrong if we
have such case here. */
gcc_assert (regno2 != regno0);
for (adds = parts.scale; adds > 0; adds--)
ix86_emit_binop (PLUS, mode, target, parts.index);
}
else
{
/* r1 = r2 + r3 * C case. Need to move r3 into r1. */
if (regno0 != regno2)
emit_insn (gen_rtx_SET (target, parts.index));
/* Use shift for scaling, but emit it as MULT instead
to avoid it being immediately peephole2 optimized back
into lea. */
ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
if (parts.base)
ix86_emit_binop (PLUS, mode, target, parts.base);
if (parts.disp && parts.disp != const0_rtx)
ix86_emit_binop (PLUS, mode, target, parts.disp);
}
}
else if (!parts.base && !parts.index)
{
gcc_assert(parts.disp);
emit_insn (gen_rtx_SET (target, parts.disp));
}
else
{
if (!parts.base)
{
if (regno0 != regno2)
emit_insn (gen_rtx_SET (target, parts.index));
}
else if (!parts.index)
{
if (regno0 != regno1)
emit_insn (gen_rtx_SET (target, parts.base));
}
else
{
if (regno0 == regno1)
tmp = parts.index;
else if (regno0 == regno2)
tmp = parts.base;
else
{
rtx tmp1;
/* Find better operand for SET instruction, depending
on which definition is farther from the insn. */
if (find_nearest_reg_def (insn, regno1, regno2))
tmp = parts.index, tmp1 = parts.base;
else
tmp = parts.base, tmp1 = parts.index;
emit_insn (gen_rtx_SET (target, tmp));
if (parts.disp && parts.disp != const0_rtx)
ix86_emit_binop (PLUS, mode, target, parts.disp);
ix86_emit_binop (PLUS, mode, target, tmp1);
return;
}
ix86_emit_binop (PLUS, mode, target, tmp);
}
if (parts.disp && parts.disp != const0_rtx)
ix86_emit_binop (PLUS, mode, target, parts.disp);
}
}
/* Post-reload splitter for converting an SF or DFmode value in an
SSE register into an unsigned SImode. */
void
ix86_split_convert_uns_si_sse (rtx operands[])
{
machine_mode vecmode;
rtx value, large, zero_or_two31, input, two31, x;
large = operands[1];
zero_or_two31 = operands[2];
input = operands[3];
two31 = operands[4];
vecmode = GET_MODE (large);
value = gen_rtx_REG (vecmode, REGNO (operands[0]));
/* Load up the value into the low element. We must ensure that the other
elements are valid floats -- zero is the easiest such value. */
if (MEM_P (input))
{
if (vecmode == V4SFmode)
emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
else
emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
}
else
{
input = gen_rtx_REG (vecmode, REGNO (input));
emit_move_insn (value, CONST0_RTX (vecmode));
if (vecmode == V4SFmode)
emit_insn (gen_sse_movss (value, value, input));
else
emit_insn (gen_sse2_movsd (value, value, input));
}
emit_move_insn (large, two31);
emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
x = gen_rtx_fmt_ee (LE, vecmode, large, value);
emit_insn (gen_rtx_SET (large, x));
x = gen_rtx_AND (vecmode, zero_or_two31, large);
emit_insn (gen_rtx_SET (zero_or_two31, x));
x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
emit_insn (gen_rtx_SET (value, x));
large = gen_rtx_REG (V4SImode, REGNO (large));
emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
x = gen_rtx_REG (V4SImode, REGNO (value));
if (vecmode == V4SFmode)
emit_insn (gen_fix_truncv4sfv4si2 (x, value));
else
emit_insn (gen_sse2_cvttpd2dq (x, value));
value = x;
emit_insn (gen_xorv4si3 (value, value, large));
}
static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
machine_mode mode, rtx target,
rtx var, int one_var);
/* Convert an unsigned DImode value into a DFmode, using only SSE.
Expects the 64-bit DImode to be supplied in a pair of integral
registers. Requires SSE2; will use SSE3 if available. For x86_32,
-mfpmath=sse, !optimize_size only. */
void
ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
{
REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
rtx int_xmm, fp_xmm;
rtx biases, exponents;
rtx x;
int_xmm = gen_reg_rtx (V4SImode);
if (TARGET_INTER_UNIT_MOVES_TO_VEC)
emit_insn (gen_movdi_to_sse (int_xmm, input));
else if (TARGET_SSE_SPLIT_REGS)
{
emit_clobber (int_xmm);
emit_move_insn (gen_lowpart (DImode, int_xmm), input);
}
else
{
x = gen_reg_rtx (V2DImode);
ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
}
x = gen_rtx_CONST_VECTOR (V4SImode,
gen_rtvec (4, GEN_INT (0x43300000UL),
GEN_INT (0x45300000UL),
const0_rtx, const0_rtx));
exponents = validize_mem (force_const_mem (V4SImode, x));
/* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
/* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
Similarly (0x45300000UL ## fp_value_hi_xmm) yields
(0x1.0p84 + double(fp_value_hi_xmm)).
Note these exponents differ by 32. */
fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
/* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
real_ldexp (&bias_lo_rvt, &dconst1, 52);
real_ldexp (&bias_hi_rvt, &dconst1, 84);
biases = const_double_from_real_value (bias_lo_rvt, DFmode);
x = const_double_from_real_value (bias_hi_rvt, DFmode);
biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
biases = validize_mem (force_const_mem (V2DFmode, biases));
emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
/* Add the upper and lower DFmode values together. */
if (TARGET_SSE3)
emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
else
{
x = copy_to_mode_reg (V2DFmode, fp_xmm);
emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
}
ix86_expand_vector_extract (false, target, fp_xmm, 0);
}
/* Not used, but eases macroization of patterns. */
void
ix86_expand_convert_uns_sixf_sse (rtx, rtx)
{
gcc_unreachable ();
}
static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
/* Convert an unsigned SImode value into a DFmode. Only currently used
for SSE, but applicable anywhere. */
void
ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
{
REAL_VALUE_TYPE TWO31r;
rtx x, fp;
x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
NULL, 1, OPTAB_DIRECT);
fp = gen_reg_rtx (DFmode);
emit_insn (gen_floatsidf2 (fp, x));
real_ldexp (&TWO31r, &dconst1, 31);
x = const_double_from_real_value (TWO31r, DFmode);
x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
/* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
x = ix86_expand_sse_fabs (x, NULL);
if (x != target)
emit_move_insn (target, x);
}
/* Convert a signed DImode value into a DFmode. Only used for SSE in
32-bit mode; otherwise we have a direct convert instruction. */
void
ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
{
REAL_VALUE_TYPE TWO32r;
rtx fp_lo, fp_hi, x;
fp_lo = gen_reg_rtx (DFmode);
fp_hi = gen_reg_rtx (DFmode);
emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
real_ldexp (&TWO32r, &dconst1, 32);
x = const_double_from_real_value (TWO32r, DFmode);
fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
0, OPTAB_DIRECT);
if (x != target)
emit_move_insn (target, x);
}
/* Convert an unsigned SImode value into a SFmode, using only SSE.
For x86_32, -mfpmath=sse, !optimize_size only. */
void
ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
{
REAL_VALUE_TYPE ONE16r;
rtx fp_hi, fp_lo, int_hi, int_lo, x;
real_ldexp (&ONE16r, &dconst1, 16);
x = const_double_from_real_value (ONE16r, SFmode);
int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
NULL, 0, OPTAB_DIRECT);
int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
NULL, 0, OPTAB_DIRECT);
fp_hi = gen_reg_rtx (SFmode);
fp_lo = gen_reg_rtx (SFmode);
emit_insn (gen_floatsisf2 (fp_hi, int_hi));
emit_insn (gen_floatsisf2 (fp_lo, int_lo));
if (TARGET_FMA)
{
x = validize_mem (force_const_mem (SFmode, x));
fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
emit_move_insn (target, fp_hi);
}
else
{
fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
0, OPTAB_DIRECT);
fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
0, OPTAB_DIRECT);
if (!rtx_equal_p (target, fp_hi))
emit_move_insn (target, fp_hi);
}
}
/* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
a vector of unsigned ints VAL to vector of floats TARGET. */
void
ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
{
rtx tmp[8];
REAL_VALUE_TYPE TWO16r;
machine_mode intmode = GET_MODE (val);
machine_mode fltmode = GET_MODE (target);
rtx (*cvt) (rtx, rtx);
if (intmode == V4SImode)
cvt = gen_floatv4siv4sf2;
else
cvt = gen_floatv8siv8sf2;
tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
tmp[0] = force_reg (intmode, tmp[0]);
tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
OPTAB_DIRECT);
tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
NULL_RTX, 1, OPTAB_DIRECT);
tmp[3] = gen_reg_rtx (fltmode);
emit_insn (cvt (tmp[3], tmp[1]));
tmp[4] = gen_reg_rtx (fltmode);
emit_insn (cvt (tmp[4], tmp[2]));
real_ldexp (&TWO16r, &dconst1, 16);
tmp[5] = const_double_from_real_value (TWO16r, SFmode);
tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
if (TARGET_FMA)
{
tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
emit_move_insn (target, tmp[6]);
}
else
{
tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
NULL_RTX, 1, OPTAB_DIRECT);
tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
target, 1, OPTAB_DIRECT);
if (tmp[7] != target)
emit_move_insn (target, tmp[7]);
}
}
/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
This is done by doing just signed conversion if < 0x1p31, and otherwise by
subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
rtx
ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
{
REAL_VALUE_TYPE TWO31r;
rtx two31r, tmp[4];
machine_mode mode = GET_MODE (val);
machine_mode scalarmode = GET_MODE_INNER (mode);
machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
rtx (*cmp) (rtx, rtx, rtx, rtx);
int i;
for (i = 0; i < 3; i++)
tmp[i] = gen_reg_rtx (mode);
real_ldexp (&TWO31r, &dconst1, 31);
two31r = const_double_from_real_value (TWO31r, scalarmode);
two31r = ix86_build_const_vector (mode, 1, two31r);
two31r = force_reg (mode, two31r);
switch (mode)
{
case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
default: gcc_unreachable ();
}
tmp[3] = gen_rtx_LE (mode, two31r, val);
emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
0, OPTAB_DIRECT);
if (intmode == V4SImode || TARGET_AVX2)
*xorp = expand_simple_binop (intmode, ASHIFT,
gen_lowpart (intmode, tmp[0]),
GEN_INT (31), NULL_RTX, 0,
OPTAB_DIRECT);
else
{
rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
two31 = ix86_build_const_vector (intmode, 1, two31);
*xorp = expand_simple_binop (intmode, AND,
gen_lowpart (intmode, tmp[0]),
two31, NULL_RTX, 0,
OPTAB_DIRECT);
}
return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
0, OPTAB_DIRECT);
}
/* Generate code for floating point ABS or NEG. */
void
ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
rtx operands[])
{
rtx set, dst, src;
bool use_sse = false;
bool vector_mode = VECTOR_MODE_P (mode);
machine_mode vmode = mode;
rtvec par;
if (vector_mode || mode == TFmode || mode == HFmode)
{
use_sse = true;
if (mode == HFmode)
vmode = V8HFmode;
}
else if (TARGET_SSE_MATH)
{
use_sse = SSE_FLOAT_MODE_P (mode);
if (mode == SFmode)
vmode = V4SFmode;
else if (mode == DFmode)
vmode = V2DFmode;
}
dst = operands[0];
src = operands[1];
set = gen_rtx_fmt_e (code, mode, src);
set = gen_rtx_SET (dst, set);
if (use_sse)
{
rtx mask, use, clob;
/* NEG and ABS performed with SSE use bitwise mask operations.
Create the appropriate mask now. */
mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
use = gen_rtx_USE (VOIDmode, mask);
if (vector_mode || mode == TFmode)
par = gen_rtvec (2, set, use);
else
{
clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
par = gen_rtvec (3, set, use, clob);
}
}
else
{
rtx clob;
/* Changing of sign for FP values is doable using integer unit too. */
clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
par = gen_rtvec (2, set, clob);
}
emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
}
/* Deconstruct a floating point ABS or NEG operation
with integer registers into integer operations. */
void
ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
rtx operands[])
{
enum rtx_code absneg_op;
rtx dst, set;
gcc_assert (operands_match_p (operands[0], operands[1]));
switch (mode)
{
case E_SFmode:
dst = gen_lowpart (SImode, operands[0]);
if (code == ABS)
{
set = gen_int_mode (0x7fffffff, SImode);
absneg_op = AND;
}
else
{
set = gen_int_mode (0x80000000, SImode);
absneg_op = XOR;
}
set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
break;
case E_DFmode:
if (TARGET_64BIT)
{
dst = gen_lowpart (DImode, operands[0]);
dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
if (code == ABS)
set = const0_rtx;
else
set = gen_rtx_NOT (DImode, dst);
}
else
{
dst = gen_highpart (SImode, operands[0]);
if (code == ABS)
{
set = gen_int_mode (0x7fffffff, SImode);
absneg_op = AND;
}
else
{
set = gen_int_mode (0x80000000, SImode);
absneg_op = XOR;
}
set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
}
break;
case E_XFmode:
dst = gen_rtx_REG (SImode,
REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
if (code == ABS)
{
set = GEN_INT (0x7fff);
absneg_op = AND;
}
else
{
set = GEN_INT (0x8000);
absneg_op = XOR;
}
set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
break;
default:
gcc_unreachable ();
}
set = gen_rtx_SET (dst, set);
rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
rtvec par = gen_rtvec (2, set, clob);
emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
}
/* Expand a copysign operation. Special case operand 0 being a constant. */
void
ix86_expand_copysign (rtx operands[])
{
machine_mode mode, vmode;
rtx dest, op0, op1, mask, op2, op3;
mode = GET_MODE (operands[0]);
if (mode == HFmode)
vmode = V8HFmode;
else if (mode == SFmode)
vmode = V4SFmode;
else if (mode == DFmode)
vmode = V2DFmode;
else if (mode == TFmode)
vmode = mode;
else
gcc_unreachable ();
if (rtx_equal_p (operands[1], operands[2]))
{
emit_move_insn (operands[0], operands[1]);
return;
}
dest = lowpart_subreg (vmode, operands[0], mode);
op1 = lowpart_subreg (vmode, operands[2], mode);
mask = ix86_build_signbit_mask (vmode, 0, 0);
if (CONST_DOUBLE_P (operands[1]))
{
op0 = simplify_unary_operation (ABS, mode, operands[1], mode);
/* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
if (op0 == CONST0_RTX (mode))
{
emit_move_insn (dest, gen_rtx_AND (vmode, mask, op1));
return;
}
if (GET_MODE_SIZE (mode) < 16)
op0 = ix86_build_const_vector (vmode, false, op0);
op0 = force_reg (vmode, op0);
}
else
op0 = lowpart_subreg (vmode, operands[1], mode);
op2 = gen_reg_rtx (vmode);
op3 = gen_reg_rtx (vmode);
emit_move_insn (op2, gen_rtx_AND (vmode,
gen_rtx_NOT (vmode, mask),
op0));
emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
emit_move_insn (dest, gen_rtx_IOR (vmode, op2, op3));
}
/* Expand an xorsign operation. */
void
ix86_expand_xorsign (rtx operands[])
{
machine_mode mode, vmode;
rtx dest, op0, op1, mask, x, temp;
dest = operands[0];
op0 = operands[1];
op1 = operands[2];
mode = GET_MODE (dest);
if (mode == HFmode)
vmode = V8HFmode;
else if (mode == SFmode)
vmode = V4SFmode;
else if (mode == DFmode)
vmode = V2DFmode;
else
gcc_unreachable ();
temp = gen_reg_rtx (vmode);
mask = ix86_build_signbit_mask (vmode, 0, 0);
op1 = lowpart_subreg (vmode, op1, mode);
x = gen_rtx_AND (vmode, op1, mask);
emit_insn (gen_rtx_SET (temp, x));
op0 = lowpart_subreg (vmode, op0, mode);
x = gen_rtx_XOR (vmode, temp, op0);
dest = lowpart_subreg (vmode, dest, mode);
emit_insn (gen_rtx_SET (dest, x));
}
static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
void
ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
{
machine_mode mode = GET_MODE (op0);
rtx tmp;
/* Handle special case - vector comparsion with boolean result, transform
it using ptest instruction. */
if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
{
rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
gcc_assert (code == EQ || code == NE);
/* Generate XOR since we can't check that one operand is zero vector. */
tmp = gen_reg_rtx (mode);
emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
tmp = gen_lowpart (p_mode, tmp);
emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
gen_rtx_UNSPEC (CCmode,
gen_rtvec (2, tmp, tmp),
UNSPEC_PTEST)));
tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
gen_rtx_LABEL_REF (VOIDmode, label),
pc_rtx);
emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
return;
}
switch (mode)
{
case E_HFmode:
case E_SFmode:
case E_DFmode:
case E_XFmode:
case E_QImode:
case E_HImode:
case E_SImode:
simple:
tmp = ix86_expand_compare (code, op0, op1);
tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
gen_rtx_LABEL_REF (VOIDmode, label),
pc_rtx);
emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
return;
case E_DImode:
if (TARGET_64BIT)
goto simple;
/* For 32-bit target DI comparison may be performed on
SSE registers. To allow this we should avoid split
to SI mode which is achieved by doing xor in DI mode
and then comparing with zero (which is recognized by
STV pass). We don't compare using xor when optimizing
for size. */
if (!optimize_insn_for_size_p ()
&& TARGET_STV
&& (code == EQ || code == NE))
{
op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
op1 = const0_rtx;
}
/* FALLTHRU */
case E_TImode:
/* Expand DImode branch into multiple compare+branch. */
{
rtx lo[2], hi[2];
rtx_code_label *label2;
enum rtx_code code1, code2, code3;
machine_mode submode;
if (CONSTANT_P (op0) && !CONSTANT_P (op1))
{
std::swap (op0, op1);
code = swap_condition (code);
}
split_double_mode (mode, &op0, 1, lo+0, hi+0);
split_double_mode (mode, &op1, 1, lo+1, hi+1);
submode = mode == DImode ? SImode : DImode;
/* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
avoid two branches. This costs one extra insn, so disable when
optimizing for size. */
if ((code == EQ || code == NE)
&& (!optimize_insn_for_size_p ()
|| hi[1] == const0_rtx || lo[1] == const0_rtx))
{
rtx xor0, xor1;
xor1 = hi[0];
if (hi[1] != const0_rtx)
xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
NULL_RTX, 0, OPTAB_WIDEN);
xor0 = lo[0];
if (lo[1] != const0_rtx)
xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
NULL_RTX, 0, OPTAB_WIDEN);
tmp = expand_binop (submode, ior_optab, xor1, xor0,
NULL_RTX, 0, OPTAB_WIDEN);
ix86_expand_branch (code, tmp, const0_rtx, label);
return;
}
/* Otherwise, if we are doing less-than or greater-or-equal-than,
op1 is a constant and the low word is zero, then we can just
examine the high word. Similarly for low word -1 and
less-or-equal-than or greater-than. */
if (CONST_INT_P (hi[1]))
switch (code)
{
case LT: case LTU: case GE: case GEU:
if (lo[1] == const0_rtx)
{
ix86_expand_branch (code, hi[0], hi[1], label);
return;
}
break;
case LE: case LEU: case GT: case GTU:
if (lo[1] == constm1_rtx)
{
ix86_expand_branch (code, hi[0], hi[1], label);
return;
}
break;
default:
break;
}
/* Emulate comparisons that do not depend on Zero flag with
double-word subtraction. Note that only Overflow, Sign
and Carry flags are valid, so swap arguments and condition
of comparisons that would otherwise test Zero flag. */
switch (code)
{
case LE: case LEU: case GT: case GTU:
std::swap (lo[0], lo[1]);
std::swap (hi[0], hi[1]);
code = swap_condition (code);
/* FALLTHRU */
case LT: case LTU: case GE: case GEU:
{
bool uns = (code == LTU || code == GEU);
rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
= uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
if (!nonimmediate_operand (lo[0], submode))
lo[0] = force_reg (submode, lo[0]);
if (!x86_64_general_operand (lo[1], submode))
lo[1] = force_reg (submode, lo[1]);
if (!register_operand (hi[0], submode))
hi[0] = force_reg (submode, hi[0]);
if ((uns && !nonimmediate_operand (hi[1], submode))
|| (!uns && !x86_64_general_operand (hi[1], submode)))
hi[1] = force_reg (submode, hi[1]);
emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
tmp = gen_rtx_SCRATCH (submode);
emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
ix86_expand_branch (code, tmp, const0_rtx, label);
return;
}
default:
break;
}
/* Otherwise, we need two or three jumps. */
label2 = gen_label_rtx ();
code1 = code;
code2 = swap_condition (code);
code3 = unsigned_condition (code);
switch (code)
{
case LT: case GT: case LTU: case GTU:
break;
case LE: code1 = LT; code2 = GT; break;
case GE: code1 = GT; code2 = LT; break;
case LEU: code1 = LTU; code2 = GTU; break;
case GEU: code1 = GTU; code2 = LTU; break;
case EQ: code1 = UNKNOWN; code2 = NE; break;
case NE: code2 = UNKNOWN; break;
default:
gcc_unreachable ();
}
/*
* a < b =>
* if (hi(a) < hi(b)) goto true;
* if (hi(a) > hi(b)) goto false;
* if (lo(a) < lo(b)) goto true;
* false:
*/
if (code1 != UNKNOWN)
ix86_expand_branch (code1, hi[0], hi[1], label);
if (code2 != UNKNOWN)
ix86_expand_branch (code2, hi[0], hi[1], label2);
ix86_expand_branch (code3, lo[0], lo[1], label);
if (code2 != UNKNOWN)
emit_label (label2);
return;
}
default:
gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
goto simple;
}
}
/* Figure out whether to use unordered fp comparisons. */
static bool
ix86_unordered_fp_compare (enum rtx_code code)
{
if (!TARGET_IEEE_FP)
return false;
switch (code)
{
case LT:
case LE:
case GT:
case GE:
case LTGT:
return false;
case EQ:
case NE:
case UNORDERED:
case ORDERED:
case UNLT:
case UNLE:
case UNGT:
case UNGE:
case UNEQ:
return true;
default:
gcc_unreachable ();
}
}
/* Return a comparison we can do and that it is equivalent to
swap_condition (code) apart possibly from orderedness.
But, never change orderedness if TARGET_IEEE_FP, returning
UNKNOWN in that case if necessary. */
static enum rtx_code
ix86_fp_swap_condition (enum rtx_code code)
{
switch (code)
{
case GT: /* GTU - CF=0 & ZF=0 */
return TARGET_IEEE_FP ? UNKNOWN : UNLT;
case GE: /* GEU - CF=0 */
return TARGET_IEEE_FP ? UNKNOWN : UNLE;
case UNLT: /* LTU - CF=1 */
return TARGET_IEEE_FP ? UNKNOWN : GT;
case UNLE: /* LEU - CF=1 | ZF=1 */
return TARGET_IEEE_FP ? UNKNOWN : GE;
default:
return swap_condition (code);
}
}
/* Return cost of comparison CODE using the best strategy for performance.
All following functions do use number of instructions as a cost metrics.
In future this should be tweaked to compute bytes for optimize_size and
take into account performance of various instructions on various CPUs. */
static int
ix86_fp_comparison_cost (enum rtx_code code)
{
int arith_cost;
/* The cost of code using bit-twiddling on %ah. */
switch (code)
{
case UNLE:
case UNLT:
case LTGT:
case GT:
case GE:
case UNORDERED:
case ORDERED:
case UNEQ:
arith_cost = 4;
break;
case LT:
case NE:
case EQ:
case UNGE:
arith_cost = TARGET_IEEE_FP ? 5 : 4;
break;
case LE:
case UNGT:
arith_cost = TARGET_IEEE_FP ? 6 : 4;
break;
default:
gcc_unreachable ();
}
switch (ix86_fp_comparison_strategy (code))
{
case IX86_FPCMP_COMI:
return arith_cost > 4 ? 3 : 2;
case IX86_FPCMP_SAHF:
return arith_cost > 4 ? 4 : 3;
default:
return arith_cost;
}
}
/* Swap, force into registers, or otherwise massage the two operands
to a fp comparison. The operands are updated in place; the new
comparison code is returned. */
static enum rtx_code
ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
{
bool unordered_compare = ix86_unordered_fp_compare (code);
rtx op0 = *pop0, op1 = *pop1;
machine_mode op_mode = GET_MODE (op0);
bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
/* All of the unordered compare instructions only work on registers.
The same is true of the fcomi compare instructions. The XFmode
compare instructions require registers except when comparing
against zero or when converting operand 1 from fixed point to
floating point. */
if (!is_sse
&& (unordered_compare
|| (op_mode == XFmode
&& ! (standard_80387_constant_p (op0) == 1
|| standard_80387_constant_p (op1) == 1)
&& GET_CODE (op1) != FLOAT)
|| ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
{
op0 = force_reg (op_mode, op0);
op1 = force_reg (op_mode, op1);
}
else
{
/* %%% We only allow op1 in memory; op0 must be st(0). So swap
things around if they appear profitable, otherwise force op0
into a register. */
if (standard_80387_constant_p (op0) == 0
|| (MEM_P (op0)
&& ! (standard_80387_constant_p (op1) == 0
|| MEM_P (op1))))
{
enum rtx_code new_code = ix86_fp_swap_condition (code);
if (new_code != UNKNOWN)
{
std::swap (op0, op1);
code = new_code;
}
}
if (!REG_P (op0))
op0 = force_reg (op_mode, op0);
if (CONSTANT_P (op1))
{
int tmp = standard_80387_constant_p (op1);
if (tmp == 0)
op1 = validize_mem (force_const_mem (op_mode, op1));
else if (tmp == 1)
{
if (TARGET_CMOVE)
op1 = force_reg (op_mode, op1);
}
else
op1 = force_reg (op_mode, op1);
}
}
/* Try to rearrange the comparison to make it cheaper. */
if (ix86_fp_comparison_cost (code)
> ix86_fp_comparison_cost (swap_condition (code))
&& (REG_P (op1) || can_create_pseudo_p ()))
{
std::swap (op0, op1);
code = swap_condition (code);
if (!REG_P (op0))
op0 = force_reg (op_mode, op0);
}
*pop0 = op0;
*pop1 = op1;
return code;
}
/* Generate insn patterns to do a floating point compare of OPERANDS. */
static rtx
ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
{
bool unordered_compare = ix86_unordered_fp_compare (code);
machine_mode cmp_mode;
rtx tmp, scratch;
code = ix86_prepare_fp_compare_args (code, &op0, &op1);
tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
if (unordered_compare)
tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
/* Do fcomi/sahf based test when profitable. */
switch (ix86_fp_comparison_strategy (code))
{
case IX86_FPCMP_COMI:
cmp_mode = CCFPmode;
emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
break;
case IX86_FPCMP_SAHF:
cmp_mode = CCFPmode;
tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
scratch = gen_reg_rtx (HImode);
emit_insn (gen_rtx_SET (scratch, tmp));
emit_insn (gen_x86_sahf_1 (scratch));
break;
case IX86_FPCMP_ARITH:
cmp_mode = CCNOmode;
tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
scratch = gen_reg_rtx (HImode);
emit_insn (gen_rtx_SET (scratch, tmp));
/* In the unordered case, we have to check C2 for NaN's, which
doesn't happen to work out to anything nice combination-wise.
So do some bit twiddling on the value we've got in AH to come
up with an appropriate set of condition codes. */
switch (code)
{
case GT:
case UNGT:
if (code == GT || !TARGET_IEEE_FP)
{
emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
code = EQ;
}
else
{
emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
cmp_mode = CCmode;
code = GEU;
}
break;
case LT:
case UNLT:
if (code == LT && TARGET_IEEE_FP)
{
emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
cmp_mode = CCmode;
code = EQ;
}
else
{
emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
code = NE;
}
break;
case GE:
case UNGE:
if (code == GE || !TARGET_IEEE_FP)
{
emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
code = EQ;
}
else
{
emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
code = NE;
}
break;
case LE:
case UNLE:
if (code == LE && TARGET_IEEE_FP)
{
emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
cmp_mode = CCmode;
code = LTU;
}
else
{
emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
code = NE;
}
break;
case EQ:
case UNEQ:
if (code == EQ && TARGET_IEEE_FP)
{
emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
cmp_mode = CCmode;
code = EQ;
}
else
{
emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
code = NE;
}
break;
case NE:
case LTGT:
if (code == NE && TARGET_IEEE_FP)
{
emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
GEN_INT (0x40)));
code = NE;
}
else
{
emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
code = EQ;
}
break;
case UNORDERED:
emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
code = NE;
break;
case ORDERED:
emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
code = EQ;
break;
default:
gcc_unreachable ();
}
break;
default:
gcc_unreachable();
}
/* Return the test that should be put into the flags user, i.e.