| /* Copyright (C) 1988-2021 Free Software Foundation, Inc. |
| |
| This file is part of GCC. |
| |
| GCC is free software; you can redistribute it and/or modify |
| it under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 3, or (at your option) |
| any later version. |
| |
| GCC is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with GCC; see the file COPYING3. If not see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #define IN_TARGET_CODE 1 |
| |
| #include "config.h" |
| #include "system.h" |
| #include "coretypes.h" |
| #include "backend.h" |
| #include "rtl.h" |
| #include "tree.h" |
| #include "memmodel.h" |
| #include "gimple.h" |
| #include "cfghooks.h" |
| #include "cfgloop.h" |
| #include "df.h" |
| #include "tm_p.h" |
| #include "stringpool.h" |
| #include "expmed.h" |
| #include "optabs.h" |
| #include "regs.h" |
| #include "emit-rtl.h" |
| #include "recog.h" |
| #include "cgraph.h" |
| #include "diagnostic.h" |
| #include "cfgbuild.h" |
| #include "alias.h" |
| #include "fold-const.h" |
| #include "attribs.h" |
| #include "calls.h" |
| #include "stor-layout.h" |
| #include "varasm.h" |
| #include "output.h" |
| #include "insn-attr.h" |
| #include "flags.h" |
| #include "except.h" |
| #include "explow.h" |
| #include "expr.h" |
| #include "cfgrtl.h" |
| #include "common/common-target.h" |
| #include "langhooks.h" |
| #include "reload.h" |
| #include "gimplify.h" |
| #include "dwarf2.h" |
| #include "tm-constrs.h" |
| #include "cselib.h" |
| #include "sched-int.h" |
| #include "opts.h" |
| #include "tree-pass.h" |
| #include "context.h" |
| #include "pass_manager.h" |
| #include "target-globals.h" |
| #include "gimple-iterator.h" |
| #include "tree-vectorizer.h" |
| #include "shrink-wrap.h" |
| #include "builtins.h" |
| #include "rtl-iter.h" |
| #include "tree-iterator.h" |
| #include "dbgcnt.h" |
| #include "case-cfn-macros.h" |
| #include "dojump.h" |
| #include "fold-const-call.h" |
| #include "tree-vrp.h" |
| #include "tree-ssanames.h" |
| #include "selftest.h" |
| #include "selftest-rtl.h" |
| #include "print-rtl.h" |
| #include "intl.h" |
| #include "ifcvt.h" |
| #include "symbol-summary.h" |
| #include "ipa-prop.h" |
| #include "ipa-fnsummary.h" |
| #include "wide-int-bitmask.h" |
| #include "tree-vector-builder.h" |
| #include "debug.h" |
| #include "dwarf2out.h" |
| #include "i386-options.h" |
| #include "i386-builtins.h" |
| #include "i386-expand.h" |
| |
| /* Split one or more double-mode RTL references into pairs of half-mode |
| references. The RTL can be REG, offsettable MEM, integer constant, or |
| CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to |
| split and "num" is its length. lo_half and hi_half are output arrays |
| that parallel "operands". */ |
| |
| void |
| split_double_mode (machine_mode mode, rtx operands[], |
| int num, rtx lo_half[], rtx hi_half[]) |
| { |
| machine_mode half_mode; |
| unsigned int byte; |
| rtx mem_op = NULL_RTX; |
| int mem_num = 0; |
| |
| switch (mode) |
| { |
| case E_TImode: |
| half_mode = DImode; |
| break; |
| case E_DImode: |
| half_mode = SImode; |
| break; |
| case E_P2HImode: |
| half_mode = HImode; |
| break; |
| case E_P2QImode: |
| half_mode = QImode; |
| break; |
| default: |
| gcc_unreachable (); |
| } |
| |
| byte = GET_MODE_SIZE (half_mode); |
| |
| while (num--) |
| { |
| rtx op = operands[num]; |
| |
| /* simplify_subreg refuse to split volatile memory addresses, |
| but we still have to handle it. */ |
| if (MEM_P (op)) |
| { |
| if (mem_op && rtx_equal_p (op, mem_op)) |
| { |
| lo_half[num] = lo_half[mem_num]; |
| hi_half[num] = hi_half[mem_num]; |
| } |
| else |
| { |
| mem_op = op; |
| mem_num = num; |
| lo_half[num] = adjust_address (op, half_mode, 0); |
| hi_half[num] = adjust_address (op, half_mode, byte); |
| } |
| } |
| else |
| { |
| lo_half[num] = simplify_gen_subreg (half_mode, op, |
| GET_MODE (op) == VOIDmode |
| ? mode : GET_MODE (op), 0); |
| |
| rtx tmp = simplify_gen_subreg (half_mode, op, |
| GET_MODE (op) == VOIDmode |
| ? mode : GET_MODE (op), byte); |
| /* simplify_gen_subreg will return NULL RTX for the |
| high half of the paradoxical subreg. */ |
| hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode); |
| } |
| } |
| } |
| |
| /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate |
| for the target. */ |
| |
| void |
| ix86_expand_clear (rtx dest) |
| { |
| rtx tmp; |
| |
| /* We play register width games, which are only valid after reload. */ |
| gcc_assert (reload_completed); |
| |
| /* Avoid HImode and its attendant prefix byte. */ |
| if (GET_MODE_SIZE (GET_MODE (dest)) < 4) |
| dest = gen_rtx_REG (SImode, REGNO (dest)); |
| tmp = gen_rtx_SET (dest, const0_rtx); |
| |
| if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ()) |
| { |
| rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); |
| tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob)); |
| } |
| |
| emit_insn (tmp); |
| } |
| |
| /* Return true if V can be broadcasted from an integer of WIDTH bits |
| which is returned in VAL_BROADCAST. Otherwise, return false. */ |
| |
| static bool |
| ix86_broadcast (HOST_WIDE_INT v, unsigned int width, |
| HOST_WIDE_INT &val_broadcast) |
| { |
| wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT); |
| val_broadcast = wi::extract_uhwi (val, 0, width); |
| for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width) |
| { |
| HOST_WIDE_INT each = wi::extract_uhwi (val, i, width); |
| if (val_broadcast != each) |
| return false; |
| } |
| val_broadcast = sext_hwi (val_broadcast, width); |
| return true; |
| } |
| |
| /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */ |
| |
| static rtx |
| ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op) |
| { |
| /* Don't use integer vector broadcast if we can't move from GPR to SSE |
| register directly. */ |
| if (!TARGET_INTER_UNIT_MOVES_TO_VEC) |
| return nullptr; |
| |
| /* Convert CONST_WIDE_INT to a non-standard SSE constant integer |
| broadcast only if vector broadcast is available. */ |
| if (!TARGET_AVX |
| || !CONST_WIDE_INT_P (op) |
| || standard_sse_constant_p (op, mode)) |
| return nullptr; |
| |
| HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0); |
| HOST_WIDE_INT val_broadcast; |
| scalar_int_mode broadcast_mode; |
| if (TARGET_AVX2 |
| && ix86_broadcast (val, GET_MODE_BITSIZE (QImode), |
| val_broadcast)) |
| broadcast_mode = QImode; |
| else if (TARGET_AVX2 |
| && ix86_broadcast (val, GET_MODE_BITSIZE (HImode), |
| val_broadcast)) |
| broadcast_mode = HImode; |
| else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode), |
| val_broadcast)) |
| broadcast_mode = SImode; |
| else if (TARGET_64BIT |
| && ix86_broadcast (val, GET_MODE_BITSIZE (DImode), |
| val_broadcast)) |
| broadcast_mode = DImode; |
| else |
| return nullptr; |
| |
| /* Check if OP can be broadcasted from VAL. */ |
| for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++) |
| if (val != CONST_WIDE_INT_ELT (op, i)) |
| return nullptr; |
| |
| unsigned int nunits = (GET_MODE_SIZE (mode) |
| / GET_MODE_SIZE (broadcast_mode)); |
| machine_mode vector_mode; |
| if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode)) |
| gcc_unreachable (); |
| rtx target = ix86_gen_scratch_sse_rtx (vector_mode); |
| bool ok = ix86_expand_vector_init_duplicate (false, vector_mode, |
| target, |
| GEN_INT (val_broadcast)); |
| gcc_assert (ok); |
| target = lowpart_subreg (mode, target, vector_mode); |
| return target; |
| } |
| |
| void |
| ix86_expand_move (machine_mode mode, rtx operands[]) |
| { |
| rtx op0, op1; |
| rtx tmp, addend = NULL_RTX; |
| enum tls_model model; |
| |
| op0 = operands[0]; |
| op1 = operands[1]; |
| |
| /* Avoid complex sets of likely spilled hard registers before reload. */ |
| if (!ix86_hardreg_mov_ok (op0, op1)) |
| { |
| tmp = gen_reg_rtx (mode); |
| operands[0] = tmp; |
| ix86_expand_move (mode, operands); |
| operands[0] = op0; |
| operands[1] = tmp; |
| op1 = tmp; |
| } |
| |
| switch (GET_CODE (op1)) |
| { |
| case CONST: |
| tmp = XEXP (op1, 0); |
| |
| if (GET_CODE (tmp) != PLUS |
| || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF) |
| break; |
| |
| op1 = XEXP (tmp, 0); |
| addend = XEXP (tmp, 1); |
| /* FALLTHRU */ |
| |
| case SYMBOL_REF: |
| model = SYMBOL_REF_TLS_MODEL (op1); |
| |
| if (model) |
| op1 = legitimize_tls_address (op1, model, true); |
| else if (ix86_force_load_from_GOT_p (op1)) |
| { |
| /* Load the external function address via GOT slot to avoid PLT. */ |
| op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1), |
| (TARGET_64BIT |
| ? UNSPEC_GOTPCREL |
| : UNSPEC_GOT)); |
| op1 = gen_rtx_CONST (Pmode, op1); |
| op1 = gen_const_mem (Pmode, op1); |
| set_mem_alias_set (op1, ix86_GOT_alias_set ()); |
| } |
| else |
| { |
| tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX); |
| if (tmp) |
| { |
| op1 = tmp; |
| if (!addend) |
| break; |
| } |
| else |
| { |
| op1 = operands[1]; |
| break; |
| } |
| } |
| |
| if (addend) |
| { |
| op1 = force_operand (op1, NULL_RTX); |
| op1 = expand_simple_binop (Pmode, PLUS, op1, addend, |
| op0, 1, OPTAB_DIRECT); |
| } |
| else |
| op1 = force_operand (op1, op0); |
| |
| if (op1 == op0) |
| return; |
| |
| op1 = convert_to_mode (mode, op1, 1); |
| |
| default: |
| break; |
| } |
| |
| if ((flag_pic || MACHOPIC_INDIRECT) |
| && symbolic_operand (op1, mode)) |
| { |
| if (TARGET_MACHO && !TARGET_64BIT) |
| { |
| #if TARGET_MACHO |
| /* dynamic-no-pic */ |
| if (MACHOPIC_INDIRECT) |
| { |
| rtx temp = (op0 && REG_P (op0) && mode == Pmode) |
| ? op0 : gen_reg_rtx (Pmode); |
| op1 = machopic_indirect_data_reference (op1, temp); |
| if (MACHOPIC_PURE) |
| op1 = machopic_legitimize_pic_address (op1, mode, |
| temp == op1 ? 0 : temp); |
| } |
| if (op0 != op1 && GET_CODE (op0) != MEM) |
| { |
| rtx insn = gen_rtx_SET (op0, op1); |
| emit_insn (insn); |
| return; |
| } |
| if (GET_CODE (op0) == MEM) |
| op1 = force_reg (Pmode, op1); |
| else |
| { |
| rtx temp = op0; |
| if (GET_CODE (temp) != REG) |
| temp = gen_reg_rtx (Pmode); |
| temp = legitimize_pic_address (op1, temp); |
| if (temp == op0) |
| return; |
| op1 = temp; |
| } |
| /* dynamic-no-pic */ |
| #endif |
| } |
| else |
| { |
| if (MEM_P (op0)) |
| op1 = force_reg (mode, op1); |
| else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode))) |
| { |
| rtx reg = can_create_pseudo_p () ? NULL_RTX : op0; |
| op1 = legitimize_pic_address (op1, reg); |
| if (op0 == op1) |
| return; |
| op1 = convert_to_mode (mode, op1, 1); |
| } |
| } |
| } |
| else |
| { |
| if (MEM_P (op0) |
| && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode) |
| || !push_operand (op0, mode)) |
| && MEM_P (op1)) |
| op1 = force_reg (mode, op1); |
| |
| if (push_operand (op0, mode) |
| && ! general_no_elim_operand (op1, mode)) |
| op1 = copy_to_mode_reg (mode, op1); |
| |
| /* Force large constants in 64bit compilation into register |
| to get them CSEed. */ |
| if (can_create_pseudo_p () |
| && (mode == DImode) && TARGET_64BIT |
| && immediate_operand (op1, mode) |
| && !x86_64_zext_immediate_operand (op1, VOIDmode) |
| && !register_operand (op0, mode) |
| && optimize) |
| op1 = copy_to_mode_reg (mode, op1); |
| |
| if (can_create_pseudo_p ()) |
| { |
| if (CONST_DOUBLE_P (op1)) |
| { |
| /* If we are loading a floating point constant to a |
| register, force the value to memory now, since we'll |
| get better code out the back end. */ |
| |
| op1 = validize_mem (force_const_mem (mode, op1)); |
| if (!register_operand (op0, mode)) |
| { |
| rtx temp = gen_reg_rtx (mode); |
| emit_insn (gen_rtx_SET (temp, op1)); |
| emit_move_insn (op0, temp); |
| return; |
| } |
| } |
| else if (GET_MODE_SIZE (mode) >= 16) |
| { |
| rtx tmp = ix86_convert_const_wide_int_to_broadcast |
| (GET_MODE (op0), op1); |
| if (tmp != nullptr) |
| op1 = tmp; |
| } |
| } |
| } |
| |
| emit_insn (gen_rtx_SET (op0, op1)); |
| } |
| |
| /* OP is a memref of CONST_VECTOR, return scalar constant mem |
| if CONST_VECTOR is a vec_duplicate, else return NULL. */ |
| static rtx |
| ix86_broadcast_from_constant (machine_mode mode, rtx op) |
| { |
| int nunits = GET_MODE_NUNITS (mode); |
| if (nunits < 2) |
| return nullptr; |
| |
| /* Don't use integer vector broadcast if we can't move from GPR to SSE |
| register directly. */ |
| if (!TARGET_INTER_UNIT_MOVES_TO_VEC |
| && INTEGRAL_MODE_P (mode)) |
| return nullptr; |
| |
| /* Convert CONST_VECTOR to a non-standard SSE constant integer |
| broadcast only if vector broadcast is available. */ |
| if (!(TARGET_AVX2 |
| || (TARGET_AVX |
| && (GET_MODE_INNER (mode) == SImode |
| || GET_MODE_INNER (mode) == DImode)) |
| || FLOAT_MODE_P (mode)) |
| || standard_sse_constant_p (op, mode)) |
| return nullptr; |
| |
| /* Don't broadcast from a 64-bit integer constant in 32-bit mode. |
| We can still put 64-bit integer constant in memory when |
| avx512 embed broadcast is available. */ |
| if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT |
| && (!TARGET_AVX512F |
| || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL))) |
| return nullptr; |
| |
| if (GET_MODE_INNER (mode) == TImode) |
| return nullptr; |
| |
| rtx constant = get_pool_constant (XEXP (op, 0)); |
| if (GET_CODE (constant) != CONST_VECTOR) |
| return nullptr; |
| |
| /* There could be some rtx like |
| (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1"))) |
| but with "*.LC1" refer to V2DI constant vector. */ |
| if (GET_MODE (constant) != mode) |
| { |
| constant = simplify_subreg (mode, constant, GET_MODE (constant), |
| 0); |
| if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR) |
| return nullptr; |
| } |
| |
| rtx first = XVECEXP (constant, 0, 0); |
| |
| for (int i = 1; i < nunits; ++i) |
| { |
| rtx tmp = XVECEXP (constant, 0, i); |
| /* Vector duplicate value. */ |
| if (!rtx_equal_p (tmp, first)) |
| return nullptr; |
| } |
| |
| return first; |
| } |
| |
| void |
| ix86_expand_vector_move (machine_mode mode, rtx operands[]) |
| { |
| rtx op0 = operands[0], op1 = operands[1]; |
| /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU |
| psABI since the biggest alignment is 4 byte for IA MCU psABI. */ |
| unsigned int align = (TARGET_IAMCU |
| ? GET_MODE_BITSIZE (mode) |
| : GET_MODE_ALIGNMENT (mode)); |
| |
| if (push_operand (op0, VOIDmode)) |
| op0 = emit_move_resolve_push (mode, op0); |
| |
| /* Force constants other than zero into memory. We do not know how |
| the instructions used to build constants modify the upper 64 bits |
| of the register, once we have that information we may be able |
| to handle some of them more efficiently. */ |
| if (can_create_pseudo_p () |
| && (CONSTANT_P (op1) |
| || (SUBREG_P (op1) |
| && CONSTANT_P (SUBREG_REG (op1)))) |
| && ((register_operand (op0, mode) |
| && !standard_sse_constant_p (op1, mode)) |
| /* ix86_expand_vector_move_misalign() does not like constants. */ |
| || (SSE_REG_MODE_P (mode) |
| && MEM_P (op0) |
| && MEM_ALIGN (op0) < align))) |
| { |
| if (SUBREG_P (op1)) |
| { |
| machine_mode imode = GET_MODE (SUBREG_REG (op1)); |
| rtx r = force_const_mem (imode, SUBREG_REG (op1)); |
| if (r) |
| r = validize_mem (r); |
| else |
| r = force_reg (imode, SUBREG_REG (op1)); |
| op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1)); |
| } |
| else |
| { |
| machine_mode mode = GET_MODE (op0); |
| rtx tmp = ix86_convert_const_wide_int_to_broadcast |
| (mode, op1); |
| if (tmp == nullptr) |
| op1 = validize_mem (force_const_mem (mode, op1)); |
| else |
| op1 = tmp; |
| } |
| } |
| |
| if (can_create_pseudo_p () |
| && GET_MODE_SIZE (mode) >= 16 |
| && VECTOR_MODE_P (mode) |
| && (MEM_P (op1) |
| && SYMBOL_REF_P (XEXP (op1, 0)) |
| && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0)))) |
| { |
| rtx first = ix86_broadcast_from_constant (mode, op1); |
| if (first != nullptr) |
| { |
| /* Broadcast to XMM/YMM/ZMM register from an integer |
| constant or scalar mem. */ |
| op1 = gen_reg_rtx (mode); |
| if (FLOAT_MODE_P (mode) |
| || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode)) |
| first = force_const_mem (GET_MODE_INNER (mode), first); |
| bool ok = ix86_expand_vector_init_duplicate (false, mode, |
| op1, first); |
| gcc_assert (ok); |
| emit_move_insn (op0, op1); |
| return; |
| } |
| } |
| |
| /* We need to check memory alignment for SSE mode since attribute |
| can make operands unaligned. */ |
| if (can_create_pseudo_p () |
| && SSE_REG_MODE_P (mode) |
| && ((MEM_P (op0) && (MEM_ALIGN (op0) < align)) |
| || (MEM_P (op1) && (MEM_ALIGN (op1) < align)))) |
| { |
| rtx tmp[2]; |
| |
| /* ix86_expand_vector_move_misalign() does not like both |
| arguments in memory. */ |
| if (!register_operand (op0, mode) |
| && !register_operand (op1, mode)) |
| { |
| rtx scratch = ix86_gen_scratch_sse_rtx (mode); |
| emit_move_insn (scratch, op1); |
| op1 = scratch; |
| } |
| |
| tmp[0] = op0; tmp[1] = op1; |
| ix86_expand_vector_move_misalign (mode, tmp); |
| return; |
| } |
| |
| /* Make operand1 a register if it isn't already. */ |
| if (can_create_pseudo_p () |
| && !register_operand (op0, mode) |
| && !register_operand (op1, mode)) |
| { |
| rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0)); |
| emit_move_insn (tmp, op1); |
| emit_move_insn (op0, tmp); |
| return; |
| } |
| |
| emit_insn (gen_rtx_SET (op0, op1)); |
| } |
| |
| /* Split 32-byte AVX unaligned load and store if needed. */ |
| |
| static void |
| ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1) |
| { |
| rtx m; |
| rtx (*extract) (rtx, rtx, rtx); |
| machine_mode mode; |
| |
| if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD) |
| || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE)) |
| { |
| emit_insn (gen_rtx_SET (op0, op1)); |
| return; |
| } |
| |
| rtx orig_op0 = NULL_RTX; |
| mode = GET_MODE (op0); |
| switch (GET_MODE_CLASS (mode)) |
| { |
| case MODE_VECTOR_INT: |
| case MODE_INT: |
| if (mode != V32QImode) |
| { |
| if (!MEM_P (op0)) |
| { |
| orig_op0 = op0; |
| op0 = gen_reg_rtx (V32QImode); |
| } |
| else |
| op0 = gen_lowpart (V32QImode, op0); |
| op1 = gen_lowpart (V32QImode, op1); |
| mode = V32QImode; |
| } |
| break; |
| case MODE_VECTOR_FLOAT: |
| break; |
| default: |
| gcc_unreachable (); |
| } |
| |
| switch (mode) |
| { |
| default: |
| gcc_unreachable (); |
| case E_V32QImode: |
| extract = gen_avx_vextractf128v32qi; |
| mode = V16QImode; |
| break; |
| case E_V16HFmode: |
| extract = gen_avx_vextractf128v16hf; |
| mode = V8HFmode; |
| break; |
| case E_V8SFmode: |
| extract = gen_avx_vextractf128v8sf; |
| mode = V4SFmode; |
| break; |
| case E_V4DFmode: |
| extract = gen_avx_vextractf128v4df; |
| mode = V2DFmode; |
| break; |
| } |
| |
| if (MEM_P (op1)) |
| { |
| rtx r = gen_reg_rtx (mode); |
| m = adjust_address (op1, mode, 0); |
| emit_move_insn (r, m); |
| m = adjust_address (op1, mode, 16); |
| r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m); |
| emit_move_insn (op0, r); |
| } |
| else if (MEM_P (op0)) |
| { |
| m = adjust_address (op0, mode, 0); |
| emit_insn (extract (m, op1, const0_rtx)); |
| m = adjust_address (op0, mode, 16); |
| emit_insn (extract (m, copy_rtx (op1), const1_rtx)); |
| } |
| else |
| gcc_unreachable (); |
| |
| if (orig_op0) |
| emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); |
| } |
| |
| /* Implement the movmisalign patterns for SSE. Non-SSE modes go |
| straight to ix86_expand_vector_move. */ |
| /* Code generation for scalar reg-reg moves of single and double precision data: |
| if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true) |
| movaps reg, reg |
| else |
| movss reg, reg |
| if (x86_sse_partial_reg_dependency == true) |
| movapd reg, reg |
| else |
| movsd reg, reg |
| |
| Code generation for scalar loads of double precision data: |
| if (x86_sse_split_regs == true) |
| movlpd mem, reg (gas syntax) |
| else |
| movsd mem, reg |
| |
| Code generation for unaligned packed loads of single precision data |
| (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency): |
| if (x86_sse_unaligned_move_optimal) |
| movups mem, reg |
| |
| if (x86_sse_partial_reg_dependency == true) |
| { |
| xorps reg, reg |
| movlps mem, reg |
| movhps mem+8, reg |
| } |
| else |
| { |
| movlps mem, reg |
| movhps mem+8, reg |
| } |
| |
| Code generation for unaligned packed loads of double precision data |
| (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs): |
| if (x86_sse_unaligned_move_optimal) |
| movupd mem, reg |
| |
| if (x86_sse_split_regs == true) |
| { |
| movlpd mem, reg |
| movhpd mem+8, reg |
| } |
| else |
| { |
| movsd mem, reg |
| movhpd mem+8, reg |
| } |
| */ |
| |
| void |
| ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[]) |
| { |
| rtx op0, op1, m; |
| |
| op0 = operands[0]; |
| op1 = operands[1]; |
| |
| /* Use unaligned load/store for AVX512 or when optimizing for size. */ |
| if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ()) |
| { |
| emit_insn (gen_rtx_SET (op0, op1)); |
| return; |
| } |
| |
| if (TARGET_AVX) |
| { |
| if (GET_MODE_SIZE (mode) == 32) |
| ix86_avx256_split_vector_move_misalign (op0, op1); |
| else |
| /* Always use 128-bit mov<mode>_internal pattern for AVX. */ |
| emit_insn (gen_rtx_SET (op0, op1)); |
| return; |
| } |
| |
| if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL |
| || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) |
| { |
| emit_insn (gen_rtx_SET (op0, op1)); |
| return; |
| } |
| |
| /* ??? If we have typed data, then it would appear that using |
| movdqu is the only way to get unaligned data loaded with |
| integer type. */ |
| if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) |
| { |
| emit_insn (gen_rtx_SET (op0, op1)); |
| return; |
| } |
| |
| if (MEM_P (op1)) |
| { |
| if (TARGET_SSE2 && mode == V2DFmode) |
| { |
| rtx zero; |
| |
| /* When SSE registers are split into halves, we can avoid |
| writing to the top half twice. */ |
| if (TARGET_SSE_SPLIT_REGS) |
| { |
| emit_clobber (op0); |
| zero = op0; |
| } |
| else |
| { |
| /* ??? Not sure about the best option for the Intel chips. |
| The following would seem to satisfy; the register is |
| entirely cleared, breaking the dependency chain. We |
| then store to the upper half, with a dependency depth |
| of one. A rumor has it that Intel recommends two movsd |
| followed by an unpacklpd, but this is unconfirmed. And |
| given that the dependency depth of the unpacklpd would |
| still be one, I'm not sure why this would be better. */ |
| zero = CONST0_RTX (V2DFmode); |
| } |
| |
| m = adjust_address (op1, DFmode, 0); |
| emit_insn (gen_sse2_loadlpd (op0, zero, m)); |
| m = adjust_address (op1, DFmode, 8); |
| emit_insn (gen_sse2_loadhpd (op0, op0, m)); |
| } |
| else |
| { |
| rtx t; |
| |
| if (mode != V4SFmode) |
| t = gen_reg_rtx (V4SFmode); |
| else |
| t = op0; |
| |
| if (TARGET_SSE_PARTIAL_REG_DEPENDENCY) |
| emit_move_insn (t, CONST0_RTX (V4SFmode)); |
| else |
| emit_clobber (t); |
| |
| m = adjust_address (op1, V2SFmode, 0); |
| emit_insn (gen_sse_loadlps (t, t, m)); |
| m = adjust_address (op1, V2SFmode, 8); |
| emit_insn (gen_sse_loadhps (t, t, m)); |
| if (mode != V4SFmode) |
| emit_move_insn (op0, gen_lowpart (mode, t)); |
| } |
| } |
| else if (MEM_P (op0)) |
| { |
| if (TARGET_SSE2 && mode == V2DFmode) |
| { |
| m = adjust_address (op0, DFmode, 0); |
| emit_insn (gen_sse2_storelpd (m, op1)); |
| m = adjust_address (op0, DFmode, 8); |
| emit_insn (gen_sse2_storehpd (m, op1)); |
| } |
| else |
| { |
| if (mode != V4SFmode) |
| op1 = gen_lowpart (V4SFmode, op1); |
| |
| m = adjust_address (op0, V2SFmode, 0); |
| emit_insn (gen_sse_storelps (m, op1)); |
| m = adjust_address (op0, V2SFmode, 8); |
| emit_insn (gen_sse_storehps (m, copy_rtx (op1))); |
| } |
| } |
| else |
| gcc_unreachable (); |
| } |
| |
| /* Move bits 64:95 to bits 32:63. */ |
| |
| void |
| ix86_move_vector_high_sse_to_mmx (rtx op) |
| { |
| rtx mask = gen_rtx_PARALLEL (VOIDmode, |
| gen_rtvec (4, GEN_INT (0), GEN_INT (2), |
| GEN_INT (0), GEN_INT (0))); |
| rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op)); |
| op = gen_rtx_VEC_SELECT (V4SImode, dest, mask); |
| rtx insn = gen_rtx_SET (dest, op); |
| emit_insn (insn); |
| } |
| |
| /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */ |
| |
| void |
| ix86_split_mmx_pack (rtx operands[], enum rtx_code code) |
| { |
| rtx op0 = operands[0]; |
| rtx op1 = operands[1]; |
| rtx op2 = operands[2]; |
| |
| machine_mode dmode = GET_MODE (op0); |
| machine_mode smode = GET_MODE (op1); |
| machine_mode inner_dmode = GET_MODE_INNER (dmode); |
| machine_mode inner_smode = GET_MODE_INNER (smode); |
| |
| /* Get the corresponding SSE mode for destination. */ |
| int nunits = 16 / GET_MODE_SIZE (inner_dmode); |
| machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode), |
| nunits).require (); |
| machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode), |
| nunits / 2).require (); |
| |
| /* Get the corresponding SSE mode for source. */ |
| nunits = 16 / GET_MODE_SIZE (inner_smode); |
| machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode), |
| nunits).require (); |
| |
| /* Generate SSE pack with signed/unsigned saturation. */ |
| rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0)); |
| op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1)); |
| op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2)); |
| |
| op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1); |
| op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2); |
| rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode, |
| op1, op2)); |
| emit_insn (insn); |
| |
| ix86_move_vector_high_sse_to_mmx (op0); |
| } |
| |
| /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */ |
| |
| void |
| ix86_split_mmx_punpck (rtx operands[], bool high_p) |
| { |
| rtx op0 = operands[0]; |
| rtx op1 = operands[1]; |
| rtx op2 = operands[2]; |
| machine_mode mode = GET_MODE (op0); |
| rtx mask; |
| /* The corresponding SSE mode. */ |
| machine_mode sse_mode, double_sse_mode; |
| |
| switch (mode) |
| { |
| case E_V4QImode: |
| case E_V8QImode: |
| sse_mode = V16QImode; |
| double_sse_mode = V32QImode; |
| mask = gen_rtx_PARALLEL (VOIDmode, |
| gen_rtvec (16, |
| GEN_INT (0), GEN_INT (16), |
| GEN_INT (1), GEN_INT (17), |
| GEN_INT (2), GEN_INT (18), |
| GEN_INT (3), GEN_INT (19), |
| GEN_INT (4), GEN_INT (20), |
| GEN_INT (5), GEN_INT (21), |
| GEN_INT (6), GEN_INT (22), |
| GEN_INT (7), GEN_INT (23))); |
| break; |
| |
| case E_V4HImode: |
| case E_V2HImode: |
| sse_mode = V8HImode; |
| double_sse_mode = V16HImode; |
| mask = gen_rtx_PARALLEL (VOIDmode, |
| gen_rtvec (8, |
| GEN_INT (0), GEN_INT (8), |
| GEN_INT (1), GEN_INT (9), |
| GEN_INT (2), GEN_INT (10), |
| GEN_INT (3), GEN_INT (11))); |
| break; |
| |
| case E_V2SImode: |
| sse_mode = V4SImode; |
| double_sse_mode = V8SImode; |
| mask = gen_rtx_PARALLEL (VOIDmode, |
| gen_rtvec (4, |
| GEN_INT (0), GEN_INT (4), |
| GEN_INT (1), GEN_INT (5))); |
| break; |
| |
| case E_V2SFmode: |
| sse_mode = V4SFmode; |
| double_sse_mode = V8SFmode; |
| mask = gen_rtx_PARALLEL (VOIDmode, |
| gen_rtvec (4, |
| GEN_INT (0), GEN_INT (4), |
| GEN_INT (1), GEN_INT (5))); |
| break; |
| |
| default: |
| gcc_unreachable (); |
| } |
| |
| /* Generate SSE punpcklXX. */ |
| rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0)); |
| op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1)); |
| op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2)); |
| |
| op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2); |
| op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask); |
| rtx insn = gen_rtx_SET (dest, op2); |
| emit_insn (insn); |
| |
| /* Move high bits to low bits. */ |
| if (high_p) |
| { |
| if (sse_mode == V4SFmode) |
| { |
| mask = gen_rtx_PARALLEL (VOIDmode, |
| gen_rtvec (4, GEN_INT (2), GEN_INT (3), |
| GEN_INT (4), GEN_INT (5))); |
| op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest); |
| op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask); |
| } |
| else |
| { |
| int sz = GET_MODE_SIZE (mode); |
| |
| if (sz == 4) |
| mask = gen_rtx_PARALLEL (VOIDmode, |
| gen_rtvec (4, GEN_INT (1), GEN_INT (0), |
| GEN_INT (0), GEN_INT (1))); |
| else if (sz == 8) |
| mask = gen_rtx_PARALLEL (VOIDmode, |
| gen_rtvec (4, GEN_INT (2), GEN_INT (3), |
| GEN_INT (0), GEN_INT (1))); |
| else |
| gcc_unreachable (); |
| |
| dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest)); |
| op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask); |
| } |
| |
| insn = gen_rtx_SET (dest, op1); |
| emit_insn (insn); |
| } |
| } |
| |
| /* Helper function of ix86_fixup_binary_operands to canonicalize |
| operand order. Returns true if the operands should be swapped. */ |
| |
| static bool |
| ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode, |
| rtx operands[]) |
| { |
| rtx dst = operands[0]; |
| rtx src1 = operands[1]; |
| rtx src2 = operands[2]; |
| |
| /* If the operation is not commutative, we can't do anything. */ |
| if (GET_RTX_CLASS (code) != RTX_COMM_ARITH |
| && GET_RTX_CLASS (code) != RTX_COMM_COMPARE) |
| return false; |
| |
| /* Highest priority is that src1 should match dst. */ |
| if (rtx_equal_p (dst, src1)) |
| return false; |
| if (rtx_equal_p (dst, src2)) |
| return true; |
| |
| /* Next highest priority is that immediate constants come second. */ |
| if (immediate_operand (src2, mode)) |
| return false; |
| if (immediate_operand (src1, mode)) |
| return true; |
| |
| /* Lowest priority is that memory references should come second. */ |
| if (MEM_P (src2)) |
| return false; |
| if (MEM_P (src1)) |
| return true; |
| |
| return false; |
| } |
| |
| |
| /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the |
| destination to use for the operation. If different from the true |
| destination in operands[0], a copy operation will be required. */ |
| |
| rtx |
| ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode, |
| rtx operands[]) |
| { |
| rtx dst = operands[0]; |
| rtx src1 = operands[1]; |
| rtx src2 = operands[2]; |
| |
| /* Canonicalize operand order. */ |
| if (ix86_swap_binary_operands_p (code, mode, operands)) |
| { |
| /* It is invalid to swap operands of different modes. */ |
| gcc_assert (GET_MODE (src1) == GET_MODE (src2)); |
| |
| std::swap (src1, src2); |
| } |
| |
| /* Both source operands cannot be in memory. */ |
| if (MEM_P (src1) && MEM_P (src2)) |
| { |
| /* Optimization: Only read from memory once. */ |
| if (rtx_equal_p (src1, src2)) |
| { |
| src2 = force_reg (mode, src2); |
| src1 = src2; |
| } |
| else if (rtx_equal_p (dst, src1)) |
| src2 = force_reg (mode, src2); |
| else |
| src1 = force_reg (mode, src1); |
| } |
| |
| /* If the destination is memory, and we do not have matching source |
| operands, do things in registers. */ |
| if (MEM_P (dst) && !rtx_equal_p (dst, src1)) |
| dst = gen_reg_rtx (mode); |
| |
| /* Source 1 cannot be a constant. */ |
| if (CONSTANT_P (src1)) |
| src1 = force_reg (mode, src1); |
| |
| /* Source 1 cannot be a non-matching memory. */ |
| if (MEM_P (src1) && !rtx_equal_p (dst, src1)) |
| src1 = force_reg (mode, src1); |
| |
| /* Improve address combine. */ |
| if (code == PLUS |
| && GET_MODE_CLASS (mode) == MODE_INT |
| && MEM_P (src2)) |
| src2 = force_reg (mode, src2); |
| |
| operands[1] = src1; |
| operands[2] = src2; |
| return dst; |
| } |
| |
| /* Similarly, but assume that the destination has already been |
| set up properly. */ |
| |
| void |
| ix86_fixup_binary_operands_no_copy (enum rtx_code code, |
| machine_mode mode, rtx operands[]) |
| { |
| rtx dst = ix86_fixup_binary_operands (code, mode, operands); |
| gcc_assert (dst == operands[0]); |
| } |
| |
| /* Attempt to expand a binary operator. Make the expansion closer to the |
| actual machine, then just general_operand, which will allow 3 separate |
| memory references (one output, two input) in a single insn. */ |
| |
| void |
| ix86_expand_binary_operator (enum rtx_code code, machine_mode mode, |
| rtx operands[]) |
| { |
| rtx src1, src2, dst, op, clob; |
| |
| dst = ix86_fixup_binary_operands (code, mode, operands); |
| src1 = operands[1]; |
| src2 = operands[2]; |
| |
| /* Emit the instruction. */ |
| |
| op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2)); |
| |
| if (reload_completed |
| && code == PLUS |
| && !rtx_equal_p (dst, src1)) |
| { |
| /* This is going to be an LEA; avoid splitting it later. */ |
| emit_insn (op); |
| } |
| else |
| { |
| clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); |
| emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); |
| } |
| |
| /* Fix up the destination if needed. */ |
| if (dst != operands[0]) |
| emit_move_insn (operands[0], dst); |
| } |
| |
| /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with |
| the given OPERANDS. */ |
| |
| void |
| ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode, |
| rtx operands[]) |
| { |
| rtx op1 = NULL_RTX, op2 = NULL_RTX; |
| if (SUBREG_P (operands[1])) |
| { |
| op1 = operands[1]; |
| op2 = operands[2]; |
| } |
| else if (SUBREG_P (operands[2])) |
| { |
| op1 = operands[2]; |
| op2 = operands[1]; |
| } |
| /* Optimize (__m128i) d | (__m128i) e and similar code |
| when d and e are float vectors into float vector logical |
| insn. In C/C++ without using intrinsics there is no other way |
| to express vector logical operation on float vectors than |
| to cast them temporarily to integer vectors. */ |
| if (op1 |
| && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL |
| && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR) |
| && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT |
| && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode) |
| && SUBREG_BYTE (op1) == 0 |
| && (GET_CODE (op2) == CONST_VECTOR |
| || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2)) |
| && SUBREG_BYTE (op2) == 0)) |
| && can_create_pseudo_p ()) |
| { |
| rtx dst; |
| switch (GET_MODE (SUBREG_REG (op1))) |
| { |
| case E_V4SFmode: |
| case E_V8SFmode: |
| case E_V16SFmode: |
| case E_V2DFmode: |
| case E_V4DFmode: |
| case E_V8DFmode: |
| dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1))); |
| if (GET_CODE (op2) == CONST_VECTOR) |
| { |
| op2 = gen_lowpart (GET_MODE (dst), op2); |
| op2 = force_reg (GET_MODE (dst), op2); |
| } |
| else |
| { |
| op1 = operands[1]; |
| op2 = SUBREG_REG (operands[2]); |
| if (!vector_operand (op2, GET_MODE (dst))) |
| op2 = force_reg (GET_MODE (dst), op2); |
| } |
| op1 = SUBREG_REG (op1); |
| if (!vector_operand (op1, GET_MODE (dst))) |
| op1 = force_reg (GET_MODE (dst), op1); |
| emit_insn (gen_rtx_SET (dst, |
| gen_rtx_fmt_ee (code, GET_MODE (dst), |
| op1, op2))); |
| emit_move_insn (operands[0], gen_lowpart (mode, dst)); |
| return; |
| default: |
| break; |
| } |
| } |
| if (!vector_operand (operands[1], mode)) |
| operands[1] = force_reg (mode, operands[1]); |
| if (!vector_operand (operands[2], mode)) |
| operands[2] = force_reg (mode, operands[2]); |
| ix86_fixup_binary_operands_no_copy (code, mode, operands); |
| emit_insn (gen_rtx_SET (operands[0], |
| gen_rtx_fmt_ee (code, mode, operands[1], |
| operands[2]))); |
| } |
| |
| /* Return TRUE or FALSE depending on whether the binary operator meets the |
| appropriate constraints. */ |
| |
| bool |
| ix86_binary_operator_ok (enum rtx_code code, machine_mode mode, |
| rtx operands[3]) |
| { |
| rtx dst = operands[0]; |
| rtx src1 = operands[1]; |
| rtx src2 = operands[2]; |
| |
| /* Both source operands cannot be in memory. */ |
| if ((MEM_P (src1) || bcst_mem_operand (src1, mode)) |
| && (MEM_P (src2) || bcst_mem_operand (src2, mode))) |
| return false; |
| |
| /* Canonicalize operand order for commutative operators. */ |
| if (ix86_swap_binary_operands_p (code, mode, operands)) |
| std::swap (src1, src2); |
| |
| /* If the destination is memory, we must have a matching source operand. */ |
| if (MEM_P (dst) && !rtx_equal_p (dst, src1)) |
| return false; |
| |
| /* Source 1 cannot be a constant. */ |
| if (CONSTANT_P (src1)) |
| return false; |
| |
| /* Source 1 cannot be a non-matching memory. */ |
| if (MEM_P (src1) && !rtx_equal_p (dst, src1)) |
| /* Support "andhi/andsi/anddi" as a zero-extending move. */ |
| return (code == AND |
| && (mode == HImode |
| || mode == SImode |
| || (TARGET_64BIT && mode == DImode)) |
| && satisfies_constraint_L (src2)); |
| |
| return true; |
| } |
| |
| /* Attempt to expand a unary operator. Make the expansion closer to the |
| actual machine, then just general_operand, which will allow 2 separate |
| memory references (one output, one input) in a single insn. */ |
| |
| void |
| ix86_expand_unary_operator (enum rtx_code code, machine_mode mode, |
| rtx operands[]) |
| { |
| bool matching_memory = false; |
| rtx src, dst, op, clob; |
| |
| dst = operands[0]; |
| src = operands[1]; |
| |
| /* If the destination is memory, and we do not have matching source |
| operands, do things in registers. */ |
| if (MEM_P (dst)) |
| { |
| if (rtx_equal_p (dst, src)) |
| matching_memory = true; |
| else |
| dst = gen_reg_rtx (mode); |
| } |
| |
| /* When source operand is memory, destination must match. */ |
| if (MEM_P (src) && !matching_memory) |
| src = force_reg (mode, src); |
| |
| /* Emit the instruction. */ |
| |
| op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src)); |
| |
| if (code == NOT) |
| emit_insn (op); |
| else |
| { |
| clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); |
| emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); |
| } |
| |
| /* Fix up the destination if needed. */ |
| if (dst != operands[0]) |
| emit_move_insn (operands[0], dst); |
| } |
| |
| /* Predict just emitted jump instruction to be taken with probability PROB. */ |
| |
| static void |
| predict_jump (int prob) |
| { |
| rtx_insn *insn = get_last_insn (); |
| gcc_assert (JUMP_P (insn)); |
| add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob)); |
| } |
| |
| /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and |
| divisor are within the range [0-255]. */ |
| |
| void |
| ix86_split_idivmod (machine_mode mode, rtx operands[], |
| bool unsigned_p) |
| { |
| rtx_code_label *end_label, *qimode_label; |
| rtx div, mod; |
| rtx_insn *insn; |
| rtx scratch, tmp0, tmp1, tmp2; |
| rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx); |
| |
| switch (mode) |
| { |
| case E_SImode: |
| if (GET_MODE (operands[0]) == SImode) |
| { |
| if (GET_MODE (operands[1]) == SImode) |
| gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1; |
| else |
| gen_divmod4_1 |
| = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2; |
| } |
| else |
| gen_divmod4_1 |
| = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1; |
| break; |
| |
| case E_DImode: |
| gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1; |
| break; |
| |
| default: |
| gcc_unreachable (); |
| } |
| |
| end_label = gen_label_rtx (); |
| qimode_label = gen_label_rtx (); |
| |
| scratch = gen_reg_rtx (mode); |
| |
| /* Use 8bit unsigned divimod if dividend and divisor are within |
| the range [0-255]. */ |
| emit_move_insn (scratch, operands[2]); |
| scratch = expand_simple_binop (mode, IOR, scratch, operands[3], |
| scratch, 1, OPTAB_DIRECT); |
| emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100))); |
| tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG); |
| tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx); |
| tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0, |
| gen_rtx_LABEL_REF (VOIDmode, qimode_label), |
| pc_rtx); |
| insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0)); |
| predict_jump (REG_BR_PROB_BASE * 50 / 100); |
| JUMP_LABEL (insn) = qimode_label; |
| |
| /* Generate original signed/unsigned divimod. */ |
| emit_insn (gen_divmod4_1 (operands[0], operands[1], |
| operands[2], operands[3])); |
| |
| /* Branch to the end. */ |
| emit_jump_insn (gen_jump (end_label)); |
| emit_barrier (); |
| |
| /* Generate 8bit unsigned divide. */ |
| emit_label (qimode_label); |
| /* Don't use operands[0] for result of 8bit divide since not all |
| registers support QImode ZERO_EXTRACT. */ |
| tmp0 = lowpart_subreg (HImode, scratch, mode); |
| tmp1 = lowpart_subreg (HImode, operands[2], mode); |
| tmp2 = lowpart_subreg (QImode, operands[3], mode); |
| emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2)); |
| |
| if (unsigned_p) |
| { |
| div = gen_rtx_UDIV (mode, operands[2], operands[3]); |
| mod = gen_rtx_UMOD (mode, operands[2], operands[3]); |
| } |
| else |
| { |
| div = gen_rtx_DIV (mode, operands[2], operands[3]); |
| mod = gen_rtx_MOD (mode, operands[2], operands[3]); |
| } |
| if (mode == SImode) |
| { |
| if (GET_MODE (operands[0]) != SImode) |
| div = gen_rtx_ZERO_EXTEND (DImode, div); |
| if (GET_MODE (operands[1]) != SImode) |
| mod = gen_rtx_ZERO_EXTEND (DImode, mod); |
| } |
| |
| /* Extract remainder from AH. */ |
| scratch = gen_lowpart (GET_MODE (operands[1]), scratch); |
| tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch, |
| GEN_INT (8), GEN_INT (8)); |
| insn = emit_move_insn (operands[1], tmp1); |
| set_unique_reg_note (insn, REG_EQUAL, mod); |
| |
| /* Zero extend quotient from AL. */ |
| tmp1 = gen_lowpart (QImode, tmp0); |
| insn = emit_insn (gen_extend_insn |
| (operands[0], tmp1, |
| GET_MODE (operands[0]), QImode, 1)); |
| set_unique_reg_note (insn, REG_EQUAL, div); |
| |
| emit_label (end_label); |
| } |
| |
| /* Emit x86 binary operand CODE in mode MODE, where the first operand |
| matches destination. RTX includes clobber of FLAGS_REG. */ |
| |
| void |
| ix86_emit_binop (enum rtx_code code, machine_mode mode, |
| rtx dst, rtx src) |
| { |
| rtx op, clob; |
| |
| op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src)); |
| clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); |
| |
| emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); |
| } |
| |
| /* Return true if regno1 def is nearest to the insn. */ |
| |
| static bool |
| find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2) |
| { |
| rtx_insn *prev = insn; |
| rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn)); |
| |
| if (insn == start) |
| return false; |
| while (prev && prev != start) |
| { |
| if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev)) |
| { |
| prev = PREV_INSN (prev); |
| continue; |
| } |
| if (insn_defines_reg (regno1, INVALID_REGNUM, prev)) |
| return true; |
| else if (insn_defines_reg (regno2, INVALID_REGNUM, prev)) |
| return false; |
| prev = PREV_INSN (prev); |
| } |
| |
| /* None of the regs is defined in the bb. */ |
| return false; |
| } |
| |
| /* INSN_UID of the last insn emitted by zero store peephole2s. */ |
| int ix86_last_zero_store_uid; |
| |
| /* Split lea instructions into a sequence of instructions |
| which are executed on ALU to avoid AGU stalls. |
| It is assumed that it is allowed to clobber flags register |
| at lea position. */ |
| |
| void |
| ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode) |
| { |
| unsigned int regno0, regno1, regno2; |
| struct ix86_address parts; |
| rtx target, tmp; |
| int ok, adds; |
| |
| ok = ix86_decompose_address (operands[1], &parts); |
| gcc_assert (ok); |
| |
| target = gen_lowpart (mode, operands[0]); |
| |
| regno0 = true_regnum (target); |
| regno1 = INVALID_REGNUM; |
| regno2 = INVALID_REGNUM; |
| |
| if (parts.base) |
| { |
| parts.base = gen_lowpart (mode, parts.base); |
| regno1 = true_regnum (parts.base); |
| } |
| |
| if (parts.index) |
| { |
| parts.index = gen_lowpart (mode, parts.index); |
| regno2 = true_regnum (parts.index); |
| } |
| |
| if (parts.disp) |
| parts.disp = gen_lowpart (mode, parts.disp); |
| |
| if (parts.scale > 1) |
| { |
| /* Case r1 = r1 + ... */ |
| if (regno1 == regno0) |
| { |
| /* If we have a case r1 = r1 + C * r2 then we |
| should use multiplication which is very |
| expensive. Assume cost model is wrong if we |
| have such case here. */ |
| gcc_assert (regno2 != regno0); |
| |
| for (adds = parts.scale; adds > 0; adds--) |
| ix86_emit_binop (PLUS, mode, target, parts.index); |
| } |
| else |
| { |
| /* r1 = r2 + r3 * C case. Need to move r3 into r1. */ |
| if (regno0 != regno2) |
| emit_insn (gen_rtx_SET (target, parts.index)); |
| |
| /* Use shift for scaling, but emit it as MULT instead |
| to avoid it being immediately peephole2 optimized back |
| into lea. */ |
| ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale)); |
| |
| if (parts.base) |
| ix86_emit_binop (PLUS, mode, target, parts.base); |
| |
| if (parts.disp && parts.disp != const0_rtx) |
| ix86_emit_binop (PLUS, mode, target, parts.disp); |
| } |
| } |
| else if (!parts.base && !parts.index) |
| { |
| gcc_assert(parts.disp); |
| emit_insn (gen_rtx_SET (target, parts.disp)); |
| } |
| else |
| { |
| if (!parts.base) |
| { |
| if (regno0 != regno2) |
| emit_insn (gen_rtx_SET (target, parts.index)); |
| } |
| else if (!parts.index) |
| { |
| if (regno0 != regno1) |
| emit_insn (gen_rtx_SET (target, parts.base)); |
| } |
| else |
| { |
| if (regno0 == regno1) |
| tmp = parts.index; |
| else if (regno0 == regno2) |
| tmp = parts.base; |
| else |
| { |
| rtx tmp1; |
| |
| /* Find better operand for SET instruction, depending |
| on which definition is farther from the insn. */ |
| if (find_nearest_reg_def (insn, regno1, regno2)) |
| tmp = parts.index, tmp1 = parts.base; |
| else |
| tmp = parts.base, tmp1 = parts.index; |
| |
| emit_insn (gen_rtx_SET (target, tmp)); |
| |
| if (parts.disp && parts.disp != const0_rtx) |
| ix86_emit_binop (PLUS, mode, target, parts.disp); |
| |
| ix86_emit_binop (PLUS, mode, target, tmp1); |
| return; |
| } |
| |
| ix86_emit_binop (PLUS, mode, target, tmp); |
| } |
| |
| if (parts.disp && parts.disp != const0_rtx) |
| ix86_emit_binop (PLUS, mode, target, parts.disp); |
| } |
| } |
| |
| /* Post-reload splitter for converting an SF or DFmode value in an |
| SSE register into an unsigned SImode. */ |
| |
| void |
| ix86_split_convert_uns_si_sse (rtx operands[]) |
| { |
| machine_mode vecmode; |
| rtx value, large, zero_or_two31, input, two31, x; |
| |
| large = operands[1]; |
| zero_or_two31 = operands[2]; |
| input = operands[3]; |
| two31 = operands[4]; |
| vecmode = GET_MODE (large); |
| value = gen_rtx_REG (vecmode, REGNO (operands[0])); |
| |
| /* Load up the value into the low element. We must ensure that the other |
| elements are valid floats -- zero is the easiest such value. */ |
| if (MEM_P (input)) |
| { |
| if (vecmode == V4SFmode) |
| emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input)); |
| else |
| emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input)); |
| } |
| else |
| { |
| input = gen_rtx_REG (vecmode, REGNO (input)); |
| emit_move_insn (value, CONST0_RTX (vecmode)); |
| if (vecmode == V4SFmode) |
| emit_insn (gen_sse_movss (value, value, input)); |
| else |
| emit_insn (gen_sse2_movsd (value, value, input)); |
| } |
| |
| emit_move_insn (large, two31); |
| emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31); |
| |
| x = gen_rtx_fmt_ee (LE, vecmode, large, value); |
| emit_insn (gen_rtx_SET (large, x)); |
| |
| x = gen_rtx_AND (vecmode, zero_or_two31, large); |
| emit_insn (gen_rtx_SET (zero_or_two31, x)); |
| |
| x = gen_rtx_MINUS (vecmode, value, zero_or_two31); |
| emit_insn (gen_rtx_SET (value, x)); |
| |
| large = gen_rtx_REG (V4SImode, REGNO (large)); |
| emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31))); |
| |
| x = gen_rtx_REG (V4SImode, REGNO (value)); |
| if (vecmode == V4SFmode) |
| emit_insn (gen_fix_truncv4sfv4si2 (x, value)); |
| else |
| emit_insn (gen_sse2_cvttpd2dq (x, value)); |
| value = x; |
| |
| emit_insn (gen_xorv4si3 (value, value, large)); |
| } |
| |
| static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok, |
| machine_mode mode, rtx target, |
| rtx var, int one_var); |
| |
| /* Convert an unsigned DImode value into a DFmode, using only SSE. |
| Expects the 64-bit DImode to be supplied in a pair of integral |
| registers. Requires SSE2; will use SSE3 if available. For x86_32, |
| -mfpmath=sse, !optimize_size only. */ |
| |
| void |
| ix86_expand_convert_uns_didf_sse (rtx target, rtx input) |
| { |
| REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt; |
| rtx int_xmm, fp_xmm; |
| rtx biases, exponents; |
| rtx x; |
| |
| int_xmm = gen_reg_rtx (V4SImode); |
| if (TARGET_INTER_UNIT_MOVES_TO_VEC) |
| emit_insn (gen_movdi_to_sse (int_xmm, input)); |
| else if (TARGET_SSE_SPLIT_REGS) |
| { |
| emit_clobber (int_xmm); |
| emit_move_insn (gen_lowpart (DImode, int_xmm), input); |
| } |
| else |
| { |
| x = gen_reg_rtx (V2DImode); |
| ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0); |
| emit_move_insn (int_xmm, gen_lowpart (V4SImode, x)); |
| } |
| |
| x = gen_rtx_CONST_VECTOR (V4SImode, |
| gen_rtvec (4, GEN_INT (0x43300000UL), |
| GEN_INT (0x45300000UL), |
| const0_rtx, const0_rtx)); |
| exponents = validize_mem (force_const_mem (V4SImode, x)); |
| |
| /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */ |
| emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents)); |
| |
| /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm) |
| yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)). |
| Similarly (0x45300000UL ## fp_value_hi_xmm) yields |
| (0x1.0p84 + double(fp_value_hi_xmm)). |
| Note these exponents differ by 32. */ |
| |
| fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm)); |
| |
| /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values |
| in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */ |
| real_ldexp (&bias_lo_rvt, &dconst1, 52); |
| real_ldexp (&bias_hi_rvt, &dconst1, 84); |
| biases = const_double_from_real_value (bias_lo_rvt, DFmode); |
| x = const_double_from_real_value (bias_hi_rvt, DFmode); |
| biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x)); |
| biases = validize_mem (force_const_mem (V2DFmode, biases)); |
| emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases)); |
| |
| /* Add the upper and lower DFmode values together. */ |
| if (TARGET_SSE3) |
| emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm)); |
| else |
| { |
| x = copy_to_mode_reg (V2DFmode, fp_xmm); |
| emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm)); |
| emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x)); |
| } |
| |
| ix86_expand_vector_extract (false, target, fp_xmm, 0); |
| } |
| |
| /* Not used, but eases macroization of patterns. */ |
| void |
| ix86_expand_convert_uns_sixf_sse (rtx, rtx) |
| { |
| gcc_unreachable (); |
| } |
| |
| static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask); |
| |
| /* Convert an unsigned SImode value into a DFmode. Only currently used |
| for SSE, but applicable anywhere. */ |
| |
| void |
| ix86_expand_convert_uns_sidf_sse (rtx target, rtx input) |
| { |
| REAL_VALUE_TYPE TWO31r; |
| rtx x, fp; |
| |
| x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1), |
| NULL, 1, OPTAB_DIRECT); |
| |
| fp = gen_reg_rtx (DFmode); |
| emit_insn (gen_floatsidf2 (fp, x)); |
| |
| real_ldexp (&TWO31r, &dconst1, 31); |
| x = const_double_from_real_value (TWO31r, DFmode); |
| |
| x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT); |
| |
| /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */ |
| if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math) |
| x = ix86_expand_sse_fabs (x, NULL); |
| |
| if (x != target) |
| emit_move_insn (target, x); |
| } |
| |
| /* Convert a signed DImode value into a DFmode. Only used for SSE in |
| 32-bit mode; otherwise we have a direct convert instruction. */ |
| |
| void |
| ix86_expand_convert_sign_didf_sse (rtx target, rtx input) |
| { |
| REAL_VALUE_TYPE TWO32r; |
| rtx fp_lo, fp_hi, x; |
| |
| fp_lo = gen_reg_rtx (DFmode); |
| fp_hi = gen_reg_rtx (DFmode); |
| |
| emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input))); |
| |
| real_ldexp (&TWO32r, &dconst1, 32); |
| x = const_double_from_real_value (TWO32r, DFmode); |
| fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT); |
| |
| ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input)); |
| |
| x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target, |
| 0, OPTAB_DIRECT); |
| if (x != target) |
| emit_move_insn (target, x); |
| } |
| |
| /* Convert an unsigned SImode value into a SFmode, using only SSE. |
| For x86_32, -mfpmath=sse, !optimize_size only. */ |
| void |
| ix86_expand_convert_uns_sisf_sse (rtx target, rtx input) |
| { |
| REAL_VALUE_TYPE ONE16r; |
| rtx fp_hi, fp_lo, int_hi, int_lo, x; |
| |
| real_ldexp (&ONE16r, &dconst1, 16); |
| x = const_double_from_real_value (ONE16r, SFmode); |
| int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff), |
| NULL, 0, OPTAB_DIRECT); |
| int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16), |
| NULL, 0, OPTAB_DIRECT); |
| fp_hi = gen_reg_rtx (SFmode); |
| fp_lo = gen_reg_rtx (SFmode); |
| emit_insn (gen_floatsisf2 (fp_hi, int_hi)); |
| emit_insn (gen_floatsisf2 (fp_lo, int_lo)); |
| if (TARGET_FMA) |
| { |
| x = validize_mem (force_const_mem (SFmode, x)); |
| fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo); |
| emit_move_insn (target, fp_hi); |
| } |
| else |
| { |
| fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi, |
| 0, OPTAB_DIRECT); |
| fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target, |
| 0, OPTAB_DIRECT); |
| if (!rtx_equal_p (target, fp_hi)) |
| emit_move_insn (target, fp_hi); |
| } |
| } |
| |
| /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert |
| a vector of unsigned ints VAL to vector of floats TARGET. */ |
| |
| void |
| ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val) |
| { |
| rtx tmp[8]; |
| REAL_VALUE_TYPE TWO16r; |
| machine_mode intmode = GET_MODE (val); |
| machine_mode fltmode = GET_MODE (target); |
| rtx (*cvt) (rtx, rtx); |
| |
| if (intmode == V4SImode) |
| cvt = gen_floatv4siv4sf2; |
| else |
| cvt = gen_floatv8siv8sf2; |
| tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff)); |
| tmp[0] = force_reg (intmode, tmp[0]); |
| tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1, |
| OPTAB_DIRECT); |
| tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16), |
| NULL_RTX, 1, OPTAB_DIRECT); |
| tmp[3] = gen_reg_rtx (fltmode); |
| emit_insn (cvt (tmp[3], tmp[1])); |
| tmp[4] = gen_reg_rtx (fltmode); |
| emit_insn (cvt (tmp[4], tmp[2])); |
| real_ldexp (&TWO16r, &dconst1, 16); |
| tmp[5] = const_double_from_real_value (TWO16r, SFmode); |
| tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5])); |
| if (TARGET_FMA) |
| { |
| tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]); |
| emit_move_insn (target, tmp[6]); |
| } |
| else |
| { |
| tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], |
| NULL_RTX, 1, OPTAB_DIRECT); |
| tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], |
| target, 1, OPTAB_DIRECT); |
| if (tmp[7] != target) |
| emit_move_insn (target, tmp[7]); |
| } |
| } |
| |
| /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc* |
| pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*. |
| This is done by doing just signed conversion if < 0x1p31, and otherwise by |
| subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */ |
| |
| rtx |
| ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp) |
| { |
| REAL_VALUE_TYPE TWO31r; |
| rtx two31r, tmp[4]; |
| machine_mode mode = GET_MODE (val); |
| machine_mode scalarmode = GET_MODE_INNER (mode); |
| machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode; |
| rtx (*cmp) (rtx, rtx, rtx, rtx); |
| int i; |
| |
| for (i = 0; i < 3; i++) |
| tmp[i] = gen_reg_rtx (mode); |
| real_ldexp (&TWO31r, &dconst1, 31); |
| two31r = const_double_from_real_value (TWO31r, scalarmode); |
| two31r = ix86_build_const_vector (mode, 1, two31r); |
| two31r = force_reg (mode, two31r); |
| switch (mode) |
| { |
| case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break; |
| case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break; |
| case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break; |
| case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break; |
| default: gcc_unreachable (); |
| } |
| tmp[3] = gen_rtx_LE (mode, two31r, val); |
| emit_insn (cmp (tmp[0], two31r, val, tmp[3])); |
| tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1], |
| 0, OPTAB_DIRECT); |
| if (intmode == V4SImode || TARGET_AVX2) |
| *xorp = expand_simple_binop (intmode, ASHIFT, |
| gen_lowpart (intmode, tmp[0]), |
| GEN_INT (31), NULL_RTX, 0, |
| OPTAB_DIRECT); |
| else |
| { |
| rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode); |
| two31 = ix86_build_const_vector (intmode, 1, two31); |
| *xorp = expand_simple_binop (intmode, AND, |
| gen_lowpart (intmode, tmp[0]), |
| two31, NULL_RTX, 0, |
| OPTAB_DIRECT); |
| } |
| return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2], |
| 0, OPTAB_DIRECT); |
| } |
| |
| /* Generate code for floating point ABS or NEG. */ |
| |
| void |
| ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode, |
| rtx operands[]) |
| { |
| rtx set, dst, src; |
| bool use_sse = false; |
| bool vector_mode = VECTOR_MODE_P (mode); |
| machine_mode vmode = mode; |
| rtvec par; |
| |
| if (vector_mode || mode == TFmode || mode == HFmode) |
| { |
| use_sse = true; |
| if (mode == HFmode) |
| vmode = V8HFmode; |
| } |
| else if (TARGET_SSE_MATH) |
| { |
| use_sse = SSE_FLOAT_MODE_P (mode); |
| if (mode == SFmode) |
| vmode = V4SFmode; |
| else if (mode == DFmode) |
| vmode = V2DFmode; |
| } |
| |
| dst = operands[0]; |
| src = operands[1]; |
| |
| set = gen_rtx_fmt_e (code, mode, src); |
| set = gen_rtx_SET (dst, set); |
| |
| if (use_sse) |
| { |
| rtx mask, use, clob; |
| |
| /* NEG and ABS performed with SSE use bitwise mask operations. |
| Create the appropriate mask now. */ |
| mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS); |
| use = gen_rtx_USE (VOIDmode, mask); |
| if (vector_mode || mode == TFmode) |
| par = gen_rtvec (2, set, use); |
| else |
| { |
| clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); |
| par = gen_rtvec (3, set, use, clob); |
| } |
| } |
| else |
| { |
| rtx clob; |
| |
| /* Changing of sign for FP values is doable using integer unit too. */ |
| clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); |
| par = gen_rtvec (2, set, clob); |
| } |
| |
| emit_insn (gen_rtx_PARALLEL (VOIDmode, par)); |
| } |
| |
| /* Deconstruct a floating point ABS or NEG operation |
| with integer registers into integer operations. */ |
| |
| void |
| ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode, |
| rtx operands[]) |
| { |
| enum rtx_code absneg_op; |
| rtx dst, set; |
| |
| gcc_assert (operands_match_p (operands[0], operands[1])); |
| |
| switch (mode) |
| { |
| case E_SFmode: |
| dst = gen_lowpart (SImode, operands[0]); |
| |
| if (code == ABS) |
| { |
| set = gen_int_mode (0x7fffffff, SImode); |
| absneg_op = AND; |
| } |
| else |
| { |
| set = gen_int_mode (0x80000000, SImode); |
| absneg_op = XOR; |
| } |
| set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set); |
| break; |
| |
| case E_DFmode: |
| if (TARGET_64BIT) |
| { |
| dst = gen_lowpart (DImode, operands[0]); |
| dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63)); |
| |
| if (code == ABS) |
| set = const0_rtx; |
| else |
| set = gen_rtx_NOT (DImode, dst); |
| } |
| else |
| { |
| dst = gen_highpart (SImode, operands[0]); |
| |
| if (code == ABS) |
| { |
| set = gen_int_mode (0x7fffffff, SImode); |
| absneg_op = AND; |
| } |
| else |
| { |
| set = gen_int_mode (0x80000000, SImode); |
| absneg_op = XOR; |
| } |
| set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set); |
| } |
| break; |
| |
| case E_XFmode: |
| dst = gen_rtx_REG (SImode, |
| REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2)); |
| if (code == ABS) |
| { |
| set = GEN_INT (0x7fff); |
| absneg_op = AND; |
| } |
| else |
| { |
| set = GEN_INT (0x8000); |
| absneg_op = XOR; |
| } |
| set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set); |
| break; |
| |
| default: |
| gcc_unreachable (); |
| } |
| |
| set = gen_rtx_SET (dst, set); |
| |
| rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); |
| rtvec par = gen_rtvec (2, set, clob); |
| |
| emit_insn (gen_rtx_PARALLEL (VOIDmode, par)); |
| } |
| |
| /* Expand a copysign operation. Special case operand 0 being a constant. */ |
| |
| void |
| ix86_expand_copysign (rtx operands[]) |
| { |
| machine_mode mode, vmode; |
| rtx dest, op0, op1, mask, op2, op3; |
| |
| mode = GET_MODE (operands[0]); |
| |
| if (mode == HFmode) |
| vmode = V8HFmode; |
| else if (mode == SFmode) |
| vmode = V4SFmode; |
| else if (mode == DFmode) |
| vmode = V2DFmode; |
| else if (mode == TFmode) |
| vmode = mode; |
| else |
| gcc_unreachable (); |
| |
| if (rtx_equal_p (operands[1], operands[2])) |
| { |
| emit_move_insn (operands[0], operands[1]); |
| return; |
| } |
| |
| dest = lowpart_subreg (vmode, operands[0], mode); |
| op1 = lowpart_subreg (vmode, operands[2], mode); |
| mask = ix86_build_signbit_mask (vmode, 0, 0); |
| |
| if (CONST_DOUBLE_P (operands[1])) |
| { |
| op0 = simplify_unary_operation (ABS, mode, operands[1], mode); |
| /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */ |
| if (op0 == CONST0_RTX (mode)) |
| { |
| emit_move_insn (dest, gen_rtx_AND (vmode, mask, op1)); |
| return; |
| } |
| |
| if (GET_MODE_SIZE (mode) < 16) |
| op0 = ix86_build_const_vector (vmode, false, op0); |
| op0 = force_reg (vmode, op0); |
| } |
| else |
| op0 = lowpart_subreg (vmode, operands[1], mode); |
| |
| op2 = gen_reg_rtx (vmode); |
| op3 = gen_reg_rtx (vmode); |
| emit_move_insn (op2, gen_rtx_AND (vmode, |
| gen_rtx_NOT (vmode, mask), |
| op0)); |
| emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1)); |
| emit_move_insn (dest, gen_rtx_IOR (vmode, op2, op3)); |
| } |
| |
| /* Expand an xorsign operation. */ |
| |
| void |
| ix86_expand_xorsign (rtx operands[]) |
| { |
| machine_mode mode, vmode; |
| rtx dest, op0, op1, mask, x, temp; |
| |
| dest = operands[0]; |
| op0 = operands[1]; |
| op1 = operands[2]; |
| |
| mode = GET_MODE (dest); |
| |
| if (mode == HFmode) |
| vmode = V8HFmode; |
| else if (mode == SFmode) |
| vmode = V4SFmode; |
| else if (mode == DFmode) |
| vmode = V2DFmode; |
| else |
| gcc_unreachable (); |
| |
| temp = gen_reg_rtx (vmode); |
| mask = ix86_build_signbit_mask (vmode, 0, 0); |
| |
| op1 = lowpart_subreg (vmode, op1, mode); |
| x = gen_rtx_AND (vmode, op1, mask); |
| emit_insn (gen_rtx_SET (temp, x)); |
| |
| op0 = lowpart_subreg (vmode, op0, mode); |
| x = gen_rtx_XOR (vmode, temp, op0); |
| |
| dest = lowpart_subreg (vmode, dest, mode); |
| emit_insn (gen_rtx_SET (dest, x)); |
| } |
| |
| static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1); |
| |
| void |
| ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label) |
| { |
| machine_mode mode = GET_MODE (op0); |
| rtx tmp; |
| |
| /* Handle special case - vector comparsion with boolean result, transform |
| it using ptest instruction. */ |
| if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) |
| { |
| rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG); |
| machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode; |
| |
| gcc_assert (code == EQ || code == NE); |
| /* Generate XOR since we can't check that one operand is zero vector. */ |
| tmp = gen_reg_rtx (mode); |
| emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1))); |
| tmp = gen_lowpart (p_mode, tmp); |
| emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG), |
| gen_rtx_UNSPEC (CCmode, |
| gen_rtvec (2, tmp, tmp), |
| UNSPEC_PTEST))); |
| tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx); |
| tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, |
| gen_rtx_LABEL_REF (VOIDmode, label), |
| pc_rtx); |
| emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); |
| return; |
| } |
| |
| switch (mode) |
| { |
| case E_HFmode: |
| case E_SFmode: |
| case E_DFmode: |
| case E_XFmode: |
| case E_QImode: |
| case E_HImode: |
| case E_SImode: |
| simple: |
| tmp = ix86_expand_compare (code, op0, op1); |
| tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, |
| gen_rtx_LABEL_REF (VOIDmode, label), |
| pc_rtx); |
| emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); |
| return; |
| |
| case E_DImode: |
| if (TARGET_64BIT) |
| goto simple; |
| /* For 32-bit target DI comparison may be performed on |
| SSE registers. To allow this we should avoid split |
| to SI mode which is achieved by doing xor in DI mode |
| and then comparing with zero (which is recognized by |
| STV pass). We don't compare using xor when optimizing |
| for size. */ |
| if (!optimize_insn_for_size_p () |
| && TARGET_STV |
| && (code == EQ || code == NE)) |
| { |
| op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1)); |
| op1 = const0_rtx; |
| } |
| /* FALLTHRU */ |
| case E_TImode: |
| /* Expand DImode branch into multiple compare+branch. */ |
| { |
| rtx lo[2], hi[2]; |
| rtx_code_label *label2; |
| enum rtx_code code1, code2, code3; |
| machine_mode submode; |
| |
| if (CONSTANT_P (op0) && !CONSTANT_P (op1)) |
| { |
| std::swap (op0, op1); |
| code = swap_condition (code); |
| } |
| |
| split_double_mode (mode, &op0, 1, lo+0, hi+0); |
| split_double_mode (mode, &op1, 1, lo+1, hi+1); |
| |
| submode = mode == DImode ? SImode : DImode; |
| |
| /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to |
| avoid two branches. This costs one extra insn, so disable when |
| optimizing for size. */ |
| |
| if ((code == EQ || code == NE) |
| && (!optimize_insn_for_size_p () |
| || hi[1] == const0_rtx || lo[1] == const0_rtx)) |
| { |
| rtx xor0, xor1; |
| |
| xor1 = hi[0]; |
| if (hi[1] != const0_rtx) |
| xor1 = expand_binop (submode, xor_optab, xor1, hi[1], |
| NULL_RTX, 0, OPTAB_WIDEN); |
| |
| xor0 = lo[0]; |
| if (lo[1] != const0_rtx) |
| xor0 = expand_binop (submode, xor_optab, xor0, lo[1], |
| NULL_RTX, 0, OPTAB_WIDEN); |
| |
| tmp = expand_binop (submode, ior_optab, xor1, xor0, |
| NULL_RTX, 0, OPTAB_WIDEN); |
| |
| ix86_expand_branch (code, tmp, const0_rtx, label); |
| return; |
| } |
| |
| /* Otherwise, if we are doing less-than or greater-or-equal-than, |
| op1 is a constant and the low word is zero, then we can just |
| examine the high word. Similarly for low word -1 and |
| less-or-equal-than or greater-than. */ |
| |
| if (CONST_INT_P (hi[1])) |
| switch (code) |
| { |
| case LT: case LTU: case GE: case GEU: |
| if (lo[1] == const0_rtx) |
| { |
| ix86_expand_branch (code, hi[0], hi[1], label); |
| return; |
| } |
| break; |
| case LE: case LEU: case GT: case GTU: |
| if (lo[1] == constm1_rtx) |
| { |
| ix86_expand_branch (code, hi[0], hi[1], label); |
| return; |
| } |
| break; |
| default: |
| break; |
| } |
| |
| /* Emulate comparisons that do not depend on Zero flag with |
| double-word subtraction. Note that only Overflow, Sign |
| and Carry flags are valid, so swap arguments and condition |
| of comparisons that would otherwise test Zero flag. */ |
| |
| switch (code) |
| { |
| case LE: case LEU: case GT: case GTU: |
| std::swap (lo[0], lo[1]); |
| std::swap (hi[0], hi[1]); |
| code = swap_condition (code); |
| /* FALLTHRU */ |
| |
| case LT: case LTU: case GE: case GEU: |
| { |
| bool uns = (code == LTU || code == GEU); |
| rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx) |
| = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz; |
| |
| if (!nonimmediate_operand (lo[0], submode)) |
| lo[0] = force_reg (submode, lo[0]); |
| if (!x86_64_general_operand (lo[1], submode)) |
| lo[1] = force_reg (submode, lo[1]); |
| |
| if (!register_operand (hi[0], submode)) |
| hi[0] = force_reg (submode, hi[0]); |
| if ((uns && !nonimmediate_operand (hi[1], submode)) |
| || (!uns && !x86_64_general_operand (hi[1], submode))) |
| hi[1] = force_reg (submode, hi[1]); |
| |
| emit_insn (gen_cmp_1 (submode, lo[0], lo[1])); |
| |
| tmp = gen_rtx_SCRATCH (submode); |
| emit_insn (sbb_insn (submode, tmp, hi[0], hi[1])); |
| |
| tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG); |
| ix86_expand_branch (code, tmp, const0_rtx, label); |
| return; |
| } |
| |
| default: |
| break; |
| } |
| |
| /* Otherwise, we need two or three jumps. */ |
| |
| label2 = gen_label_rtx (); |
| |
| code1 = code; |
| code2 = swap_condition (code); |
| code3 = unsigned_condition (code); |
| |
| switch (code) |
| { |
| case LT: case GT: case LTU: case GTU: |
| break; |
| |
| case LE: code1 = LT; code2 = GT; break; |
| case GE: code1 = GT; code2 = LT; break; |
| case LEU: code1 = LTU; code2 = GTU; break; |
| case GEU: code1 = GTU; code2 = LTU; break; |
| |
| case EQ: code1 = UNKNOWN; code2 = NE; break; |
| case NE: code2 = UNKNOWN; break; |
| |
| default: |
| gcc_unreachable (); |
| } |
| |
| /* |
| * a < b => |
| * if (hi(a) < hi(b)) goto true; |
| * if (hi(a) > hi(b)) goto false; |
| * if (lo(a) < lo(b)) goto true; |
| * false: |
| */ |
| |
| if (code1 != UNKNOWN) |
| ix86_expand_branch (code1, hi[0], hi[1], label); |
| if (code2 != UNKNOWN) |
| ix86_expand_branch (code2, hi[0], hi[1], label2); |
| |
| ix86_expand_branch (code3, lo[0], lo[1], label); |
| |
| if (code2 != UNKNOWN) |
| emit_label (label2); |
| return; |
| } |
| |
| default: |
| gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC); |
| goto simple; |
| } |
| } |
| |
| /* Figure out whether to use unordered fp comparisons. */ |
| |
| static bool |
| ix86_unordered_fp_compare (enum rtx_code code) |
| { |
| if (!TARGET_IEEE_FP) |
| return false; |
| |
| switch (code) |
| { |
| case LT: |
| case LE: |
| case GT: |
| case GE: |
| case LTGT: |
| return false; |
| |
| case EQ: |
| case NE: |
| |
| case UNORDERED: |
| case ORDERED: |
| case UNLT: |
| case UNLE: |
| case UNGT: |
| case UNGE: |
| case UNEQ: |
| return true; |
| |
| default: |
| gcc_unreachable (); |
| } |
| } |
| |
| /* Return a comparison we can do and that it is equivalent to |
| swap_condition (code) apart possibly from orderedness. |
| But, never change orderedness if TARGET_IEEE_FP, returning |
| UNKNOWN in that case if necessary. */ |
| |
| static enum rtx_code |
| ix86_fp_swap_condition (enum rtx_code code) |
| { |
| switch (code) |
| { |
| case GT: /* GTU - CF=0 & ZF=0 */ |
| return TARGET_IEEE_FP ? UNKNOWN : UNLT; |
| case GE: /* GEU - CF=0 */ |
| return TARGET_IEEE_FP ? UNKNOWN : UNLE; |
| case UNLT: /* LTU - CF=1 */ |
| return TARGET_IEEE_FP ? UNKNOWN : GT; |
| case UNLE: /* LEU - CF=1 | ZF=1 */ |
| return TARGET_IEEE_FP ? UNKNOWN : GE; |
| default: |
| return swap_condition (code); |
| } |
| } |
| |
| /* Return cost of comparison CODE using the best strategy for performance. |
| All following functions do use number of instructions as a cost metrics. |
| In future this should be tweaked to compute bytes for optimize_size and |
| take into account performance of various instructions on various CPUs. */ |
| |
| static int |
| ix86_fp_comparison_cost (enum rtx_code code) |
| { |
| int arith_cost; |
| |
| /* The cost of code using bit-twiddling on %ah. */ |
| switch (code) |
| { |
| case UNLE: |
| case UNLT: |
| case LTGT: |
| case GT: |
| case GE: |
| case UNORDERED: |
| case ORDERED: |
| case UNEQ: |
| arith_cost = 4; |
| break; |
| case LT: |
| case NE: |
| case EQ: |
| case UNGE: |
| arith_cost = TARGET_IEEE_FP ? 5 : 4; |
| break; |
| case LE: |
| case UNGT: |
| arith_cost = TARGET_IEEE_FP ? 6 : 4; |
| break; |
| default: |
| gcc_unreachable (); |
| } |
| |
| switch (ix86_fp_comparison_strategy (code)) |
| { |
| case IX86_FPCMP_COMI: |
| return arith_cost > 4 ? 3 : 2; |
| case IX86_FPCMP_SAHF: |
| return arith_cost > 4 ? 4 : 3; |
| default: |
| return arith_cost; |
| } |
| } |
| |
| /* Swap, force into registers, or otherwise massage the two operands |
| to a fp comparison. The operands are updated in place; the new |
| comparison code is returned. */ |
| |
| static enum rtx_code |
| ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1) |
| { |
| bool unordered_compare = ix86_unordered_fp_compare (code); |
| rtx op0 = *pop0, op1 = *pop1; |
| machine_mode op_mode = GET_MODE (op0); |
| bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode); |
| |
| /* All of the unordered compare instructions only work on registers. |
| The same is true of the fcomi compare instructions. The XFmode |
| compare instructions require registers except when comparing |
| against zero or when converting operand 1 from fixed point to |
| floating point. */ |
| |
| if (!is_sse |
| && (unordered_compare |
| || (op_mode == XFmode |
| && ! (standard_80387_constant_p (op0) == 1 |
| || standard_80387_constant_p (op1) == 1) |
| && GET_CODE (op1) != FLOAT) |
| || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI)) |
| { |
| op0 = force_reg (op_mode, op0); |
| op1 = force_reg (op_mode, op1); |
| } |
| else |
| { |
| /* %%% We only allow op1 in memory; op0 must be st(0). So swap |
| things around if they appear profitable, otherwise force op0 |
| into a register. */ |
| |
| if (standard_80387_constant_p (op0) == 0 |
| || (MEM_P (op0) |
| && ! (standard_80387_constant_p (op1) == 0 |
| || MEM_P (op1)))) |
| { |
| enum rtx_code new_code = ix86_fp_swap_condition (code); |
| if (new_code != UNKNOWN) |
| { |
| std::swap (op0, op1); |
| code = new_code; |
| } |
| } |
| |
| if (!REG_P (op0)) |
| op0 = force_reg (op_mode, op0); |
| |
| if (CONSTANT_P (op1)) |
| { |
| int tmp = standard_80387_constant_p (op1); |
| if (tmp == 0) |
| op1 = validize_mem (force_const_mem (op_mode, op1)); |
| else if (tmp == 1) |
| { |
| if (TARGET_CMOVE) |
| op1 = force_reg (op_mode, op1); |
| } |
| else |
| op1 = force_reg (op_mode, op1); |
| } |
| } |
| |
| /* Try to rearrange the comparison to make it cheaper. */ |
| if (ix86_fp_comparison_cost (code) |
| > ix86_fp_comparison_cost (swap_condition (code)) |
| && (REG_P (op1) || can_create_pseudo_p ())) |
| { |
| std::swap (op0, op1); |
| code = swap_condition (code); |
| if (!REG_P (op0)) |
| op0 = force_reg (op_mode, op0); |
| } |
| |
| *pop0 = op0; |
| *pop1 = op1; |
| return code; |
| } |
| |
| /* Generate insn patterns to do a floating point compare of OPERANDS. */ |
| |
| static rtx |
| ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1) |
| { |
| bool unordered_compare = ix86_unordered_fp_compare (code); |
| machine_mode cmp_mode; |
| rtx tmp, scratch; |
| |
| code = ix86_prepare_fp_compare_args (code, &op0, &op1); |
| |
| tmp = gen_rtx_COMPARE (CCFPmode, op0, op1); |
| if (unordered_compare) |
| tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP); |
| |
| /* Do fcomi/sahf based test when profitable. */ |
| switch (ix86_fp_comparison_strategy (code)) |
| { |
| case IX86_FPCMP_COMI: |
| cmp_mode = CCFPmode; |
| emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp)); |
| break; |
| |
| case IX86_FPCMP_SAHF: |
| cmp_mode = CCFPmode; |
| tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW); |
| scratch = gen_reg_rtx (HImode); |
| emit_insn (gen_rtx_SET (scratch, tmp)); |
| emit_insn (gen_x86_sahf_1 (scratch)); |
| break; |
| |
| case IX86_FPCMP_ARITH: |
| cmp_mode = CCNOmode; |
| tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW); |
| scratch = gen_reg_rtx (HImode); |
| emit_insn (gen_rtx_SET (scratch, tmp)); |
| |
| /* In the unordered case, we have to check C2 for NaN's, which |
| doesn't happen to work out to anything nice combination-wise. |
| So do some bit twiddling on the value we've got in AH to come |
| up with an appropriate set of condition codes. */ |
| |
| switch (code) |
| { |
| case GT: |
| case UNGT: |
| if (code == GT || !TARGET_IEEE_FP) |
| { |
| emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45))); |
| code = EQ; |
| } |
| else |
| { |
| emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); |
| emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx)); |
| emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44))); |
| cmp_mode = CCmode; |
| code = GEU; |
| } |
| break; |
| case LT: |
| case UNLT: |
| if (code == LT && TARGET_IEEE_FP) |
| { |
| emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); |
| emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx)); |
| cmp_mode = CCmode; |
| code = EQ; |
| } |
| else |
| { |
| emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx)); |
| code = NE; |
| } |
| break; |
| case GE: |
| case UNGE: |
| if (code == GE || !TARGET_IEEE_FP) |
| { |
| emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05))); |
| code = EQ; |
| } |
| else |
| { |
| emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); |
| emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx)); |
| code = NE; |
| } |
| break; |
| case LE: |
| case UNLE: |
| if (code == LE && TARGET_IEEE_FP) |
| { |
| emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); |
| emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx)); |
| emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40))); |
| cmp_mode = CCmode; |
| code = LTU; |
| } |
| else |
| { |
| emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45))); |
| code = NE; |
| } |
| break; |
| case EQ: |
| case UNEQ: |
| if (code == EQ && TARGET_IEEE_FP) |
| { |
| emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); |
| emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40))); |
| cmp_mode = CCmode; |
| code = EQ; |
| } |
| else |
| { |
| emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40))); |
| code = NE; |
| } |
| break; |
| case NE: |
| case LTGT: |
| if (code == NE && TARGET_IEEE_FP) |
| { |
| emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); |
| emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, |
| GEN_INT (0x40))); |
| code = NE; |
| } |
| else |
| { |
| emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40))); |
| code = EQ; |
| } |
| break; |
| |
| case UNORDERED: |
| emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04))); |
| code = NE; |
| break; |
| case ORDERED: |
| emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04))); |
| code = EQ; |
| break; |
| |
| default: |
| gcc_unreachable (); |
| } |
| break; |
| |
| default: |
| gcc_unreachable(); |
| } |
| |
| /* Return the test that should be put into the flags user, i.e. |
| the bcc, scc, or cmov instruction. */ |
| return gen_rtx_fmt_ee (code, VOIDmode, |
| gen_rtx_REG (cmp_mode, FLAGS_REG), |
| const0_rtx); |
| } |
| |
| /* Generate insn patterns to do an integer compare of OPERANDS. */ |
| |
| static rtx |
| ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1) |
| { |
| machine_mode cmpmode; |
| rtx tmp, flags; |
| |
| /* Swap operands to emit carry flag comparison. */ |
| if ((code == GTU || code == LEU) |
| && nonimmediate_operand (op1, VOIDmode)) |
| { |
| std::swap (op0, op1); |
| code = swap_condition (code); |
| } |
| |
| cmpmode = SELECT_CC_MODE (code, op0, op1); |
| flags = gen_rtx_REG (cmpmode, FLAGS_REG); |
| |
| /* This is very simple, but making the interface the same as in the |
| FP case makes the rest of the code easier. */ |
| tmp = gen_rtx_COMPARE (cmpmode, op0, op1); |
| emit_insn (gen_rtx_SET (flags, tmp)); |
| |
| /* Return the test that should be put into the flags user, i.e. |
| the bcc, scc, or cmov instruction. */ |
| return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx); |
| } |
| |
| static rtx |
| ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1) |
| { |
| rtx ret; |
| |
| if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC) |
| ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1); |
| |
| else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0))) |
| { |
| gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0))); |
| ret = ix86_expand_fp_compare (code, op0, op1); |
| } |
| else |
| ret = ix86_expand_int_compare (code, op0, op1); |
| |
| return ret; |
| } |
| |
| void |
| ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1) |
| { |
| rtx ret; |
| |
| gcc_assert (GET_MODE (dest) == QImode); |
| |
| ret = ix86_expand_compare (code, op0, op1); |
| PUT_MODE (ret, QImode); |
| emit_insn (gen_rtx_SET (dest, ret)); |
| } |
| |
| /* Expand comparison setting or clearing carry flag. Return true when |
| successful and set pop for the operation. */ |
| static bool |
| ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop) |
| { |
| machine_mode mode |
| = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1); |
| |
| /* Do not handle double-mode compares that go through special path. */ |
| if (mode == (TARGET_64BIT ? TImode : DImode)) |
| return false; |
| |
| if (SCALAR_FLOAT_MODE_P (mode)) |
| { |
| rtx compare_op; |
| rtx_insn *compare_seq; |
| |
| gcc_assert (!DECIMAL_FLOAT_MODE_P (mode)); |
| |
| /* Shortcut: following common codes never translate |
| into carry flag compares. */ |
| if (code == EQ || code == NE || code == UNEQ || code == LTGT |
| || code == ORDERED || code == UNORDERED) |
| return false; |
| |
| /* These comparisons require zero flag; swap operands so they won't. */ |
| if ((code == GT || code == UNLE || code == LE || code == UNGT) |
| && !TARGET_IEEE_FP) |
| { |
| std::swap (op0, op1); |
| code = swap_condition (code); |
| } |
| |
| /* Try to expand the comparison and verify that we end up with |
| carry flag based comparison. This fails to be true only when |
| we decide to expand comparison using arithmetic that is not |
| too common scenario. */ |
| start_sequence (); |
| compare_op = ix86_expand_fp_compare (code, op0, op1); |
| compare_seq = get_insns (); |
| end_sequence (); |
| |
| if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode) |
| code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op)); |
| else |
| code = GET_CODE (compare_op); |
| |
| if (code != LTU && code != GEU) |
| return false; |
| |
| emit_insn (compare_seq); |
| *pop = compare_op; |
| return true; |
| } |
| |
| if (!INTEGRAL_MODE_P (mode)) |
| return false; |
| |
| switch (code) |
| { |
| case LTU: |
| case GEU: |
| break; |
| |
| /* Convert a==0 into (unsigned)a<1. */ |
| case EQ: |
| case NE: |
| if (op1 != const0_rtx) |
| return false; |
| op1 = const1_rtx; |
| code = (code == EQ ? LTU : GEU); |
| break; |
| |
| /* Convert a>b into b<a or a>=b-1. */ |
| case GTU: |
| case LEU: |
| if (CONST_INT_P (op1)) |
| { |
| op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0)); |
| /* Bail out on overflow. We still can swap operands but that |
| would force loading of the constant into register. */ |
| if (op1 == const0_rtx |
| || !x86_64_immediate_operand (op1, GET_MODE (op1))) |
| return false; |
| code = (code == GTU ? GEU : LTU); |
| } |
| else |
| { |
| std::swap (op0, op1); |
| code = (code == GTU ? LTU : GEU); |
| } |
| break; |
| |
| /* Convert a>=0 into (unsigned)a<0x80000000. */ |
| case LT: |
| case GE: |
| if (mode == DImode || op1 != const0_rtx) |
| return false; |
| op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode); |
| code = (code == LT ? GEU : LTU); |
| break; |
| case LE: |
| case GT: |
| if (mode == DImode || op1 != constm1_rtx) |
| return false; |
| op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode); |
| code = (code == LE ? GEU : LTU); |
| break; |
| |
| default: |
| return false; |
| } |
| /* Swapping operands may cause constant to appear as first operand. */ |
| if (!nonimmediate_operand (op0, VOIDmode)) |
| { |
| if (!can_create_pseudo_p ()) |
| return false; |
| op0 = force_reg (mode, op0); |
| } |
| *pop = ix86_expand_compare (code, op0, op1); |
| gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU); |
| return true; |
| } |
| |
| /* Expand conditional increment or decrement using adb/sbb instructions. |
| The default case using setcc followed by the conditional move can be |
| done by generic code. */ |
| bool |
| ix86_expand_int_addcc (rtx operands[]) |
| { |
| enum rtx_code code = GET_CODE (operands[1]); |
| rtx flags; |
| rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx); |
| rtx compare_op; |
| rtx val = const0_rtx; |
| bool fpcmp = false; |
| machine_mode mode; |
| rtx op0 = XEXP (operands[1], 0); |
| rtx op1 = XEXP (operands[1], 1); |
| |
| if (operands[3] != const1_rtx |
| && operands[3] != constm1_rtx) |
| return false; |
| if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op)) |
| return false; |
| code = GET_CODE (compare_op); |
| |
| flags = XEXP (compare_op, 0); |
| |
| if (GET_MODE (flags) == CCFPmode) |
| { |
| fpcmp = true; |
| code = ix86_fp_compare_code_to_integer (code); |
| } |
| |
| if (code != LTU) |
| { |
| val = constm1_rtx; |
| if (fpcmp) |
| PUT_CODE (compare_op, |
| reverse_condition_maybe_unordered |
| (GET_CODE (compare_op))); |
| else |
| PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op))); |
| } |
| |
| mode = GET_MODE (operands[0]); |
| |
| /* Construct either adc or sbb insn. */ |
| if ((code == LTU) == (operands[3] == constm1_rtx)) |
| insn = gen_sub3_carry; |
| else |
| insn = gen_add3_carry; |
| |
| emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op)); |
| |
| return true; |
| } |
| |
| bool |
| ix86_expand_int_movcc (rtx operands[]) |
| { |
| enum rtx_code code = GET_CODE (operands[1]), compare_code; |
| rtx_insn *compare_seq; |
| rtx compare_op; |
| machine_mode mode = GET_MODE (operands[0]); |
| bool sign_bit_compare_p = false; |
| rtx op0 = XEXP (operands[1], 0); |
| rtx op1 = XEXP (operands[1], 1); |
| |
| if (GET_MODE (op0) == TImode |
| || (GET_MODE (op0) == DImode |
| && !TARGET_64BIT)) |
| return false; |
| |
| start_sequence (); |
| compare_op = ix86_expand_compare (code, op0, op1); |
| compare_seq = get_insns (); |
| end_sequence (); |
| |
| compare_code = GET_CODE (compare_op); |
| |
| if ((op1 == const0_rtx && (code == GE || code == LT)) |
| || (op1 == constm1_rtx && (code == GT || code == LE))) |
| sign_bit_compare_p = true; |
| |
| /* Don't attempt mode expansion here -- if we had to expand 5 or 6 |
| HImode insns, we'd be swallowed in word prefix ops. */ |
| |
| if ((mode != HImode || TARGET_FAST_PREFIX) |
| && (mode != (TARGET_64BIT ? TImode : DImode)) |
| && CONST_INT_P (operands[2]) |
| && CONST_INT_P (operands[3])) |
| { |
| rtx out = operands[0]; |
| HOST_WIDE_INT ct = INTVAL (operands[2]); |
| HOST_WIDE_INT cf = INTVAL (operands[3]); |
| HOST_WIDE_INT diff; |
| |
| diff = ct - cf; |
| /* Sign bit compares are better done using shifts than we do by using |
| sbb. */ |
| if (sign_bit_compare_p |
| || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op)) |
| { |
| /* Detect overlap between destination and compare sources. */ |
| rtx tmp = out; |
| |
| if (!sign_bit_compare_p) |
| { |
| rtx flags; |
| bool fpcmp = false; |
| |
| compare_code = GET_CODE (compare_op); |
| |
| flags = XEXP (compare_op, 0); |
| |
| if (GET_MODE (flags) == CCFPmode) |
| { |
| fpcmp = true; |
| compare_code |
| = ix86_fp_compare_code_to_integer (compare_code); |
| } |
| |
| /* To simplify rest of code, restrict to the GEU case. */ |
| if (compare_code == LTU) |
| { |
| std::swap (ct, cf); |
| compare_code = reverse_condition (compare_code); |
| code = reverse_condition (code); |
| } |
| else |
| { |
| if (fpcmp) |
| PUT_CODE (compare_op, |
| reverse_condition_maybe_unordered |
| (GET_CODE (compare_op))); |
| else |
| PUT_CODE (compare_op, |
| reverse_condition (GET_CODE (compare_op))); |
| } |
| diff = ct - cf; |
| |
|