blob: 93644be9cb3c49912ea19f30b869a1f674676602 [file] [log] [blame]
/* Costs of operations of individual x86 CPUs.
Copyright (C) 1988-2021 Free Software Foundation, Inc.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* Processor costs (relative to an add) */
/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
#define COSTS_N_BYTES(N) ((N) * 2)
#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
static stringop_algs ix86_size_memcpy[2] = {
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
static stringop_algs ix86_size_memset[2] = {
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
const
struct processor_costs ix86_size_cost = {/* costs for tuning for size */
{
/* Start of register allocator costs. integer->integer move cost is 2. */
2, /* cost for loading QImode using movzbl */
{2, 2, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 2, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{2, 2, 2}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{2, 2, 2}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
3, /* cost of moving MMX register */
{3, 3}, /* cost of loading MMX registers
in SImode and DImode */
{3, 3}, /* cost of storing MMX registers
in SImode and DImode */
3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
{3, 3, 3, 3, 3}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit */
{3, 3, 3, 3, 3}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit */
3, 3, /* SSE->integer and integer->SSE moves */
3, 3, /* mask->integer and integer->mask moves */
{2, 2, 2}, /* cost of loading mask register
in QImode, HImode, SImode. */
{2, 2, 2}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_BYTES (2), /* cost of an add instruction */
COSTS_N_BYTES (3), /* cost of a lea instruction */
COSTS_N_BYTES (2), /* variable shift costs */
COSTS_N_BYTES (3), /* constant shift costs */
{COSTS_N_BYTES (3), /* cost of starting multiply for QI */
COSTS_N_BYTES (3), /* HI */
COSTS_N_BYTES (3), /* SI */
COSTS_N_BYTES (3), /* DI */
COSTS_N_BYTES (5)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
COSTS_N_BYTES (3), /* HI */
COSTS_N_BYTES (3), /* SI */
COSTS_N_BYTES (3), /* DI */
COSTS_N_BYTES (5)}, /* other */
COSTS_N_BYTES (3), /* cost of movsx */
COSTS_N_BYTES (3), /* cost of movzx */
0, /* "large" insn */
2, /* MOVE_RATIO */
2, /* CLEAR_RATIO */
{2, 2, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 2, 2}, /* cost of storing integer registers */
{3, 3, 3, 3, 3}, /* cost of loading SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{3, 3, 3, 3, 3}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{3, 3, 3, 3, 3}, /* cost of unaligned SSE load
in 128bit, 256bit and 512bit */
{3, 3, 3, 3, 3}, /* cost of unaligned SSE store
in 128bit, 256bit and 512bit */
3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
5, 0, /* Gather load static, per_elt. */
5, 0, /* Gather store static, per_elt. */
0, /* size of l1 cache */
0, /* size of l2 cache */
0, /* size of prefetch block */
0, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
COSTS_N_BYTES (2), /* cost of FMUL instruction. */
COSTS_N_BYTES (2), /* cost of FDIV instruction. */
COSTS_N_BYTES (2), /* cost of FABS instruction. */
COSTS_N_BYTES (2), /* cost of FCHS instruction. */
COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */
COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_BYTES (2), /* cost of MULSS instruction. */
COSTS_N_BYTES (2), /* cost of MULSD instruction. */
COSTS_N_BYTES (2), /* cost of FMA SS instruction. */
COSTS_N_BYTES (2), /* cost of FMA SD instruction. */
COSTS_N_BYTES (2), /* cost of DIVSS instruction. */
COSTS_N_BYTES (2), /* cost of DIVSD instruction. */
COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
ix86_size_memcpy,
ix86_size_memset,
COSTS_N_BYTES (1), /* cond_taken_branch_cost. */
COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */
NULL, /* Loop alignment. */
NULL, /* Jump alignment. */
NULL, /* Label alignment. */
NULL, /* Func alignment. */
};
/* Processor costs (relative to an add) */
static stringop_algs i386_memcpy[2] = {
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs i386_memset[2] = {
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs i386_cost = { /* 386 specific costs */
{
/* Start of register allocator costs. integer->integer move cost is 2. */
4, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 4, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{8, 8, 8}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{8, 8, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{4, 8}, /* cost of loading MMX registers
in SImode and DImode */
{4, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
{4, 8, 16, 32, 64}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit */
3, 3, /* SSE->integer and integer->SSE moves */
3, 3, /* mask->integer and integer->mask moves */
{2, 4, 2}, /* cost of loading mask register
in QImode, HImode, SImode. */
{2, 4, 2}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (3), /* variable shift costs */
COSTS_N_INSNS (2), /* constant shift costs */
{COSTS_N_INSNS (6), /* cost of starting multiply for QI */
COSTS_N_INSNS (6), /* HI */
COSTS_N_INSNS (6), /* SI */
COSTS_N_INSNS (6), /* DI */
COSTS_N_INSNS (6)}, /* other */
COSTS_N_INSNS (1), /* cost of multiply per each bit set */
{COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
COSTS_N_INSNS (23), /* HI */
COSTS_N_INSNS (23), /* SI */
COSTS_N_INSNS (23), /* DI */
COSTS_N_INSNS (23)}, /* other */
COSTS_N_INSNS (3), /* cost of movsx */
COSTS_N_INSNS (2), /* cost of movzx */
15, /* "large" insn */
3, /* MOVE_RATIO */
3, /* CLEAR_RATIO */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 4, 2}, /* cost of storing integer registers */
{4, 8, 16, 32, 64}, /* cost of loading SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 8, 16, 32, 64}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 8, 16, 32, 64}, /* cost of unaligned loads. */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
4, 4, /* Gather load static, per_elt. */
4, 4, /* Gather store static, per_elt. */
0, /* size of l1 cache */
0, /* size of l2 cache */
0, /* size of prefetch block */
0, /* number of parallel prefetches */
1, /* Branch cost */
COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (27), /* cost of FMUL instruction. */
COSTS_N_INSNS (88), /* cost of FDIV instruction. */
COSTS_N_INSNS (22), /* cost of FABS instruction. */
COSTS_N_INSNS (24), /* cost of FCHS instruction. */
COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (27), /* cost of MULSS instruction. */
COSTS_N_INSNS (27), /* cost of MULSD instruction. */
COSTS_N_INSNS (27), /* cost of FMA SS instruction. */
COSTS_N_INSNS (27), /* cost of FMA SD instruction. */
COSTS_N_INSNS (88), /* cost of DIVSS instruction. */
COSTS_N_INSNS (88), /* cost of DIVSD instruction. */
COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
i386_memcpy,
i386_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
"4", /* Loop alignment. */
"4", /* Jump alignment. */
NULL, /* Label alignment. */
"4", /* Func alignment. */
};
static stringop_algs i486_memcpy[2] = {
{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs i486_memset[2] = {
{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs i486_cost = { /* 486 specific costs */
{
/* Start of register allocator costs. integer->integer move cost is 2. */
4, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 4, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{8, 8, 8}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{8, 8, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{4, 8}, /* cost of loading MMX registers
in SImode and DImode */
{4, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
{4, 8, 16, 32, 64}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit */
3, 3, /* SSE->integer and integer->SSE moves */
3, 3, /* mask->integer and integer->mask moves */
{2, 4, 2}, /* cost of loading mask register
in QImode, HImode, SImode. */
{2, 4, 2}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (3), /* variable shift costs */
COSTS_N_INSNS (2), /* constant shift costs */
{COSTS_N_INSNS (12), /* cost of starting multiply for QI */
COSTS_N_INSNS (12), /* HI */
COSTS_N_INSNS (12), /* SI */
COSTS_N_INSNS (12), /* DI */
COSTS_N_INSNS (12)}, /* other */
1, /* cost of multiply per each bit set */
{COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
COSTS_N_INSNS (40), /* HI */
COSTS_N_INSNS (40), /* SI */
COSTS_N_INSNS (40), /* DI */
COSTS_N_INSNS (40)}, /* other */
COSTS_N_INSNS (3), /* cost of movsx */
COSTS_N_INSNS (2), /* cost of movzx */
15, /* "large" insn */
3, /* MOVE_RATIO */
3, /* CLEAR_RATIO */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 4, 2}, /* cost of storing integer registers */
{4, 8, 16, 32, 64}, /* cost of loading SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 8, 16, 32, 64}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 8, 16, 32, 64}, /* cost of unaligned loads. */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
4, 4, /* Gather load static, per_elt. */
4, 4, /* Gather store static, per_elt. */
4, /* size of l1 cache. 486 has 8kB cache
shared for code and data, so 4kB is
not really precise. */
4, /* size of l2 cache */
0, /* size of prefetch block */
0, /* number of parallel prefetches */
1, /* Branch cost */
COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (16), /* cost of FMUL instruction. */
COSTS_N_INSNS (73), /* cost of FDIV instruction. */
COSTS_N_INSNS (3), /* cost of FABS instruction. */
COSTS_N_INSNS (3), /* cost of FCHS instruction. */
COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (16), /* cost of MULSS instruction. */
COSTS_N_INSNS (16), /* cost of MULSD instruction. */
COSTS_N_INSNS (16), /* cost of FMA SS instruction. */
COSTS_N_INSNS (16), /* cost of FMA SD instruction. */
COSTS_N_INSNS (73), /* cost of DIVSS instruction. */
COSTS_N_INSNS (74), /* cost of DIVSD instruction. */
COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
i486_memcpy,
i486_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
"16", /* Loop alignment. */
"16", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"16", /* Func alignment. */
};
static stringop_algs pentium_memcpy[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs pentium_memset[2] = {
{libcall, {{-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs pentium_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
6, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 4, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{2, 2, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 6}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
8, /* cost of moving MMX register */
{8, 8}, /* cost of loading MMX registers
in SImode and DImode */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
{4, 8, 16, 32, 64}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit */
3, 3, /* SSE->integer and integer->SSE moves */
3, 3, /* mask->integer and integer->mask moves */
{2, 4, 2}, /* cost of loading mask register
in QImode, HImode, SImode. */
{2, 4, 2}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (4), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (11), /* cost of starting multiply for QI */
COSTS_N_INSNS (11), /* HI */
COSTS_N_INSNS (11), /* SI */
COSTS_N_INSNS (11), /* DI */
COSTS_N_INSNS (11)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
COSTS_N_INSNS (25), /* HI */
COSTS_N_INSNS (25), /* SI */
COSTS_N_INSNS (25), /* DI */
COSTS_N_INSNS (25)}, /* other */
COSTS_N_INSNS (3), /* cost of movsx */
COSTS_N_INSNS (2), /* cost of movzx */
8, /* "large" insn */
6, /* MOVE_RATIO */
6, /* CLEAR_RATIO */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 4, 2}, /* cost of storing integer registers */
{4, 8, 16, 32, 64}, /* cost of loading SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 8, 16, 32, 64}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 8, 16, 32, 64}, /* cost of unaligned loads. */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
4, 4, /* Gather load static, per_elt. */
4, 4, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
8, /* size of l2 cache */
0, /* size of prefetch block */
0, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (3), /* cost of FMUL instruction. */
COSTS_N_INSNS (39), /* cost of FDIV instruction. */
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (3), /* cost of MULSS instruction. */
COSTS_N_INSNS (3), /* cost of MULSD instruction. */
COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
COSTS_N_INSNS (39), /* cost of DIVSD instruction. */
COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
pentium_memcpy,
pentium_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
"16:8:8", /* Loop alignment. */
"16:8:8", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"16", /* Func alignment. */
};
static const
struct processor_costs lakemont_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
6, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 4, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{2, 2, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 6}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
8, /* cost of moving MMX register */
{8, 8}, /* cost of loading MMX registers
in SImode and DImode */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
{4, 8, 16, 32, 64}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit */
3, 3, /* SSE->integer and integer->SSE moves */
3, 3, /* mask->integer and integer->mask moves */
{2, 4, 2}, /* cost of loading mask register
in QImode, HImode, SImode. */
{2, 4, 2}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (11), /* cost of starting multiply for QI */
COSTS_N_INSNS (11), /* HI */
COSTS_N_INSNS (11), /* SI */
COSTS_N_INSNS (11), /* DI */
COSTS_N_INSNS (11)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
COSTS_N_INSNS (25), /* HI */
COSTS_N_INSNS (25), /* SI */
COSTS_N_INSNS (25), /* DI */
COSTS_N_INSNS (25)}, /* other */
COSTS_N_INSNS (3), /* cost of movsx */
COSTS_N_INSNS (2), /* cost of movzx */
8, /* "large" insn */
17, /* MOVE_RATIO */
6, /* CLEAR_RATIO */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 4, 2}, /* cost of storing integer registers */
{4, 8, 16, 32, 64}, /* cost of loading SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 8, 16, 32, 64}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 8, 16, 32, 64}, /* cost of unaligned loads. */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
4, 4, /* Gather load static, per_elt. */
4, 4, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
8, /* size of l2 cache */
0, /* size of prefetch block */
0, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (3), /* cost of FMUL instruction. */
COSTS_N_INSNS (39), /* cost of FDIV instruction. */
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (5), /* cost of MULSS instruction. */
COSTS_N_INSNS (5), /* cost of MULSD instruction. */
COSTS_N_INSNS (10), /* cost of FMA SS instruction. */
COSTS_N_INSNS (10), /* cost of FMA SD instruction. */
COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
pentium_memcpy,
pentium_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
"16:8:8", /* Loop alignment. */
"16:8:8", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"16", /* Func alignment. */
};
/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
(we ensure the alignment). For small blocks inline loop is still a
noticeable win, for bigger blocks either rep movsl or rep movsb is
way to go. Rep movsb has apparently more expensive startup time in CPU,
but after 4K the difference is down in the noise. */
static stringop_algs pentiumpro_memcpy[2] = {
{rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
{8192, rep_prefix_4_byte, false},
{-1, rep_prefix_1_byte, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs pentiumpro_memset[2] = {
{rep_prefix_4_byte, {{1024, unrolled_loop, false},
{8192, rep_prefix_4_byte, false},
{-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs pentiumpro_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
2, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 2, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{2, 2, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 6}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{2, 2}, /* cost of loading MMX registers
in SImode and DImode */
{2, 2}, /* cost of storing MMX registers
in SImode and DImode */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
{4, 8, 16, 32, 64}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit */
{4, 8, 16, 32, 64}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit */
3, 3, /* SSE->integer and integer->SSE moves */
3, 3, /* mask->integer and integer->mask moves */
{4, 4, 4}, /* cost of loading mask register
in QImode, HImode, SImode. */
{2, 2, 2}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (4), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (4), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (4)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
COSTS_N_INSNS (17), /* HI */
COSTS_N_INSNS (17), /* SI */
COSTS_N_INSNS (17), /* DI */
COSTS_N_INSNS (17)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
6, /* MOVE_RATIO */
6, /* CLEAR_RATIO */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 2, 2}, /* cost of storing integer registers */
{4, 8, 16, 32, 64}, /* cost of loading SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 8, 16, 32, 64}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 8, 16, 32, 64}, /* cost of unaligned loads. */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
4, 4, /* Gather load static, per_elt. */
4, 4, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
256, /* size of l2 cache */
32, /* size of prefetch block */
6, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (5), /* cost of FMUL instruction. */
COSTS_N_INSNS (56), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (4), /* cost of MULSS instruction. */
COSTS_N_INSNS (4), /* cost of MULSD instruction. */
COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
COSTS_N_INSNS (18), /* cost of DIVSD instruction. */
COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
pentiumpro_memcpy,
pentiumpro_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
"16", /* Loop alignment. */
"16:11:8", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"16", /* Func alignment. */
};
static stringop_algs geode_memcpy[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs geode_memset[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs geode_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
2, /* cost for loading QImode using movzbl */
{2, 2, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 2, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{2, 2, 2}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 6, 6}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{2, 2}, /* cost of loading MMX registers
in SImode and DImode */
{2, 2}, /* cost of storing MMX registers
in SImode and DImode */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
{2, 2, 8, 16, 32}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit */
{2, 2, 8, 16, 32}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit */
6, 6, /* SSE->integer and integer->SSE moves */
6, 6, /* mask->integer and integer->mask moves */
{2, 2, 2}, /* cost of loading mask register
in QImode, HImode, SImode. */
{2, 2, 2}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (2), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (7), /* SI */
COSTS_N_INSNS (7), /* DI */
COSTS_N_INSNS (7)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
COSTS_N_INSNS (23), /* HI */
COSTS_N_INSNS (39), /* SI */
COSTS_N_INSNS (39), /* DI */
COSTS_N_INSNS (39)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
4, /* MOVE_RATIO */
4, /* CLEAR_RATIO */
{2, 2, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 2, 2}, /* cost of storing integer registers */
{2, 2, 8, 16, 32}, /* cost of loading SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{2, 2, 8, 16, 32}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{2, 2, 8, 16, 32}, /* cost of unaligned loads. */
{2, 2, 8, 16, 32}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
6, /* cost of moving SSE register to integer. */
2, 2, /* Gather load static, per_elt. */
2, 2, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
128, /* size of l2 cache. */
32, /* size of prefetch block */
1, /* number of parallel prefetches */
1, /* Branch cost */
COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (11), /* cost of FMUL instruction. */
COSTS_N_INSNS (47), /* cost of FDIV instruction. */
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (11), /* cost of MULSS instruction. */
COSTS_N_INSNS (11), /* cost of MULSD instruction. */
COSTS_N_INSNS (17), /* cost of FMA SS instruction. */
COSTS_N_INSNS (17), /* cost of FMA SD instruction. */
COSTS_N_INSNS (47), /* cost of DIVSS instruction. */
COSTS_N_INSNS (47), /* cost of DIVSD instruction. */
COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
geode_memcpy,
geode_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
NULL, /* Loop alignment. */
NULL, /* Jump alignment. */
NULL, /* Label alignment. */
NULL, /* Func alignment. */
};
static stringop_algs k6_memcpy[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs k6_memset[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs k6_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
3, /* cost for loading QImode using movzbl */
{4, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 3, 2}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{6, 6, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 4}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{2, 2}, /* cost of loading MMX registers
in SImode and DImode */
{2, 2}, /* cost of storing MMX registers
in SImode and DImode */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
{2, 2, 8, 16, 32}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit */
{2, 2, 8, 16, 32}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit */
6, 6, /* SSE->integer and integer->SSE moves */
6, 6, /* mask->integer and integer->mask moves */
{4, 5, 4}, /* cost of loading mask register
in QImode, HImode, SImode. */
{2, 3, 2}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (3), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (3), /* DI */
COSTS_N_INSNS (3)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (18), /* HI */
COSTS_N_INSNS (18), /* SI */
COSTS_N_INSNS (18), /* DI */
COSTS_N_INSNS (18)}, /* other */
COSTS_N_INSNS (2), /* cost of movsx */
COSTS_N_INSNS (2), /* cost of movzx */
8, /* "large" insn */
4, /* MOVE_RATIO */
4, /* CLEAR_RATIO */
{4, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 3, 2}, /* cost of storing integer registers */
{2, 2, 8, 16, 32}, /* cost of loading SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{2, 2, 8, 16, 32}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{2, 2, 8, 16, 32}, /* cost of unaligned loads. */
{2, 2, 8, 16, 32}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
6, /* cost of moving SSE register to integer. */
2, 2, /* Gather load static, per_elt. */
2, 2, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
32, /* size of l2 cache. Some models
have integrated l2 cache, but
optimizing for k6 is not important
enough to worry about that. */
32, /* size of prefetch block */
1, /* number of parallel prefetches */
1, /* Branch cost */
COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (2), /* cost of FMUL instruction. */
COSTS_N_INSNS (56), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (2), /* cost of MULSS instruction. */
COSTS_N_INSNS (2), /* cost of MULSD instruction. */
COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
COSTS_N_INSNS (56), /* cost of DIVSS instruction. */
COSTS_N_INSNS (56), /* cost of DIVSD instruction. */
COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
k6_memcpy,
k6_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
"32:8:8", /* Loop alignment. */
"32:8:8", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"32", /* Func alignment. */
};
/* For some reason, Athlon deals better with REP prefix (relative to loops)
compared to K8. Alignment becomes important after 8 bytes for memcpy and
128 bytes for memset. */
static stringop_algs athlon_memcpy[2] = {
{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs athlon_memset[2] = {
{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs athlon_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
4, /* cost for loading QImode using movzbl */
{3, 4, 3}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{3, 4, 3}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{4, 4, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{4, 4}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
{4, 4, 12, 12, 24}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit */
{4, 4, 10, 10, 20}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit */
5, 5, /* SSE->integer and integer->SSE moves */
5, 5, /* mask->integer and integer->mask moves */
{3, 4, 3}, /* cost of loading mask register
in QImode, HImode, SImode. */
{3, 4, 3}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (5), /* cost of starting multiply for QI */
COSTS_N_INSNS (5), /* HI */
COSTS_N_INSNS (5), /* SI */
COSTS_N_INSNS (5), /* DI */
COSTS_N_INSNS (5)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (26), /* HI */
COSTS_N_INSNS (42), /* SI */
COSTS_N_INSNS (74), /* DI */
COSTS_N_INSNS (74)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
6, /* CLEAR_RATIO */
{3, 4, 3}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{3, 4, 3}, /* cost of storing integer registers */
{4, 4, 12, 12, 24}, /* cost of loading SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 4, 10, 10, 20}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 4, 12, 12, 24}, /* cost of unaligned loads. */
{4, 4, 10, 10, 20}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
5, /* cost of moving SSE register to integer. */
4, 4, /* Gather load static, per_elt. */
4, 4, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
5, /* Branch cost */
COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (24), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (4), /* cost of MULSS instruction. */
COSTS_N_INSNS (4), /* cost of MULSD instruction. */
COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
/* 11-16 */
COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
COSTS_N_INSNS (24), /* cost of DIVSD instruction. */
COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
athlon_memcpy,
athlon_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
"16:8:8", /* Loop alignment. */
"16:8:8", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"16", /* Func alignment. */
};
/* K8 has optimized REP instruction for medium sized blocks, but for very
small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
static stringop_algs k8_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs k8_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static const
struct processor_costs k8_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
4, /* cost for loading QImode using movzbl */
{3, 4, 3}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{3, 4, 3}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{4, 4, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{3, 3}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
{4, 3, 12, 12, 24}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit */
{4, 4, 10, 10, 20}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit */
5, 5, /* SSE->integer and integer->SSE moves */
5, 5, /* mask->integer and integer->mask moves */
{3, 4, 3}, /* cost of loading mask register
in QImode, HImode, SImode. */
{3, 4, 3}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (5)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (26), /* HI */
COSTS_N_INSNS (42), /* SI */
COSTS_N_INSNS (74), /* DI */
COSTS_N_INSNS (74)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
6, /* CLEAR_RATIO */
{3, 4, 3}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{3, 4, 3}, /* cost of storing integer registers */
{4, 3, 12, 12, 24}, /* cost of loading SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 4, 10, 10, 20}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 3, 12, 12, 24}, /* cost of unaligned loads. */
{4, 4, 10, 10, 20}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
5, /* cost of moving SSE register to integer. */
4, 4, /* Gather load static, per_elt. */
4, 4, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches */
3, /* Branch cost */
COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (19), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (4), /* cost of MULSS instruction. */
COSTS_N_INSNS (4), /* cost of MULSD instruction. */
COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
/* 11-16 */
COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
k8_memcpy,
k8_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
"16:8:8", /* Loop alignment. */
"16:8:8", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"16", /* Func alignment. */
};
/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
static stringop_algs amdfam10_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs amdfam10_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
struct processor_costs amdfam10_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
4, /* cost for loading QImode using movzbl */
{3, 4, 3}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{3, 4, 3}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{4, 4, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{3, 3}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
{4, 4, 3, 6, 12}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit */
{4, 4, 5, 10, 20}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit */
3, 3, /* SSE->integer and integer->SSE moves */
3, 3, /* mask->integer and integer->mask moves */
{3, 4, 3}, /* cost of loading mask register
in QImode, HImode, SImode. */
{3, 4, 3}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* On K8:
MOVD reg64, xmmreg Double FSTORE 4
MOVD reg32, xmmreg Double FSTORE 4
On AMDFAM10:
MOVD reg64, xmmreg Double FADD 3
1/1 1/1
MOVD reg32, xmmreg Double FADD 3
1/1 1/1 */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (5)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
COSTS_N_INSNS (35), /* HI */
COSTS_N_INSNS (51), /* SI */
COSTS_N_INSNS (83), /* DI */
COSTS_N_INSNS (83)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
6, /* CLEAR_RATIO */
{3, 4, 3}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{3, 4, 3}, /* cost of storing integer registers */
{4, 4, 3, 6, 12}, /* cost of loading SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 4, 5, 10, 20}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{4, 4, 3, 7, 12}, /* cost of unaligned loads. */
{4, 4, 5, 10, 20}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
4, 4, /* Gather load static, per_elt. */
4, 4, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (19), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (4), /* cost of MULSS instruction. */
COSTS_N_INSNS (4), /* cost of MULSD instruction. */
COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
/* 11-16 */
COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
amdfam10_memcpy,
amdfam10_memset,
COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
"32:25:8", /* Loop alignment. */
"32:8:8", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"32", /* Func alignment. */
};
/* BDVER has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
static stringop_algs bdver_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs bdver_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
const struct processor_costs bdver_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
8, /* cost for loading QImode using movzbl */
{8, 8, 8}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{8, 8, 8}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{12, 12, 28}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{10, 10, 18}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
4, /* cost of moving MMX register */
{12, 12}, /* cost of loading MMX registers
in SImode and DImode */
{10, 10}, /* cost of storing MMX registers
in SImode and DImode */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
{12, 12, 10, 40, 60}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit */
{10, 10, 10, 40, 60}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit */
16, 20, /* SSE->integer and integer->SSE moves */
16, 20, /* mask->integer and integer->mask moves */
{8, 8, 8}, /* cost of loading mask register
in QImode, HImode, SImode. */
{8, 8, 8}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (4), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (4), /* SI */
COSTS_N_INSNS (6), /* DI */
COSTS_N_INSNS (6)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
COSTS_N_INSNS (35), /* HI */
COSTS_N_INSNS (51), /* SI */
COSTS_N_INSNS (83), /* DI */
COSTS_N_INSNS (83)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
6, /* CLEAR_RATIO */
{8, 8, 8}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{8, 8, 8}, /* cost of storing integer registers */
{12, 12, 10, 40, 60}, /* cost of loading SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{10, 10, 10, 40, 60}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{12, 12, 10, 40, 60}, /* cost of unaligned loads. */
{10, 10, 10, 40, 60}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
16, /* cost of moving SSE register to integer. */
12, 12, /* Gather load static, per_elt. */
10, 10, /* Gather store static, per_elt. */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (6), /* cost of FMUL instruction. */
COSTS_N_INSNS (42), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (6), /* cost of MULSS instruction. */
COSTS_N_INSNS (6), /* cost of MULSD instruction. */
COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
/* 9-24 */
COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
/* 9-27 */
COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
bdver_memcpy,
bdver_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
"16:11:8", /* Loop alignment. */
"16:8:8", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"11", /* Func alignment. */
};
/* ZNVER1 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
static stringop_algs znver1_memcpy[2] = {
/* 32-bit tuning. */
{libcall, {{6, loop, false},
{14, unrolled_loop, false},
{-1, libcall, false}}},
/* 64-bit tuning. */
{libcall, {{16, loop, false},
{128, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs znver1_memset[2] = {
/* 32-bit tuning. */
{libcall, {{8, loop, false},
{24, unrolled_loop, false},
{128, rep_prefix_4_byte, false},
{-1, libcall, false}}},
/* 64-bit tuning. */
{libcall, {{48, unrolled_loop, false},
{128, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
struct processor_costs znver1_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
/* reg-reg moves are done by renaming and thus they are even cheaper than
1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
to doubles of latencies, we do not model this correctly. It does not
seem to make practical difference to bump prices up even more. */
6, /* cost for loading QImode using
movzbl. */
{6, 6, 6}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{8, 8, 8}, /* cost of storing integer
registers. */
2, /* cost of reg,reg fld/fst. */
{6, 6, 16}, /* cost of loading fp registers
in SFmode, DFmode and XFmode. */
{8, 8, 16}, /* cost of storing fp registers
in SFmode, DFmode and XFmode. */
2, /* cost of moving MMX register. */
{6, 6}, /* cost of loading MMX registers
in SImode and DImode. */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode. */
2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
{6, 6, 6, 12, 24}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit. */
{8, 8, 8, 16, 32}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit. */
6, 6, /* SSE->integer and integer->SSE moves. */
8, 8, /* mask->integer and integer->mask moves */
{6, 6, 6}, /* cost of loading mask register
in QImode, HImode, SImode. */
{8, 8, 8}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction. */
COSTS_N_INSNS (1), /* cost of a lea instruction. */
COSTS_N_INSNS (1), /* variable shift costs. */
COSTS_N_INSNS (1), /* constant shift costs. */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
COSTS_N_INSNS (3), /* HI. */
COSTS_N_INSNS (3), /* SI. */
COSTS_N_INSNS (3), /* DI. */
COSTS_N_INSNS (3)}, /* other. */
0, /* cost of multiply per each bit
set. */
/* Depending on parameters, idiv can get faster on ryzen. This is upper
bound. */
{COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
COSTS_N_INSNS (22), /* HI. */
COSTS_N_INSNS (30), /* SI. */
COSTS_N_INSNS (45), /* DI. */
COSTS_N_INSNS (45)}, /* other. */
COSTS_N_INSNS (1), /* cost of movsx. */
COSTS_N_INSNS (1), /* cost of movzx. */
8, /* "large" insn. */
9, /* MOVE_RATIO. */
6, /* CLEAR_RATIO */
{6, 6, 6}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{8, 8, 8}, /* cost of storing integer
registers. */
{6, 6, 6, 12, 24}, /* cost of loading SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{8, 8, 8, 16, 32}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{6, 6, 6, 12, 24}, /* cost of unaligned loads. */
{8, 8, 8, 16, 32}, /* cost of unaligned stores. */
2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
6, /* cost of moving SSE register to integer. */
/* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
throughput 12. Approx 9 uops do not depend on vector size and every load
is 7 uops. */
18, 8, /* Gather load static, per_elt. */
18, 10, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block. */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches. */
3, /* Branch cost. */
COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (5), /* cost of FMUL instruction. */
/* Latency of fdiv is 8-15. */
COSTS_N_INSNS (15), /* cost of FDIV instruction. */
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
/* Latency of fsqrt is 4-10. */
COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (3), /* cost of MULSS instruction. */
COSTS_N_INSNS (4), /* cost of MULSD instruction. */
COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
/* 9-13 */
COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
/* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
and it can execute 2 integer additions and 2 multiplications thus
reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
that 4 works better than 6 probably due to register pressure.
Integer vector operations are taken by FP unit and execute 3 vector
plus/minus operations per cycle but only one multiply. This is adjusted
in ix86_reassociation_width. */
4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
znver1_memcpy,
znver1_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
"16", /* Loop alignment. */
"16", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"16", /* Func alignment. */
};
/* ZNVER2 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
static stringop_algs znver2_memcpy[2] = {
/* 32-bit tuning. */
{libcall, {{6, loop, false},
{14, unrolled_loop, false},
{-1, libcall, false}}},
/* 64-bit tuning. */
{libcall, {{16, loop, false},
{64, rep_prefix_4_byte, false},
{-1, libcall, false}}}};
static stringop_algs znver2_memset[2] = {
/* 32-bit tuning. */
{libcall, {{8, loop, false},
{24, unrolled_loop, false},
{128, rep_prefix_4_byte, false},
{-1, libcall, false}}},
/* 64-bit tuning. */
{libcall, {{24, rep_prefix_4_byte, false},
{128, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
struct processor_costs znver2_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
/* reg-reg moves are done by renaming and thus they are even cheaper than
1 cycle. Because reg-reg move cost is 2 and following tables correspond
to doubles of latencies, we do not model this correctly. It does not
seem to make practical difference to bump prices up even more. */
6, /* cost for loading QImode using
movzbl. */
{6, 6, 6}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{8, 8, 8}, /* cost of storing integer
registers. */
2, /* cost of reg,reg fld/fst. */
{6, 6, 16}, /* cost of loading fp registers
in SFmode, DFmode and XFmode. */
{8, 8, 16}, /* cost of storing fp registers
in SFmode, DFmode and XFmode. */
2, /* cost of moving MMX register. */
{6, 6}, /* cost of loading MMX registers
in SImode and DImode. */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode. */
2, 2, 3, /* cost of moving XMM,YMM,ZMM
register. */
{6, 6, 6, 6, 12}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit. */
{8, 8, 8, 8, 16}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit. */
6, 6, /* SSE->integer and integer->SSE
moves. */
8, 8, /* mask->integer and integer->mask moves */
{6, 6, 6}, /* cost of loading mask register
in QImode, HImode, SImode. */
{8, 8, 8}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction. */
COSTS_N_INSNS (1), /* cost of a lea instruction. */
COSTS_N_INSNS (1), /* variable shift costs. */
COSTS_N_INSNS (1), /* constant shift costs. */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
COSTS_N_INSNS (3), /* HI. */
COSTS_N_INSNS (3), /* SI. */
COSTS_N_INSNS (3), /* DI. */
COSTS_N_INSNS (3)}, /* other. */
0, /* cost of multiply per each bit
set. */
/* Depending on parameters, idiv can get faster on ryzen. This is upper
bound. */
{COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
COSTS_N_INSNS (22), /* HI. */
COSTS_N_INSNS (30), /* SI. */
COSTS_N_INSNS (45), /* DI. */
COSTS_N_INSNS (45)}, /* other. */
COSTS_N_INSNS (1), /* cost of movsx. */
COSTS_N_INSNS (1), /* cost of movzx. */
8, /* "large" insn. */
9, /* MOVE_RATIO. */
6, /* CLEAR_RATIO */
{6, 6, 6}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{8, 8, 8}, /* cost of storing integer
registers. */
{6, 6, 6, 6, 12}, /* cost of loading SSE registers
in 32bit, 64bit, 128bit, 256bit and 512bit */
{8, 8, 8, 8, 16}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{6, 6, 6, 6, 12}, /* cost of unaligned loads. */
{8, 8, 8, 8, 16}, /* cost of unaligned stores. */
2, 2, 3, /* cost of moving XMM,YMM,ZMM
register. */
6, /* cost of moving SSE register to integer. */
/* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
throughput 12. Approx 9 uops do not depend on vector size and every load
is 7 uops. */
18, 8, /* Gather load static, per_elt. */
18, 10, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block. */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches. */
3, /* Branch cost. */
COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (5), /* cost of FMUL instruction. */
/* Latency of fdiv is 8-15. */
COSTS_N_INSNS (15), /* cost of FDIV instruction. */
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
/* Latency of fsqrt is 4-10. */
COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (3), /* cost of MULSS instruction. */
COSTS_N_INSNS (3), /* cost of MULSD instruction. */
COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
/* 9-13. */
COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
/* Zen can execute 4 integer operations per cycle. FP operations
take 3 cycles and it can execute 2 integer additions and 2
multiplications thus reassociation may make sense up to with of 6.
SPEC2k6 bencharks suggests
that 4 works better than 6 probably due to register pressure.
Integer vector operations are taken by FP unit and execute 3 vector
plus/minus operations per cycle but only one multiply. This is adjusted
in ix86_reassociation_width. */
4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
znver2_memcpy,
znver2_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
"16", /* Loop alignment. */
"16", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"16", /* Func alignment. */
};
struct processor_costs znver3_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
/* reg-reg moves are done by renaming and thus they are even cheaper than
1 cycle. Because reg-reg move cost is 2 and following tables correspond
to doubles of latencies, we do not model this correctly. It does not
seem to make practical difference to bump prices up even more. */
6, /* cost for loading QImode using
movzbl. */
{6, 6, 6}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{8, 8, 8}, /* cost of storing integer
registers. */
2, /* cost of reg,reg fld/fst. */
{6, 6, 16}, /* cost of loading fp registers
in SFmode, DFmode and XFmode. */
{8, 8, 16}, /* cost of storing fp registers
in SFmode, DFmode and XFmode. */
2, /* cost of moving MMX register. */
{6, 6}, /* cost of loading MMX registers
in SImode and DImode. */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode. */
2, 2, 3, /* cost of moving XMM,YMM,ZMM
register. */
{6, 6, 6, 6, 12}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit. */
{8, 8, 8, 8, 16}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit. */
6, 6, /* SSE->integer and integer->SSE
moves. */
8, 8, /* mask->integer and integer->mask moves */
{6, 6, 6}, /* cost of loading mask register
in QImode, HImode, SImode. */
{8, 8, 8}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction. */
COSTS_N_INSNS (1), /* cost of a lea instruction. */
COSTS_N_INSNS (1), /* variable shift costs. */
COSTS_N_INSNS (1), /* constant shift costs. */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
COSTS_N_INSNS (3), /* HI. */
COSTS_N_INSNS (3), /* SI. */
COSTS_N_INSNS (3), /* DI. */
COSTS_N_INSNS (3)}, /* other. */
0, /* cost of multiply per each bit
set. */
{COSTS_N_INSNS (9), /* cost of a divide/mod for QI. */
COSTS_N_INSNS (10), /* HI. */
COSTS_N_INSNS (12), /* SI. */
COSTS_N_INSNS (17), /* DI. */
COSTS_N_INSNS (17)}, /* other. */
COSTS_N_INSNS (1), /* cost of movsx. */
COSTS_N_INSNS (1), /* cost of movzx. */
8, /* "large" insn. */
9, /* MOVE_RATIO. */
6, /* CLEAR_RATIO */
{6, 6, 6}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{8, 8, 8}, /* cost of storing integer
registers. */
{6, 6, 6, 6, 12}, /* cost of loading SSE registers
in 32bit, 64bit, 128bit, 256bit and 512bit */
{8, 8, 8, 8, 16}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{6, 6, 6, 6, 12}, /* cost of unaligned loads. */
{8, 8, 8, 8, 16}, /* cost of unaligned stores. */
2, 2, 3, /* cost of moving XMM,YMM,ZMM
register. */
6, /* cost of moving SSE register to integer. */
/* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
throughput 9. Approx 7 uops do not depend on vector size and every load
is 4 uops. */
14, 8, /* Gather load static, per_elt. */
14, 10, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block. */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches. */
3, /* Branch cost. */
COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (5), /* cost of FMUL instruction. */
/* Latency of fdiv is 8-15. */
COSTS_N_INSNS (15), /* cost of FDIV instruction. */
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
/* Latency of fsqrt is 4-10. */
COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (3), /* cost of MULSS instruction. */
COSTS_N_INSNS (3), /* cost of MULSD instruction. */
COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
/* 9-13. */
COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
/* Zen can execute 4 integer operations per cycle. FP operations
take 3 cycles and it can execute 2 integer additions and 2
multiplications thus reassociation may make sense up to with of 6.
SPEC2k6 bencharks suggests
that 4 works better than 6 probably due to register pressure.
Integer vector operations are taken by FP unit and execute 3 vector
plus/minus operations per cycle but only one multiply. This is adjusted
in ix86_reassociation_width. */
4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
znver2_memcpy,
znver2_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
"16", /* Loop alignment. */
"16", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"16", /* Func alignment. */
};
/* skylake_cost should produce code tuned for Skylake familly of CPUs. */
static stringop_algs skylake_memcpy[2] = {
{libcall,
{{256, rep_prefix_1_byte, true},
{256, loop, false},
{-1, libcall, false}}},
{libcall,
{{256, rep_prefix_1_byte, true},
{256, loop, false},
{-1, libcall, false}}}};
static stringop_algs skylake_memset[2] = {
{libcall,
{{256, rep_prefix_1_byte, true},
{256, loop, false},
{-1, libcall, false}}},
{libcall,
{{256, rep_prefix_1_byte, true},
{256, loop, false},
{-1, libcall, false}}}};
static const
struct processor_costs skylake_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
6, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{6, 6, 6}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{6, 6, 8}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 10}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{6, 6}, /* cost of loading MMX registers
in SImode and DImode */
{6, 6}, /* cost of storing MMX registers
in SImode and DImode */
2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
{6, 6, 6, 10, 20}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit */
{8, 8, 8, 12, 24}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit */
6, 6, /* SSE->integer and integer->SSE moves */
5, 5, /* mask->integer and integer->mask moves */
{8, 8, 8}, /* cost of loading mask register
in QImode, HImode, SImode. */
{6, 6, 6}, /* cost if storing mask register
in QImode, HImode, SImode. */
3, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (3), /* DI */
COSTS_N_INSNS (3)}, /* other */
0, /* cost of multiply per each bit set */
/* Expanding div/mod currently doesn't consider parallelism. So the cost
model is not realistic. We compensate by increasing the latencies a bit. */
{COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
COSTS_N_INSNS (11), /* HI */
COSTS_N_INSNS (14), /* SI */
COSTS_N_INSNS (76), /* DI */
COSTS_N_INSNS (76)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (0), /* cost of movzx */
8, /* "large" insn */
17, /* MOVE_RATIO */
17, /* CLEAR_RATIO */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{6, 6, 6}, /* cost of storing integer registers */
{6, 6, 6, 10, 20}, /* cost of loading SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{8, 8, 8, 12, 24}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{6, 6, 6, 10, 20}, /* cost of unaligned loads. */
{8, 8, 8, 8, 16}, /* cost of unaligned stores. */
2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
6, /* cost of moving SSE register to integer. */
20, 8, /* Gather load static, per_elt. */
22, 10, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
3, /* Branch cost */
COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (20), /* cost of FDIV instruction. */
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (4), /* cost of MULSS instruction. */
COSTS_N_INSNS (4), /* cost of MULSD instruction. */
COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
skylake_memcpy,
skylake_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
"16:11:8", /* Loop alignment. */
"16:11:8", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"16", /* Func alignment. */
};
/* icelake_cost should produce code tuned for Icelake family of CPUs.
NB: rep_prefix_1_byte is used only for known size. */
static stringop_algs icelake_memcpy[2] = {
{libcall,
{{256, rep_prefix_1_byte, true},
{256, loop, false},
{-1, libcall, false}}},
{libcall,
{{256, rep_prefix_1_byte, true},
{256, loop, false},
{-1, libcall, false}}}};
static stringop_algs icelake_memset[2] = {
{libcall,
{{256, rep_prefix_1_byte, true},
{256, loop, false},
{-1, libcall, false}}},
{libcall,
{{256, rep_prefix_1_byte, true},
{256, loop, false},
{-1, libcall, false}}}};
static const
struct processor_costs icelake_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
6, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{6, 6, 6}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{6, 6, 8}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 10}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{6, 6}, /* cost of loading MMX registers
in SImode and DImode */
{6, 6}, /* cost of storing MMX registers
in SImode and DImode */
2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
{6, 6, 6, 10, 20}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit */
{8, 8, 8, 12, 24}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit */
6, 6, /* SSE->integer and integer->SSE moves */
5, 5, /* mask->integer and integer->mask moves */
{8, 8, 8}, /* cost of loading mask register
in QImode, HImode, SImode. */
{6, 6, 6}, /* cost if storing mask register
in QImode, HImode, SImode. */
3, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (3), /* DI */
COSTS_N_INSNS (3)}, /* other */
0, /* cost of multiply per each bit set */
/* Expanding div/mod currently doesn't consider parallelism. So the cost
model is not realistic. We compensate by increasing the latencies a bit. */
{COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
COSTS_N_INSNS (11), /* HI */
COSTS_N_INSNS (14), /* SI */
COSTS_N_INSNS (76), /* DI */
COSTS_N_INSNS (76)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (0), /* cost of movzx */
8, /* "large" insn */
17, /* MOVE_RATIO */
17, /* CLEAR_RATIO */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{6, 6, 6}, /* cost of storing integer registers */
{6, 6, 6, 10, 20}, /* cost of loading SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{8, 8, 8, 12, 24}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{6, 6, 6, 10, 20}, /* cost of unaligned loads. */
{8, 8, 8, 8, 16}, /* cost of unaligned stores. */
2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
6, /* cost of moving SSE register to integer. */
20, 8, /* Gather load static, per_elt. */
22, 10, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
3, /* Branch cost */
COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (20), /* cost of FDIV instruction. */
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (4), /* cost of MULSS instruction. */
COSTS_N_INSNS (4), /* cost of MULSD instruction. */
COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
icelake_memcpy,
icelake_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
"16:11:8", /* Loop alignment. */
"16:11:8", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"16", /* Func alignment. */
};
/* BTVER1 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
static stringop_algs btver1_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs btver1_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
const struct processor_costs btver1_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
8, /* cost for loading QImode using movzbl */
{6, 8, 6}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{6, 8, 6}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{12, 12, 28}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{12, 12, 38}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
4, /* cost of moving MMX register */
{10, 10}, /* cost of loading MMX registers
in SImode and DImode */
{12, 12}, /* cost of storing MMX registers
in SImode and DImode */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
{10, 10, 12, 48, 96}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit */
{10, 10, 12, 48, 96}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit */
14, 14, /* SSE->integer and integer->SSE moves */
14, 14, /* mask->integer and integer->mask moves */
{6, 8, 6}, /* cost of loading mask register
in QImode, HImode, SImode. */
{6, 8, 6}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (5)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
COSTS_N_INSNS (35), /* HI */
COSTS_N_INSNS (51), /* SI */
COSTS_N_INSNS (83), /* DI */
COSTS_N_INSNS (83)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
6, /* CLEAR_RATIO */
{6, 8, 6}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{6, 8, 6}, /* cost of storing integer registers */
{10, 10, 12, 48, 96}, /* cost of loading SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{10, 10, 12, 48, 96}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{10, 10, 12, 48, 96}, /* cost of unaligned loads. */
{10, 10, 12, 48, 96}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
14, /* cost of moving SSE register to integer. */
10, 10, /* Gather load static, per_elt. */
10, 10, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
100, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (19), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (2), /* cost of MULSS instruction. */
COSTS_N_INSNS (4), /* cost of MULSD instruction. */
COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
btver1_memcpy,
btver1_memset,
COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
"16:11:8", /* Loop alignment. */
"16:8:8", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"11", /* Func alignment. */
};
static stringop_algs btver2_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs btver2_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
const struct processor_costs btver2_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
8, /* cost for loading QImode using movzbl */
{8, 8, 6}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{8, 8, 6}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{12, 12, 28}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{12, 12, 38}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
4, /* cost of moving MMX register */
{10, 10}, /* cost of loading MMX registers
in SImode and DImode */
{12, 12}, /* cost of storing MMX registers
in SImode and DImode */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
{10, 10, 12, 48, 96}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit */
{10, 10, 12, 48, 96}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit */
14, 14, /* SSE->integer and integer->SSE moves */
14, 14, /* mask->integer and integer->mask moves */
{8, 8, 6}, /* cost of loading mask register
in QImode, HImode, SImode. */
{8, 8, 6}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (5)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
COSTS_N_INSNS (35), /* HI */
COSTS_N_INSNS (51), /* SI */
COSTS_N_INSNS (83), /* DI */
COSTS_N_INSNS (83)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
6, /* CLEAR_RATIO */
{8, 8, 6}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{8, 8, 6}, /* cost of storing integer registers */
{10, 10, 12, 48, 96}, /* cost of loading SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{10, 10, 12, 48, 96}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{10, 10, 12, 48, 96}, /* cost of unaligned loads. */
{10, 10, 12, 48, 96}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
14, /* cost of moving SSE register to integer. */
10, 10, /* Gather load static, per_elt. */
10, 10, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
100, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (19), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (2), /* cost of MULSS instruction. */
COSTS_N_INSNS (4), /* cost of MULSD instruction. */
COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
COSTS_N_INSNS (19), /* cost of DIVSD instruction. */
COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
btver2_memcpy,
btver2_memset,
COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
"16:11:8", /* Loop alignment. */
"16:8:8", /* Jump alignment. */
"0:0:8", /* Label alignment. */
"11", /* Func alignment. */
};
static stringop_algs pentium4_memcpy[2] = {
{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs pentium4_memset[2] = {
{libcall, {{6, loop_1_byte, false}, {48, loop, false},
{20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs pentium4_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
5, /* cost for loading QImode using movzbl */
{4, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 3, 2}, /* cost of storing integer registers */
12, /* cost of reg,reg fld/fst */
{14, 14, 14}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{14, 14, 14}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
12, /* cost of moving MMX register */
{16, 16}, /* cost of loading MMX registers
in SImode and DImode */
{16, 16}, /* cost of storing MMX registers
in SImode and DImode */
12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
{16, 16, 16, 32, 64}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit */
{16, 16, 16, 32, 64}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit */
20, 12, /* SSE->integer and integer->SSE moves */
20, 12, /* mask->integer and integer->mask moves */
{4, 5, 4}, /* cost of loading mask register
in QImode, HImode, SImode. */
{2, 3, 2}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (3), /* cost of a lea instruction */
COSTS_N_INSNS (4), /* variable shift costs */
COSTS_N_INSNS (4), /* constant shift costs */
{COSTS_N_INSNS (15), /* cost of starting multiply for QI */
COSTS_N_INSNS (15), /* HI */
COSTS_N_INSNS (15), /* SI */
COSTS_N_INSNS (15), /* DI */
COSTS_N_INSNS (15)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
COSTS_N_INSNS (56), /* HI */
COSTS_N_INSNS (56), /* SI */
COSTS_N_INSNS (56), /* DI */
COSTS_N_INSNS (56)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
16, /* "large" insn */
6, /* MOVE_RATIO */
6, /* CLEAR_RATIO */
{4, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 3, 2}, /* cost of storing integer registers */
{16, 16, 16, 32, 64}, /* cost of loading SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{16, 16, 16, 32, 64}, /* cost of storing SSE register
in 32bit, 64bit, 128bit, 256bit and 512bit */
{32, 32, 32, 64, 128}, /* cost of unaligned loads. */
{32, 32, 32, 64, 128}, /* cost of unaligned stores. */
12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
20, /* cost of moving SSE register to integer. */
16, 16, /* Gather load static, per_elt. */
16, 16, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (7), /* cost of FMUL instruction. */
COSTS_N_INSNS (43), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
COSTS_N_INSNS (6), /* cost of MULSS instruction. */
COSTS_N_INSNS (6), /* cost of MULSD instruction. */
COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
COSTS_N_INSNS (23), /* cost of DIVSS instruction. */
COSTS_N_INSNS (38), /* cost of DIVSD instruction. */
COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
pentium4_memcpy,
pentium4_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
NULL, /* Loop alignment. */
NULL, /* Jump alignment. */
NULL, /* Label alignment. */
NULL, /* Func alignment. */
};
static stringop_algs nocona_memcpy[2] = {
{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
{100000, unrolled_loop, false}, {-1, libcall, false}}}};
static stringop_algs nocona_memset[2] = {
{libcall, {{6, loop_1_byte, false}, {48, loop, false},
{20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{24, loop, false}, {64, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static const
struct processor_costs nocona_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
4, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
12, /* cost of reg,reg fld/fst */
{14, 14, 14}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{14, 14, 14}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
14, /* cost of moving MMX register */
{12, 12}, /* cost of loading MMX registers
in SImode and DImode */
{12, 12}, /* cost of storing MMX registers
in SImode and DImode */
6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
{12, 12, 12, 24, 48}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit */
{12, 12, 12, 24, 48}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit */
20, 12, /* SSE->integer and integer->SSE moves */
20, 12, /* mask->integer and integer->mask moves */
{4, 4, 4}, /* cost of loading mask register
in QImode, HImode, SImode. */
{4, 4, 4}, /* cost if storing mask register
in QImode, HImode, SImode. */
2, /* cost of moving mask register. */
/* End of register allocator costs. */
},
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (10), /* cost of starting multiply for QI */
COSTS_N_INSNS (10), /* HI */
COSTS_N_INSNS (10), /* SI */
COSTS_N_INSNS (10), /* DI */
COSTS_N_INSNS (10)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
COSTS_N_INSNS (66), /* HI */
COSTS_N_INSNS (66), /* SI */
COSTS_N_INSNS (66), /* DI */
COSTS_N_INSNS (66)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
16, /* "large" insn */
17, /* MOVE_RATIO */