;; DFA-based pipeline description for the VR1x000. ;; Copyright (C) 2005-2021 Free Software Foundation, Inc. ;; ;; This file is part of GCC.

;; GCC is free software; you can redistribute it and/or modify it ;; under the terms of the GNU General Public License as published ;; by the Free Software Foundation; either version 3, or (at your ;; option) any later version.

;; GCC is distributed in the hope that it will be useful, but WITHOUT ;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public ;; License for more details.

;; You should have received a copy of the GNU General Public License ;; along with GCC; see the file COPYING3. If not see ;; http://www.gnu.org/licenses/.

;; R12K/R14K/R16K are derivatives of R10K, thus copy its description ;; until specific tuning for each is added.

;; R10000 has an int queue, fp queue, address queue. ;; The int queue feeds ALU1 and ALU2. ;; The fp queue feeds the fp-adder and fp-multiplier. ;; The addr queue feeds the Load/Store unit. ;; ;; However, we define the fp-adder and fp-multiplier as ;; separate automatons, because the fp-multiplier is ;; divided into fp-multiplier, fp-division, and ;; fp-squareroot units, all of which share the same ;; issue and completion logic, yet can operate in ;; parallel. ;; ;; This is based on the model described in the R10K Manual ;; and it helps to reduce the size of the automata. (define_automaton “r10k_a_int, r10k_a_fpadder, r10k_a_addr, r10k_a_fpmpy, r10k_a_fpdiv, r10k_a_fpsqrt”)

(define_cpu_unit “r10k_alu1” “r10k_a_int”) (define_cpu_unit “r10k_alu2” “r10k_a_int”) (define_cpu_unit “r10k_fpadd” “r10k_a_fpadder”) (define_cpu_unit “r10k_fpmpy” “r10k_a_fpmpy”) (define_cpu_unit “r10k_fpdiv” “r10k_a_fpdiv”) (define_cpu_unit “r10k_fpsqrt” “r10k_a_fpsqrt”) (define_cpu_unit “r10k_loadstore” “r10k_a_addr”)

;; R10k Loads and Stores. (define_insn_reservation “r10k_load” 2 (and (eq_attr “cpu” “r10000”) (eq_attr “type” “load,prefetch,prefetchx”)) “r10k_loadstore”)

(define_insn_reservation “r10k_store” 0 (and (eq_attr “cpu” “r10000”) (eq_attr “type” “store,fpstore,fpidxstore”)) “r10k_loadstore”)

(define_insn_reservation “r10k_fpload” 3 (and (eq_attr “cpu” “r10000”) (eq_attr “type” “fpload,fpidxload”)) “r10k_loadstore”)

;; Integer add/sub + logic ops, and mt hi/lo can be done by alu1 or alu2. ;; Miscellaneous arith goes here too (this is a guess). (define_insn_reservation “r10k_arith” 1 (and (eq_attr “cpu” “r10000”) (eq_attr “type” “arith,mthi,mtlo,slt,clz,const,nop,trap,logical”)) “r10k_alu1 | r10k_alu2”)

;; We treat mfhilo differently, because we need to know when ;; it‘s HI and when it’s LO. (define_insn_reservation “r10k_mfhi” 1 (and (eq_attr “cpu” “r10000”) (eq_attr “type” “mfhi”)) “r10k_alu1 | r10k_alu2”)

(define_insn_reservation “r10k_mflo” 1 (and (eq_attr “cpu” “r10000”) (eq_attr “type” “mflo”)) “r10k_alu1 | r10k_alu2”)

;; ALU1 handles shifts, branch eval, and condmove. ;; ;; Brancher is separate, but part of ALU1, but can only ;; do one branch per cycle (is this even implementable?). ;; ;; Unsure if the brancher handles jumps and calls as well, but since ;; they‘re related, we’ll add them here for now. (define_insn_reservation “r10k_brancher” 1 (and (eq_attr “cpu” “r10000”) (eq_attr “type” “shift,branch,jump,call”)) “r10k_alu1”)

(define_insn_reservation “r10k_int_cmove” 1 (and (eq_attr “cpu” “r10000”) (and (eq_attr “type” “condmove”) (eq_attr “mode” “SI,DI”))) “r10k_alu1”)

;; Coprocessor Moves. ;; mtc1/dmtc1 are handled by ALU1. ;; mfc1/dmfc1 are handled by the fp-multiplier. (define_insn_reservation “r10k_mt_xfer” 3 (and (eq_attr “cpu” “r10000”) (eq_attr “type” “mtc”)) “r10k_alu1”)

(define_insn_reservation “r10k_mf_xfer” 2 (and (eq_attr “cpu” “r10000”) (eq_attr “type” “mfc”)) “r10k_fpmpy”)

;; Only ALU2 does int multiplications and divisions. ;; ;; According to the Vr10000 series user manual, ;; integer mult and div insns can be issued one ;; cycle earlier if using register Lo. We model ;; this by using the Lo value by default, as it ;; is the more common value, and use a bypass ;; for the Hi value when needed. ;; ;; Also of note, There are different latencies ;; for MULT/DMULT (Lo 5/Hi 6) and MULTU/DMULTU (Lo 6/Hi 7). ;; However, gcc does not have separate types ;; for these insns. Thus to strike a balance, ;; we use the Hi latency value for imul ;; operations until the imul type can be split. (define_insn_reservation “r10k_imul_single” 6 (and (eq_attr “cpu” “r10000”) (and (eq_attr “type” “imul,imul3”) (eq_attr “mode” “SI”))) “r10k_alu2 * 6”)

(define_insn_reservation “r10k_imul_double” 10 (and (eq_attr “cpu” “r10000”) (and (eq_attr “type” “imul,imul3”) (eq_attr “mode” “DI”))) “r10k_alu2 * 10”)

;; Divides keep ALU2 busy. (define_insn_reservation “r10k_idiv_single” 34 (and (eq_attr “cpu” “r10000”) (and (eq_attr “type” “idiv”) (eq_attr “mode” “SI”))) “r10k_alu2 * 35”)

(define_insn_reservation “r10k_idiv_double” 66 (and (eq_attr “cpu” “r10000”) (and (eq_attr “type” “idiv”) (eq_attr “mode” “DI”))) “r10k_alu2 * 67”)

(define_bypass 35 “r10k_idiv_single” “r10k_mfhi”) (define_bypass 67 “r10k_idiv_double” “r10k_mfhi”)

;; Floating point add/sub, mul, abs value, neg, comp, & moves. (define_insn_reservation “r10k_fp_miscadd” 2 (and (eq_attr “cpu” “r10000”) (eq_attr “type” “fadd,fabs,fneg,fcmp”)) “r10k_fpadd”)

(define_insn_reservation “r10k_fp_miscmul” 2 (and (eq_attr “cpu” “r10000”) (eq_attr “type” “fmul,fmove”)) “r10k_fpmpy”)

(define_insn_reservation “r10k_fp_cmove” 2 (and (eq_attr “cpu” “r10000”) (and (eq_attr “type” “condmove”) (eq_attr “mode” “SF,DF”))) “r10k_fpmpy”)

;; The fcvt.s.[wl] insn has latency 4, repeat 2. ;; All other fcvt insns have latency 2, repeat 1. (define_insn_reservation “r10k_fcvt_single” 4 (and (eq_attr “cpu” “r10000”) (and (eq_attr “type” “fcvt”) (eq_attr “cnv_mode” “I2S”))) “r10k_fpadd * 2”)

(define_insn_reservation “r10k_fcvt_other” 2 (and (eq_attr “cpu” “r10000”) (and (eq_attr “type” “fcvt”) (eq_attr “cnv_mode” “!I2S”))) “r10k_fpadd”)

;; Run the fmadd insn through fp-adder first, then fp-multiplier. ;; ;; The latency for fmadd is 2 cycles if the result is used ;; by another fmadd instruction. (define_insn_reservation “r10k_fmadd” 4 (and (eq_attr “cpu” “r10000”) (eq_attr “type” “fmadd”)) “r10k_fpadd, r10k_fpmpy”)

(define_bypass 2 “r10k_fmadd” “r10k_fmadd”)

;; Floating point Divisions & square roots. (define_insn_reservation “r10k_fdiv_single” 12 (and (eq_attr “cpu” “r10000”) (and (eq_attr “type” “fdiv,frdiv”) (eq_attr “mode” “SF”))) “r10k_fpdiv * 14”)

(define_insn_reservation “r10k_fdiv_double” 19 (and (eq_attr “cpu” “r10000”) (and (eq_attr “type” “fdiv,frdiv”) (eq_attr “mode” “DF”))) “r10k_fpdiv * 21”)

(define_insn_reservation “r10k_fsqrt_single” 18 (and (eq_attr “cpu” “r10000”) (and (eq_attr “type” “fsqrt”) (eq_attr “mode” “SF”))) “r10k_fpsqrt * 20”)

(define_insn_reservation “r10k_fsqrt_double” 33 (and (eq_attr “cpu” “r10000”) (and (eq_attr “type” “fsqrt”) (eq_attr “mode” “DF”))) “r10k_fpsqrt * 35”)

(define_insn_reservation “r10k_frsqrt_single” 30 (and (eq_attr “cpu” “r10000”) (and (eq_attr “type” “frsqrt”) (eq_attr “mode” “SF”))) “r10k_fpsqrt * 20”)

(define_insn_reservation “r10k_frsqrt_double” 52 (and (eq_attr “cpu” “r10000”) (and (eq_attr “type” “frsqrt”) (eq_attr “mode” “DF”))) “r10k_fpsqrt * 35”)

;; Handle unknown/multi insns here (this is a guess). (define_insn_reservation “r10k_unknown” 1 (and (eq_attr “cpu” “r10000”) (eq_attr “type” “unknown,multi,atomic,syncloop”)) “r10k_alu1 + r10k_alu2”)