;; Scheduling description for the SPARC M8. ;; Copyright (C) 2017-2021 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; ;; GCC is free software; you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation; either version 3, or (at your option) ;; any later version. ;; ;; GCC is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; ;; You should have received a copy of the GNU General Public License ;; along with GCC; see the file COPYING3. If not see ;; http://www.gnu.org/licenses/.

;; Thigs to improve: ;; ;; - Store instructions are implemented by micro-ops, one of which ;; generates the store address and is executed in the store address ;; generation unit in the slot0. We need to model that. ;; ;; - There are two V3 pipes connected to different slots. The current ;; implementation assumes that all the instructions executing in a ;; V3 pipe are issued to the unit in slot3. ;; ;; - Single-issue ALU operations incur an additional cycle of latency to ;; slot 0 and slot 1 instructions. This is not currently reflected ;; in the DFA.

(define_automaton “m8_0”)

;; The S5 core has two dual-issue queues, PQLS and PQEX. Each queue ;; is divided into two slots: PQLS corresponds to slots 0 and 1, and ;; PQEX corresponds to slots 2 and 3. The core can issue 4 ;; instructions per-cycle, and up to 4 instructions are committed each ;; cycle. ;; ;;
;; m8_slot0 - Load Unit. ;; - Store address gen. Unit. ;;
;;
;; === PQLS ==> m8_slot1 - Store data unit. ;; - Branch unit. ;;
;;
;; === PQEX ==> m8_slot2 - Integer Unit (EXU2).
;; - 3-cycles Crypto Unit (SPU2). ;;
;; m8_slot3 - Integer Unit (EXU3). ;; - 3-cycles Crypto Unit (SPU3). ;; - Floating-point and graphics unit (FPG). ;; - Long-latency Crypto Unit. ;; - Oracle Numbers Unit (ONU).

(define_cpu_unit “m8_slot0,m8_slot1,m8_slot2,m8_slot3” “m8_0”)

;; Some instructions stall the pipeline and avoid any other ;; instruction to be issued in the same cycle. We assume the same for ;; multi-instruction insns.

(define_reservation “m8_single_issue” “m8_slot0 + m8_slot1 + m8_slot2 + m8_slot3”)

(define_insn_reservation “m8_single” 1 (and (eq_attr “cpu” “m8”) (eq_attr “type” “multi,savew,flushw,trap,bmask”)) “m8_single_issue”)

;; Most of the instructions executing in the integer units have a ;; latency of 1.

(define_insn_reservation “m8_integer” 1 (and (eq_attr “cpu” “m8”) (eq_attr “type” “ialu,ialuX,shift,cmove,compare,bmask”)) “(m8_slot2 | m8_slot3)”)

;; Flushing the instruction memory takes 27 cycles.

(define_insn_reservation “m8_iflush” 27 (and (eq_attr “cpu” “m8”) (eq_attr “type” “iflush”)) “(m8_slot2 | m8_slot3), nothing*26”)

;; The integer multiplication instructions have a latency of 10 cycles ;; and execute in integer units. ;; ;; Likewise for array*, edge* and pdistn instructions. ;; ;; However, the latency is only 9 cycles if the consumer of the ;; operation is also capable of 9 cycles latency. We model this with ;; a bypass.

(define_insn_reservation “m8_imul” 10 (and (eq_attr “cpu” “m8”) (eq_attr “type” “imul,array,edge,edgen,pdistn”)) “(m8_slot2 | m8_slot3), nothing*12”)

(define_bypass 9 “m8_imul” “m8_imul”)

;; The integer division instructions sdiv' and udivx' have a latency ;; of 30 cycles and execute in integer units.

(define_insn_reservation “m8_idiv” 30 (and (eq_attr “cpu” “m8”) (eq_attr “type” “idiv”)) “(m8_slot2 | m8_slot3), nothing*29”)

;; Both integer and floating-point load instructions have a latency of ;; only 3 cycles,and execute in the slot0. ;; ;; Misaligned load instructions feature a latency of 11 cycles. ;; ;; The prefetch instruction also executes in the load unit, but it's ;; latency is only 1 cycle.

(define_insn_reservation “m8_load” 3 (and (eq_attr “cpu” “m8”) (ior (eq_attr “type” “fpload,sload”) (and (eq_attr “type” “load”) (eq_attr “subtype” “regular”)))) “m8_slot0, nothing*2”)

;; (define_insn_reservation “m8_load_misalign” 11 ;; (and (eq_attr “cpu” “m8”) ;; (eq_attr “type” “load_mis,fpload_mis”)) ;; “m8_slot0, nothing*10”)

(define_insn_reservation “m8_prefetch” 1 (and (eq_attr “cpu” “m8”) (eq_attr “type” “load”) (eq_attr “subtype” “prefetch”)) “m8_slot0”)

;; Both integer and floating-point store instructions have a latency ;; of 1 cycle, and execute in the store data unit in slot1. ;; ;; However, misaligned store instructions feature a latency of 3 ;; cycles.

(define_insn_reservation “m8_store” 1 (and (eq_attr “cpu” “m8”) (eq_attr “type” “store,fpstore”)) “m8_slot1”)

;; (define_insn_reservation “m8_store_misalign” 3 ;; (and (eq_attr “cpu” “m8”) ;; (eq_attr “type” “store_mis,fpstore_mis”)) ;; “m8_slot1, nothing*2”)

;; Control-transfer instructions execute in the Branch Unit in the ;; slot1.

(define_insn_reservation “m8_cti” 1 (and (eq_attr “cpu” “m8”) (eq_attr “type” “cbcond,uncond_cbcond,branch,call,sibcall,call_no_delay_slot,uncond_branch,return”)) “m8_slot1”)

;; Many instructions executing in the Floating-point and Graphics Unit ;; (FGU) serving slot3 feature a default latency of 9 cycles.

(define_insn_reservation “m8_fp” 9 (and (eq_attr “cpu” “m8”) (ior (eq_attr “type” “fpmove,fpcmove,fpcrmove,fp,fpcmp,fpmul,fgm_pack,fgm_mul,pdist”) (and (eq_attr “type” “fga”) (eq_attr “subtype” “fpu”)))) “m8_slot3, nothing*8”)

;; Floating-point division and floating-point square-root instructions ;; have high latencies. They execute in the FGU.

(define_insn_reservation “m8_fpdivs” 26 (and (eq_attr “cpu” “m8”) (eq_attr “type” “fpdivs”)) “m8_slot3, nothing*25”)

(define_insn_reservation “m8_fpsqrts” 33 (and (eq_attr “cpu” “m8”) (eq_attr “type” “fpsqrts”)) “m8_slot3, nothing*32”)

(define_insn_reservation “m8_fpdivd” 30 (and (eq_attr “cpu” “m8”) (eq_attr “type” “fpdivd”)) “m8_slot3, nothing*29”)

(define_insn_reservation “m8_fpsqrtd” 41 (and (eq_attr “cpu” “m8”) (eq_attr “type” “fpsqrtd”)) “m8_slot3, nothing*40”)

;; SIMD VIS instructions executing in the Floating-point and graphics ;; unit (FPG) in slot3 usually have a latency of 5 cycles. ;; ;; However, the latency for many instructions is only 3 cycles if the ;; consumer can also be executed in 3 cycles. We model this with a ;; bypass. In these cases the instructions are executed in one of the ;; two 3-cycle crypto units (SPU, also known as “v3-pipes”) in slots 2 ;; and 3.

(define_insn_reservation “m8_vis” 5 (and (eq_attr “cpu” “m8”) (ior (eq_attr “type” “viscmp,lzd”) (and (eq_attr “type” “fga”) (eq_attr “subtype” “maxmin,cmask,other”)) (and (eq_attr “type” “vismv”) (eq_attr “subtype” “single,movstouw”)) (and (eq_attr “type” “visl”) (eq_attr “subtype” “single”)))) “m8_slot3, nothing*4”)

(define_bypass 3 “m8_vis” “m8_vis”)

(define_insn_reservation “m8_gsr” 5 (and (eq_attr “cpu” “m8”) (eq_attr “type” “gsr”) (eq_attr “subtype” “alignaddr”)) “m8_slot3, nothing*4”)

;; A few VIS instructions have a latency of 1.

(define_insn_reservation “m8_vis_1cycle” 1 (and (eq_attr “cpu” “m8”) (ior (and (eq_attr “type” “vismv”) (eq_attr “subtype” “double,movxtod,movdtox”)) (and (eq_attr “type” “visl”) (eq_attr “subtype” “double”)) (and (eq_attr “type” “fga”) (eq_attr “subtype” “addsub64”)))) “m8_slot3”)

;; Reading and writing to the gsr register takes more than 70 cycles.

(define_insn_reservation “m8_gsr_reg” 70 (and (eq_attr “cpu” “m8”) (eq_attr “type” “gsr”) (eq_attr “subtype” “reg”)) “m8_slot3, nothing*69”)