| ;; Scheduling description for cell processor. |
| ;; Copyright (C) 2001-2026 Free Software Foundation, Inc. |
| ;; Contributed by Sony Computer Entertainment, Inc., |
| |
| |
| ;; This file is free software; you can redistribute it and/or modify it under |
| ;; the terms of the GNU General Public License as published by the Free |
| ;; Software Foundation; either version 3 of the License, or (at your option) |
| ;; any later version. |
| |
| ;; This file is distributed in the hope that it will be useful, but WITHOUT |
| ;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| ;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| ;; for more details. |
| |
| ;; You should have received a copy of the GNU General Public License |
| ;; along with GCC; see the file COPYING3. If not see |
| ;; <http://www.gnu.org/licenses/>. |
| |
| ;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf) |
| |
| ;; BE Architecture *DD3.0 and DD3.1* |
| ;; This file simulate PPU processor unit backend of pipeline, maualP24. |
| ;; manual P27, stall and flush points |
| ;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program |
| ;; order, the grouped address are aligned by 8 |
| ;; This file only simulate one thread situation |
| ;; XU executes all fixed point insns(3 units, a simple alu, a complex unit, |
| ;; and load/store unit) |
| ;; VSU executes all scalar floating points insn(a float unit), |
| ;; VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point) |
| |
| ;; Dual issue combination |
| |
| ;; FXU LSU BR VMX VMX |
| ;; (sx,cx,vsu_fp,fp_arith) (perm,vsu_ls,fp_ls) |
| ;;FXU X |
| ;;LSU X X X |
| ;;BR X |
| ;;VMX(sx,cx,vsu_fp,fp_arth) X |
| ;;VMX(perm,vsu_ls, fp_ls) X |
| ;; X are illegal combination. |
| |
| ;; Dual issue exceptions: |
| ;;(1) nop-pipelined FXU instr in slot 0 |
| ;;(2) non-pipelined FPU inst in slot 0 |
| ;; CSI instr(contex-synchronizing insn) |
| ;; Microcode insn |
| |
| ;; BRU unit: bru(none register stall), bru_cr(cr register stall) |
| ;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex), |
| ;; vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for |
| ;; nonpipelined simulation |
| ;; micr insns will stall at least 7 cycles to get the first instr from ROM, |
| ;; micro instructions are not dual issued. |
| |
| ;; slot0 is older than slot1 |
| ;; non-pipelined insn need to be in slot1 to avoid 1cycle stall |
| |
| ;; There different stall point |
| ;; IB2, only stall one thread if stall here, so try to stall here as much as |
| ;; we can |
| ;; condition(1) insert nop, OR and ORI instruction form |
| ;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or |
| ;; CR0-access while stdcx, or stwcx |
| ;; IS2 stall ;; Page91 for details |
| ;; VQ8 stall |
| ;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to |
| ;; the vsu issue queue |
| |
| ;;(define_automaton "cellxu") |
| |
| ;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu") |
| |
| ;; ndfa |
| (define_automaton "cellxu,cellvsu,cellbru,cell_mis") |
| |
| (define_cpu_unit "fxu_cell,lsu_cell" "cellxu") |
| (define_cpu_unit "bru_cell" "cellbru") |
| (define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu") |
| |
| (define_cpu_unit "slot0,slot1" "cell_mis") |
| |
| (absence_set "slot0" "slot1") |
| |
| (define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell") |
| (define_reservation "slot01" "slot0|slot1") |
| |
| |
| ;; Load/store |
| ;; lmw, lswi, lswx are only generated for optimize for space, MC, |
| ;; these instr are not simulated |
| (define_insn_reservation "cell-load" 2 |
| (and (eq_attr "type" "load") |
| (eq_attr "sign_extend" "no") |
| (eq_attr "update" "no") |
| (eq_attr "cpu" "cell")) |
| "slot01,lsu_cell") |
| |
| ;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs, |
| ;; if with 32bytes alignment, CMC |
| (define_insn_reservation "cell-load-ux" 2 |
| (and (eq_attr "type" "load") |
| (eq_attr "sign_extend" "no") |
| (eq_attr "update" "yes") |
| (eq_attr "cpu" "cell")) |
| "slot01,fxu_cell+lsu_cell") |
| |
| ;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown |
| ;; 11/7, 11/8, 11/12 |
| (define_insn_reservation "cell-load-ext" 2 |
| (and (eq_attr "type" "load") |
| (eq_attr "sign_extend" "yes") |
| (eq_attr "cpu" "cell")) |
| "slot01,fxu_cell+lsu_cell") |
| |
| ;;lfs,lfsx,lfd,lfdx, 1 cycle |
| (define_insn_reservation "cell-fpload" 1 |
| (and (eq_attr "type" "fpload") |
| (eq_attr "update" "no") |
| (eq_attr "cpu" "cell")) |
| "vsu2_cell+lsu_cell+slot01") |
| |
| ;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr) |
| (define_insn_reservation "cell-fpload-update" 1 |
| (and (eq_attr "type" "fpload") |
| (eq_attr "update" "yes") |
| (eq_attr "cpu" "cell")) |
| "fxu_cell+vsu2_cell+lsu_cell+slot01") |
| |
| (define_insn_reservation "cell-vecload" 2 |
| (and (eq_attr "type" "vecload") |
| (eq_attr "cpu" "cell")) |
| "slot01,vsu2_cell+lsu_cell") |
| |
| ;;st? stw(MC) |
| (define_insn_reservation "cell-store" 1 |
| (and (eq_attr "type" "store") |
| (eq_attr "update" "no") |
| (eq_attr "cpu" "cell")) |
| "lsu_cell+slot01") |
| |
| ;;stdux, stdu, (hardware breaks into store and add) 2 for update reg |
| (define_insn_reservation "cell-store-update" 1 |
| (and (eq_attr "type" "store") |
| (eq_attr "update" "yes") |
| (eq_attr "cpu" "cell")) |
| "fxu_cell+lsu_cell+slot01") |
| |
| (define_insn_reservation "cell-fpstore" 1 |
| (and (eq_attr "type" "fpstore") |
| (eq_attr "update" "no") |
| (eq_attr "cpu" "cell")) |
| "vsu2_cell+lsu_cell+slot01") |
| |
| (define_insn_reservation "cell-fpstore-update" 1 |
| (and (eq_attr "type" "fpstore") |
| (eq_attr "update" "yes") |
| (eq_attr "cpu" "cell")) |
| "vsu2_cell+fxu_cell+lsu_cell+slot01") |
| |
| (define_insn_reservation "cell-vecstore" 1 |
| (and (eq_attr "type" "vecstore") |
| (eq_attr "cpu" "cell")) |
| "vsu2_cell+lsu_cell+slot01") |
| |
| ;; Integer latency is 2 cycles |
| (define_insn_reservation "cell-integer" 2 |
| (and (ior (eq_attr "type" "integer,trap,cntlz,isel") |
| (and (eq_attr "type" "add,logical,shift,exts") |
| (eq_attr "dot" "no")) |
| (and (eq_attr "type" "insert") |
| (eq_attr "size" "64"))) |
| (eq_attr "cpu" "cell")) |
| "slot01,fxu_cell") |
| |
| ;; Two integer latency is 4 cycles |
| (define_insn_reservation "cell-two" 4 |
| (and (eq_attr "type" "two") |
| (eq_attr "cpu" "cell")) |
| "slot01,fxu_cell,fxu_cell*2") |
| |
| ;; Three integer latency is 6 cycles |
| (define_insn_reservation "cell-three" 6 |
| (and (eq_attr "type" "three") |
| (eq_attr "cpu" "cell")) |
| "slot01,fxu_cell,fxu_cell*4") |
| |
| ;; rlwimi, alter cr0 |
| (define_insn_reservation "cell-insert" 2 |
| (and (eq_attr "type" "insert") |
| (eq_attr "size" "32") |
| (eq_attr "cpu" "cell")) |
| "slot01,fxu_cell") |
| |
| ;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0 |
| (define_insn_reservation "cell-cmp" 1 |
| (and (eq_attr "type" "cmp") |
| (eq_attr "cpu" "cell")) |
| "fxu_cell+slot01") |
| |
| ;; add, addo, sub, subo, alter cr0, rldcli, rlwinm |
| (define_insn_reservation "cell-fast-cmp" 2 |
| (and (eq_attr "type" "add,logical,shift,exts") |
| (eq_attr "dot" "yes") |
| (eq_attr "cpu" "cell") |
| (eq_attr "cell_micro" "not")) |
| "slot01,fxu_cell") |
| |
| (define_insn_reservation "cell-cmp-microcoded" 9 |
| (and (eq_attr "type" "add,logical,shift,exts") |
| (eq_attr "dot" "yes") |
| (eq_attr "cpu" "cell") |
| (eq_attr "cell_micro" "always")) |
| "slot0+slot1,fxu_cell,fxu_cell*7") |
| |
| ;; mulld |
| (define_insn_reservation "cell-lmul" 15 |
| (and (eq_attr "type" "mul") |
| (eq_attr "dot" "no") |
| (eq_attr "size" "64") |
| (eq_attr "cpu" "cell")) |
| "slot1,nonpipeline,nonpipeline*13") |
| |
| ;; mulld. is microcoded |
| (define_insn_reservation "cell-lmul-cmp" 22 |
| (and (eq_attr "type" "mul") |
| (eq_attr "dot" "yes") |
| (eq_attr "size" "64") |
| (eq_attr "cpu" "cell")) |
| "slot0+slot1,nonpipeline,nonpipeline*20") |
| |
| ;; mulli, 6 cycles |
| (define_insn_reservation "cell-imul23" 6 |
| (and (eq_attr "type" "mul") |
| (eq_attr "size" "8,16") |
| (eq_attr "cpu" "cell")) |
| "slot1,nonpipeline,nonpipeline*4") |
| |
| ;; mullw, 9 |
| (define_insn_reservation "cell-imul" 9 |
| (and (eq_attr "type" "mul") |
| (eq_attr "dot" "no") |
| (eq_attr "size" "32") |
| (eq_attr "cpu" "cell")) |
| "slot1,nonpipeline,nonpipeline*7") |
| |
| ;; divide |
| (define_insn_reservation "cell-idiv" 32 |
| (and (eq_attr "type" "div") |
| (eq_attr "size" "32") |
| (eq_attr "cpu" "cell")) |
| "slot1,nonpipeline,nonpipeline*30") |
| |
| (define_insn_reservation "cell-ldiv" 64 |
| (and (eq_attr "type" "div") |
| (eq_attr "size" "64") |
| (eq_attr "cpu" "cell")) |
| "slot1,nonpipeline,nonpipeline*62") |
| |
| ;;mflr and mfctr are pipelined |
| (define_insn_reservation "cell-mfjmpr" 1 |
| (and (eq_attr "type" "mfjmpr") |
| (eq_attr "cpu" "cell")) |
| "slot01+bru_cell") |
| |
| ;;mtlr and mtctr, |
| ;;mtspr fully pipelined |
| (define_insn_reservation "cell-mtjmpr" 1 |
| (and (eq_attr "type" "mtjmpr") |
| (eq_attr "cpu" "cell")) |
| "bru_cell+slot01") |
| |
| ;; Branches |
| ;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency |
| ;; bcctr, bcctrl, latency 2, actually adjust by be to 4 |
| (define_insn_reservation "cell-branch" 1 |
| (and (eq_attr "type" "branch") |
| (eq_attr "cpu" "cell")) |
| "bru_cell+slot1") |
| |
| (define_insn_reservation "cell-branchreg" 1 |
| (and (eq_attr "type" "jmpreg") |
| (eq_attr "cpu" "cell")) |
| "bru_cell+slot1") |
| |
| ;; cr hazard |
| ;; page 90, special cases for CR hazard, only one instr can access cr per cycle |
| ;; if insn reads CR following a stwcx, pipeline stall till stwcx finish |
| (define_insn_reservation "cell-crlogical" 1 |
| (and (eq_attr "type" "cr_logical") |
| (eq_attr "cpu" "cell")) |
| "bru_cell+slot01") |
| |
| ;; mfcrf and mfcr is about 34 cycles and nonpipelined |
| (define_insn_reservation "cell-mfcr" 34 |
| (and (eq_attr "type" "mfcrf,mfcr") |
| (eq_attr "cpu" "cell")) |
| "slot1,nonpipeline,nonpipeline*32") |
| |
| ;; mtcrf (1 field) |
| (define_insn_reservation "cell-mtcrf" 1 |
| (and (eq_attr "type" "mtcr") |
| (eq_attr "cpu" "cell")) |
| "fxu_cell+slot01") |
| |
| ; Basic FP latency is 10 cycles, thoughput is 1/cycle |
| (define_insn_reservation "cell-fp" 10 |
| (and (eq_attr "type" "fp,fpsimple,dmul") |
| (eq_attr "cpu" "cell")) |
| "slot01,vsu1_cell,vsu1_cell*8") |
| |
| (define_insn_reservation "cell-fpcompare" 1 |
| (and (eq_attr "type" "fpcompare") |
| (eq_attr "cpu" "cell")) |
| "vsu1_cell+slot01") |
| |
| ;; sdiv thoughput 1/74, not pipelined but only in the FPU |
| (define_insn_reservation "cell-sdiv" 74 |
| (and (eq_attr "type" "sdiv,ddiv") |
| (eq_attr "cpu" "cell")) |
| "slot1,nonpipeline,nonpipeline*72") |
| |
| ;; fsqrt thoughput 1/84, not pipelined but only in the FPU |
| (define_insn_reservation "cell-sqrt" 84 |
| (and (eq_attr "type" "ssqrt,dsqrt") |
| (eq_attr "cpu" "cell")) |
| "slot1,nonpipeline,nonpipeline*82") |
| |
| ; VMX |
| (define_insn_reservation "cell-vecsimple" 4 |
| (and (eq_attr "type" "vecsimple,veclogical,vecmove") |
| (eq_attr "cpu" "cell")) |
| "slot01,vsu1_cell,vsu1_cell*2") |
| |
| ;; mult, div, madd |
| (define_insn_reservation "cell-veccomplex" 10 |
| (and (eq_attr "type" "veccomplex") |
| (eq_attr "cpu" "cell")) |
| "slot01,vsu1_cell,vsu1_cell*8") |
| |
| ;; TODO: add support for recording instructions |
| (define_insn_reservation "cell-veccmp" 4 |
| (and (eq_attr "type" "veccmp,veccmpfx") |
| (eq_attr "cpu" "cell")) |
| "slot01,vsu1_cell,vsu1_cell*2") |
| |
| (define_insn_reservation "cell-vecfloat" 12 |
| (and (eq_attr "type" "vecfloat") |
| (eq_attr "cpu" "cell")) |
| "slot01,vsu1_cell,vsu1_cell*10") |
| |
| (define_insn_reservation "cell-vecperm" 4 |
| (and (eq_attr "type" "vecperm") |
| (eq_attr "cpu" "cell")) |
| "slot01,vsu2_cell,vsu2_cell*2") |
| |
| ;; New for 4.2, syncs |
| |
| (define_insn_reservation "cell-sync" 11 |
| (and (eq_attr "type" "sync") |
| (eq_attr "cpu" "cell")) |
| "slot01,lsu_cell,lsu_cell*9") |
| |
| (define_insn_reservation "cell-isync" 11 |
| (and (eq_attr "type" "isync") |
| (eq_attr "cpu" "cell")) |
| "slot01,lsu_cell,lsu_cell*9") |
| |
| (define_insn_reservation "cell-load_l" 11 |
| (and (eq_attr "type" "load_l") |
| (eq_attr "cpu" "cell")) |
| "slot01,lsu_cell,lsu_cell*9") |
| |
| (define_insn_reservation "cell-store_c" 11 |
| (and (eq_attr "type" "store_c") |
| (eq_attr "cpu" "cell")) |
| "slot01,lsu_cell,lsu_cell*9") |
| |
| ;; RAW register dependency |
| |
| ;; addi r3, r3, 1 |
| ;; lw r4,offset(r3) |
| ;; there are 5 cycle deplay for r3 bypassing |
| ;; there are 5 cycle delay for a dependent load after a load |
| (define_bypass 5 "cell-integer" "cell-load") |
| (define_bypass 5 "cell-integer" "cell-load-ext") |
| (define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext") |
| |
| ;; there is a 6 cycle delay after a fp compare until you can use the cr. |
| (define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical") |
| |
| ;; VXU float RAW |
| (define_bypass 11 "cell-vecfloat" "cell-vecfloat") |
| |
| ;; VXU and FPU |
| (define_bypass 6 "cell-veccomplex" "cell-vecsimple") |
| ;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg") |
| (define_bypass 3 "cell-vecfloat" "cell-veccomplex") |
| ; this is not correct, |
| ;; this is a stall in general and not dependent on result |
| (define_bypass 13 "cell-vecstore" "cell-fpstore") |
| ; this is not correct, this can never be true, not dependent on result |
| (define_bypass 7 "cell-fp" "cell-fpload") |
| ;; vsu1 should avoid writing to the same target register as vsu2 insn |
| ;; within 12 cycles. |
| |
| ;; WAW hazard |
| |
| ;; the target of VSU estimate should not be reused within 10 dispatch groups |
| ;; the target of VSU float should not be reused within 8 dispatch groups |
| ;; the target of VSU complex should not be reused within 5 dispatch groups |
| ;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus |
| |
| ;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at |
| ;; ex4 stage(10 cycles) |
| (define_bypass 10 "cell-mtjmpr" "cell-branchreg") |
| |
| ;;Things are not simulated: |
| ;; update instruction, update address gpr are not simulated |
| ;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float |
| ;; insns |
| |