;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf)

;; BE Architecture DD3.0 and DD3.1 ;; This file simulate PPU processor unit backend of pipeline, maualP24. ;; manual P27, stall and flush points ;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program ;; order, the grouped address are aligned by 8 ;; This file only simulate one thread situation ;; XU executes all fixed point insns(3 units, a simple alu, a complex unit, ;; and load/store unit) ;; VSU executes all scalar floating points insn(a float unit), ;; VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point)

;; Dual issue combination

;; FXU LSU BR VMX VMX ;; (sx,cx,vsu_fp,fp_arith) (perm,vsu_ls,fp_ls) ;;FXU X ;;LSU X X X ;;BR X ;;VMX(sx,cx,vsu_fp,fp_arth) X ;;VMX(perm,vsu_ls, fp_ls) X ;; X are illegal combination.

;; Dual issue exceptions: ;;(1) nop-pipelined FXU instr in slot 0 ;;(2) non-pipelined FPU inst in slot 0 ;; CSI instr(contex-synchronizing insn) ;; Microcode insn

;; BRU unit: bru(none register stall), bru_cr(cr register stall) ;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex), ;; vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for ;; nonpipelined simulation ;; micr insns will stall at least 7 cycles to get the first instr from ROM, ;; micro instructions are not dual issued.

;; slot0 is older than slot1 ;; non-pipelined insn need to be in slot1 to avoid 1cycle stall

;; There different stall point ;; IB2, only stall one thread if stall here, so try to stall here as much as ;; we can ;; condition(1) insert nop, OR and ORI instruction form ;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or ;; CR0-access while stdcx, or stwcx ;; IS2 stall ;; Page91 for details ;; VQ8 stall ;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to ;; the vsu issue queue

;;(define_automaton “cellxu”)

;;(define_cpu_unit “fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell” “cellxu”)

;; ndfa (define_automaton “cellxu,cellvsu,cellbru,cell_mis”)

(define_cpu_unit “fxu_cell,lsu_cell” “cellxu”) (define_cpu_unit “bru_cell” “cellbru”) (define_cpu_unit “vsu1_cell,vsu2_cell” “cellvsu”)

(define_cpu_unit “slot0,slot1” “cell_mis”)

(absence_set “slot0” “slot1”)

(define_reservation “nonpipeline” “fxu_cell+lsu_cell+vsu1_cell+vsu2_cell”) (define_reservation “slot01” “slot0|slot1”)

;; Load/store ;; lmw, lswi, lswx are only generated for optimize for space, MC, ;; these instr are not simulated (define_insn_reservation “cell-load” 2 (and (eq_attr “type” “load”) (eq_attr “sign_extend” “no”) (eq_attr “update” “no”) (eq_attr “cpu” “cell”)) “slot01,lsu_cell”)

;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs, ;; if with 32bytes alignment, CMC (define_insn_reservation “cell-load-ux” 2 (and (eq_attr “type” “load”) (eq_attr “sign_extend” “no”) (eq_attr “update” “yes”) (eq_attr “cpu” “cell”)) “slot01,fxu_cell+lsu_cell”)

;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown ;; 11/7, 11/8, 11/12 (define_insn_reservation “cell-load-ext” 2 (and (eq_attr “type” “load”) (eq_attr “sign_extend” “yes”) (eq_attr “cpu” “cell”)) “slot01,fxu_cell+lsu_cell”)

;;lfs,lfsx,lfd,lfdx, 1 cycle (define_insn_reservation “cell-fpload” 1 (and (eq_attr “type” “fpload”) (eq_attr “update” “no”) (eq_attr “cpu” “cell”)) “vsu2_cell+lsu_cell+slot01”)

;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr) (define_insn_reservation “cell-fpload-update” 1 (and (eq_attr “type” “fpload”) (eq_attr “update” “yes”) (eq_attr “cpu” “cell”)) “fxu_cell+vsu2_cell+lsu_cell+slot01”)

(define_insn_reservation “cell-vecload” 2 (and (eq_attr “type” “vecload”) (eq_attr “cpu” “cell”)) “slot01,vsu2_cell+lsu_cell”)

;;st? stw(MC) (define_insn_reservation “cell-store” 1 (and (eq_attr “type” “store”) (eq_attr “update” “no”) (eq_attr “cpu” “cell”)) “lsu_cell+slot01”)

;;stdux, stdu, (hardware breaks into store and add) 2 for update reg (define_insn_reservation “cell-store-update” 1 (and (eq_attr “type” “store”) (eq_attr “update” “yes”) (eq_attr “cpu” “cell”)) “fxu_cell+lsu_cell+slot01”)

(define_insn_reservation “cell-fpstore” 1 (and (eq_attr “type” “fpstore”) (eq_attr “update” “no”) (eq_attr “cpu” “cell”)) “vsu2_cell+lsu_cell+slot01”)

(define_insn_reservation “cell-fpstore-update” 1 (and (eq_attr “type” “fpstore”) (eq_attr “update” “yes”) (eq_attr “cpu” “cell”)) “vsu2_cell+fxu_cell+lsu_cell+slot01”)

(define_insn_reservation “cell-vecstore” 1 (and (eq_attr “type” “vecstore”) (eq_attr “cpu” “cell”)) “vsu2_cell+lsu_cell+slot01”)

;; Integer latency is 2 cycles (define_insn_reservation “cell-integer” 2 (and (ior (eq_attr “type” “integer,trap,cntlz,isel”) (and (eq_attr “type” “add,logical,shift,exts”) (eq_attr “dot” “no”)) (and (eq_attr “type” “insert”) (eq_attr “size” “64”))) (eq_attr “cpu” “cell”)) “slot01,fxu_cell”)

;; Two integer latency is 4 cycles (define_insn_reservation “cell-two” 4 (and (eq_attr “type” “two”) (eq_attr “cpu” “cell”)) “slot01,fxu_cell,fxu_cell*2”)

;; Three integer latency is 6 cycles (define_insn_reservation “cell-three” 6 (and (eq_attr “type” “three”) (eq_attr “cpu” “cell”)) “slot01,fxu_cell,fxu_cell*4”)

;; rlwimi, alter cr0
(define_insn_reservation “cell-insert” 2 (and (eq_attr “type” “insert”) (eq_attr “size” “32”) (eq_attr “cpu” “cell”)) “slot01,fxu_cell”)

;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0 (define_insn_reservation “cell-cmp” 1 (and (eq_attr “type” “cmp”) (eq_attr “cpu” “cell”)) “fxu_cell+slot01”)

;; add, addo, sub, subo, alter cr0, rldcli, rlwinm (define_insn_reservation “cell-fast-cmp” 2 (and (eq_attr “type” “add,logical,shift,exts”) (eq_attr “dot” “yes”) (eq_attr “cpu” “cell”) (eq_attr “cell_micro” “not”)) “slot01,fxu_cell”)

(define_insn_reservation “cell-cmp-microcoded” 9 (and (eq_attr “type” “add,logical,shift,exts”) (eq_attr “dot” “yes”) (eq_attr “cpu” “cell”) (eq_attr “cell_micro” “always”)) “slot0+slot1,fxu_cell,fxu_cell*7”)

;; mulld (define_insn_reservation “cell-lmul” 15 (and (eq_attr “type” “mul”) (eq_attr “dot” “no”) (eq_attr “size” “64”) (eq_attr “cpu” “cell”)) “slot1,nonpipeline,nonpipeline*13”)

;; mulld. is microcoded (define_insn_reservation “cell-lmul-cmp” 22 (and (eq_attr “type” “mul”) (eq_attr “dot” “yes”) (eq_attr “size” “64”) (eq_attr “cpu” “cell”)) “slot0+slot1,nonpipeline,nonpipeline*20”)

;; mulli, 6 cycles (define_insn_reservation “cell-imul23” 6 (and (eq_attr “type” “mul”) (eq_attr “size” “8,16”) (eq_attr “cpu” “cell”)) “slot1,nonpipeline,nonpipeline*4”)

;; mullw, 9 (define_insn_reservation “cell-imul” 9 (and (eq_attr “type” “mul”) (eq_attr “dot” “no”) (eq_attr “size” “32”) (eq_attr “cpu” “cell”)) “slot1,nonpipeline,nonpipeline*7”)

;; divide (define_insn_reservation “cell-idiv” 32 (and (eq_attr “type” “div”) (eq_attr “size” “32”) (eq_attr “cpu” “cell”)) “slot1,nonpipeline,nonpipeline*30”)

(define_insn_reservation “cell-ldiv” 64 (and (eq_attr “type” “div”) (eq_attr “size” “64”) (eq_attr “cpu” “cell”)) “slot1,nonpipeline,nonpipeline*62”)

;;mflr and mfctr are pipelined (define_insn_reservation “cell-mfjmpr” 1 (and (eq_attr “type” “mfjmpr”) (eq_attr “cpu” “cell”)) “slot01+bru_cell”)

;;mtlr and mtctr, ;;mtspr fully pipelined (define_insn_reservation “cell-mtjmpr” 1 (and (eq_attr “type” “mtjmpr”) (eq_attr “cpu” “cell”)) “bru_cell+slot01”)

;; Branches ;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency ;; bcctr, bcctrl, latency 2, actually adjust by be to 4 (define_insn_reservation “cell-branch” 1 (and (eq_attr “type” “branch”) (eq_attr “cpu” “cell”)) “bru_cell+slot1”)

(define_insn_reservation “cell-branchreg” 1 (and (eq_attr “type” “jmpreg”) (eq_attr “cpu” “cell”)) “bru_cell+slot1”)

;; cr hazard ;; page 90, special cases for CR hazard, only one instr can access cr per cycle ;; if insn reads CR following a stwcx, pipeline stall till stwcx finish (define_insn_reservation “cell-crlogical” 1 (and (eq_attr “type” “cr_logical”) (eq_attr “cpu” “cell”)) “bru_cell+slot01”)

;; mfcrf and mfcr is about 34 cycles and nonpipelined (define_insn_reservation “cell-mfcr” 34 (and (eq_attr “type” “mfcrf,mfcr”) (eq_attr “cpu” “cell”)) “slot1,nonpipeline,nonpipeline*32”)

;; mtcrf (1 field) (define_insn_reservation “cell-mtcrf” 1 (and (eq_attr “type” “mtcr”) (eq_attr “cpu” “cell”)) “fxu_cell+slot01”)

; Basic FP latency is 10 cycles, thoughput is 1/cycle (define_insn_reservation “cell-fp” 10 (and (eq_attr “type” “fp,fpsimple,dmul”) (eq_attr “cpu” “cell”)) “slot01,vsu1_cell,vsu1_cell*8”)

(define_insn_reservation “cell-fpcompare” 1 (and (eq_attr “type” “fpcompare”) (eq_attr “cpu” “cell”)) “vsu1_cell+slot01”)

;; sdiv thoughput 1/74, not pipelined but only in the FPU (define_insn_reservation “cell-sdiv” 74 (and (eq_attr “type” “sdiv,ddiv”) (eq_attr “cpu” “cell”)) “slot1,nonpipeline,nonpipeline*72”)

;; fsqrt thoughput 1/84, not pipelined but only in the FPU (define_insn_reservation “cell-sqrt” 84 (and (eq_attr “type” “ssqrt,dsqrt”) (eq_attr “cpu” “cell”)) “slot1,nonpipeline,nonpipeline*82”)

; VMX (define_insn_reservation “cell-vecsimple” 4 (and (eq_attr “type” “vecsimple,veclogical,vecmove”) (eq_attr “cpu” “cell”)) “slot01,vsu1_cell,vsu1_cell*2”)

;; mult, div, madd (define_insn_reservation “cell-veccomplex” 10 (and (eq_attr “type” “veccomplex”) (eq_attr “cpu” “cell”)) “slot01,vsu1_cell,vsu1_cell*8”)

;; TODO: add support for recording instructions (define_insn_reservation “cell-veccmp” 4 (and (eq_attr “type” “veccmp,veccmpfx”) (eq_attr “cpu” “cell”)) “slot01,vsu1_cell,vsu1_cell*2”)

(define_insn_reservation “cell-vecfloat” 12 (and (eq_attr “type” “vecfloat”) (eq_attr “cpu” “cell”)) “slot01,vsu1_cell,vsu1_cell*10”)

(define_insn_reservation “cell-vecperm” 4 (and (eq_attr “type” “vecperm”) (eq_attr “cpu” “cell”)) “slot01,vsu2_cell,vsu2_cell*2”)

;; New for 4.2, syncs

(define_insn_reservation “cell-sync” 11 (and (eq_attr “type” “sync”) (eq_attr “cpu” “cell”)) “slot01,lsu_cell,lsu_cell*9”)

(define_insn_reservation “cell-isync” 11 (and (eq_attr “type” “isync”) (eq_attr “cpu” “cell”)) “slot01,lsu_cell,lsu_cell*9”)

(define_insn_reservation “cell-load_l” 11 (and (eq_attr “type” “load_l”) (eq_attr “cpu” “cell”)) “slot01,lsu_cell,lsu_cell*9”)

(define_insn_reservation “cell-store_c” 11 (and (eq_attr “type” “store_c”) (eq_attr “cpu” “cell”)) “slot01,lsu_cell,lsu_cell*9”)

;; RAW register dependency

;; addi r3, r3, 1 ;; lw r4,offset(r3) ;; there are 5 cycle deplay for r3 bypassing ;; there are 5 cycle delay for a dependent load after a load (define_bypass 5 “cell-integer” “cell-load”) (define_bypass 5 “cell-integer” “cell-load-ext”) (define_bypass 5 “cell-load,cell-load-ext” “cell-load,cell-load-ext”)

;; there is a 6 cycle delay after a fp compare until you can use the cr. (define_bypass 6 “cell-fpcompare” “cell-branch,cell-branchreg,cell-mfcr,cell-crlogical”)

;; VXU float RAW (define_bypass 11 “cell-vecfloat” “cell-vecfloat”)

;; VXU and FPU (define_bypass 6 “cell-veccomplex” “cell-vecsimple”) ;;(define_bypass 6 “cell-veccompare” “cell-branch,cell-branchreg”) (define_bypass 3 “cell-vecfloat” “cell-veccomplex”) ; this is not correct, ;; this is a stall in general and not dependent on result (define_bypass 13 “cell-vecstore” “cell-fpstore”) ; this is not correct, this can never be true, not dependent on result (define_bypass 7 “cell-fp” “cell-fpload”) ;; vsu1 should avoid writing to the same target register as vsu2 insn ;; within 12 cycles.

;; WAW hazard

;; the target of VSU estimate should not be reused within 10 dispatch groups ;; the target of VSU float should not be reused within 8 dispatch groups ;; the target of VSU complex should not be reused within 5 dispatch groups ;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus

;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at ;; ex4 stage(10 cycles) (define_bypass 10 “cell-mtjmpr” “cell-branchreg”)

;;Things are not simulated: ;; update instruction, update address gpr are not simulated ;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float ;; insns