| /* Copyright (C) 2006-2022 Free Software Foundation, Inc. |
| |
| This file is free software; you can redistribute it and/or modify it |
| under the terms of the GNU General Public License as published by the |
| Free Software Foundation; either version 3, or (at your option) any |
| later version. |
| |
| This file is distributed in the hope that it will be useful, but |
| WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| General Public License for more details. |
| |
| Under Section 7 of GPL version 3, you are granted additional |
| permissions described in the GCC Runtime Library Exception, version |
| 3.1, as published by the Free Software Foundation. |
| |
| You should have received a copy of the GNU General Public License and |
| a copy of the GCC Runtime Library Exception along with this program; |
| see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| <http://www.gnu.org/licenses/>. */ |
| |
| /* Moderately Space-optimized libgcc routines for the Renesas SH / |
| STMicroelectronics ST40 CPUs. |
| Contributed by J"orn Rennecke joern.rennecke@st.com. */ |
| |
| #include "lib1funcs.h" |
| |
| #ifdef L_udivsi3_i4i |
| |
| /* 88 bytes; sh4-200 cycle counts: |
| divisor >= 2G: 11 cycles |
| dividend < 2G: 48 cycles |
| dividend >= 2G: divisor != 1: 54 cycles |
| dividend >= 2G, divisor == 1: 22 cycles */ |
| #if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__) |
| !! args in r4 and r5, result in r0, clobber r1 |
| |
| .global GLOBAL(udivsi3_i4i) |
| FUNC(GLOBAL(udivsi3_i4i)) |
| GLOBAL(udivsi3_i4i): |
| mova L1,r0 |
| cmp/pz r5 |
| sts fpscr,r1 |
| lds.l @r0+,fpscr |
| sts.l fpul,@-r15 |
| bf LOCAL(huge_divisor) |
| mov.l r1,@-r15 |
| lds r4,fpul |
| cmp/pz r4 |
| #ifdef FMOVD_WORKS |
| fmov.d dr0,@-r15 |
| float fpul,dr0 |
| fmov.d dr2,@-r15 |
| bt LOCAL(dividend_adjusted) |
| mov #1,r1 |
| fmov.d @r0,dr2 |
| cmp/eq r1,r5 |
| bt LOCAL(div_by_1) |
| fadd dr2,dr0 |
| LOCAL(dividend_adjusted): |
| lds r5,fpul |
| float fpul,dr2 |
| fdiv dr2,dr0 |
| LOCAL(div_by_1): |
| fmov.d @r15+,dr2 |
| ftrc dr0,fpul |
| fmov.d @r15+,dr0 |
| #else /* !FMOVD_WORKS */ |
| fmov.s DR01,@-r15 |
| mov #1,r1 |
| fmov.s DR00,@-r15 |
| float fpul,dr0 |
| fmov.s DR21,@-r15 |
| bt/s LOCAL(dividend_adjusted) |
| fmov.s DR20,@-r15 |
| cmp/eq r1,r5 |
| bt LOCAL(div_by_1) |
| fmov.s @r0+,DR20 |
| fmov.s @r0,DR21 |
| fadd dr2,dr0 |
| LOCAL(dividend_adjusted): |
| lds r5,fpul |
| float fpul,dr2 |
| fdiv dr2,dr0 |
| LOCAL(div_by_1): |
| fmov.s @r15+,DR20 |
| fmov.s @r15+,DR21 |
| ftrc dr0,fpul |
| fmov.s @r15+,DR00 |
| fmov.s @r15+,DR01 |
| #endif /* !FMOVD_WORKS */ |
| lds.l @r15+,fpscr |
| sts fpul,r0 |
| rts |
| lds.l @r15+,fpul |
| |
| #ifdef FMOVD_WORKS |
| .p2align 3 ! make double below 8 byte aligned. |
| #endif |
| LOCAL(huge_divisor): |
| lds r1,fpscr |
| add #4,r15 |
| cmp/hs r5,r4 |
| rts |
| movt r0 |
| |
| .p2align 2 |
| L1: |
| #ifndef FMOVD_WORKS |
| .long 0x80000 |
| #else |
| .long 0x180000 |
| #endif |
| .double 4294967296 |
| |
| ENDFUNC(GLOBAL(udivsi3_i4i)) |
| #elif !defined (__sh1__) /* !__SH_FPU_DOUBLE__ */ |
| |
| #if 0 |
| /* With 36 bytes, the following would probably be the most compact |
| implementation, but with 139 cycles on an sh4-200, it is extremely slow. */ |
| GLOBAL(udivsi3_i4i): |
| mov.l r2,@-r15 |
| mov #0,r1 |
| div0u |
| mov r1,r2 |
| mov.l r3,@-r15 |
| mov r1,r3 |
| sett |
| mov r4,r0 |
| LOCAL(loop): |
| rotcr r2 |
| ; |
| bt/s LOCAL(end) |
| cmp/gt r2,r3 |
| rotcl r0 |
| bra LOCAL(loop) |
| div1 r5,r1 |
| LOCAL(end): |
| rotcl r0 |
| mov.l @r15+,r3 |
| rts |
| mov.l @r15+,r2 |
| #endif /* 0 */ |
| |
| /* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i |
| sh4-200 run times: |
| udiv small divisor: 55 cycles |
| udiv large divisor: 52 cycles |
| sdiv small divisor, positive result: 59 cycles |
| sdiv large divisor, positive result: 56 cycles |
| sdiv small divisor, negative result: 65 cycles (*) |
| sdiv large divisor, negative result: 62 cycles (*) |
| (*): r2 is restored in the rts delay slot and has a lingering latency |
| of two more cycles. */ |
| .balign 4 |
| .global GLOBAL(udivsi3_i4i) |
| FUNC(GLOBAL(udivsi3_i4i)) |
| FUNC(GLOBAL(sdivsi3_i4i)) |
| GLOBAL(udivsi3_i4i): |
| sts pr,r1 |
| mov.l r4,@-r15 |
| extu.w r5,r0 |
| cmp/eq r5,r0 |
| swap.w r4,r0 |
| shlr16 r4 |
| bf/s LOCAL(large_divisor) |
| div0u |
| mov.l r5,@-r15 |
| shll16 r5 |
| LOCAL(sdiv_small_divisor): |
| div1 r5,r4 |
| bsr LOCAL(div6) |
| div1 r5,r4 |
| div1 r5,r4 |
| bsr LOCAL(div6) |
| div1 r5,r4 |
| xtrct r4,r0 |
| xtrct r0,r4 |
| bsr LOCAL(div7) |
| swap.w r4,r4 |
| div1 r5,r4 |
| bsr LOCAL(div7) |
| div1 r5,r4 |
| xtrct r4,r0 |
| mov.l @r15+,r5 |
| swap.w r0,r0 |
| mov.l @r15+,r4 |
| jmp @r1 |
| rotcl r0 |
| LOCAL(div7): |
| div1 r5,r4 |
| LOCAL(div6): |
| div1 r5,r4; div1 r5,r4; div1 r5,r4 |
| div1 r5,r4; div1 r5,r4; rts; div1 r5,r4 |
| |
| LOCAL(divx3): |
| rotcl r0 |
| div1 r5,r4 |
| rotcl r0 |
| div1 r5,r4 |
| rotcl r0 |
| rts |
| div1 r5,r4 |
| |
| LOCAL(large_divisor): |
| mov.l r5,@-r15 |
| LOCAL(sdiv_large_divisor): |
| xor r4,r0 |
| .rept 4 |
| rotcl r0 |
| bsr LOCAL(divx3) |
| div1 r5,r4 |
| .endr |
| mov.l @r15+,r5 |
| mov.l @r15+,r4 |
| jmp @r1 |
| rotcl r0 |
| ENDFUNC(GLOBAL(udivsi3_i4i)) |
| |
| .global GLOBAL(sdivsi3_i4i) |
| GLOBAL(sdivsi3_i4i): |
| mov.l r4,@-r15 |
| cmp/pz r5 |
| mov.l r5,@-r15 |
| bt/s LOCAL(pos_divisor) |
| cmp/pz r4 |
| neg r5,r5 |
| extu.w r5,r0 |
| bt/s LOCAL(neg_result) |
| cmp/eq r5,r0 |
| neg r4,r4 |
| LOCAL(pos_result): |
| swap.w r4,r0 |
| bra LOCAL(sdiv_check_divisor) |
| sts pr,r1 |
| LOCAL(pos_divisor): |
| extu.w r5,r0 |
| bt/s LOCAL(pos_result) |
| cmp/eq r5,r0 |
| neg r4,r4 |
| LOCAL(neg_result): |
| mova LOCAL(negate_result),r0 |
| ; |
| mov r0,r1 |
| swap.w r4,r0 |
| lds r2,macl |
| sts pr,r2 |
| LOCAL(sdiv_check_divisor): |
| shlr16 r4 |
| bf/s LOCAL(sdiv_large_divisor) |
| div0u |
| bra LOCAL(sdiv_small_divisor) |
| shll16 r5 |
| .balign 4 |
| LOCAL(negate_result): |
| neg r0,r0 |
| jmp @r2 |
| sts macl,r2 |
| ENDFUNC(GLOBAL(sdivsi3_i4i)) |
| #endif /* !__SH_FPU_DOUBLE__ */ |
| #endif /* L_udivsi3_i4i */ |
| |
| #ifdef L_sdivsi3_i4i |
| #if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__) |
| /* 48 bytes, 45 cycles on sh4-200 */ |
| !! args in r4 and r5, result in r0, clobber r1 |
| |
| .global GLOBAL(sdivsi3_i4i) |
| FUNC(GLOBAL(sdivsi3_i4i)) |
| GLOBAL(sdivsi3_i4i): |
| sts.l fpscr,@-r15 |
| sts fpul,r1 |
| mova L1,r0 |
| lds.l @r0+,fpscr |
| lds r4,fpul |
| #ifdef FMOVD_WORKS |
| fmov.d dr0,@-r15 |
| float fpul,dr0 |
| lds r5,fpul |
| fmov.d dr2,@-r15 |
| #else |
| fmov.s DR01,@-r15 |
| fmov.s DR00,@-r15 |
| float fpul,dr0 |
| lds r5,fpul |
| fmov.s DR21,@-r15 |
| fmov.s DR20,@-r15 |
| #endif |
| float fpul,dr2 |
| fdiv dr2,dr0 |
| #ifdef FMOVD_WORKS |
| fmov.d @r15+,dr2 |
| #else |
| fmov.s @r15+,DR20 |
| fmov.s @r15+,DR21 |
| #endif |
| ftrc dr0,fpul |
| #ifdef FMOVD_WORKS |
| fmov.d @r15+,dr0 |
| #else |
| fmov.s @r15+,DR00 |
| fmov.s @r15+,DR01 |
| #endif |
| lds.l @r15+,fpscr |
| sts fpul,r0 |
| rts |
| lds r1,fpul |
| |
| .p2align 2 |
| L1: |
| #ifndef FMOVD_WORKS |
| .long 0x80000 |
| #else |
| .long 0x180000 |
| #endif |
| |
| ENDFUNC(GLOBAL(sdivsi3_i4i)) |
| #endif /* __SH_FPU_DOUBLE__ */ |
| #endif /* L_sdivsi3_i4i */ |