| /* Copyright (C) 2008-2021 Free Software Foundation, Inc. |
| Contributor: Joern Rennecke <joern.rennecke@embecosm.com> |
| on behalf of Synopsys Inc. |
| |
| This file is part of GCC. |
| |
| GCC is free software; you can redistribute it and/or modify it under |
| the terms of the GNU General Public License as published by the Free |
| Software Foundation; either version 3, or (at your option) any later |
| version. |
| |
| GCC is distributed in the hope that it will be useful, but WITHOUT ANY |
| WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| for more details. |
| |
| Under Section 7 of GPL version 3, you are granted additional |
| permissions described in the GCC Runtime Library Exception, version |
| 3.1, as published by the Free Software Foundation. |
| |
| You should have received a copy of the GNU General Public License and |
| a copy of the GCC Runtime Library Exception along with this program; |
| see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| <http://www.gnu.org/licenses/>. */ |
| |
| /* |
| to calculate a := b/x as b*y, with y := 1/x: |
| - x is in the range [1..2) |
| - calculate 15..18 bit inverse y0 using a table of approximating polynoms. |
| Precision is higher for polynoms used to evaluate input with larger |
| value. |
| - Do one newton-raphson iteration step to double the precision, |
| then multiply this with the divisor |
| -> more time to decide if dividend is subnormal |
| - the worst error propagation is on the side of the value range |
| with the least initial defect, thus giving us about 30 bits precision. |
| The truncation error for the either is less than 1 + x/2 ulp. |
| A 31 bit inverse can be simply calculated by using x with implicit 1 |
| and chaining the multiplies. For a 32 bit inverse, we multiply y0^2 |
| with the bare fraction part of x, then add in y0^2 for the implicit |
| 1 of x. |
| - If calculating a 31 bit inverse, the systematic error is less than |
| -1 ulp; likewise, for 32 bit, it is less than -2 ulp. |
| - If we calculate our seed with a 32 bit fraction, we can archive a |
| tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we |
| only need to take the step to calculate the 2nd stage rest and |
| rounding adjust 1/32th of the time. However, if we use a 20 bit |
| fraction for the seed, the negative error can exceed -2 ulp/128, (2) |
| thus for a simple add / tst check, we need to do the 2nd stage |
| rest calculation/ rounding adjust 1/16th of the time. |
| (1): The inexactness of the 32 bit inverse contributes an error in the |
| range of (-1 .. +(1+x/2) ) ulp/128. Leaving out the low word of the |
| rest contributes an error < +1/x ulp/128 . In the interval [1,2), |
| x/2 + 1/x <= 1.5 . |
| (2): Unless proven otherwise. I have not actually looked for an |
| example where -2 ulp/128 is exceeded, and my calculations indicate |
| that the excess, if existent, is less than -1/512 ulp. |
| ??? The algorithm is still based on the ARC700 optimized code. |
| Maybe we could make better use of 32x16 bit multiply, or 64 bit multiply |
| results. |
| */ |
| #include "../arc-ieee-754.h" |
| #define mlo acc2 |
| #define mhi acc1 |
| #define mul64(b,c) mullw 0,b,c` machlw 0,b,c |
| #define mulu64(b,c) mululw 0,b,c` machulw 0,b,c |
| |
| /* N.B. fp-bit.c does double rounding on denormal numbers. */ |
| #if 0 /* DEBUG */ |
| .global __divdf3 |
| FUNC(__divdf3) |
| .balign 4 |
| __divdf3: |
| push_s blink |
| push_s r2 |
| push_s r3 |
| push_s r0 |
| bl.d __divdf3_c |
| push_s r1 |
| ld_s r2,[sp,12] |
| ld_s r3,[sp,8] |
| st_s r0,[sp,12] |
| st_s r1,[sp,8] |
| pop_s r1 |
| bl.d __divdf3_asm |
| pop_s r0 |
| pop_s r3 |
| pop_s r2 |
| pop_s blink |
| cmp r0,r2 |
| cmp.eq r1,r3 |
| jeq_s [blink] |
| and r12,DBL0H,DBL1H |
| bic.f 0,0x7ff80000,r12 ; both NaN -> OK |
| jeq_s [blink] |
| bl abort |
| ENDFUNC(__divdf3) |
| #define __divdf3 __divdf3_asm |
| #endif /* DEBUG */ |
| |
| FUNC(__divdf3) |
| .balign 4 |
| .L7ff00000: |
| .long 0x7ff00000 |
| .Ldivtab: |
| .long 0xfc0fffe1 |
| .long 0xf46ffdfb |
| .long 0xed1ffa54 |
| .long 0xe61ff515 |
| .long 0xdf7fee75 |
| .long 0xd91fe680 |
| .long 0xd2ffdd52 |
| .long 0xcd1fd30c |
| .long 0xc77fc7cd |
| .long 0xc21fbbb6 |
| .long 0xbcefaec0 |
| .long 0xb7efa100 |
| .long 0xb32f92bf |
| .long 0xae8f83b7 |
| .long 0xaa2f7467 |
| .long 0xa5ef6479 |
| .long 0xa1cf53fa |
| .long 0x9ddf433e |
| .long 0x9a0f3216 |
| .long 0x965f2091 |
| .long 0x92df0f11 |
| .long 0x8f6efd05 |
| .long 0x8c1eeacc |
| .long 0x88eed876 |
| .long 0x85dec615 |
| .long 0x82eeb3b9 |
| .long 0x800ea10b |
| .long 0x7d3e8e0f |
| .long 0x7a8e7b3f |
| .long 0x77ee6836 |
| .long 0x756e5576 |
| .long 0x72fe4293 |
| .long 0x709e2f93 |
| .long 0x6e4e1c7f |
| .long 0x6c0e095e |
| .long 0x69edf6c5 |
| .long 0x67cde3a5 |
| .long 0x65cdd125 |
| .long 0x63cdbe25 |
| .long 0x61ddab3f |
| .long 0x600d991f |
| .long 0x5e3d868c |
| .long 0x5c6d7384 |
| .long 0x5abd615f |
| .long 0x590d4ecd |
| .long 0x576d3c83 |
| .long 0x55dd2a89 |
| .long 0x545d18e9 |
| .long 0x52dd06e9 |
| .long 0x516cf54e |
| .long 0x4ffce356 |
| .long 0x4e9cd1ce |
| .long 0x4d3cbfec |
| .long 0x4becae86 |
| .long 0x4aac9da4 |
| .long 0x496c8c73 |
| .long 0x483c7bd3 |
| .long 0x470c6ae8 |
| .long 0x45dc59af |
| .long 0x44bc4915 |
| .long 0x43ac3924 |
| .long 0x428c27fb |
| .long 0x418c187a |
| .long 0x407c07bd |
| |
| __divdf3_support: /* This label makes debugger output saner. */ |
| .balign 4 |
| .Ldenorm_dbl1: |
| brge r6, \ |
| 0x43500000,.Linf_NaN ; large number / denorm -> Inf |
| bmsk.f r12,DBL1H,19 |
| mov.eq r12,DBL1L |
| mov.eq DBL1L,0 |
| sub.eq r7,r7,32 |
| norm.f r11,r12 ; flag for x/0 -> Inf check |
| beq_s .Linf_NaN |
| mov.mi r11,0 |
| add.pl r11,r11,1 |
| add_s r12,r12,r12 |
| asl r8,r12,r11 |
| rsub r12,r11,31 |
| lsr r12,DBL1L,r12 |
| tst_s DBL1H,DBL1H |
| or r8,r8,r12 |
| lsr r4,r8,26 |
| lsr DBL1H,r8,12 |
| ld.as r4,[r10,r4] |
| bxor.mi DBL1H,DBL1H,31 |
| sub r11,r11,11 |
| asl DBL1L,DBL1L,r11 |
| sub r11,r11,1 |
| mulu64 (r4,r8) |
| sub r7,r7,r11 |
| b.d .Lpast_denorm_dbl1 |
| asl r7,r7,20 |
| |
| .Linf_NaN: |
| tst_s DBL0L,DBL0L ; 0/0 -> NaN |
| xor_s DBL1H,DBL1H,DBL0H |
| bclr.eq.f DBL0H,DBL0H,31 |
| bmsk DBL0H,DBL1H,30 |
| xor_s DBL0H,DBL0H,DBL1H |
| sub.eq DBL0H,DBL0H,1 |
| mov_s DBL0L,0 |
| j_s.d [blink] |
| or DBL0H,DBL0H,r9 |
| .balign 4 |
| .Lret0_2: |
| xor_s DBL1H,DBL1H,DBL0H |
| mov_s DBL0L,0 |
| bmsk DBL0H,DBL1H,30 |
| j_s.d [blink] |
| xor_s DBL0H,DBL0H,DBL1H |
| .balign 4 |
| .global __divdf3 |
| /* N.B. the spacing between divtab and the sub3 to get its address must |
| be a multiple of 8. */ |
| __divdf3: |
| asl r8,DBL1H,12 |
| lsr r4,r8,26 |
| sub3 r10,pcl,51;(.-.Ldivtab) >> 3 |
| ld.as r9,[pcl,-104]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000 |
| ld.as r4,[r10,r4] |
| lsr r12,DBL1L,20 |
| and.f r7,DBL1H,r9 |
| or r8,r8,r12 |
| mulu64 (r4,r8) |
| beq.d .Ldenorm_dbl1 |
| .Lpast_denorm_dbl1: |
| and.f r6,DBL0H,r9 |
| breq.d r7,r9,.Linf_nan_dbl1 |
| asl r4,r4,12 |
| sub r4,r4,mhi |
| mululw 0,r4,r4 |
| machulw r5,r4,r4 |
| bne.d .Lnormal_dbl0 |
| lsr r8,r8,1 |
| |
| .balign 4 |
| .Ldenorm_dbl0: |
| bmsk.f r12,DBL0H,19 |
| ; wb stall |
| mov.eq r12,DBL0L |
| sub.eq r6,r6,32 |
| norm.f r11,r12 ; flag for 0/x -> 0 check |
| brge r7, \ |
| 0x43500000, .Lret0_2 ; denorm/large number -> 0 |
| beq_s .Lret0_2 |
| mov.mi r11,0 |
| add.pl r11,r11,1 |
| asl r12,r12,r11 |
| sub r6,r6,r11 |
| add.f 0,r6,31 |
| lsr r10,DBL0L,r6 |
| mov.mi r10,0 |
| add r6,r6,11+32 |
| neg.f r11,r6 |
| asl DBL0L,DBL0L,r11 |
| mov.pl DBL0L,0 |
| sub r6,r6,32-1 |
| b.d .Lpast_denorm_dbl0 |
| asl r6,r6,20 |
| |
| .balign 4 |
| .Linf_nan_dbl1: ; 0/Inf -> NaN Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN |
| or.f 0,r6,DBL0L |
| cmp.ne r6,r9 |
| not_s DBL0L,DBL1H |
| sub_s.ne DBL0L,DBL0L,DBL0L |
| tst_s DBL0H,DBL0H |
| add_s DBL0H,DBL1H,DBL0L |
| j_s.d [blink] |
| bxor.mi DBL0H,DBL0H,31 |
| |
| .balign 4 |
| .Lnormal_dbl0: |
| breq.d r6,r9,.Linf_nan_dbl0 |
| asl r12,DBL0H,11 |
| lsr r10,DBL0L,21 |
| .Lpast_denorm_dbl0: |
| bset r8,r8,31 |
| mulu64 (r5,r8) |
| add_s r12,r12,r10 |
| bset r5,r12,31 |
| cmp r5,r8 |
| cmp.eq DBL0L,DBL1L |
| lsr.cc r5,r5,1 |
| sub r4,r4,mhi ; u1.31 inverse, about 30 bit |
| mululw 0,r5,r4 |
| machulw r11,r5,r4 ; result fraction highpart |
| lsr r8,r8,2 ; u3.29 |
| add r5,r6, /* wait for immediate */ \ |
| 0x3fe00000 |
| mulu64 (r11,r8) ; u-28.31 |
| asl_s DBL1L,DBL1L,9 ; u-29.23:9 |
| sbc r6,r5,r7 |
| mov r12,mlo ; u-28.31 |
| mulu64 (r11,DBL1L) ; mhi: u-28.23:9 |
| add.cs DBL0L,DBL0L,DBL0L |
| asl_s DBL0L,DBL0L,6 ; u-26.25:7 |
| asl r10,r11,23 |
| sub_l DBL0L,DBL0L,r12 |
| lsr r7,r11,9 |
| sub r5,DBL0L,mhi ; rest msw ; u-26.31:0 |
| mul64 (r5,r4) ; mhi: result fraction lowpart |
| xor.f 0,DBL0H,DBL1H |
| and DBL0H,r6,r9 |
| add_s DBL0H,DBL0H,r7 |
| bclr r12,r9,20 ; 0x7fe00000 |
| brhs.d r6,r12,.Linf_denorm |
| bxor.mi DBL0H,DBL0H,31 |
| add.f r12,mhi,0x11 |
| asr r9,r12,5 |
| sub.mi DBL0H,DBL0H,1 |
| add.f DBL0L,r9,r10 |
| tst r12,0x1c |
| jne.d [blink] |
| add.cs DBL0H,DBL0H,1 |
| /* work out exact rounding if we fall through here. */ |
| /* We know that the exact result cannot be represented in double |
| precision. Find the mid-point between the two nearest |
| representable values, multiply with the divisor, and check if |
| the result is larger than the dividend. Since we want to know |
| only the sign bit, it is sufficient to calculate only the |
| highpart of the lower 64 bits. */ |
| mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo |
| sub.f DBL0L,DBL0L,1 |
| asl r12,r9,2 ; u-22.30:2 |
| sub.cs DBL0H,DBL0H,1 |
| sub.f r12,r12,2 |
| mov r10,mlo ; rest before considering r12 in r5 : -r10 |
| mululw 0,r12,DBL1L |
| machulw r7,r12,DBL1L ; mhi: u-51.32 |
| asl r5,r5,25 ; s-51.7:25 |
| lsr r10,r10,7 ; u-51.30:2 |
| mulu64 (r12,r8) ; mlo: u-51.31:1 |
| sub r5,r5,r10 |
| add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L |
| bset r7,r7,0 ; make sure that the result is not zero, and that |
| sub r5,r5,r7 ; a highpart zero appears negative |
| sub.f r5,r5,mlo ; rest msw |
| add.pl.f DBL0L,DBL0L,1 |
| j_s.d [blink] |
| add.eq DBL0H,DBL0H,1 |
| |
| .Linf_nan_dbl0: |
| tst_s DBL1H,DBL1H |
| j_s.d [blink] |
| bxor.mi DBL0H,DBL0H,31 |
| .balign 4 |
| .Linf_denorm: |
| lsr r12,r6,28 |
| brlo.d r12,0xc,.Linf |
| .Ldenorm: |
| asr r6,r6,20 |
| neg r9,r6 |
| mov_s DBL0H,0 |
| brhs.d r9,54,.Lret0 |
| bxor.mi DBL0H,DBL0H,31 |
| add r12,mhi,1 |
| and r12,r12,-4 |
| rsub r7,r6,5 |
| asr r10,r12,28 |
| bmsk r4,r12,27 |
| min r7,r7,31 |
| asr DBL0L,r4,r7 |
| add DBL1H,r11,r10 |
| abs.f r10,r4 |
| sub.mi r10,r10,1 |
| add.f r7,r6,32-5 |
| asl r4,r4,r7 |
| mov.mi r4,r10 |
| add.f r10,r6,23 |
| rsub r7,r6,9 |
| lsr r7,DBL1H,r7 |
| asl r10,DBL1H,r10 |
| or.pnz DBL0H,DBL0H,r7 |
| or.mi r4,r4,r10 |
| mov.mi r10,r7 |
| add.f DBL0L,r10,DBL0L |
| add.cs.f DBL0H,DBL0H,1 ; carry clear after this point |
| bxor.f 0,r4,31 |
| add.pnz.f DBL0L,DBL0L,1 |
| add.cs.f DBL0H,DBL0H,1 |
| jne_s [blink] |
| /* Calculation so far was not conclusive; calculate further rest. */ |
| mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo |
| asr.f r12,r12,3 |
| asl r5,r5,25 ; s-51.7:25 |
| mov r11,mlo ; rest before considering r12 in r5 : -r11 |
| mulu64 (r12,r8) ; u-51.31:1 |
| and r9,DBL0L,1 ; tie-breaker: round to even |
| lsr r11,r11,7 ; u-51.30:2 |
| mov DBL1H,mlo ; u-51.31:1 |
| mulu64 (r12,DBL1L) ; u-51.62:2 |
| sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L |
| add_s DBL1H,DBL1H,r11 |
| sub DBL1H,DBL1H,r5 ; -rest msw |
| add_s DBL1H,DBL1H,mhi ; -rest msw |
| add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-( |
| tst_s DBL1H,DBL1H |
| cmp.eq mlo,r9 |
| add.cs.f DBL0L,DBL0L,1 |
| j_s.d [blink] |
| add.cs DBL0H,DBL0H,1 |
| |
| .Lret0: |
| /* return +- 0 */ |
| j_s.d [blink] |
| mov_s DBL0L,0 |
| .Linf: |
| mov_s DBL0H,r9 |
| mov_s DBL0L,0 |
| j_s.d [blink] |
| bxor.mi DBL0H,DBL0H,31 |
| ENDFUNC(__divdf3) |