libgcc/config/arc/ieee-754/arc600-dsp/divdf3.S - gcc - Git at Google

 /* Copyright (C) 2008-2023 Free Software Foundation, Inc.
    Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
 		on behalf of Synopsys Inc.

 This file is part of GCC.

 GCC is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free
 Software Foundation; either version 3, or (at your option) any later
 version.

 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
 WARRANTY; without even the implied warranty of MERCHANTABILITY or
 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 for more details.

 Under Section 7 of GPL version 3, you are granted additional
 permissions described in the GCC Runtime Library Exception, version
 3.1, as published by the Free Software Foundation.

 You should have received a copy of the GNU General Public License and
 a copy of the GCC Runtime Library Exception along with this program;
 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 <http://www.gnu.org/licenses/>.  */

 /*
    to calculate a := b/x as b*y, with y := 1/x:
    - x is in the range [1..2)
    - calculate 15..18 bit inverse y0 using a table of approximating polynoms.
      Precision is higher for polynoms used to evaluate input with larger
      value.
    - Do one newton-raphson iteration step to double the precision,
      then multiply this with the divisor
 	-> more time to decide if dividend is subnormal
      - the worst error propagation is on the side of the value range
        with the least initial defect, thus giving us about 30 bits precision.
       The truncation error for the either is less than 1 + x/2 ulp.
       A 31 bit inverse can be simply calculated by using x with implicit 1
       and chaining the multiplies.  For a 32 bit inverse, we multiply y0^2
       with the bare fraction part of x, then add in y0^2 for the implicit
       1 of x.
     - If calculating a 31 bit inverse, the systematic error is less than
       -1 ulp; likewise, for 32 bit, it is less than -2 ulp.
     - If we calculate our seed with a 32 bit fraction, we can archive a
       tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
       only need to take the step to calculate the 2nd stage rest and
       rounding adjust 1/32th of the time.  However, if we use a 20 bit
       fraction for the seed, the negative error can exceed -2 ulp/128, (2)
       thus for a simple add / tst check, we need to do the 2nd stage
       rest calculation/ rounding adjust 1/16th of the time.
       (1): The inexactness of the 32 bit inverse contributes an error in the
       range of (-1 .. +(1+x/2) ) ulp/128.  Leaving out the low word of the
       rest contributes an error < +1/x ulp/128 .  In the interval [1,2),
       x/2 + 1/x <= 1.5 .
       (2): Unless proven otherwise.  I have not actually looked for an
       example where -2 ulp/128 is exceeded, and my calculations indicate
       that the excess, if existent, is less than -1/512 ulp.
     ??? The algorithm is still based on the ARC700 optimized code.
     Maybe we could make better use of 32x16 bit multiply, or 64 bit multiply
     results.
  */
 #include "../arc-ieee-754.h"
 #define mlo acc2
 #define mhi acc1
 #define mul64(b,c) mullw 0,b,c` machlw 0,b,c
 #define mulu64(b,c) mululw 0,b,c` machulw 0,b,c

 /* N.B. fp-bit.c does double rounding on denormal numbers.  */
 #if 0 /* DEBUG */
 	.global __divdf3
 	FUNC(__divdf3)
 	.balign 4
 __divdf3:
 	push_s blink
 	push_s r2
 	push_s r3
 	push_s r0
 	bl.d __divdf3_c
 	push_s r1
 	ld_s r2,[sp,12]
 	ld_s r3,[sp,8]
 	st_s r0,[sp,12]
 	st_s r1,[sp,8]
 	pop_s r1
 	bl.d __divdf3_asm
 	pop_s r0
 	pop_s r3
 	pop_s r2
 	pop_s blink
 	cmp r0,r2
 	cmp.eq r1,r3
 	jeq_s [blink]
 	and r12,DBL0H,DBL1H
 	bic.f 0,0x7ff80000,r12 ; both NaN -> OK
 	jeq_s [blink]
 	bl abort
 	ENDFUNC(__divdf3)
 #define __divdf3 __divdf3_asm
 #endif /* DEBUG */

 	FUNC(__divdf3)
 	.balign 4
 .L7ff00000:
 	.long 0x7ff00000
 .Ldivtab:
 	.long 0xfc0fffe1
 	.long 0xf46ffdfb
 	.long 0xed1ffa54
 	.long 0xe61ff515
 	.long 0xdf7fee75
 	.long 0xd91fe680
 	.long 0xd2ffdd52
 	.long 0xcd1fd30c
 	.long 0xc77fc7cd
 	.long 0xc21fbbb6
 	.long 0xbcefaec0
 	.long 0xb7efa100
 	.long 0xb32f92bf
 	.long 0xae8f83b7
 	.long 0xaa2f7467
 	.long 0xa5ef6479
 	.long 0xa1cf53fa
 	.long 0x9ddf433e
 	.long 0x9a0f3216
 	.long 0x965f2091
 	.long 0x92df0f11
 	.long 0x8f6efd05
 	.long 0x8c1eeacc
 	.long 0x88eed876
 	.long 0x85dec615
 	.long 0x82eeb3b9
 	.long 0x800ea10b
 	.long 0x7d3e8e0f
 	.long 0x7a8e7b3f
 	.long 0x77ee6836
 	.long 0x756e5576
 	.long 0x72fe4293
 	.long 0x709e2f93
 	.long 0x6e4e1c7f
 	.long 0x6c0e095e
 	.long 0x69edf6c5
 	.long 0x67cde3a5
 	.long 0x65cdd125
 	.long 0x63cdbe25
 	.long 0x61ddab3f
 	.long 0x600d991f
 	.long 0x5e3d868c
 	.long 0x5c6d7384
 	.long 0x5abd615f
 	.long 0x590d4ecd
 	.long 0x576d3c83
 	.long 0x55dd2a89
 	.long 0x545d18e9
 	.long 0x52dd06e9
 	.long 0x516cf54e
 	.long 0x4ffce356
 	.long 0x4e9cd1ce
 	.long 0x4d3cbfec
 	.long 0x4becae86
 	.long 0x4aac9da4
 	.long 0x496c8c73
 	.long 0x483c7bd3
 	.long 0x470c6ae8
 	.long 0x45dc59af
 	.long 0x44bc4915
 	.long 0x43ac3924
 	.long 0x428c27fb
 	.long 0x418c187a
 	.long 0x407c07bd

 __divdf3_support: /* This label makes debugger output saner.  */
 	.balign 4
 .Ldenorm_dbl1:
 	brge r6, \
 		0x43500000,.Linf_NaN ; large number / denorm -> Inf
 	bmsk.f r12,DBL1H,19
 	mov.eq r12,DBL1L
 	mov.eq DBL1L,0
 	sub.eq r7,r7,32
 	norm.f r11,r12 ; flag for x/0 -> Inf check
 	beq_s .Linf_NaN
 	mov.mi r11,0
 	add.pl r11,r11,1
 	add_s r12,r12,r12
 	asl r8,r12,r11
 	rsub r12,r11,31
 	lsr r12,DBL1L,r12
 	tst_s DBL1H,DBL1H
 	or r8,r8,r12
 	lsr r4,r8,26
 	lsr DBL1H,r8,12
 	ld.as r4,[r10,r4]
 	bxor.mi DBL1H,DBL1H,31
 	sub r11,r11,11
 	asl DBL1L,DBL1L,r11
 	sub r11,r11,1
 	mulu64 (r4,r8)
 	sub r7,r7,r11
 	b.d .Lpast_denorm_dbl1
 	asl r7,r7,20

 .Linf_NaN:
 	tst_s DBL0L,DBL0L ; 0/0 -> NaN
 	xor_s DBL1H,DBL1H,DBL0H
 	bclr.eq.f DBL0H,DBL0H,31
 	bmsk DBL0H,DBL1H,30
 	xor_s DBL0H,DBL0H,DBL1H
 	sub.eq DBL0H,DBL0H,1
 	mov_s DBL0L,0
 	j_s.d [blink]
 	or DBL0H,DBL0H,r9
 	.balign 4
 .Lret0_2:
 	xor_s DBL1H,DBL1H,DBL0H
 	mov_s DBL0L,0
 	bmsk DBL0H,DBL1H,30
 	j_s.d [blink]
 	xor_s DBL0H,DBL0H,DBL1H
 	.balign 4
 	.global __divdf3
 /* N.B. the spacing between divtab and the sub3 to get its address must
    be a multiple of 8.  */
 __divdf3:
 	asl r8,DBL1H,12
 	lsr r4,r8,26
 	sub3 r10,pcl,51;(.-.Ldivtab) >> 3
 	ld.as r9,[pcl,-104]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
 	ld.as r4,[r10,r4]
 	lsr r12,DBL1L,20
 	and.f r7,DBL1H,r9
 	or r8,r8,r12
 	mulu64 (r4,r8)
 	beq.d .Ldenorm_dbl1
 .Lpast_denorm_dbl1:
 	and.f r6,DBL0H,r9
 	breq.d r7,r9,.Linf_nan_dbl1
 	asl r4,r4,12
 	sub r4,r4,mhi
 	mululw 0,r4,r4
 	machulw r5,r4,r4
 	bne.d .Lnormal_dbl0
 	lsr r8,r8,1

 	.balign 4
 .Ldenorm_dbl0:
 	bmsk.f r12,DBL0H,19
 	; wb stall
 	mov.eq r12,DBL0L
 	sub.eq r6,r6,32
 	norm.f r11,r12 ; flag for 0/x -> 0 check
 	brge r7, \
 		0x43500000, .Lret0_2 ; denorm/large number -> 0
 	beq_s .Lret0_2
 	mov.mi r11,0
 	add.pl r11,r11,1
 	asl r12,r12,r11
 	sub r6,r6,r11
 	add.f 0,r6,31
 	lsr r10,DBL0L,r6
 	mov.mi r10,0
 	add r6,r6,11+32
 	neg.f r11,r6
 	asl DBL0L,DBL0L,r11
 	mov.pl DBL0L,0
 	sub r6,r6,32-1
 	b.d .Lpast_denorm_dbl0
 	asl r6,r6,20

 	.balign 4
 .Linf_nan_dbl1: ; 0/Inf -> NaN Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
 	or.f 0,r6,DBL0L
 	cmp.ne r6,r9
 	not_s DBL0L,DBL1H
 	sub_s.ne DBL0L,DBL0L,DBL0L
 	tst_s DBL0H,DBL0H
 	add_s DBL0H,DBL1H,DBL0L
 	j_s.d [blink]
 	bxor.mi DBL0H,DBL0H,31

 	.balign 4
 .Lnormal_dbl0:
 	breq.d r6,r9,.Linf_nan_dbl0
 	asl r12,DBL0H,11
 	lsr r10,DBL0L,21
 .Lpast_denorm_dbl0:
 	bset r8,r8,31
 	mulu64 (r5,r8)
 	add_s r12,r12,r10
 	bset r5,r12,31
 	cmp r5,r8
 	cmp.eq DBL0L,DBL1L
 	lsr.cc r5,r5,1
 	sub r4,r4,mhi ; u1.31 inverse, about 30 bit
 	mululw 0,r5,r4
 	machulw r11,r5,r4 ; result fraction highpart
 	lsr r8,r8,2 ; u3.29
 	add r5,r6, /* wait for immediate */ \
 		0x3fe00000
 	mulu64 (r11,r8) ; u-28.31
 	asl_s DBL1L,DBL1L,9 ; u-29.23:9
 	sbc r6,r5,r7
 	mov r12,mlo ; u-28.31
 	mulu64 (r11,DBL1L) ; mhi: u-28.23:9
 	add.cs DBL0L,DBL0L,DBL0L
 	asl_s DBL0L,DBL0L,6 ; u-26.25:7
 	asl r10,r11,23
 	sub_l DBL0L,DBL0L,r12
 	lsr r7,r11,9
 	sub r5,DBL0L,mhi ; rest msw ; u-26.31:0
 	mul64 (r5,r4) ; mhi: result fraction lowpart
 	xor.f 0,DBL0H,DBL1H
 	and DBL0H,r6,r9
 	add_s DBL0H,DBL0H,r7
 	bclr r12,r9,20 ; 0x7fe00000
 	brhs.d r6,r12,.Linf_denorm
 	bxor.mi DBL0H,DBL0H,31
 	add.f r12,mhi,0x11
 	asr r9,r12,5
 	sub.mi DBL0H,DBL0H,1
 	add.f DBL0L,r9,r10
 	tst r12,0x1c
 	jne.d [blink]
 	add.cs DBL0H,DBL0H,1
         /* work out exact rounding if we fall through here.  */
         /* We know that the exact result cannot be represented in double
            precision.  Find the mid-point between the two nearest
            representable values, multiply with the divisor, and check if
            the result is larger than the dividend.  Since we want to know
 	   only the sign bit, it is sufficient to calculate only the
 	   highpart of the lower 64 bits.  */
 	mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo
 	sub.f DBL0L,DBL0L,1
 	asl r12,r9,2 ; u-22.30:2
 	sub.cs DBL0H,DBL0H,1
 	sub.f r12,r12,2
 	mov r10,mlo ; rest before considering r12 in r5 : -r10
 	mululw 0,r12,DBL1L
 	machulw r7,r12,DBL1L ; mhi: u-51.32
 	asl r5,r5,25 ; s-51.7:25
 	lsr r10,r10,7 ; u-51.30:2
 	mulu64 (r12,r8) ; mlo: u-51.31:1
 	sub r5,r5,r10
 	add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
 	bset r7,r7,0 ; make sure that the result is not zero, and that
 	sub r5,r5,r7 ; a highpart zero appears negative
 	sub.f r5,r5,mlo ; rest msw
 	add.pl.f DBL0L,DBL0L,1
 	j_s.d [blink]
 	add.eq DBL0H,DBL0H,1

 .Linf_nan_dbl0:
 	tst_s DBL1H,DBL1H
 	j_s.d [blink]
 	bxor.mi DBL0H,DBL0H,31
 	.balign 4
 .Linf_denorm:
 	lsr r12,r6,28
 	brlo.d r12,0xc,.Linf
 .Ldenorm:
 	asr r6,r6,20
 	neg r9,r6
 	mov_s DBL0H,0
 	brhs.d r9,54,.Lret0
 	bxor.mi DBL0H,DBL0H,31
 	add r12,mhi,1
 	and r12,r12,-4
 	rsub r7,r6,5
 	asr r10,r12,28
 	bmsk r4,r12,27
 	min r7,r7,31
 	asr DBL0L,r4,r7
 	add DBL1H,r11,r10
 	abs.f r10,r4
 	sub.mi r10,r10,1
 	add.f r7,r6,32-5
 	asl r4,r4,r7
 	mov.mi r4,r10
 	add.f r10,r6,23
 	rsub r7,r6,9
 	lsr r7,DBL1H,r7
 	asl r10,DBL1H,r10
 	or.pnz DBL0H,DBL0H,r7
 	or.mi r4,r4,r10
 	mov.mi r10,r7
 	add.f DBL0L,r10,DBL0L
 	add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
 	bxor.f 0,r4,31
 	add.pnz.f DBL0L,DBL0L,1
 	add.cs.f DBL0H,DBL0H,1
 	jne_s [blink]
 	/* Calculation so far was not conclusive; calculate further rest.  */
 	mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo
 	asr.f r12,r12,3
 	asl r5,r5,25 ; s-51.7:25
 	mov r11,mlo ; rest before considering r12 in r5 : -r11
 	mulu64 (r12,r8) ; u-51.31:1
 	and r9,DBL0L,1 ; tie-breaker: round to even
 	lsr r11,r11,7 ; u-51.30:2
 	mov DBL1H,mlo ; u-51.31:1
 	mulu64 (r12,DBL1L) ; u-51.62:2
 	sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
 	add_s DBL1H,DBL1H,r11
 	sub DBL1H,DBL1H,r5 ; -rest msw
 	add_s DBL1H,DBL1H,mhi ; -rest msw
 	add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
 	tst_s DBL1H,DBL1H
 	cmp.eq mlo,r9
 	add.cs.f DBL0L,DBL0L,1
 	j_s.d [blink]
 	add.cs DBL0H,DBL0H,1

 .Lret0:
 	/* return +- 0 */
 	j_s.d [blink]
 	mov_s DBL0L,0
 .Linf:
 	mov_s DBL0H,r9
 	mov_s DBL0L,0
 	j_s.d [blink]
 	bxor.mi DBL0H,DBL0H,31
 	ENDFUNC(__divdf3)
	/* Copyright (C) 2008-2023 Free Software Foundation, Inc.
	Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
	on behalf of Synopsys Inc.

	This file is part of GCC.

	GCC is free software; you can redistribute it and/or modify it under
	the terms of the GNU General Public License as published by the Free
	Software Foundation; either version 3, or (at your option) any later
	version.

	GCC is distributed in the hope that it will be useful, but WITHOUT ANY
	WARRANTY; without even the implied warranty of MERCHANTABILITY or
	FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	for more details.

	Under Section 7 of GPL version 3, you are granted additional
	permissions described in the GCC Runtime Library Exception, version
	3.1, as published by the Free Software Foundation.

	You should have received a copy of the GNU General Public License and
	a copy of the GCC Runtime Library Exception along with this program;
	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	<http://www.gnu.org/licenses/>. */

	/*
	to calculate a := b/x as b*y, with y := 1/x:
	- x is in the range [1..2)
	- calculate 15..18 bit inverse y0 using a table of approximating polynoms.
	Precision is higher for polynoms used to evaluate input with larger
	value.
	- Do one newton-raphson iteration step to double the precision,
	then multiply this with the divisor
	-> more time to decide if dividend is subnormal
	- the worst error propagation is on the side of the value range
	with the least initial defect, thus giving us about 30 bits precision.
	The truncation error for the either is less than 1 + x/2 ulp.
	A 31 bit inverse can be simply calculated by using x with implicit 1
	and chaining the multiplies. For a 32 bit inverse, we multiply y0^2
	with the bare fraction part of x, then add in y0^2 for the implicit
	1 of x.
	- If calculating a 31 bit inverse, the systematic error is less than
	-1 ulp; likewise, for 32 bit, it is less than -2 ulp.
	- If we calculate our seed with a 32 bit fraction, we can archive a
	tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
	only need to take the step to calculate the 2nd stage rest and
	rounding adjust 1/32th of the time. However, if we use a 20 bit
	fraction for the seed, the negative error can exceed -2 ulp/128, (2)
	thus for a simple add / tst check, we need to do the 2nd stage
	rest calculation/ rounding adjust 1/16th of the time.
	(1): The inexactness of the 32 bit inverse contributes an error in the
	range of (-1 .. +(1+x/2) ) ulp/128. Leaving out the low word of the
	rest contributes an error < +1/x ulp/128 . In the interval [1,2),
	x/2 + 1/x <= 1.5 .
	(2): Unless proven otherwise. I have not actually looked for an
	example where -2 ulp/128 is exceeded, and my calculations indicate
	that the excess, if existent, is less than -1/512 ulp.
	??? The algorithm is still based on the ARC700 optimized code.
	Maybe we could make better use of 32x16 bit multiply, or 64 bit multiply
	results.
	*/
	#include "../arc-ieee-754.h"
	#define mlo acc2
	#define mhi acc1
	#define mul64(b,c) mullw 0,b,c` machlw 0,b,c
	#define mulu64(b,c) mululw 0,b,c` machulw 0,b,c

	/* N.B. fp-bit.c does double rounding on denormal numbers. */
	#if 0 /* DEBUG */
	.global __divdf3
	FUNC(__divdf3)
	.balign 4
	__divdf3:
	push_s blink
	push_s r2
	push_s r3
	push_s r0
	bl.d __divdf3_c
	push_s r1
	ld_s r2,[sp,12]
	ld_s r3,[sp,8]
	st_s r0,[sp,12]
	st_s r1,[sp,8]
	pop_s r1
	bl.d __divdf3_asm
	pop_s r0
	pop_s r3
	pop_s r2
	pop_s blink
	cmp r0,r2
	cmp.eq r1,r3
	jeq_s [blink]
	and r12,DBL0H,DBL1H
	bic.f 0,0x7ff80000,r12 ; both NaN -> OK
	jeq_s [blink]
	bl abort
	ENDFUNC(__divdf3)
	#define __divdf3 __divdf3_asm
	#endif /* DEBUG */

	FUNC(__divdf3)
	.balign 4
	.L7ff00000:
	.long 0x7ff00000
	.Ldivtab:
	.long 0xfc0fffe1
	.long 0xf46ffdfb
	.long 0xed1ffa54
	.long 0xe61ff515
	.long 0xdf7fee75
	.long 0xd91fe680
	.long 0xd2ffdd52
	.long 0xcd1fd30c
	.long 0xc77fc7cd
	.long 0xc21fbbb6
	.long 0xbcefaec0
	.long 0xb7efa100
	.long 0xb32f92bf
	.long 0xae8f83b7
	.long 0xaa2f7467
	.long 0xa5ef6479
	.long 0xa1cf53fa
	.long 0x9ddf433e
	.long 0x9a0f3216
	.long 0x965f2091
	.long 0x92df0f11
	.long 0x8f6efd05
	.long 0x8c1eeacc
	.long 0x88eed876
	.long 0x85dec615
	.long 0x82eeb3b9
	.long 0x800ea10b
	.long 0x7d3e8e0f
	.long 0x7a8e7b3f
	.long 0x77ee6836
	.long 0x756e5576
	.long 0x72fe4293
	.long 0x709e2f93
	.long 0x6e4e1c7f
	.long 0x6c0e095e
	.long 0x69edf6c5
	.long 0x67cde3a5
	.long 0x65cdd125
	.long 0x63cdbe25
	.long 0x61ddab3f
	.long 0x600d991f
	.long 0x5e3d868c
	.long 0x5c6d7384
	.long 0x5abd615f
	.long 0x590d4ecd
	.long 0x576d3c83
	.long 0x55dd2a89
	.long 0x545d18e9
	.long 0x52dd06e9
	.long 0x516cf54e
	.long 0x4ffce356
	.long 0x4e9cd1ce
	.long 0x4d3cbfec
	.long 0x4becae86
	.long 0x4aac9da4
	.long 0x496c8c73
	.long 0x483c7bd3
	.long 0x470c6ae8
	.long 0x45dc59af
	.long 0x44bc4915
	.long 0x43ac3924
	.long 0x428c27fb
	.long 0x418c187a
	.long 0x407c07bd

	__divdf3_support: /* This label makes debugger output saner. */
	.balign 4
	.Ldenorm_dbl1:
	brge r6, \
	0x43500000,.Linf_NaN ; large number / denorm -> Inf
	bmsk.f r12,DBL1H,19
	mov.eq r12,DBL1L
	mov.eq DBL1L,0
	sub.eq r7,r7,32
	norm.f r11,r12 ; flag for x/0 -> Inf check
	beq_s .Linf_NaN
	mov.mi r11,0
	add.pl r11,r11,1
	add_s r12,r12,r12
	asl r8,r12,r11
	rsub r12,r11,31
	lsr r12,DBL1L,r12
	tst_s DBL1H,DBL1H
	or r8,r8,r12
	lsr r4,r8,26
	lsr DBL1H,r8,12
	ld.as r4,[r10,r4]
	bxor.mi DBL1H,DBL1H,31
	sub r11,r11,11
	asl DBL1L,DBL1L,r11
	sub r11,r11,1
	mulu64 (r4,r8)
	sub r7,r7,r11
	b.d .Lpast_denorm_dbl1
	asl r7,r7,20

	.Linf_NaN:
	tst_s DBL0L,DBL0L ; 0/0 -> NaN
	xor_s DBL1H,DBL1H,DBL0H
	bclr.eq.f DBL0H,DBL0H,31
	bmsk DBL0H,DBL1H,30
	xor_s DBL0H,DBL0H,DBL1H
	sub.eq DBL0H,DBL0H,1
	mov_s DBL0L,0
	j_s.d [blink]
	or DBL0H,DBL0H,r9
	.balign 4
	.Lret0_2:
	xor_s DBL1H,DBL1H,DBL0H
	mov_s DBL0L,0
	bmsk DBL0H,DBL1H,30
	j_s.d [blink]
	xor_s DBL0H,DBL0H,DBL1H
	.balign 4
	.global __divdf3
	/* N.B. the spacing between divtab and the sub3 to get its address must
	be a multiple of 8. */
	__divdf3:
	asl r8,DBL1H,12
	lsr r4,r8,26
	sub3 r10,pcl,51;(.-.Ldivtab) >> 3
	ld.as r9,[pcl,-104]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
	ld.as r4,[r10,r4]
	lsr r12,DBL1L,20
	and.f r7,DBL1H,r9
	or r8,r8,r12
	mulu64 (r4,r8)
	beq.d .Ldenorm_dbl1
	.Lpast_denorm_dbl1:
	and.f r6,DBL0H,r9
	breq.d r7,r9,.Linf_nan_dbl1
	asl r4,r4,12
	sub r4,r4,mhi
	mululw 0,r4,r4
	machulw r5,r4,r4
	bne.d .Lnormal_dbl0
	lsr r8,r8,1

	.balign 4
	.Ldenorm_dbl0:
	bmsk.f r12,DBL0H,19
	; wb stall
	mov.eq r12,DBL0L
	sub.eq r6,r6,32
	norm.f r11,r12 ; flag for 0/x -> 0 check
	brge r7, \
	0x43500000, .Lret0_2 ; denorm/large number -> 0
	beq_s .Lret0_2
	mov.mi r11,0
	add.pl r11,r11,1
	asl r12,r12,r11
	sub r6,r6,r11
	add.f 0,r6,31
	lsr r10,DBL0L,r6
	mov.mi r10,0
	add r6,r6,11+32
	neg.f r11,r6
	asl DBL0L,DBL0L,r11
	mov.pl DBL0L,0
	sub r6,r6,32-1
	b.d .Lpast_denorm_dbl0
	asl r6,r6,20

	.balign 4
	.Linf_nan_dbl1: ; 0/Inf -> NaN Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
	or.f 0,r6,DBL0L
	cmp.ne r6,r9
	not_s DBL0L,DBL1H
	sub_s.ne DBL0L,DBL0L,DBL0L
	tst_s DBL0H,DBL0H
	add_s DBL0H,DBL1H,DBL0L
	j_s.d [blink]
	bxor.mi DBL0H,DBL0H,31

	.balign 4
	.Lnormal_dbl0:
	breq.d r6,r9,.Linf_nan_dbl0
	asl r12,DBL0H,11
	lsr r10,DBL0L,21
	.Lpast_denorm_dbl0:
	bset r8,r8,31
	mulu64 (r5,r8)
	add_s r12,r12,r10
	bset r5,r12,31
	cmp r5,r8
	cmp.eq DBL0L,DBL1L
	lsr.cc r5,r5,1
	sub r4,r4,mhi ; u1.31 inverse, about 30 bit
	mululw 0,r5,r4
	machulw r11,r5,r4 ; result fraction highpart
	lsr r8,r8,2 ; u3.29
	add r5,r6, /* wait for immediate */ \
	0x3fe00000
	mulu64 (r11,r8) ; u-28.31
	asl_s DBL1L,DBL1L,9 ; u-29.23:9
	sbc r6,r5,r7
	mov r12,mlo ; u-28.31
	mulu64 (r11,DBL1L) ; mhi: u-28.23:9
	add.cs DBL0L,DBL0L,DBL0L
	asl_s DBL0L,DBL0L,6 ; u-26.25:7
	asl r10,r11,23
	sub_l DBL0L,DBL0L,r12
	lsr r7,r11,9
	sub r5,DBL0L,mhi ; rest msw ; u-26.31:0
	mul64 (r5,r4) ; mhi: result fraction lowpart
	xor.f 0,DBL0H,DBL1H
	and DBL0H,r6,r9
	add_s DBL0H,DBL0H,r7
	bclr r12,r9,20 ; 0x7fe00000
	brhs.d r6,r12,.Linf_denorm
	bxor.mi DBL0H,DBL0H,31
	add.f r12,mhi,0x11
	asr r9,r12,5
	sub.mi DBL0H,DBL0H,1
	add.f DBL0L,r9,r10
	tst r12,0x1c
	jne.d [blink]
	add.cs DBL0H,DBL0H,1
	/* work out exact rounding if we fall through here. */
	/* We know that the exact result cannot be represented in double
	precision. Find the mid-point between the two nearest
	representable values, multiply with the divisor, and check if
	the result is larger than the dividend. Since we want to know
	only the sign bit, it is sufficient to calculate only the
	highpart of the lower 64 bits. */
	mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo
	sub.f DBL0L,DBL0L,1
	asl r12,r9,2 ; u-22.30:2
	sub.cs DBL0H,DBL0H,1
	sub.f r12,r12,2
	mov r10,mlo ; rest before considering r12 in r5 : -r10
	mululw 0,r12,DBL1L
	machulw r7,r12,DBL1L ; mhi: u-51.32
	asl r5,r5,25 ; s-51.7:25
	lsr r10,r10,7 ; u-51.30:2
	mulu64 (r12,r8) ; mlo: u-51.31:1
	sub r5,r5,r10
	add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
	bset r7,r7,0 ; make sure that the result is not zero, and that
	sub r5,r5,r7 ; a highpart zero appears negative
	sub.f r5,r5,mlo ; rest msw
	add.pl.f DBL0L,DBL0L,1
	j_s.d [blink]
	add.eq DBL0H,DBL0H,1

	.Linf_nan_dbl0:
	tst_s DBL1H,DBL1H
	j_s.d [blink]
	bxor.mi DBL0H,DBL0H,31
	.balign 4
	.Linf_denorm:
	lsr r12,r6,28
	brlo.d r12,0xc,.Linf
	.Ldenorm:
	asr r6,r6,20
	neg r9,r6
	mov_s DBL0H,0
	brhs.d r9,54,.Lret0
	bxor.mi DBL0H,DBL0H,31
	add r12,mhi,1
	and r12,r12,-4
	rsub r7,r6,5
	asr r10,r12,28
	bmsk r4,r12,27
	min r7,r7,31
	asr DBL0L,r4,r7
	add DBL1H,r11,r10
	abs.f r10,r4
	sub.mi r10,r10,1
	add.f r7,r6,32-5
	asl r4,r4,r7
	mov.mi r4,r10
	add.f r10,r6,23
	rsub r7,r6,9
	lsr r7,DBL1H,r7
	asl r10,DBL1H,r10
	or.pnz DBL0H,DBL0H,r7
	or.mi r4,r4,r10
	mov.mi r10,r7
	add.f DBL0L,r10,DBL0L
	add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
	bxor.f 0,r4,31
	add.pnz.f DBL0L,DBL0L,1
	add.cs.f DBL0H,DBL0H,1
	jne_s [blink]
	/* Calculation so far was not conclusive; calculate further rest. */
	mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo
	asr.f r12,r12,3
	asl r5,r5,25 ; s-51.7:25
	mov r11,mlo ; rest before considering r12 in r5 : -r11
	mulu64 (r12,r8) ; u-51.31:1
	and r9,DBL0L,1 ; tie-breaker: round to even
	lsr r11,r11,7 ; u-51.30:2
	mov DBL1H,mlo ; u-51.31:1
	mulu64 (r12,DBL1L) ; u-51.62:2
	sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
	add_s DBL1H,DBL1H,r11
	sub DBL1H,DBL1H,r5 ; -rest msw
	add_s DBL1H,DBL1H,mhi ; -rest msw
	add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
	tst_s DBL1H,DBL1H
	cmp.eq mlo,r9
	add.cs.f DBL0L,DBL0L,1
	j_s.d [blink]
	add.cs DBL0H,DBL0H,1

	.Lret0:
	/* return +- 0 */
	j_s.d [blink]
	mov_s DBL0L,0
	.Linf:
	mov_s DBL0H,r9
	mov_s DBL0L,0
	j_s.d [blink]
	bxor.mi DBL0H,DBL0H,31
	ENDFUNC(__divdf3)