libgcc/config/ia64/lib1funcs.S - gcc - Git at Google

 /* Copyright (C) 2000-2023 Free Software Foundation, Inc.
    Contributed by James E. Wilson <wilson@cygnus.com>.

    This file is part of GCC.

    GCC is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 3, or (at your option)
    any later version.

    GCC is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    Under Section 7 of GPL version 3, you are granted additional
    permissions described in the GCC Runtime Library Exception, version
    3.1, as published by the Free Software Foundation.

    You should have received a copy of the GNU General Public License and
    a copy of the GCC Runtime Library Exception along with this program;
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */

 #ifdef L__divxf3
 // Compute a 80-bit IEEE double-extended quotient.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // farg0 holds the dividend.  farg1 holds the divisor.
 //
 // __divtf3 is an alternate symbol name for backward compatibility.

 	.text
 	.align 16
 	.global __divxf3
 	.proc __divxf3
 __divxf3:
 #ifdef SHARED
 	.global __divtf3
 __divtf3:
 #endif
 	cmp.eq p7, p0 = r0, r0
 	frcpa.s0 f10, p6 = farg0, farg1
 	;;
 (p6)	cmp.ne p7, p0 = r0, r0
 	.pred.rel.mutex p6, p7
 (p6)	fnma.s1 f11 = farg1, f10, f1
 (p6)	fma.s1 f12 = farg0, f10, f0
 	;;
 (p6)	fma.s1 f13 = f11, f11, f0
 (p6)	fma.s1 f14 = f11, f11, f11
 	;;
 (p6)	fma.s1 f11 = f13, f13, f11
 (p6)	fma.s1 f13 = f14, f10, f10
 	;;
 (p6)	fma.s1 f10 = f13, f11, f10
 (p6)	fnma.s1 f11 = farg1, f12, farg0
 	;;
 (p6)	fma.s1 f11 = f11, f10, f12
 (p6)	fnma.s1 f12 = farg1, f10, f1
 	;;
 (p6)	fma.s1 f10 = f12, f10, f10
 (p6)	fnma.s1 f12 = farg1, f11, farg0
 	;;
 (p6)	fma.s0 fret0 = f12, f10, f11
 (p7)	mov fret0 = f10
 	br.ret.sptk rp
 	.endp __divxf3
 #endif

 #ifdef L__divdf3
 // Compute a 64-bit IEEE double quotient.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // farg0 holds the dividend.  farg1 holds the divisor.

 	.text
 	.align 16
 	.global __divdf3
 	.proc __divdf3
 __divdf3:
 	cmp.eq p7, p0 = r0, r0
 	frcpa.s0 f10, p6 = farg0, farg1
 	;;
 (p6)	cmp.ne p7, p0 = r0, r0
 	.pred.rel.mutex p6, p7
 (p6)	fmpy.s1 f11 = farg0, f10
 (p6)	fnma.s1 f12 = farg1, f10, f1
 	;;
 (p6)	fma.s1 f11 = f12, f11, f11
 (p6)	fmpy.s1 f13 = f12, f12
 	;;
 (p6)	fma.s1 f10 = f12, f10, f10
 (p6)	fma.s1 f11 = f13, f11, f11
 	;;
 (p6)	fmpy.s1 f12 = f13, f13
 (p6)	fma.s1 f10 = f13, f10, f10
 	;;
 (p6)	fma.d.s1 f11 = f12, f11, f11
 (p6)	fma.s1 f10 = f12, f10, f10
 	;;
 (p6)	fnma.d.s1 f8 = farg1, f11, farg0
 	;;
 (p6)	fma.d fret0 = f8, f10, f11
 (p7)	mov fret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __divdf3
 #endif

 #ifdef L__divsf3
 // Compute a 32-bit IEEE float quotient.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // farg0 holds the dividend.  farg1 holds the divisor.

 	.text
 	.align 16
 	.global __divsf3
 	.proc __divsf3
 __divsf3:
 	cmp.eq p7, p0 = r0, r0
 	frcpa.s0 f10, p6 = farg0, farg1
 	;;
 (p6)	cmp.ne p7, p0 = r0, r0
 	.pred.rel.mutex p6, p7
 (p6)	fmpy.s1 f8 = farg0, f10
 (p6)	fnma.s1 f9 = farg1, f10, f1
 	;;
 (p6)	fma.s1 f8 = f9, f8, f8
 (p6)	fmpy.s1 f9 = f9, f9
 	;;
 (p6)	fma.s1 f8 = f9, f8, f8
 (p6)	fmpy.s1 f9 = f9, f9
 	;;
 (p6)	fma.d.s1 f10 = f9, f8, f8
 	;;
 (p6)	fnorm.s.s0 fret0 = f10
 (p7)	mov fret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __divsf3
 #endif

 #ifdef L__divdi3
 // Compute a 64-bit integer quotient.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // in0 holds the dividend.  in1 holds the divisor.

 	.text
 	.align 16
 	.global __divdi3
 	.proc __divdi3
 __divdi3:
 	.regstk 2,0,0,0
 	// Transfer inputs to FP registers.
 	setf.sig f8 = in0
 	setf.sig f9 = in1
 	// Check divide by zero.
 	cmp.ne.unc p0,p7=0,in1
 	;;
 	// Convert the inputs to FP, so that they won't be treated as unsigned.
 	fcvt.xf f8 = f8
 	fcvt.xf f9 = f9
 (p7)	break 1
 	;;
 	// Compute the reciprocal approximation.
 	frcpa.s1 f10, p6 = f8, f9
 	;;
 	// 3 Newton-Raphson iterations.
 (p6)	fnma.s1 f11 = f9, f10, f1
 (p6)	fmpy.s1 f12 = f8, f10
 	;;
 (p6)	fmpy.s1 f13 = f11, f11
 (p6)	fma.s1 f12 = f11, f12, f12
 	;;
 (p6)	fma.s1 f10 = f11, f10, f10
 (p6)	fma.s1 f11 = f13, f12, f12
 	;;
 (p6)	fma.s1 f10 = f13, f10, f10
 (p6)	fnma.s1 f12 = f9, f11, f8
 	;;
 (p6)	fma.s1 f10 = f12, f10, f11
 	;;
 	// Round quotient to an integer.
 	fcvt.fx.trunc.s1 f10 = f10
 	;;
 	// Transfer result to GP registers.
 	getf.sig ret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __divdi3
 #endif

 #ifdef L__moddi3
 // Compute a 64-bit integer modulus.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // in0 holds the dividend (a).  in1 holds the divisor (b).

 	.text
 	.align 16
 	.global __moddi3
 	.proc __moddi3
 __moddi3:
 	.regstk 2,0,0,0
 	// Transfer inputs to FP registers.
 	setf.sig f14 = in0
 	setf.sig f9 = in1
 	// Check divide by zero.
 	cmp.ne.unc p0,p7=0,in1
 	;;
 	// Convert the inputs to FP, so that they won't be treated as unsigned.
 	fcvt.xf f8 = f14
 	fcvt.xf f9 = f9
 (p7)	break 1
 	;;
 	// Compute the reciprocal approximation.
 	frcpa.s1 f10, p6 = f8, f9
 	;;
 	// 3 Newton-Raphson iterations.
 (p6)	fmpy.s1 f12 = f8, f10
 (p6)	fnma.s1 f11 = f9, f10, f1
 	;;
 (p6)	fma.s1 f12 = f11, f12, f12
 (p6)	fmpy.s1 f13 = f11, f11
 	;;
 (p6)	fma.s1 f10 = f11, f10, f10
 (p6)	fma.s1 f11 = f13, f12, f12
 	;;
 	sub in1 = r0, in1
 (p6)	fma.s1 f10 = f13, f10, f10
 (p6)	fnma.s1 f12 = f9, f11, f8
 	;;
 	setf.sig f9 = in1
 (p6)	fma.s1 f10 = f12, f10, f11
 	;;
 	fcvt.fx.trunc.s1 f10 = f10
 	;;
 	// r = q * (-b) + a
 	xma.l f10 = f10, f9, f14
 	;;
 	// Transfer result to GP registers.
 	getf.sig ret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __moddi3
 #endif

 #ifdef L__udivdi3
 // Compute a 64-bit unsigned integer quotient.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // in0 holds the dividend.  in1 holds the divisor.

 	.text
 	.align 16
 	.global __udivdi3
 	.proc __udivdi3
 __udivdi3:
 	.regstk 2,0,0,0
 	// Transfer inputs to FP registers.
 	setf.sig f8 = in0
 	setf.sig f9 = in1
 	// Check divide by zero.
 	cmp.ne.unc p0,p7=0,in1
 	;;
 	// Convert the inputs to FP, to avoid FP software-assist faults.
 	fcvt.xuf.s1 f8 = f8
 	fcvt.xuf.s1 f9 = f9
 (p7)	break 1
 	;;
 	// Compute the reciprocal approximation.
 	frcpa.s1 f10, p6 = f8, f9
 	;;
 	// 3 Newton-Raphson iterations.
 (p6)	fnma.s1 f11 = f9, f10, f1
 (p6)	fmpy.s1 f12 = f8, f10
 	;;
 (p6)	fmpy.s1 f13 = f11, f11
 (p6)	fma.s1 f12 = f11, f12, f12
 	;;
 (p6)	fma.s1 f10 = f11, f10, f10
 (p6)	fma.s1 f11 = f13, f12, f12
 	;;
 (p6)	fma.s1 f10 = f13, f10, f10
 (p6)	fnma.s1 f12 = f9, f11, f8
 	;;
 (p6)	fma.s1 f10 = f12, f10, f11
 	;;
 	// Round quotient to an unsigned integer.
 	fcvt.fxu.trunc.s1 f10 = f10
 	;;
 	// Transfer result to GP registers.
 	getf.sig ret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __udivdi3
 #endif

 #ifdef L__umoddi3
 // Compute a 64-bit unsigned integer modulus.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // in0 holds the dividend (a).  in1 holds the divisor (b).

 	.text
 	.align 16
 	.global __umoddi3
 	.proc __umoddi3
 __umoddi3:
 	.regstk 2,0,0,0
 	// Transfer inputs to FP registers.
 	setf.sig f14 = in0
 	setf.sig f9 = in1
 	// Check divide by zero.
 	cmp.ne.unc p0,p7=0,in1
 	;;
 	// Convert the inputs to FP, to avoid FP software assist faults.
 	fcvt.xuf.s1 f8 = f14
 	fcvt.xuf.s1 f9 = f9
 (p7)	break 1;
 	;;
 	// Compute the reciprocal approximation.
 	frcpa.s1 f10, p6 = f8, f9
 	;;
 	// 3 Newton-Raphson iterations.
 (p6)	fmpy.s1 f12 = f8, f10
 (p6)	fnma.s1 f11 = f9, f10, f1
 	;;
 (p6)	fma.s1 f12 = f11, f12, f12
 (p6)	fmpy.s1 f13 = f11, f11
 	;;
 (p6)	fma.s1 f10 = f11, f10, f10
 (p6)	fma.s1 f11 = f13, f12, f12
 	;;
 	sub in1 = r0, in1
 (p6)	fma.s1 f10 = f13, f10, f10
 (p6)	fnma.s1 f12 = f9, f11, f8
 	;;
 	setf.sig f9 = in1
 (p6)	fma.s1 f10 = f12, f10, f11
 	;;
 	// Round quotient to an unsigned integer.
 	fcvt.fxu.trunc.s1 f10 = f10
 	;;
 	// r = q * (-b) + a
 	xma.l f10 = f10, f9, f14
 	;;
 	// Transfer result to GP registers.
 	getf.sig ret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __umoddi3
 #endif

 #ifdef L__divsi3
 // Compute a 32-bit integer quotient.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // in0 holds the dividend.  in1 holds the divisor.

 	.text
 	.align 16
 	.global __divsi3
 	.proc __divsi3
 __divsi3:
 	.regstk 2,0,0,0
 	// Check divide by zero.
 	cmp.ne.unc p0,p7=0,in1
 	sxt4 in0 = in0
 	sxt4 in1 = in1
 	;;
 	setf.sig f8 = in0
 	setf.sig f9 = in1
 (p7)	break 1
 	;;
 	mov r2 = 0x0ffdd
 	fcvt.xf f8 = f8
 	fcvt.xf f9 = f9
 	;;
 	setf.exp f11 = r2
 	frcpa.s1 f10, p6 = f8, f9
 	;;
 (p6)	fmpy.s1 f8 = f8, f10
 (p6)	fnma.s1 f9 = f9, f10, f1
 	;;
 (p6)	fma.s1 f8 = f9, f8, f8
 (p6)	fma.s1 f9 = f9, f9, f11
 	;;
 (p6)	fma.s1 f10 = f9, f8, f8
 	;;
 	fcvt.fx.trunc.s1 f10 = f10
 	;;
 	getf.sig ret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __divsi3
 #endif

 #ifdef L__modsi3
 // Compute a 32-bit integer modulus.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // in0 holds the dividend.  in1 holds the divisor.

 	.text
 	.align 16
 	.global __modsi3
 	.proc __modsi3
 __modsi3:
 	.regstk 2,0,0,0
 	mov r2 = 0x0ffdd
 	sxt4 in0 = in0
 	sxt4 in1 = in1
 	;;
 	setf.sig f13 = r32
 	setf.sig f9 = r33
 	// Check divide by zero.
 	cmp.ne.unc p0,p7=0,in1
 	;;
 	sub in1 = r0, in1
 	fcvt.xf f8 = f13
 	fcvt.xf f9 = f9
 	;;
 	setf.exp f11 = r2
 	frcpa.s1 f10, p6 = f8, f9
 (p7)	break 1
 	;;
 (p6)	fmpy.s1 f12 = f8, f10
 (p6)	fnma.s1 f10 = f9, f10, f1
 	;;
 	setf.sig f9 = in1
 (p6)	fma.s1 f12 = f10, f12, f12
 (p6)	fma.s1 f10 = f10, f10, f11
 	;;
 (p6)	fma.s1 f10 = f10, f12, f12
 	;;
 	fcvt.fx.trunc.s1 f10 = f10
 	;;
 	xma.l f10 = f10, f9, f13
 	;;
 	getf.sig ret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __modsi3
 #endif

 #ifdef L__udivsi3
 // Compute a 32-bit unsigned integer quotient.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // in0 holds the dividend.  in1 holds the divisor.

 	.text
 	.align 16
 	.global __udivsi3
 	.proc __udivsi3
 __udivsi3:
 	.regstk 2,0,0,0
 	mov r2 = 0x0ffdd
 	zxt4 in0 = in0
 	zxt4 in1 = in1
 	;;
 	setf.sig f8 = in0
 	setf.sig f9 = in1
 	// Check divide by zero.
 	cmp.ne.unc p0,p7=0,in1
 	;;
 	fcvt.xf f8 = f8
 	fcvt.xf f9 = f9
 (p7)	break 1
 	;;
 	setf.exp f11 = r2
 	frcpa.s1 f10, p6 = f8, f9
 	;;
 (p6)	fmpy.s1 f8 = f8, f10
 (p6)	fnma.s1 f9 = f9, f10, f1
 	;;
 (p6)	fma.s1 f8 = f9, f8, f8
 (p6)	fma.s1 f9 = f9, f9, f11
 	;;
 (p6)	fma.s1 f10 = f9, f8, f8
 	;;
 	fcvt.fxu.trunc.s1 f10 = f10
 	;;
 	getf.sig ret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __udivsi3
 #endif

 #ifdef L__umodsi3
 // Compute a 32-bit unsigned integer modulus.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // in0 holds the dividend.  in1 holds the divisor.

 	.text
 	.align 16
 	.global __umodsi3
 	.proc __umodsi3
 __umodsi3:
 	.regstk 2,0,0,0
 	mov r2 = 0x0ffdd
 	zxt4 in0 = in0
 	zxt4 in1 = in1
 	;;
 	setf.sig f13 = in0
 	setf.sig f9 = in1
 	// Check divide by zero.
 	cmp.ne.unc p0,p7=0,in1
 	;;
 	sub in1 = r0, in1
 	fcvt.xf f8 = f13
 	fcvt.xf f9 = f9
 	;;
 	setf.exp f11 = r2
 	frcpa.s1 f10, p6 = f8, f9
 (p7)	break 1;
 	;;
 (p6)	fmpy.s1 f12 = f8, f10
 (p6)	fnma.s1 f10 = f9, f10, f1
 	;;
 	setf.sig f9 = in1
 (p6)	fma.s1 f12 = f10, f12, f12
 (p6)	fma.s1 f10 = f10, f10, f11
 	;;
 (p6)	fma.s1 f10 = f10, f12, f12
 	;;
 	fcvt.fxu.trunc.s1 f10 = f10
 	;;
 	xma.l f10 = f10, f9, f13
 	;;
 	getf.sig ret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __umodsi3
 #endif

 #ifdef L__save_stack_nonlocal
 // Notes on save/restore stack nonlocal: We read ar.bsp but write
 // ar.bspstore.  This is because ar.bsp can be read at all times
 // (independent of the RSE mode) but since it's read-only we need to
 // restore the value via ar.bspstore.  This is OK because
 // ar.bsp==ar.bspstore after executing "flushrs".

 // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)

 	.text
 	.align 16
 	.global __ia64_save_stack_nonlocal
 	.proc __ia64_save_stack_nonlocal
 __ia64_save_stack_nonlocal:
 	{ .mmf
 	  alloc r18 = ar.pfs, 2, 0, 0, 0
 	  mov r19 = ar.rsc
 	  ;;
 	}
 	{ .mmi
 	  flushrs
 	  st8 [in0] = in1, 24
 	  and r19 = 0x1c, r19
 	  ;;
 	}
 	{ .mmi
 	  st8 [in0] = r18, -16
 	  mov ar.rsc = r19
 	  or r19 = 0x3, r19
 	  ;;
 	}
 	{ .mmi
 	  mov r16 = ar.bsp
 	  mov r17 = ar.rnat
 	  adds r2 = 8, in0
 	  ;;
 	}
 	{ .mmi
 	  st8 [in0] = r16
 	  st8 [r2] = r17
 	}
 	{ .mib
 	  mov ar.rsc = r19
 	  br.ret.sptk.few rp
 	  ;;
 	}
 	.endp __ia64_save_stack_nonlocal
 #endif

 #ifdef L__nonlocal_goto
 // void __ia64_nonlocal_goto(void *target_label, void *save_area,
 //			     void *static_chain);

 	.text
 	.align 16
 	.global __ia64_nonlocal_goto
 	.proc __ia64_nonlocal_goto
 __ia64_nonlocal_goto:
 	{ .mmi
 	  alloc r20 = ar.pfs, 3, 0, 0, 0
 	  ld8 r12 = [in1], 8
 	  mov.ret.sptk rp = in0, .L0
 	  ;;
 	}
 	{ .mmf
 	  ld8 r16 = [in1], 8
 	  mov r19 = ar.rsc
 	  ;;
 	}
 	{ .mmi
 	  flushrs
 	  ld8 r17 = [in1], 8
 	  and r19 = 0x1c, r19
 	  ;;
 	}
 	{ .mmi
 	  ld8 r18 = [in1]
 	  mov ar.rsc = r19
 	  or r19 = 0x3, r19
 	  ;;
 	}
 	{ .mmi
 	  mov ar.bspstore = r16
 	  ;;
 	  mov ar.rnat = r17
 	  ;;
 	}
 	{ .mmi
 	  loadrs
 	  invala
 	  mov r15 = in2
 	  ;;
 	}
 .L0:	{ .mib
 	  mov ar.rsc = r19
 	  mov ar.pfs = r18
 	  br.ret.sptk.few rp
 	  ;;
 	}
 	.endp __ia64_nonlocal_goto
 #endif

 #ifdef L__restore_stack_nonlocal
 // This is mostly the same as nonlocal_goto above.
 // ??? This has not been tested yet.

 // void __ia64_restore_stack_nonlocal(void *save_area)

 	.text
 	.align 16
 	.global __ia64_restore_stack_nonlocal
 	.proc __ia64_restore_stack_nonlocal
 __ia64_restore_stack_nonlocal:
 	{ .mmf
 	  alloc r20 = ar.pfs, 4, 0, 0, 0
 	  ld8 r12 = [in0], 8
 	  ;;
 	}
 	{ .mmb
 	  ld8 r16=[in0], 8
 	  mov r19 = ar.rsc
 	  ;;
 	}
 	{ .mmi
 	  flushrs
 	  ld8 r17 = [in0], 8
 	  and r19 = 0x1c, r19
 	  ;;
 	}
 	{ .mmf
 	  ld8 r18 = [in0]
 	  mov ar.rsc = r19
 	  ;;
 	}
 	{ .mmi
 	  mov ar.bspstore = r16
 	  ;;
 	  mov ar.rnat = r17
 	  or r19 = 0x3, r19
 	  ;;
 	}
 	{ .mmf
 	  loadrs
 	  invala
 	  ;;
 	}
 .L0:	{ .mib
 	  mov ar.rsc = r19
 	  mov ar.pfs = r18
 	  br.ret.sptk.few rp
 	  ;;
 	}
 	.endp __ia64_restore_stack_nonlocal
 #endif

 #ifdef L__trampoline
 // Implement the nested function trampoline.  This is out of line
 // so that we don't have to bother with flushing the icache, as
 // well as making the on-stack trampoline smaller.
 //
 // The trampoline has the following form:
 //
 //		+-------------------+ >
 //	TRAMP:	| __ia64_trampoline | |
 //		+-------------------+  > fake function descriptor
 //		| TRAMP+16          | |
 //		+-------------------+ >
 //		| target descriptor |
 //		+-------------------+
 //		| static link	    |
 //		+-------------------+

 	.text
 	.align 16
 	.global __ia64_trampoline
 	.proc __ia64_trampoline
 __ia64_trampoline:
 	{ .mmi
 	  ld8 r2 = [r1], 8
 	  ;;
 	  ld8 r15 = [r1]
 	}
 	{ .mmi
 	  ld8 r3 = [r2], 8
 	  ;;
 	  ld8 r1 = [r2]
 	  mov b6 = r3
 	}
 	{ .bbb
 	  br.sptk.many b6
 	  ;;
 	}
 	.endp __ia64_trampoline
 #endif

 #ifdef SHARED
 // Thunks for backward compatibility.
 #ifdef L_fixtfdi
 	.text
 	.align 16
 	.global __fixtfti
 	.proc __fixtfti
 __fixtfti:
 	{ .bbb
 	  br.sptk.many __fixxfti
 	  ;;
 	}
 	.endp __fixtfti
 #endif
 #ifdef L_fixunstfdi
 	.align 16
 	.global __fixunstfti
 	.proc __fixunstfti
 __fixunstfti:
 	{ .bbb
 	  br.sptk.many __fixunsxfti
 	  ;;
 	}
 	.endp __fixunstfti
 #endif
 #ifdef L_floatditf
 	.align 16
 	.global __floattitf
 	.proc __floattitf
 __floattitf:
 	{ .bbb
 	  br.sptk.many __floattixf
 	  ;;
 	}
 	.endp __floattitf
 #endif
 #endif
	/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
	Contributed by James E. Wilson <wilson@cygnus.com>.

	This file is part of GCC.

	GCC is free software; you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation; either version 3, or (at your option)
	any later version.

	GCC is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	Under Section 7 of GPL version 3, you are granted additional
	permissions described in the GCC Runtime Library Exception, version
	3.1, as published by the Free Software Foundation.

	You should have received a copy of the GNU General Public License and
	a copy of the GCC Runtime Library Exception along with this program;
	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	<http://www.gnu.org/licenses/>. */

	#ifdef L__divxf3
	// Compute a 80-bit IEEE double-extended quotient.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// farg0 holds the dividend. farg1 holds the divisor.
	//
	// __divtf3 is an alternate symbol name for backward compatibility.

	.text
	.align 16
	.global __divxf3
	.proc __divxf3
	__divxf3:
	#ifdef SHARED
	.global __divtf3
	__divtf3:
	#endif
	cmp.eq p7, p0 = r0, r0
	frcpa.s0 f10, p6 = farg0, farg1
	;;
	(p6) cmp.ne p7, p0 = r0, r0
	.pred.rel.mutex p6, p7
	(p6) fnma.s1 f11 = farg1, f10, f1
	(p6) fma.s1 f12 = farg0, f10, f0
	;;
	(p6) fma.s1 f13 = f11, f11, f0
	(p6) fma.s1 f14 = f11, f11, f11
	;;
	(p6) fma.s1 f11 = f13, f13, f11
	(p6) fma.s1 f13 = f14, f10, f10
	;;
	(p6) fma.s1 f10 = f13, f11, f10
	(p6) fnma.s1 f11 = farg1, f12, farg0
	;;
	(p6) fma.s1 f11 = f11, f10, f12
	(p6) fnma.s1 f12 = farg1, f10, f1
	;;
	(p6) fma.s1 f10 = f12, f10, f10
	(p6) fnma.s1 f12 = farg1, f11, farg0
	;;
	(p6) fma.s0 fret0 = f12, f10, f11
	(p7) mov fret0 = f10
	br.ret.sptk rp
	.endp __divxf3
	#endif

	#ifdef L__divdf3
	// Compute a 64-bit IEEE double quotient.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// farg0 holds the dividend. farg1 holds the divisor.

	.text
	.align 16
	.global __divdf3
	.proc __divdf3
	__divdf3:
	cmp.eq p7, p0 = r0, r0
	frcpa.s0 f10, p6 = farg0, farg1
	;;
	(p6) cmp.ne p7, p0 = r0, r0
	.pred.rel.mutex p6, p7
	(p6) fmpy.s1 f11 = farg0, f10
	(p6) fnma.s1 f12 = farg1, f10, f1
	;;
	(p6) fma.s1 f11 = f12, f11, f11
	(p6) fmpy.s1 f13 = f12, f12
	;;
	(p6) fma.s1 f10 = f12, f10, f10
	(p6) fma.s1 f11 = f13, f11, f11
	;;
	(p6) fmpy.s1 f12 = f13, f13
	(p6) fma.s1 f10 = f13, f10, f10
	;;
	(p6) fma.d.s1 f11 = f12, f11, f11
	(p6) fma.s1 f10 = f12, f10, f10
	;;
	(p6) fnma.d.s1 f8 = farg1, f11, farg0
	;;
	(p6) fma.d fret0 = f8, f10, f11
	(p7) mov fret0 = f10
	br.ret.sptk rp
	;;
	.endp __divdf3
	#endif

	#ifdef L__divsf3
	// Compute a 32-bit IEEE float quotient.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// farg0 holds the dividend. farg1 holds the divisor.

	.text
	.align 16
	.global __divsf3
	.proc __divsf3
	__divsf3:
	cmp.eq p7, p0 = r0, r0
	frcpa.s0 f10, p6 = farg0, farg1
	;;
	(p6) cmp.ne p7, p0 = r0, r0
	.pred.rel.mutex p6, p7
	(p6) fmpy.s1 f8 = farg0, f10
	(p6) fnma.s1 f9 = farg1, f10, f1
	;;
	(p6) fma.s1 f8 = f9, f8, f8
	(p6) fmpy.s1 f9 = f9, f9
	;;
	(p6) fma.s1 f8 = f9, f8, f8
	(p6) fmpy.s1 f9 = f9, f9
	;;
	(p6) fma.d.s1 f10 = f9, f8, f8
	;;
	(p6) fnorm.s.s0 fret0 = f10
	(p7) mov fret0 = f10
	br.ret.sptk rp
	;;
	.endp __divsf3
	#endif

	#ifdef L__divdi3
	// Compute a 64-bit integer quotient.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// in0 holds the dividend. in1 holds the divisor.

	.text
	.align 16
	.global __divdi3
	.proc __divdi3
	__divdi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f8 = in0
	setf.sig f9 = in1
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	// Convert the inputs to FP, so that they won't be treated as unsigned.
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	(p7) break 1
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
	(p6) fnma.s1 f11 = f9, f10, f1
	(p6) fmpy.s1 f12 = f8, f10
	;;
	(p6) fmpy.s1 f13 = f11, f11
	(p6) fma.s1 f12 = f11, f12, f12
	;;
	(p6) fma.s1 f10 = f11, f10, f10
	(p6) fma.s1 f11 = f13, f12, f12
	;;
	(p6) fma.s1 f10 = f13, f10, f10
	(p6) fnma.s1 f12 = f9, f11, f8
	;;
	(p6) fma.s1 f10 = f12, f10, f11
	;;
	// Round quotient to an integer.
	fcvt.fx.trunc.s1 f10 = f10
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __divdi3
	#endif

	#ifdef L__moddi3
	// Compute a 64-bit integer modulus.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// in0 holds the dividend (a). in1 holds the divisor (b).

	.text
	.align 16
	.global __moddi3
	.proc __moddi3
	__moddi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f14 = in0
	setf.sig f9 = in1
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	// Convert the inputs to FP, so that they won't be treated as unsigned.
	fcvt.xf f8 = f14
	fcvt.xf f9 = f9
	(p7) break 1
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
	(p6) fmpy.s1 f12 = f8, f10
	(p6) fnma.s1 f11 = f9, f10, f1
	;;
	(p6) fma.s1 f12 = f11, f12, f12
	(p6) fmpy.s1 f13 = f11, f11
	;;
	(p6) fma.s1 f10 = f11, f10, f10
	(p6) fma.s1 f11 = f13, f12, f12
	;;
	sub in1 = r0, in1
	(p6) fma.s1 f10 = f13, f10, f10
	(p6) fnma.s1 f12 = f9, f11, f8
	;;
	setf.sig f9 = in1
	(p6) fma.s1 f10 = f12, f10, f11
	;;
	fcvt.fx.trunc.s1 f10 = f10
	;;
	// r = q * (-b) + a
	xma.l f10 = f10, f9, f14
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __moddi3
	#endif

	#ifdef L__udivdi3
	// Compute a 64-bit unsigned integer quotient.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// in0 holds the dividend. in1 holds the divisor.

	.text
	.align 16
	.global __udivdi3
	.proc __udivdi3
	__udivdi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f8 = in0
	setf.sig f9 = in1
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	// Convert the inputs to FP, to avoid FP software-assist faults.
	fcvt.xuf.s1 f8 = f8
	fcvt.xuf.s1 f9 = f9
	(p7) break 1
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
	(p6) fnma.s1 f11 = f9, f10, f1
	(p6) fmpy.s1 f12 = f8, f10
	;;
	(p6) fmpy.s1 f13 = f11, f11
	(p6) fma.s1 f12 = f11, f12, f12
	;;
	(p6) fma.s1 f10 = f11, f10, f10
	(p6) fma.s1 f11 = f13, f12, f12
	;;
	(p6) fma.s1 f10 = f13, f10, f10
	(p6) fnma.s1 f12 = f9, f11, f8
	;;
	(p6) fma.s1 f10 = f12, f10, f11
	;;
	// Round quotient to an unsigned integer.
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __udivdi3
	#endif

	#ifdef L__umoddi3
	// Compute a 64-bit unsigned integer modulus.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// in0 holds the dividend (a). in1 holds the divisor (b).

	.text
	.align 16
	.global __umoddi3
	.proc __umoddi3
	__umoddi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f14 = in0
	setf.sig f9 = in1
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	// Convert the inputs to FP, to avoid FP software assist faults.
	fcvt.xuf.s1 f8 = f14
	fcvt.xuf.s1 f9 = f9
	(p7) break 1;
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
	(p6) fmpy.s1 f12 = f8, f10
	(p6) fnma.s1 f11 = f9, f10, f1
	;;
	(p6) fma.s1 f12 = f11, f12, f12
	(p6) fmpy.s1 f13 = f11, f11
	;;
	(p6) fma.s1 f10 = f11, f10, f10
	(p6) fma.s1 f11 = f13, f12, f12
	;;
	sub in1 = r0, in1
	(p6) fma.s1 f10 = f13, f10, f10
	(p6) fnma.s1 f12 = f9, f11, f8
	;;
	setf.sig f9 = in1
	(p6) fma.s1 f10 = f12, f10, f11
	;;
	// Round quotient to an unsigned integer.
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	// r = q * (-b) + a
	xma.l f10 = f10, f9, f14
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __umoddi3
	#endif

	#ifdef L__divsi3
	// Compute a 32-bit integer quotient.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// in0 holds the dividend. in1 holds the divisor.

	.text
	.align 16
	.global __divsi3
	.proc __divsi3
	__divsi3:
	.regstk 2,0,0,0
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	sxt4 in0 = in0
	sxt4 in1 = in1
	;;
	setf.sig f8 = in0
	setf.sig f9 = in1
	(p7) break 1
	;;
	mov r2 = 0x0ffdd
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
	;;
	(p6) fmpy.s1 f8 = f8, f10
	(p6) fnma.s1 f9 = f9, f10, f1
	;;
	(p6) fma.s1 f8 = f9, f8, f8
	(p6) fma.s1 f9 = f9, f9, f11
	;;
	(p6) fma.s1 f10 = f9, f8, f8
	;;
	fcvt.fx.trunc.s1 f10 = f10
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __divsi3
	#endif

	#ifdef L__modsi3
	// Compute a 32-bit integer modulus.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// in0 holds the dividend. in1 holds the divisor.

	.text
	.align 16
	.global __modsi3
	.proc __modsi3
	__modsi3:
	.regstk 2,0,0,0
	mov r2 = 0x0ffdd
	sxt4 in0 = in0
	sxt4 in1 = in1
	;;
	setf.sig f13 = r32
	setf.sig f9 = r33
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	sub in1 = r0, in1
	fcvt.xf f8 = f13
	fcvt.xf f9 = f9
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
	(p7) break 1
	;;
	(p6) fmpy.s1 f12 = f8, f10
	(p6) fnma.s1 f10 = f9, f10, f1
	;;
	setf.sig f9 = in1
	(p6) fma.s1 f12 = f10, f12, f12
	(p6) fma.s1 f10 = f10, f10, f11
	;;
	(p6) fma.s1 f10 = f10, f12, f12
	;;
	fcvt.fx.trunc.s1 f10 = f10
	;;
	xma.l f10 = f10, f9, f13
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __modsi3
	#endif

	#ifdef L__udivsi3
	// Compute a 32-bit unsigned integer quotient.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// in0 holds the dividend. in1 holds the divisor.

	.text
	.align 16
	.global __udivsi3
	.proc __udivsi3
	__udivsi3:
	.regstk 2,0,0,0
	mov r2 = 0x0ffdd
	zxt4 in0 = in0
	zxt4 in1 = in1
	;;
	setf.sig f8 = in0
	setf.sig f9 = in1
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	(p7) break 1
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
	;;
	(p6) fmpy.s1 f8 = f8, f10
	(p6) fnma.s1 f9 = f9, f10, f1
	;;
	(p6) fma.s1 f8 = f9, f8, f8
	(p6) fma.s1 f9 = f9, f9, f11
	;;
	(p6) fma.s1 f10 = f9, f8, f8
	;;
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __udivsi3
	#endif

	#ifdef L__umodsi3
	// Compute a 32-bit unsigned integer modulus.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// in0 holds the dividend. in1 holds the divisor.

	.text
	.align 16
	.global __umodsi3
	.proc __umodsi3
	__umodsi3:
	.regstk 2,0,0,0
	mov r2 = 0x0ffdd
	zxt4 in0 = in0
	zxt4 in1 = in1
	;;
	setf.sig f13 = in0
	setf.sig f9 = in1
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	sub in1 = r0, in1
	fcvt.xf f8 = f13
	fcvt.xf f9 = f9
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
	(p7) break 1;
	;;
	(p6) fmpy.s1 f12 = f8, f10
	(p6) fnma.s1 f10 = f9, f10, f1
	;;
	setf.sig f9 = in1
	(p6) fma.s1 f12 = f10, f12, f12
	(p6) fma.s1 f10 = f10, f10, f11
	;;
	(p6) fma.s1 f10 = f10, f12, f12
	;;
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	xma.l f10 = f10, f9, f13
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __umodsi3
	#endif

	#ifdef L__save_stack_nonlocal
	// Notes on save/restore stack nonlocal: We read ar.bsp but write
	// ar.bspstore. This is because ar.bsp can be read at all times
	// (independent of the RSE mode) but since it's read-only we need to
	// restore the value via ar.bspstore. This is OK because
	// ar.bsp==ar.bspstore after executing "flushrs".

	// void __ia64_save_stack_nonlocal(void save_area, void stack_pointer)

	.text
	.align 16
	.global __ia64_save_stack_nonlocal
	.proc __ia64_save_stack_nonlocal
	__ia64_save_stack_nonlocal:
	{ .mmf
	alloc r18 = ar.pfs, 2, 0, 0, 0
	mov r19 = ar.rsc
	;;
	}
	{ .mmi
	flushrs
	st8 [in0] = in1, 24
	and r19 = 0x1c, r19
	;;
	}
	{ .mmi
	st8 [in0] = r18, -16
	mov ar.rsc = r19
	or r19 = 0x3, r19
	;;
	}
	{ .mmi
	mov r16 = ar.bsp
	mov r17 = ar.rnat
	adds r2 = 8, in0
	;;
	}
	{ .mmi
	st8 [in0] = r16
	st8 [r2] = r17
	}
	{ .mib
	mov ar.rsc = r19
	br.ret.sptk.few rp
	;;
	}
	.endp __ia64_save_stack_nonlocal
	#endif

	#ifdef L__nonlocal_goto
	// void __ia64_nonlocal_goto(void target_label, void save_area,
	// void *static_chain);

	.text
	.align 16
	.global __ia64_nonlocal_goto
	.proc __ia64_nonlocal_goto
	__ia64_nonlocal_goto:
	{ .mmi
	alloc r20 = ar.pfs, 3, 0, 0, 0
	ld8 r12 = [in1], 8
	mov.ret.sptk rp = in0, .L0
	;;
	}
	{ .mmf
	ld8 r16 = [in1], 8
	mov r19 = ar.rsc
	;;
	}
	{ .mmi
	flushrs
	ld8 r17 = [in1], 8
	and r19 = 0x1c, r19
	;;
	}
	{ .mmi
	ld8 r18 = [in1]
	mov ar.rsc = r19
	or r19 = 0x3, r19
	;;
	}
	{ .mmi
	mov ar.bspstore = r16
	;;
	mov ar.rnat = r17
	;;
	}
	{ .mmi
	loadrs
	invala
	mov r15 = in2
	;;
	}
	.L0: { .mib
	mov ar.rsc = r19
	mov ar.pfs = r18
	br.ret.sptk.few rp
	;;
	}
	.endp __ia64_nonlocal_goto
	#endif

	#ifdef L__restore_stack_nonlocal
	// This is mostly the same as nonlocal_goto above.
	// ??? This has not been tested yet.

	// void __ia64_restore_stack_nonlocal(void *save_area)

	.text
	.align 16
	.global __ia64_restore_stack_nonlocal
	.proc __ia64_restore_stack_nonlocal
	__ia64_restore_stack_nonlocal:
	{ .mmf
	alloc r20 = ar.pfs, 4, 0, 0, 0
	ld8 r12 = [in0], 8
	;;
	}
	{ .mmb
	ld8 r16=[in0], 8
	mov r19 = ar.rsc
	;;
	}
	{ .mmi
	flushrs
	ld8 r17 = [in0], 8
	and r19 = 0x1c, r19
	;;
	}
	{ .mmf
	ld8 r18 = [in0]
	mov ar.rsc = r19
	;;
	}
	{ .mmi
	mov ar.bspstore = r16
	;;
	mov ar.rnat = r17
	or r19 = 0x3, r19
	;;
	}
	{ .mmf
	loadrs
	invala
	;;
	}
	.L0: { .mib
	mov ar.rsc = r19
	mov ar.pfs = r18
	br.ret.sptk.few rp
	;;
	}
	.endp __ia64_restore_stack_nonlocal
	#endif

	#ifdef L__trampoline
	// Implement the nested function trampoline. This is out of line
	// so that we don't have to bother with flushing the icache, as
	// well as making the on-stack trampoline smaller.
	//
	// The trampoline has the following form:
	//
	// +-------------------+ >
	// TRAMP: \| __ia64_trampoline \| \|
	// +-------------------+ > fake function descriptor
	// \| TRAMP+16 \| \|
	// +-------------------+ >
	// \| target descriptor \|
	// +-------------------+
	// \| static link \|
	// +-------------------+

	.text
	.align 16
	.global __ia64_trampoline
	.proc __ia64_trampoline
	__ia64_trampoline:
	{ .mmi
	ld8 r2 = [r1], 8
	;;
	ld8 r15 = [r1]
	}
	{ .mmi
	ld8 r3 = [r2], 8
	;;
	ld8 r1 = [r2]
	mov b6 = r3
	}
	{ .bbb
	br.sptk.many b6
	;;
	}
	.endp __ia64_trampoline
	#endif

	#ifdef SHARED
	// Thunks for backward compatibility.
	#ifdef L_fixtfdi
	.text
	.align 16
	.global __fixtfti
	.proc __fixtfti
	__fixtfti:
	{ .bbb
	br.sptk.many __fixxfti
	;;
	}
	.endp __fixtfti
	#endif
	#ifdef L_fixunstfdi
	.align 16
	.global __fixunstfti
	.proc __fixunstfti
	__fixunstfti:
	{ .bbb
	br.sptk.many __fixunsxfti
	;;
	}
	.endp __fixunstfti
	#endif
	#ifdef L_floatditf
	.align 16
	.global __floattitf
	.proc __floattitf
	__floattitf:
	{ .bbb
	br.sptk.many __floattixf
	;;
	}
	.endp __floattitf
	#endif
	#endif