gcc/config/score/mul-div.S - gcc - Git at Google

 /* Copyright (C) 2005, 2007 Free Software Foundation, Inc.
    Contributed by Sunnorth

    This file is part of GCC.

    GCC is free software; you can redistribute it and/or modify it
    under the terms of the GNU General Public License as published
    by the Free Software Foundation; either version 3, or (at your
    option) any later version.

    GCC is distributed in the hope that it will be useful, but WITHOUT
    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
    License for more details.

    You should have received a copy of the GNU General Public License
    along with GCC; see the file COPYING3.  If not see
    <http://www.gnu.org/licenses/>.  */

 #define ra r3
 #define a0 r4
 #define a1 r5
 #define a2 r6
 #define a3 r7
 #define v0 r23

 #define t0 r8
 #define t1 r9
 #define t2 r10
 #define t3 r11
 #define t4 r22

 #ifndef __pic__
 #if !defined(L_mulsi3) && !defined(L_divsi3)
        .text
        .global _flush_cache
 #ifdef __score3__
 _flush_cache:
         br      r3
 #else
 _flush_cache:
         srli    r9, r5, 4
         mv      r8, r4
         mtsr    r9, sr0
 1:
         cache   0xe, [r8, 0]            # write back invalid dcache
         addi    r8, 16
         bcnz    1b
         mfcr    r8, cr4
         bittst! r8, 0x3                 # if LDM is enable, write back LDM
         beq!    6f
         ldi     r10, 0
         cache   0xc, [r10, 0]
 6:
         bittst! r8, 0x2                 # if LIM is enable, refill it
         beq!    7f
         cache   0x4, [r10, 0]
 7:
         #nop!
         #nop!
         #nop!
         #nop!
         #nop!
         mv      r8, r4
         mtsr    r9, sr0
 2:
         cache   0x2, [r8, 0]            # invalid unlock icache
         #nop!
         #nop!
         #nop!
         #nop!
         #nop!
         addi    r8, 16
         bcnz    2b
         br      r3
 #endif
 #endif

 /* FUNCTION
    (U) INT32 v0 = __mulsi3 ((U) INT32 a0, (U) INT32 a1);
    REGISTERS:
         use     t0
         modify  a0
         a1      -> become 0
    NOTE:
    this seems to give better performance to just rotate and add.  */

 #ifdef L_mulsi3
         .text
         .global __umulsi3
         .global __mulsi3
         /* signed multiplication (32x32)  */
         .ent    __mulsi3
 __umulsi3:
 __mulsi3:
         li      t1, 0
 __mulsi3_loop:
         andri.c t0, a1, 1               # t0 = multiplier[0]
         srli    a1, a1, 1               # a1 /= 2
         beq     __mulsi3_loop2          # skip if (t0 == 0)
         add     t1, t1, a0              # add multiplicand
 __mulsi3_loop2:
         slli    a0, a0, 1               # multiplicand mul 2
         cmpi.c  a1, 0
         bne     __mulsi3_loop
         mv      r4, t1
         br      ra
         .end    __mulsi3
 #endif /* L_mulsi3 */

 /* FUNCTION
    UINT32 (v0) = __udivsi3 (UINT32 (a0), UINT32 (a1));
    INT32 (v0) = __divsi3 (INT32 (a0),  INT32 (a1));
    UINT32 (v0) = __umodsi3 (UINT32 (a0), UINT32 (a1));
    INT32 (v0) = __modsi3 (INT32 (a0),  INT32 (a1));
    DESCRIPTION
    performs 32-bit division/modulo.
    REGISTERS
    used t0      bit-index
         t1
    modify a0    becomes remainer  */
 #ifdef L_divsi3
         .text
         .global __udivsi3
         .global __umodsi3
         .global __divsi3
         .global __modsi3

         /* unsigned division  */
         .ent    __udivsi3
 __udivsi3:
         li      t4, 0
         cmpi.c  a1, 0
         beq     __uds_exit
         li      t0, 1
         blt     __uds_ok
 __uds_normalize:
         cmp.c   a0, a1
         bcc     __uds_ok
         slli    a1, a1, 1
         slli    t0, t0, 1
         cmpi.c  a1, 0
         bge     __uds_normalize
 __uds_ok:
 __uds_loop2:
         cmp.c   a0, a1
         bcc     __uds_loop3
         sub     a0, a0, a1
         or      t4, t4, t0
 __uds_loop3:
         srli    t0, t0, 1
         srli    a1, a1, 1
         cmpi.c  t0, 0
         bne     __uds_loop2
 __uds_exit:
         mv      a1, a0
         mv      r4, t4
         br      ra
         .end    __udivsi3

         /* unsigned modulus  */
         .ent    __umodsi3
 __umodsi3:
         mv      t3, ra
         jl      __udivsi3
         mv      r4, a1
         br      t3
         .end    __umodsi3

         /* abs and div  */
         .ent    __orgsi3
 __orgsi3:
         cmpi.c  a0, 0
         bge     __orgsi3_a0p
         neg     a0, a0
 __orgsi3_a0p:
         cmpi.c  a1, 0
         bge     __udivsi3
         neg     a1, a1
         b       __udivsi3               # goto udivsi3
         .end    __orgsi3

         /* signed division  */
         .ent    __divsi3
 __divsi3:
         mv      t3, ra
         xor     t2, a0, a1
         jl      __orgsi3
 __divsi3_adjust:
         cmpi.c  t2, 0
         bge     __divsi3_exit
         neg     r4, r4
 __divsi3_exit:
         br      t3
         .end    __divsi3

         /* signed modulus  */
         .ent    __modsi3
 __modsi3:
         mv      t3, ra
         mv      t2, a0
         jl      __orgsi3
         mv      r4, a1
         b       __divsi3_adjust
         .end    __modsi3

 #endif /* L_divsi3 */
 #else /* -fPIC */
 #if !defined(L_mulsi3) && !defined(L_divsi3)
         .set pic
         .text
         .global _flush_cache
 #ifdef __score3__
 _flush_cache:
         br      r3
 #else
 _flush_cache:
         addi    r0, -8                  # pic used
         .cpload r29                     # pic used
         srli    r9, r5, 4
         mv      r8, r4
         mtsr    r9, sr0
 1:
         cache   0xe, [r8, 0]            # write back invalid dcache
         addi    r8, 16
         bcnz    1b
         mfcr    r8, cr4
         bittst! r8, 0x3                 # if LDM is enable, write back LDM
         beq!    6f
         ldi     r10, 0
         cache   0xc, [r10, 0]
 6:
         bittst! r8, 0x2                 # if LIM is enable, refill it
         beq!    7f
         cache   0x4, [r10, 0]
 7:
         #nop!
         #nop!
         #nop!
         #nop!
         #nop!
         mv      r8, r4
         mtsr    r9, sr0
 2:
         cache   0x2, [r8, 0]            # invalid unlock icache
         #nop!
         #nop!
         #nop!
         #nop!
         #nop!
         addi    r8, 16
         bcnz    2b
         .cprestore r0, 12               # pic used
         addi    r0, 8                   # pic used
         br      r3
 #endif
 #endif

 /* FUNCTION
    (U) INT32 v0 = __mulsi3 ((U) INT32 a0, (U) INT32 a1);
    REGISTERS:
         use     t0
         modify  a0
         a1      -> become 0
    NOTE:
    this seems to give better performance to just rotate and add.  */

 #ifdef L_mulsi3
         .set pic
         .text
         .global __umulsi3
         .global __mulsi3
         /* signed multiplication (32x32)  */
         .ent    __mulsi3
 __umulsi3:
 __mulsi3:
         addi    r0, -8                  # pic used
         .cpload r29                     # pic used
         li      t1, 0
 __mulsi3_loop:
         andri.c t0, a1, 1               # t0 = multiplier[0]
         srli    a1, a1, 1               # a1 /= 2
         beq     __mulsi3_loop2          # skip if (t0 == 0)
         add     t1, t1, a0              # add multiplicand
 __mulsi3_loop2:
         slli    a0, a0, 1               # multiplicand mul 2
         cmpi.c  a1, 0
         bne     __mulsi3_loop
         mv      r4, t1
         .cprestore r0, 12               # pic used
         addi    r0, 8                   # pic used
         br      ra
         .end    __mulsi3
 #endif /* L_mulsi3 */

 /* FUNCTION
    UINT32 (v0) = __udivsi3 (UINT32 (a0), UINT32 (a1));
    INT32 (v0) = __divsi3 (INT32 (a0),  INT32 (a1));
    UINT32 (v0) = __umodsi3 (UINT32 (a0), UINT32 (a1));
    INT32 (v0) = __modsi3 (INT32 (a0),  INT32 (a1));
    DESCRIPTION
    performs 32-bit division/modulo.
    REGISTERS
    used t0      bit-index
         t1
    modify a0    becomes remainer  */
 #ifdef L_divsi3
         .set pic
         .text
         .global __udivsi3
         .global __umodsi3
         .global __divsi3
         .global __modsi3

         /* unsigned division  */
         .ent    __udivsi3
 __udivsi3:
         addi    r0, -8                  # pic used
         .cpload r29                     # pic used
         li      t4, 0
         cmpi.c  a1, 0
         beq     __uds_exit
         li      t0, 1
         blt     __uds_ok
 __uds_normalize:
         cmp.c   a0, a1
         bcc     __uds_ok
         slli    a1, a1, 1
         slli    t0, t0, 1
         cmpi.c  a1, 0
         bge     __uds_normalize
 __uds_ok:
 __uds_loop2:
         cmp.c   a0, a1
         bcc     __uds_loop3
         sub     a0, a0, a1
         or      t4, t4, t0
 __uds_loop3:
         srli    t0, t0, 1
         srli    a1, a1, 1
         cmpi.c  t0, 0
         bne     __uds_loop2
 __uds_exit:
         mv      a1, a0
         mv      r4, t4
         .cprestore r0, 12               # pic used
         addi    r0, 8                   # pic used
         br      ra
         .end    __udivsi3

         /* unsigned modulus  */
         .ent    __umodsi3
 __umodsi3:
         addi    r0, -8                  # pic used
         .cpload r29                     # pic used
         li      t1, 0
         mv      t3, ra
         la      r29, __udivsi3
         brl     r29
         mv      r4, a1
         .cprestore r0, 12               # pic used
         addi    r0, 8                   # pic used
         br      t3
         .end    __umodsi3

         /* abs and div  */
         .ent    __orgsi3
 __orgsi3:
         cmpi.c  a0, 0
         bge     __orgsi3_a0p
         neg     a0, a0
 __orgsi3_a0p:
         cmpi.c  a1, 0
         bge     __udivsi3
         neg     a1, a1
         b       __udivsi3               # goto udivsi3
         .end    __orgsi3

         /* signed division  */
         .ent    __divsi3
 __divsi3:
         addi    r0, -8                  # pic used
         .cpload r29                     # pic used
         mv      t3, ra
         xor     t2, a0, a1
         la      r29, __orgsi3
         brl     r29
 __divsi3_adjust:
         cmpi.c  t2, 0
         bge     __divsi3_exit
         neg     r4, r4
 __divsi3_exit:
         .cprestore r0, 12               # pic used
         addi    r0, 8                   # pic used
         br      t3
         .end    __divsi3

         /* signed modulus  */
         .ent    __modsi3
 __modsi3:
         addi    r0, -8                  # pic used
         .cpload r29                     # pic used
         mv      t3, ra
         mv      t2, a0
         la      r29, __orgsi3
         brl     r29
         mv      r4, a1
         b       __divsi3_adjust
         .end    __modsi3

 #endif /*L_divsi3 */
 #endif
	/* Copyright (C) 2005, 2007 Free Software Foundation, Inc.
	Contributed by Sunnorth

	This file is part of GCC.

	GCC is free software; you can redistribute it and/or modify it
	under the terms of the GNU General Public License as published
	by the Free Software Foundation; either version 3, or (at your
	option) any later version.

	GCC is distributed in the hope that it will be useful, but WITHOUT
	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
	License for more details.

	You should have received a copy of the GNU General Public License
	along with GCC; see the file COPYING3. If not see
	<http://www.gnu.org/licenses/>. */

	#define ra r3
	#define a0 r4
	#define a1 r5
	#define a2 r6
	#define a3 r7
	#define v0 r23

	#define t0 r8
	#define t1 r9
	#define t2 r10
	#define t3 r11
	#define t4 r22

	#ifndef __pic__
	#if !defined(L_mulsi3) && !defined(L_divsi3)
	.text
	.global _flush_cache
	#ifdef __score3__
	_flush_cache:
	br r3
	#else
	_flush_cache:
	srli r9, r5, 4
	mv r8, r4
	mtsr r9, sr0
	1:
	cache 0xe, [r8, 0] # write back invalid dcache
	addi r8, 16
	bcnz 1b
	mfcr r8, cr4
	bittst! r8, 0x3 # if LDM is enable, write back LDM
	beq! 6f
	ldi r10, 0
	cache 0xc, [r10, 0]
	6:
	bittst! r8, 0x2 # if LIM is enable, refill it
	beq! 7f
	cache 0x4, [r10, 0]
	7:
	#nop!
	#nop!
	#nop!
	#nop!
	#nop!
	mv r8, r4
	mtsr r9, sr0
	2:
	cache 0x2, [r8, 0] # invalid unlock icache
	#nop!
	#nop!
	#nop!
	#nop!
	#nop!
	addi r8, 16
	bcnz 2b
	br r3
	#endif
	#endif

	/* FUNCTION
	(U) INT32 v0 = __mulsi3 ((U) INT32 a0, (U) INT32 a1);
	REGISTERS:
	use t0
	modify a0
	a1 -> become 0
	NOTE:
	this seems to give better performance to just rotate and add. */

	#ifdef L_mulsi3
	.text
	.global __umulsi3
	.global __mulsi3
	/* signed multiplication (32x32) */
	.ent __mulsi3
	__umulsi3:
	__mulsi3:
	li t1, 0
	__mulsi3_loop:
	andri.c t0, a1, 1 # t0 = multiplier[0]
	srli a1, a1, 1 # a1 /= 2
	beq __mulsi3_loop2 # skip if (t0 == 0)
	add t1, t1, a0 # add multiplicand
	__mulsi3_loop2:
	slli a0, a0, 1 # multiplicand mul 2
	cmpi.c a1, 0
	bne __mulsi3_loop
	mv r4, t1
	br ra
	.end __mulsi3
	#endif /* L_mulsi3 */

	/* FUNCTION
	UINT32 (v0) = __udivsi3 (UINT32 (a0), UINT32 (a1));
	INT32 (v0) = __divsi3 (INT32 (a0), INT32 (a1));
	UINT32 (v0) = __umodsi3 (UINT32 (a0), UINT32 (a1));
	INT32 (v0) = __modsi3 (INT32 (a0), INT32 (a1));
	DESCRIPTION
	performs 32-bit division/modulo.
	REGISTERS
	used t0 bit-index
	t1
	modify a0 becomes remainer */
	#ifdef L_divsi3
	.text
	.global __udivsi3
	.global __umodsi3
	.global __divsi3
	.global __modsi3

	/* unsigned division */
	.ent __udivsi3
	__udivsi3:
	li t4, 0
	cmpi.c a1, 0
	beq __uds_exit
	li t0, 1
	blt __uds_ok
	__uds_normalize:
	cmp.c a0, a1
	bcc __uds_ok
	slli a1, a1, 1
	slli t0, t0, 1
	cmpi.c a1, 0
	bge __uds_normalize
	__uds_ok:
	__uds_loop2:
	cmp.c a0, a1
	bcc __uds_loop3
	sub a0, a0, a1
	or t4, t4, t0
	__uds_loop3:
	srli t0, t0, 1
	srli a1, a1, 1
	cmpi.c t0, 0
	bne __uds_loop2
	__uds_exit:
	mv a1, a0
	mv r4, t4
	br ra
	.end __udivsi3

	/* unsigned modulus */
	.ent __umodsi3
	__umodsi3:
	mv t3, ra
	jl __udivsi3
	mv r4, a1
	br t3
	.end __umodsi3

	/* abs and div */
	.ent __orgsi3
	__orgsi3:
	cmpi.c a0, 0
	bge __orgsi3_a0p
	neg a0, a0
	__orgsi3_a0p:
	cmpi.c a1, 0
	bge __udivsi3
	neg a1, a1
	b __udivsi3 # goto udivsi3
	.end __orgsi3

	/* signed division */
	.ent __divsi3
	__divsi3:
	mv t3, ra
	xor t2, a0, a1
	jl __orgsi3
	__divsi3_adjust:
	cmpi.c t2, 0
	bge __divsi3_exit
	neg r4, r4
	__divsi3_exit:
	br t3
	.end __divsi3

	/* signed modulus */
	.ent __modsi3
	__modsi3:
	mv t3, ra
	mv t2, a0
	jl __orgsi3
	mv r4, a1
	b __divsi3_adjust
	.end __modsi3

	#endif /* L_divsi3 */
	#else /* -fPIC */
	#if !defined(L_mulsi3) && !defined(L_divsi3)
	.set pic
	.text
	.global _flush_cache
	#ifdef __score3__
	_flush_cache:
	br r3
	#else
	_flush_cache:
	addi r0, -8 # pic used
	.cpload r29 # pic used
	srli r9, r5, 4
	mv r8, r4
	mtsr r9, sr0
	1:
	cache 0xe, [r8, 0] # write back invalid dcache
	addi r8, 16
	bcnz 1b
	mfcr r8, cr4
	bittst! r8, 0x3 # if LDM is enable, write back LDM
	beq! 6f
	ldi r10, 0
	cache 0xc, [r10, 0]
	6:
	bittst! r8, 0x2 # if LIM is enable, refill it
	beq! 7f
	cache 0x4, [r10, 0]
	7:
	#nop!
	#nop!
	#nop!
	#nop!
	#nop!
	mv r8, r4
	mtsr r9, sr0
	2:
	cache 0x2, [r8, 0] # invalid unlock icache
	#nop!
	#nop!
	#nop!
	#nop!
	#nop!
	addi r8, 16
	bcnz 2b
	.cprestore r0, 12 # pic used
	addi r0, 8 # pic used
	br r3
	#endif
	#endif

	/* FUNCTION
	(U) INT32 v0 = __mulsi3 ((U) INT32 a0, (U) INT32 a1);
	REGISTERS:
	use t0
	modify a0
	a1 -> become 0
	NOTE:
	this seems to give better performance to just rotate and add. */

	#ifdef L_mulsi3
	.set pic
	.text
	.global __umulsi3
	.global __mulsi3
	/* signed multiplication (32x32) */
	.ent __mulsi3
	__umulsi3:
	__mulsi3:
	addi r0, -8 # pic used
	.cpload r29 # pic used
	li t1, 0
	__mulsi3_loop:
	andri.c t0, a1, 1 # t0 = multiplier[0]
	srli a1, a1, 1 # a1 /= 2
	beq __mulsi3_loop2 # skip if (t0 == 0)
	add t1, t1, a0 # add multiplicand
	__mulsi3_loop2:
	slli a0, a0, 1 # multiplicand mul 2
	cmpi.c a1, 0
	bne __mulsi3_loop
	mv r4, t1
	.cprestore r0, 12 # pic used
	addi r0, 8 # pic used
	br ra
	.end __mulsi3
	#endif /* L_mulsi3 */

	/* FUNCTION
	UINT32 (v0) = __udivsi3 (UINT32 (a0), UINT32 (a1));
	INT32 (v0) = __divsi3 (INT32 (a0), INT32 (a1));
	UINT32 (v0) = __umodsi3 (UINT32 (a0), UINT32 (a1));
	INT32 (v0) = __modsi3 (INT32 (a0), INT32 (a1));
	DESCRIPTION
	performs 32-bit division/modulo.
	REGISTERS
	used t0 bit-index
	t1
	modify a0 becomes remainer */
	#ifdef L_divsi3
	.set pic
	.text
	.global __udivsi3
	.global __umodsi3
	.global __divsi3
	.global __modsi3

	/* unsigned division */
	.ent __udivsi3
	__udivsi3:
	addi r0, -8 # pic used
	.cpload r29 # pic used
	li t4, 0
	cmpi.c a1, 0
	beq __uds_exit
	li t0, 1
	blt __uds_ok
	__uds_normalize:
	cmp.c a0, a1
	bcc __uds_ok
	slli a1, a1, 1
	slli t0, t0, 1
	cmpi.c a1, 0
	bge __uds_normalize
	__uds_ok:
	__uds_loop2:
	cmp.c a0, a1
	bcc __uds_loop3
	sub a0, a0, a1
	or t4, t4, t0
	__uds_loop3:
	srli t0, t0, 1
	srli a1, a1, 1
	cmpi.c t0, 0
	bne __uds_loop2
	__uds_exit:
	mv a1, a0
	mv r4, t4
	.cprestore r0, 12 # pic used
	addi r0, 8 # pic used
	br ra
	.end __udivsi3

	/* unsigned modulus */
	.ent __umodsi3
	__umodsi3:
	addi r0, -8 # pic used
	.cpload r29 # pic used
	li t1, 0
	mv t3, ra
	la r29, __udivsi3
	brl r29
	mv r4, a1
	.cprestore r0, 12 # pic used
	addi r0, 8 # pic used
	br t3
	.end __umodsi3

	/* abs and div */
	.ent __orgsi3
	__orgsi3:
	cmpi.c a0, 0
	bge __orgsi3_a0p
	neg a0, a0
	__orgsi3_a0p:
	cmpi.c a1, 0
	bge __udivsi3
	neg a1, a1
	b __udivsi3 # goto udivsi3
	.end __orgsi3

	/* signed division */
	.ent __divsi3
	__divsi3:
	addi r0, -8 # pic used
	.cpload r29 # pic used
	mv t3, ra
	xor t2, a0, a1
	la r29, __orgsi3
	brl r29
	__divsi3_adjust:
	cmpi.c t2, 0
	bge __divsi3_exit
	neg r4, r4
	__divsi3_exit:
	.cprestore r0, 12 # pic used
	addi r0, 8 # pic used
	br t3
	.end __divsi3

	/* signed modulus */
	.ent __modsi3
	__modsi3:
	addi r0, -8 # pic used
	.cpload r29 # pic used
	mv t3, ra
	mv t2, a0
	la r29, __orgsi3
	brl r29
	mv r4, a1
	b __divsi3_adjust
	.end __modsi3

	#endif /L_divsi3 /
	#endif