/* libgcc functions for Blackfin. | |

Copyright (C) 2005-2021 Free Software Foundation, Inc. | |

Contributed by Analog Devices. | |

This file is part of GCC. | |

GCC is free software; you can redistribute it and/or modify | |

it under the terms of the GNU General Public License as published by | |

the Free Software Foundation; either version 3, or (at your option) | |

any later version. | |

GCC is distributed in the hope that it will be useful, | |

but WITHOUT ANY WARRANTY; without even the implied warranty of | |

MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |

GNU General Public License for more details. | |

Under Section 7 of GPL version 3, you are granted additional | |

permissions described in the GCC Runtime Library Exception, version | |

3.1, as published by the Free Software Foundation. | |

You should have received a copy of the GNU General Public License and | |

a copy of the GCC Runtime Library Exception along with this program; | |

see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |

<http://www.gnu.org/licenses/>. */ | |

#ifdef L_divsi3 | |

.text | |

.align 2 | |

.global ___divsi3; | |

.type ___divsi3, STT_FUNC; | |

___divsi3: | |

[--SP]= RETS; | |

[--SP] = R7; | |

R2 = -R0; | |

CC = R0 < 0; | |

IF CC R0 = R2; | |

R7 = CC; | |

R2 = -R1; | |

CC = R1 < 0; | |

IF CC R1 = R2; | |

R2 = CC; | |

R7 = R7 ^ R2; | |

CALL ___udivsi3; | |

CC = R7; | |

R1 = -R0; | |

IF CC R0 = R1; | |

R7 = [SP++]; | |

RETS = [SP++]; | |

RTS; | |

#endif | |

#ifdef L_modsi3 | |

.align 2 | |

.global ___modsi3; | |

.type ___modsi3, STT_FUNC; | |

___modsi3: | |

[--SP] = RETS; | |

[--SP] = R0; | |

[--SP] = R1; | |

CALL ___divsi3; | |

R2 = [SP++]; | |

R1 = [SP++]; | |

R2 *= R0; | |

R0 = R1 - R2; | |

RETS = [SP++]; | |

RTS; | |

#endif | |

#ifdef L_udivsi3 | |

.align 2 | |

.global ___udivsi3; | |

.type ___udivsi3, STT_FUNC; | |

___udivsi3: | |

P0 = 32; | |

LSETUP (0f, 1f) LC0 = P0; | |

/* upper half of dividend */ | |

R3 = 0; | |

0: | |

/* The first time round in the loop we shift in garbage, but since we | |

perform 33 shifts, it doesn't matter. */ | |

R0 = ROT R0 BY 1; | |

R3 = ROT R3 BY 1; | |

R2 = R3 - R1; | |

CC = R3 < R1 (IU); | |

1: | |

/* Last instruction of the loop. */ | |

IF ! CC R3 = R2; | |

/* Shift in the last bit. */ | |

R0 = ROT R0 BY 1; | |

/* R0 is the result, R3 contains the remainder. */ | |

R0 = ~ R0; | |

RTS; | |

#endif | |

#ifdef L_umodsi3 | |

.align 2 | |

.global ___umodsi3; | |

.type ___umodsi3, STT_FUNC; | |

___umodsi3: | |

[--SP] = RETS; | |

CALL ___udivsi3; | |

R0 = R3; | |

RETS = [SP++]; | |

RTS; | |

#endif | |

#ifdef L_umulsi3_highpart | |

.align 2 | |

.global ___umulsi3_highpart; | |

.type ___umulsi3_highpart, STT_FUNC; | |

___umulsi3_highpart: | |

A1 = R1.L * R0.L (FU); | |

A1 = A1 >> 16; | |

A0 = R1.H * R0.H, A1 += R1.L * R0.H (FU); | |

A1 += R0.L * R1.H (FU); | |

A1 = A1 >> 16; | |

A0 += A1; | |

R0 = A0 (FU); | |

RTS; | |

#endif | |

#ifdef L_smulsi3_highpart | |

.align 2 | |

.global ___smulsi3_highpart; | |

.type ___smulsi3_highpart, STT_FUNC; | |

___smulsi3_highpart: | |

A1 = R1.L * R0.L (FU); | |

A1 = A1 >> 16; | |

A0 = R0.H * R1.H, A1 += R0.H * R1.L (IS,M); | |

A1 += R1.H * R0.L (IS,M); | |

A1 = A1 >>> 16; | |

R0 = (A0 += A1); | |

RTS; | |

#endif | |

#ifdef L_muldi3 | |

.align 2 | |

.global ___muldi3; | |

.type ___muldi3, STT_FUNC; | |

/* | |

R1:R0 * R3:R2 | |

= R1.h:R1.l:R0.h:R0.l * R3.h:R3.l:R2.h:R2.l | |

[X] = (R1.h * R3.h) * 2^96 | |

[X] + (R1.h * R3.l + R1.l * R3.h) * 2^80 | |

[X] + (R1.h * R2.h + R1.l * R3.l + R3.h * R0.h) * 2^64 | |

[T1] + (R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h) * 2^48 | |

[T2] + (R1.l * R2.l + R3.l * R0.l + R0.h * R2.h) * 2^32 | |

[T3] + (R0.l * R2.h + R2.l * R0.h) * 2^16 | |

[T4] + (R0.l * R2.l) | |

We can discard the first three lines marked "X" since we produce | |

only a 64 bit result. So, we need ten 16-bit multiplies. | |

Individual mul-acc results: | |

[E1] = R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h | |

[E2] = R1.l * R2.l + R3.l * R0.l + R0.h * R2.h | |

[E3] = R0.l * R2.h + R2.l * R0.h | |

[E4] = R0.l * R2.l | |

We also need to add high parts from lower-level results to higher ones: | |

E[n]c = E[n] + (E[n+1]c >> 16), where E4c := E4 | |

One interesting property is that all parts of the result that depend | |

on the sign of the multiplication are discarded. Those would be the | |

multiplications involving R1.h and R3.h, but only the top 16 bit of | |

the 32 bit result depend on the sign, and since R1.h and R3.h only | |

occur in E1, the top half of these results is cut off. | |

So, we can just use FU mode for all of the 16-bit multiplies, and | |

ignore questions of when to use mixed mode. */ | |

___muldi3: | |

/* [SP] technically is part of the caller's frame, but we can | |

use it as scratch space. */ | |

A0 = R2.H * R1.L, A1 = R2.L * R1.H (FU) || R3 = [SP + 12]; /* E1 */ | |

A0 += R3.H * R0.L, A1 += R3.L * R0.H (FU) || [SP] = R4; /* E1 */ | |

A0 += A1; /* E1 */ | |

R4 = A0.w; | |

A0 = R0.l * R3.l (FU); /* E2 */ | |

A0 += R2.l * R1.l (FU); /* E2 */ | |

A1 = R2.L * R0.L (FU); /* E4 */ | |

R3 = A1.w; | |

A1 = A1 >> 16; /* E3c */ | |

A0 += R2.H * R0.H, A1 += R2.L * R0.H (FU); /* E2, E3c */ | |

A1 += R0.L * R2.H (FU); /* E3c */ | |

R0 = A1.w; | |

A1 = A1 >> 16; /* E2c */ | |

A0 += A1; /* E2c */ | |

R1 = A0.w; | |

/* low(result) = low(E3c):low(E4) */ | |

R0 = PACK (R0.l, R3.l); | |

/* high(result) = E2c + (E1 << 16) */ | |

R1.h = R1.h + R4.l (NS) || R4 = [SP]; | |

RTS; | |

.size ___muldi3, .-___muldi3 | |

#endif |