*
*   $VER: exp.s 34.1 (19.1.97)
*
*   calculates the result of e to the power of x.
*
*   Version history:
*
*   33.1    19.1.97 laukkanen
*
*           - created (using MacLaurin series approximation with five iterations)
*
*   34.1    19.1.97 (c) Motorola
*
*           - unfortunately the result was only close to the current in the immediate
*           - positive vicinity of zero, scrapped.
*           - cut'n pasted from Motorola M68060SP
*
*   34.2    22.1.97 laukkanen
*
*           - trashed a6, fixed
*
*   34.3    23.1.97 (c) Motorola
*
*           - added expm1()
*
*   34.4    24.1.97 laukkanen
*
*           - in the converting process, I seem to have broken
*             expm1(), fixed.
*
*   34.4    24.1.97 laukkanen
*
*           - exmp1() fixed again.
*

    machine 68040
    fpu     1

    XDEF    _exp
    XDEF    @exp
    XDEF    _expm1
    XDEF    @expm1

* ALGORITHM and IMPLEMENTATION **************************************** *
*                                                                       *
*       exp()                                                           *
*       -----                                                           *
*                                                                       *
*       Step 1. Filter out extreme cases of input argument.             *
*               1.1     If |X| >= 2^(-65), go to Step 1.3.              *
*               1.2     Go to Step 7.                                   *
*               1.3     If |X| < 16380 log(2), go to Step 2.            *
*               1.4     Go to Step 8.                                   *
*       Notes:  The usual case should take the branches 1.1 -> 1.3 -> 2.*
*               To avoid the use of floating-point comparisons, a       *
*               compact representation of |X| is used. This format is a *
*               32-bit integer, the upper (more significant) 16 bits    *
*               are the sign and biased exponent field of |X|; the      *
*               lower 16 bits are the 16 most significant fraction      *
*               (including the explicit bit) bits of |X|. Consequently, *
*               the comparisons in Steps 1.1 and 1.3 can be performed   *
*               by integer comparison. Note also that the constant      *
*               16380 log(2) used in Step 1.3 is also in the compact    *
*               form. Thus taking the branch to Step 2 guarantees       *
*               |X| < 16380 log(2). There is no harm to have a small    *
*               number of cases where |X| is less than, but close to,   *
*               16380 log(2) and the branch to Step 9 is taken.         *
*                                                                       *
*       Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).      *
*               2.1     Set AdjFlag := 0 (indicates the branch 1.3 -> 2 *
*                       was taken)                                      *
*               2.2     N := round-to-nearest-integer( X * 64/log2 ).   *
*               2.3     Calculate       J = N mod 64; so J = 0,1,2,..., *
*                       or 63.                                          *
*               2.4     Calculate       M = (N - J)/64; so N = 64M + J. *
*               2.5     Calculate the address of the stored value of    *
*                       2^(J/64).                                       *
*               2.6     Create the value Scale = 2^M.                   *
*       Notes:  The calculation in 2.2 is really performed by           *
*                       Z := X * constant                               *
*                       N := round-to-nearest-integer(Z)                *
*               where                                                   *
*                       constant := single-precision( 64/log 2 ).       *
*                                                                       *
*               Using a single-precision constant avoids memory         *
*               access. Another effect of using a single-precision      *
*               "constant" is that the calculated value Z is            *
*                                                                       *
*                       Z = X*(64/log2)*(1+eps), |eps| <= 2^(-24).      *
*                                                                       *
*               This error has to be considered later in Steps 3 and 4. *
*                                                                       *
*       Step 3. Calculate X - N*log2/64.                                *
*               3.1     R := X + N*L1,                                  *
*                               where L1 := single-precision(-log2/64). *
*               3.2     R := R + N*L2,                                  *
*                               L2 := extended-precision(-log2/64 - L1).*
*       Notes:  a) The way L1 and L2 are chosen ensures L1+L2           *
*               approximate the value -log2/64 to 88 bits of accuracy.  *
*               b) N*L1 is exact because N is no dc.ler than 22 bits    *
*               and L1 is no dc.ler than 24 bits.                       *
*               c) The calculation X+N*L1 is also exact due to          *
*               cancellation. Thus, R is practically X+N(L1+L2) to full *
*               64 bits.                                                *
*               d) It is important to estimate how large can |R| be     *
*               after Step 3.2.                                         *
*                                                                       *
*               N = rnd-to-int( X*64/log2 (1+eps) ), |eps|<=2^(-24)     *
*               X*64/log2 (1+eps)       =       N + f,  |f| <= 0.5      *
*               X*64/log2 - N   =       f - eps*X 64/log2               *
*               X - N*log2/64   =       f*log2/64 - eps*X               *
*                                                                       *
*                                                                       *
*               Now |X| <= 16446 log2, thus                             *
*                                                                       *
*                       |X - N*log2/64| <= (0.5 + 16446/2^(18))*log2/64 *
*                                       <= 0.57 log2/64.                *
*                This bound will be used in Step 4.                     *
*                                                                       *
*       Step 4. Approximate exp(R)-1 by a polynomial                    *
*               p = R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))      *
*       Notes:  a) In order to reduce memory access, the coefficients   *
*               are made as "short" as possible: A1 (which is 1/2), A4  *
*               and A5 are single precision; A2 and A3 are double       *
*               precision.                                              *
*               b) Even with the restrictions above,                    *
*                  |p - (exp(R)-1)| < 2^(-68.8) for all |R| <= 0.0062.  *
*               Note that 0.0062 is slightly bigger than 0.57 log2/64.  *
*               c) To fully utilize the pipeline, p is separated into   *
*               two independent pieces of roughly equal complexities    *
*                       p = [ R + R*S*(A2 + S*A4) ]     +               *
*                               [ S*(A1 + S*(A3 + S*A5)) ]              *
*               where S = R*R.                                          *
*                                                                       *
*       Step 5. Compute 2^(J/64)*exp(R) = 2^(J/64)*(1+p) by             *
*                               ans := T + ( T*p + t)                   *
*               where T and t are the stored values for 2^(J/64).       *
*       Notes:  2^(J/64) is stored as T and t where T+t approximates    *
*               2^(J/64) to roughly 85 bits; T is in extended precision *
*               and t is in single precision. Note also that T is       *
*               rounded to 62 bits so that the last two bits of T are   *
*               zero. The reason for such a special form is that T-1,   *
*               T-2, and T-8 will all be exact --- a property that will *
*               give much more accurate computation of the function     *
*               EXPM1.                                                  *
*                                                                       *
*       Step 6. Reconstruction of exp(X)                                *
*                       exp(X) = 2^M * 2^(J/64) * exp(R).               *
*               6.1     If AdjFlag = 0, go to 6.3                       *
*               6.2     ans := ans * AdjScale                           *
*               6.3     Restore the user FPCR                           *
*               6.4     Return ans := ans * Scale. Exit.                *
*       Notes:  If AdjFlag = 0, we have X = Mlog2 + Jlog2/64 + R,       *
*               |M| <= 16380, and Scale = 2^M. Moreover, exp(X) will    *
*               neither overflow nor underflow. If AdjFlag = 1, that    *
*               means that                                              *
*                       X = (M1+M)log2 + Jlog2/64 + R, |M1+M| >= 16380. *
*               Hence, exp(X) may overflow or underflow or neither.     *
*               When that is the case, AdjScale = 2^(M1) where M1 is    *
*               approximately M. Thus 6.2 will never cause              *
*               over/underflow. Possible exception in 6.4 is overflow   *
*               or underflow. The inexact exception is not generated in *
*               6.4. Although one can argue that the inexact flag       *
*               should always be raised, to simulate that exception     *
*               cost to much than the flag is worth in practical uses.  *
*                                                                       *
*       Step 7. Return 1 + X.                                           *
*               7.1     ans := X                                        *
*               7.2     Restore user FPCR.                              *
*               7.3     Return ans := 1 + ans. Exit                     *
*       Notes:  For non-zero X, the inexact exception will always be    *
*               raised by 7.3. That is the only exception raised by 7.3.*
*               Note also that we use the FMOVEM instruction to move X  *
*               in Step 7.1 to avoid unnecessary trapping. (Although    *
*               the FMOVEM may not seem relevant since X is normalized, *
*               the precaution will be useful in the library version of *
*               this code where the separate entry for denormalized     *
*               inputs will be done away with.)                         *
*                                                                       *
*       Step 8. Handle exp(X) where |X| >= 16380log2.                   *
*               8.1     If |X| > 16480 log2, go to Step 9.              *
*               (mimic 2.2 - 2.6)                                       *
*               8.2     N := round-to-integer( X * 64/log2 )            *
*               8.3     Calculate J = N mod 64, J = 0,1,...,63          *
*               8.4     K := (N-J)/64, M1 := truncate(K/2), M = K-M1,   *
*                       AdjFlag := 1.                                   *
*               8.5     Calculate the address of the stored value       *
*                       2^(J/64).                                       *
*               8.6     Create the values Scale = 2^M, AdjScale = 2^M1. *
*               8.7     Go to Step 3.                                   *
*       Notes:  Refer to notes for 2.2 - 2.6.                           *
*                                                                       *
*       Step 9. Handle exp(X), |X| > 16480 log2.                        *
*               9.1     If X < 0, go to 9.3                             *
*               9.2     ans := Huge, go to 9.4                          *
*               9.3     ans := Tiny.                                    *
*               9.4     Restore user FPCR.                              *
*               9.5     Return ans := ans * ans. Exit.                  *
*       Notes:  Exp(X) will surely overflow or underflow, depending on  *
*               X's sign. "Huge" and "Tiny" are respectively large/tiny *
*               extended-precision numbers whose square over/underflow  *
*               with an inexact result. Thus, 9.5 always raises the     *
*               inexact together with either overflow or underflow.     *
*************************************************************************

        cnop            0,4

L2      dc.l            $3FDC0000,$82E30865,$4361C4C6,$00000000

EEXPA3  dc.l            $3FA55555,$55554CC1
EEXPA2  dc.l            $3FC55555,$55554A54

EM1A4   dc.l            $3F811111,$11174385
EM1A3   dc.l            $3FA55555,$55554F5A

EM1A2   dc.l            $3FC55555,$55555555,$00000000,$00000000

EM1B8   dc.l            $3EC71DE3,$A5774682
EM1B7   dc.l            $3EFA01A0,$19D7CB68

EM1B6   dc.l            $3F2A01A0,$1A019DF3
EM1B5   dc.l            $3F56C16C,$16C170E2

EM1B4   dc.l            $3F811111,$11111111
EM1B3   dc.l            $3FA55555,$55555555

EM1B2   dc.l            $3FFC0000,$AAAAAAAA,$AAAAAAAB
        dc.l            $00000000

TWO140  dc.l            $48B00000,$00000000
TWON140
        dc.l            $37300000,$00000000

EEXPTBL
        dc.l            $3FFF0000,$80000000,$00000000,$00000000
        dc.l            $3FFF0000,$8164D1F3,$BC030774,$9F841A9B
        dc.l            $3FFF0000,$82CD8698,$AC2BA1D8,$9FC1D5B9
        dc.l            $3FFF0000,$843A28C3,$ACDE4048,$A0728369
        dc.l            $3FFF0000,$85AAC367,$CC487B14,$1FC5C95C
        dc.l            $3FFF0000,$871F6196,$9E8D1010,$1EE85C9F
        dc.l            $3FFF0000,$88980E80,$92DA8528,$9FA20729
        dc.l            $3FFF0000,$8A14D575,$496EFD9C,$A07BF9AF
        dc.l            $3FFF0000,$8B95C1E3,$EA8BD6E8,$A0020DCF
        dc.l            $3FFF0000,$8D1ADF5B,$7E5BA9E4,$205A63DA
        dc.l            $3FFF0000,$8EA4398B,$45CD53C0,$1EB70051
        dc.l            $3FFF0000,$9031DC43,$1466B1DC,$1F6EB029
        dc.l            $3FFF0000,$91C3D373,$AB11C338,$A0781494
        dc.l            $3FFF0000,$935A2B2F,$13E6E92C,$9EB319B0
        dc.l            $3FFF0000,$94F4EFA8,$FEF70960,$2017457D
        dc.l            $3FFF0000,$96942D37,$20185A00,$1F11D537
        dc.l            $3FFF0000,$9837F051,$8DB8A970,$9FB952DD
        dc.l            $3FFF0000,$99E04593,$20B7FA64,$1FE43087
        dc.l            $3FFF0000,$9B8D39B9,$D54E5538,$1FA2A818
        dc.l            $3FFF0000,$9D3ED9A7,$2CFFB750,$1FDE494D
        dc.l            $3FFF0000,$9EF53260,$91A111AC,$20504890
        dc.l            $3FFF0000,$A0B0510F,$B9714FC4,$A073691C
        dc.l            $3FFF0000,$A2704303,$0C496818,$1F9B7A05
        dc.l            $3FFF0000,$A43515AE,$09E680A0,$A0797126
        dc.l            $3FFF0000,$A5FED6A9,$B15138EC,$A071A140
        dc.l            $3FFF0000,$A7CD93B4,$E9653568,$204F62DA
        dc.l            $3FFF0000,$A9A15AB4,$EA7C0EF8,$1F283C4A
        dc.l            $3FFF0000,$AB7A39B5,$A93ED338,$9F9A7FDC
        dc.l            $3FFF0000,$AD583EEA,$42A14AC8,$A05B3FAC
        dc.l            $3FFF0000,$AF3B78AD,$690A4374,$1FDF2610
        dc.l            $3FFF0000,$B123F581,$D2AC2590,$9F705F90
        dc.l            $3FFF0000,$B311C412,$A9112488,$201F678A
        dc.l            $3FFF0000,$B504F333,$F9DE6484,$1F32FB13
        dc.l            $3FFF0000,$B6FD91E3,$28D17790,$20038B30
        dc.l            $3FFF0000,$B8FBAF47,$62FB9EE8,$200DC3CC
        dc.l            $3FFF0000,$BAFF5AB2,$133E45FC,$9F8B2AE6
        dc.l            $3FFF0000,$BD08A39F,$580C36C0,$A02BBF70
        dc.l            $3FFF0000,$BF1799B6,$7A731084,$A00BF518
        dc.l            $3FFF0000,$C12C4CCA,$66709458,$A041DD41
        dc.l            $3FFF0000,$C346CCDA,$24976408,$9FDF137B
        dc.l            $3FFF0000,$C5672A11,$5506DADC,$201F1568
        dc.l            $3FFF0000,$C78D74C8,$ABB9B15C,$1FC13A2E
        dc.l            $3FFF0000,$C9B9BD86,$6E2F27A4,$A03F8F03
        dc.l            $3FFF0000,$CBEC14FE,$F2727C5C,$1FF4907D
        dc.l            $3FFF0000,$CE248C15,$1F8480E4,$9E6E53E4
        dc.l            $3FFF0000,$D06333DA,$EF2B2594,$1FD6D45C
        dc.l            $3FFF0000,$D2A81D91,$F12AE45C,$A076EDB9
        dc.l            $3FFF0000,$D4F35AAB,$CFEDFA20,$9FA6DE21
        dc.l            $3FFF0000,$D744FCCA,$D69D6AF4,$1EE69A2F
        dc.l            $3FFF0000,$D99D15C2,$78AFD7B4,$207F439F
        dc.l            $3FFF0000,$DBFBB797,$DAF23754,$201EC207
        dc.l            $3FFF0000,$DE60F482,$5E0E9124,$9E8BE175
        dc.l            $3FFF0000,$E0CCDEEC,$2A94E110,$20032C4B
        dc.l            $3FFF0000,$E33F8972,$BE8A5A50,$2004DFF5
        dc.l            $3FFF0000,$E5B906E7,$7C8348A8,$1E72F47A
        dc.l            $3FFF0000,$E8396A50,$3C4BDC68,$1F722F22
        dc.l            $3FFF0000,$EAC0C6E7,$DD243930,$A017E945
        dc.l            $3FFF0000,$ED4F301E,$D9942B84,$1F401A5B
        dc.l            $3FFF0000,$EFE4B99B,$DCDAF5CC,$9FB9A9E3
        dc.l            $3FFF0000,$F281773C,$59FFB138,$20744C05
        dc.l            $3FFF0000,$F5257D15,$2486CC2C,$1F773A19
        dc.l            $3FFF0000,$F7D0DF73,$0AD13BB8,$1FFE90D5
        dc.l            $3FFF0000,$FA83B2DB,$722A033C,$A041ED22
        dc.l            $3FFF0000,$FD3E0C0C,$F486C174,$1F853F3A


SCALE       EQU     -16
ADJSCALE    EQU     -28
SC          EQU     -16
ONEBYSC     EQU     -28
L_SCR1      EQU     -4
TEMP_SIZE   EQU     28


_exp
        fmove.d         (4,sp),fp0
@exp

;--Step 2.
;--This is the normal branch:   2^(-65) <= |X| < 16380 log2.
        fmove.s         fp0,-(sp)
        move.w          (sp),d0
        addq.l          #4,sp
        and.w           #$7f80,d0
        beq             .zero

        fmove.x         fp0,fp1
        fmul.s          #$42B8AA3B,fp0          ; 64/log2 * X
        link            a0,#-TEMP_SIZE
        fmovem.x        fp2/fp3,-(sp)           ; save fp2 {fp2/fp3}
        fmove.l         fp0,d1                  ; N = int( X * 64/log2 )
        lea             (EEXPTBL,pc),a1
        fmove.l         d1,fp0                  ; convert to floating-format

        move.l          d1,d0                   ; save N temporarily
        and.l           #$3F,d1                 ; D0 is J = N mod 64
        asr.l           #6,d0                   ; D0 is M
        fmove.x         fp0,fp2
        lsl.l           #4,d1
        fmul.s          #$BC317218,fp0          ; N * L1, L1 = lead(-log2/64)
        add.w           #$3FFF,d0               ; biased expo. of 2^(M)
        add.l           d1,a1                   ; address of 2^(J/64)
        move.w          (L2,pc),(L_SCR1,a0)     ; prefetch L2, no need in CB

.EXPCONT1
;--Step 3.
;--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
;--a0 points to 2^(J/64), D0 is biased expo. of 2^(M)
        fmul.x          (L2,pc),fp2             ; N * L2, L1+L2 = -log2/64
        fadd.x          fp1,fp0                 ; X + N*L1
        fadd.x          fp2,fp0                 ; fp0 is R, reduced arg.

;--Step 4.
;--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
;-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
;--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
;--[R+R*S*(A2+S*A4)] + [S*(A1+S*(A3+S*A5))]

        fmove.x         fp0,fp1
        fmul.x          fp1,fp1                 ; fp1 IS S = R*R

        fmove.s         #$3AB60B70,fp2          ; fp2 IS A5

        fmul.x          fp1,fp2                 ; fp2 IS S*A5
        fmove.x         fp1,fp3
        fmul.s          #$3C088895,fp3          ; fp3 IS S*A4

        fadd.d          (EEXPA3,pc),fp2         ; fp2 IS A3+S*A5
        fadd.d          (EEXPA2,pc),fp3         ; fp3 IS A2+S*A4

        fmul.x          fp1,fp2                 ; fp2 IS S*(A3+S*A5)
        move.w          d0,(SCALE,a0)           ; SCALE is 2^(M) in extended
        move.l          #$80000000,(SCALE+4,a0)
        clr.l           (SCALE+8,a0)

        fmul.x          fp1,fp3                 ; fp3 IS S*(A2+S*A4)

        fadd.s          #$3F000000,fp2          ; fp2 IS A1+S*(A3+S*A5)
        fmul.x          fp0,fp3                 ; fp3 IS R*S*(A2+S*A4)

        fmul.x          fp1,fp2                 ; fp2 IS S*(A1+S*(A3+S*A5))
        fadd.x          fp3,fp0                 ; fp0 IS R+R*S*(A2+S*A4),

        fmove.x         (a1)+,fp1               ; fp1 is lead. pt. of 2^(J/64)
        fadd.x          fp2,fp0                 ; fp0 is EXP(R) - 1

;--Step 5
;--final reconstruction process
;--EXP(X) = 2^M * ( 2^(J/64) + 2^(J/64)*(EXP(R)-1) )

        fmul.x          fp1,fp0                 ; 2^(J/64)*(Exp(R)-1)
        fmovem.x        (sp)+,fp2/fp3           ; fp2 restored {fp2/fp3}
        fadd.s          (a1),fp0                ; accurate 2^(J/64)

        fadd.x          fp1,fp0                 ; 2^(J/64) + 2^(J/64)*...

.NORMAL
        fmul.x          (SCALE,a0),fp0          ; multiply 2^(M)
        unlk            a0
        rts
.zero
        fmove.s         #1,fp0
        rts

*************************************************************************
* expm1():  computes the exponential minus 1 for a normalized input     *
*                                                                       *
* INPUT *************************************************************** *
*       fp0 = extended precision input                                  *
*                                                                       *
* OUTPUT ************************************************************** *
*       fp0 = exp(X) or exp(X)-1                                        *
*                                                                       *
* ACCURACY and MONOTONICITY ******************************************* *
*       The returned result is within 0.85 ulps in 64 significant bit,  *
*       i.e. within 0.5001 ulp to 53 bits if the result is subsequently *
*       rounded to double precision. The result is provably monotonic   *
*       in double precision.                                            *
*                                                                       *
* ALGORITHM and IMPLEMENTATION **************************************** *
*                                                                       *
*       Step 1. Check |X|                                               *
*               1.1     If |X| >= 1/4, go to Step 1.3.                  *
*               1.2     Go to Step 7.                                   *
*               1.3     If |X| < 70 log(2), go to Step 2.               *
*               1.4     Go to Step 10.                                  *
*       Notes:  The usual case should take the branches 1.1 -> 1.3 -> 2.*
*               However, it is conceivable |X| can be small very often  *
*               because EXPM1 is intended to evaluate exp(X)-1          *
*               accurately when |X| is small. For further details on    *
*               the comparisons, see the notes on Step 1 of setox.      *
*                                                                       *
*       Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).      *
*               2.1     N := round-to-nearest-integer( X * 64/log2 ).   *
*               2.2     Calculate       J = N mod 64; so J = 0,1,2,..., *
*                       or 63.                                          *
*               2.3     Calculate       M = (N - J)/64; so N = 64M + J. *
*               2.4     Calculate the address of the stored value of    *
*                       2^(J/64).                                       *
*               2.5     Create the values Sc = 2^M and                  *
*                       OnebySc := -2^(-M).                             *
*       Notes:  See the notes on Step 2 of setox.                       *
*                                                                       *
*       Step 3. Calculate X - N*log2/64.                                *
*               3.1     R := X + N*L1,                                  *
*                               where L1 := single-precision(-log2/64). *
*               3.2     R := R + N*L2,                                  *
*                               L2 := extended-precision(-log2/64 - L1).*
*       Notes:  Applying the analysis of Step 3 of setox in this case   *
*               shows that |R| <= 0.0055 (note that |X| <= 70 log2 in   *
*               this case).                                             *
*                                                                       *
*       Step 4. Approximate exp(R)-1 by a polynomial                    *
*                       p = R+R*R*(A1+R*(A2+R*(A3+R*(A4+R*(A5+R*A6))))) *
*       Notes:  a) In order to reduce memory access, the coefficients   *
*               are made as "short" as possible: A1 (which is 1/2), A5  *
*               and A6 are single precision; A2, A3 and A4 are double   *
*               precision.                                              *
*               b) Even with the restriction above,                     *
*                       |p - (exp(R)-1)| <      |R| * 2^(-72.7)         *
*               for all |R| <= 0.0055.                                  *
*               c) To fully utilize the pipeline, p is separated into   *
*               two independent pieces of roughly equal complexity      *
*                       p = [ R*S*(A2 + S*(A4 + S*A6)) ]        +       *
*                               [ R + S*(A1 + S*(A3 + S*A5)) ]          *
*               where S = R*R.                                          *
*                                                                       *
*       Step 5. Compute 2^(J/64)*p by                                   *
*                               p := T*p                                *
*               where T and t are the stored values for 2^(J/64).       *
*       Notes:  2^(J/64) is stored as T and t where T+t approximates    *
*               2^(J/64) to roughly 85 bits; T is in extended precision *
*               and t is in single precision. Note also that T is       *
*               rounded to 62 bits so that the last two bits of T are   *
*               zero. The reason for such a special form is that T-1,   *
*               T-2, and T-8 will all be exact --- a property that will *
*               be exploited in Step 6 below. The total relative error  *
*               in p is no bigger than 2^(-67.7) compared to the final  *
*               result.                                                 *
*                                                                       *
*       Step 6. Reconstruction of exp(X)-1                              *
*                       exp(X)-1 = 2^M * ( 2^(J/64) + p - 2^(-M) ).     *
*               6.1     If M <= 63, go to Step 6.3.                     *
*               6.2     ans := T + (p + (t + OnebySc)). Go to 6.6       *
*               6.3     If M >= -3, go to 6.5.                          *
*               6.4     ans := (T + (p + t)) + OnebySc. Go to 6.6       *
*               6.5     ans := (T + OnebySc) + (p + t).                 *
*               6.6     Restore user FPCR.                              *
*               6.7     Return ans := Sc * ans. Exit.                   *
*       Notes:  The various arrangements of the expressions give        *
*               accurate evaluations.                                   *
*                                                                       *
*       Step 7. exp(X)-1 for |X| < 1/4.                                 *
*               7.1     If |X| >= 2^(-65), go to Step 9.                *
*               7.2     Go to Step 8.                                   *
*                                                                       *
*       Step 8. Calculate exp(X)-1, |X| < 2^(-65).                      *
*               8.1     If |X| < 2^(-16312), goto 8.3                   *
*               8.2     Restore FPCR; return ans := X - 2^(-16382).     *
*                       Exit.                                           *
*               8.3     X := X * 2^(140).                               *
*               8.4     Restore FPCR; ans := ans - 2^(-16382).          *
*                Return ans := ans*2^(140). Exit                        *
*       Notes:  The idea is to return "X - tiny" under the user         *
*               precision and rounding modes. To avoid unnecessary      *
*               inefficiency, we stay away from denormalized numbers    *
*               the best we can. For |X| >= 2^(-16312), the             *
*               straightforward 8.2 generates the inexact exception as  *
*               the case warrants.                                      *
*                                                                       *
*       Step 9. Calculate exp(X)-1, |X| < 1/4, by a polynomial          *
*                       p = X + X*X*(B1 + X*(B2 + ... + X*B12))         *
*       Notes:  a) In order to reduce memory access, the coefficients   *
*               are made as "short" as possible: B1 (which is 1/2), B9  *
*               to B12 are single precision; B3 to B8 are double        *
*               precision; and B2 is double extended.                   *
*               b) Even with the restriction above,                     *
*                       |p - (exp(X)-1)| < |X| 2^(-70.6)                *
*               for all |X| <= 0.251.                                   *
*               Note that 0.251 is slightly bigger than 1/4.            *
*               c) To fully preserve accuracy, the polynomial is        *
*               computed as                                             *
*                       X + ( S*B1 +    Q ) where S = X*X and           *
*                       Q       =       X*S*(B2 + X*(B3 + ... + X*B12)) *
*               d) To fully utilize the pipeline, Q is separated into   *
*               two independent pieces of roughly equal complexity      *
*                       Q = [ X*S*(B2 + S*(B4 + ... + S*B12)) ] +       *
*                               [ S*S*(B3 + S*(B5 + ... + S*B11)) ]     *
*                                                                       *
*       Step 10. Calculate exp(X)-1 for |X| >= 70 log 2.                *
*               10.1 If X >= 70log2 , exp(X) - 1 = exp(X) for all       *
*               practical purposes. Therefore, go to Step 1 of setox.   *
*               10.2 If X <= -70log2, exp(X) - 1 = -1 for all practical *
*               purposes.                                               *
*               ans := -1                                               *
*               Restore user FPCR                                       *
*               Return ans := ans + 2^(-126). Exit.                     *
*       Notes:  10.2 will always create an inexact and return -1 + tiny *
*               in the user rounding precision and mode.                *
*                                                                       *
*************************************************************************

_expm1
        fmove.d         (4,sp),fp0
@expm1
;--entry point for EXPM1(X), here X is finite, non-zero, non-NaN

;--Step 1.
;--Step 1.1
        link            a0,#-TEMP_SIZE
        fmove.x         fp0,(SC,a0)
        move.l          (SC,a0),d1              ; load part of input X
        and.l           #$7FFF0000,d1           ; biased expo. of X
        cmp.l           #$3FFD0000,d1           ; 1/4
        bge.b           .EM1CON1                ; |X| >= 1/4
        bra             .EM1SM

.EM1CON1
;--Step 1.3
;--The case |X| >= 1/4
        move.w          (SC+4,a0),d1            ; expo. and partial sig. of |X|
        cmp.l           #$4004C215,d1           ; 70log2 rounded up to 16 bits
        ble.b           .EM1MAIN                ; 1/4 <= |X| <= 70log2
        bra             .EM1BIG

.EM1MAIN
;--Step 2.
;--This is the case:    1/4 <= |X| <= 70 log2.

        fmove.x         fp0,fp1
        fmul.s          #$42B8AA3B,fp0          ; 64/log2 * X
        fmovem.x        fp2/fp3,-(sp)           ; save fp2 {fp2/fp3}
        fmove.l         fp0,d1                  ; N = int( X * 64/log2 )
        lea             (EEXPTBL,pc),a1
        fmove.l         d1,fp0                  ; convert to floating-format
        move.l          d1,(L_SCR1,a0)          ; save N temporarily
        and.l           #$3F,d1                 ; D0 is J = N mod 64
        lsl.l           #4,d1
        add.l           d1,a1                   ; address of 2^(J/64)
        move.l          (L_SCR1,a0),d1
        asr.l           #6,d1                   ; D0 is M
        move.l          d1,(L_SCR1,a0)          ; save a copy of M

;--Step 3.
;--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
;--a0 points to 2^(J/64), D0 and a1 both contain M
        fmove.x         fp0,fp2
        fmul.s          #$BC317218,fp0          ; N * L1, L1 = lead(-log2/64)
        fmul.x          (L2,pc),fp2             ; N * L2, L1+L2 = -log2/64
        fadd.x          fp1,fp0                 ; X + N*L1
        fadd.x          fp2,fp0                 ; fp0 is R, reduced arg.
        add.w           #$3FFF,d1               ; D0 is biased expo. of 2^M

;--Step 4.
;--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
;-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*(A5 + R*A6)))))
;--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
;--[R*S*(A2+S*(A4+S*A6))] + [R+S*(A1+S*(A3+S*A5))]

        fmove.x         fp0,fp1
        fmul.x          fp1,fp1                 ; fp1 IS S = R*R

        fmove.s         #$3950097B,fp2          ; fp2 IS a6

        fmul.x          fp1,fp2                 ; fp2 IS S*A6
        fmove.x         fp1,fp3
        fmul.s          #$3AB60B6A,fp3          ; fp3 IS S*A5

        fadd.d          (EM1A4,pc),fp2          ; fp2 IS A4+S*A6
        fadd.d          (EM1A3,pc),fp3          ; fp3 IS A3+S*A5
        move.w          d1,(SC,a0)              ; SC is 2^(M) in extended
        move.l          #$80000000,(SC+4,a0)
        clr.l           (SC+8,a0)

        fmul.x          fp1,fp2                 ; fp2 IS S*(A4+S*A6)
        move.l          (L_SCR1,a0),d1          ; D0 is M
        neg.w           d1                      ; D0 is -M
        fmul.x          fp1,fp3                 ; fp3 IS S*(A3+S*A5)
        add.w           #$3FFF,d1               ; biased expo. of 2^(-M)
        fadd.d          (EM1A2,pc),fp2          ; fp2 IS A2+S*(A4+S*A6)
        fadd.s          #$3F000000,fp3          ; fp3 IS A1+S*(A3+S*A5)
        fmul.x          fp1,fp2                 ; fp2 IS S*(A2+S*(A4+S*A6))
        or.w            #$8000,d1               ; signed/expo. of -2^(-M)
        move.w          d1,(ONEBYSC,a0)         ; OnebySc is -2^(-M)
        move.l          #$80000000,(ONEBYSC+4,a0)
        clr.l           (ONEBYSC+8,a0)
        fmul.x          fp3,fp1                 ; fp1 IS S*(A1+S*(A3+S*A5))

        fmul.x          fp0,fp2                 ; fp2 IS R*S*(A2+S*(A4+S*A6))
        fadd.x          fp1,fp0                 ; fp0 IS R+S*(A1+S*(A3+S*A5))

        fadd.x          fp2,fp0                 ; fp0 IS EXP(R)-1

        fmovem.x        (sp)+,fp2/fp3           ; fp2 restored {fp2/fp3}

;--Step 5
;--Compute 2^(J/64)*p

        fmul.x          (a1),fp0                ; 2^(J/64)*(Exp(R)-1)

;--Step 6
;--Step 6.1
        move.l          (L_SCR1,a0),d1          ; retrieve M
        cmp.l           #63,d1
        ble.b           .MLE63
;--Step 6.2     M >= 64
        fmove.s         (12,a1),fp1             ; fp1 is t
        fadd.x          (ONEBYSC,a0),fp1        ; fp1 is t+OnebySc
        fadd.x          fp1,fp0                 ; p+(t+OnebySc), fp1 released
        fadd.x          (a1),fp0                ; T+(p+(t+OnebySc))
        bra             .EM1SCALE
.MLE63:
;--Step 6.3     M <= 63
        cmp.l           #-3,d1
        bge.b           .MGEN3
.MLTN3:
;--Step 6.4     M <= -4
        fadd.s          (12,a1),fp0             ; p+t
        fadd.x          (a1),fp0                ; T+(p+t)
        fadd.x          (ONEBYSC,a0),fp0        ; OnebySc + (T+(p+t))
        bra             .EM1SCALE
.MGEN3
;--Step 6.5     -3 <= M <= 63
        fmove.x         (a1)+,fp1               ; fp1 is T
        fadd.s          (a1),fp0                ; fp0 is p+t
        fadd.x          (ONEBYSC,a0),fp1        ; fp1 is T+OnebySc
        fadd.x          fp1,fp0                 ; (T+OnebySc)+(p+t)
.EM1SCALE
;--Step 6.6
        fmul.x          (SC,a0),fp0
        unlk            a0
        rts

.EM1SM
;--Step 7       |X| < 1/4.
        cmp.l           #$3FBE0000,d1           ; 2^(-65)
        bge.b           .EM1POLY

.EM1TINY
;--Step 8       |X| < 2^(-65)
        cmp.l           #$00330000,d1           ; 2^(-16312)
        blt.b           .EM12TINY
;--Step 8.2
        move.l          #$80010000,(SC,a0)      ; SC is -2^(-16382)
        move.l          #$80000000,(SC+4,a0)
        clr.l           (SC+8,a0)

        fadd.x          (SC,a0),fp0
        unlk            a0
        rts

.EM12TINY
;--Step 8.3
        fmul.d          (TWO140,pc),fp0
        move.l          #$80010000,(SC,a0)
        move.l          #$80000000,(SC+4,a0)
        clr.l           (SC+8,a0)
        fadd.x          (SC,a0),fp0
        fmul.d          (TWON140,pc),fp0
        unlk            a0
        rts

.EM1POLY
;--Step 9       exp(X)-1 by a simple polynomial
        fmul.x          fp0,fp0                 ; fp0 is S := X*X
        fmovem.x        fp2/fp3,-(sp)           ; save fp2 {fp2/fp3}
        fmove.s         #$2F30CAA8,fp1          ; fp1 is B12
        fmul.x          fp0,fp1                 ; fp1 is S*B12
        fmove.s         #$310F8290,fp2          ; fp2 is B11
        fadd.s          #$32D73220,fp1          ; fp1 is B10+S*B12
        fmul.x          fp0,fp2                 ; fp2 is S*B11
        fmul.x          fp0,fp1                 ; fp1 is S*(B10 + ...

        fadd.s          #$3493F281,fp2          ; fp2 is B9+S*...
        fadd.d          (EM1B8,pc),fp1          ; fp1 is B8+S*...

        fmul.x          fp0,fp2                 ; fp2 is S*(B9+...
        fmul.x          fp0,fp1                 ; fp1 is S*(B8+...

        fadd.d          (EM1B7,pc),fp2          ; fp2 is B7+S*...
        fadd.d          (EM1B6,pc),fp1          ; fp1 is B6+S*...

        fmul.x          fp0,fp2                 ; fp2 is S*(B7+...
        fmul.x          fp0,fp1                 ; fp1 is S*(B6+...

        fadd.d          (EM1B5,pc),fp2          ; fp2 is B5+S*...
        fadd.d          (EM1B4,pc),fp1          ; fp1 is B4+S*...

        fmul.x          fp0,fp2                 ; fp2 is S*(B5+...
        fmul.x          fp0,fp1                 ; fp1 is S*(B4+...

        fadd.d          (EM1B3,pc),fp2          ; fp2 is B3+S*...
        fadd.x          (EM1B2,pc),fp1          ; fp1 is B2+S*...

        fmul.x          fp0,fp2                 ; fp2 is S*(B3+...
        fmul.x          fp0,fp1                 ; fp1 is S*(B2+...

        fmul.x          fp0,fp2                 ; fp2 is S*S*(B3+...)
        fmul.x          (SC,a0),fp1             ; fp1 is X*S*(B2...

        fmul.s          #$3F000000,fp0          ; fp0 is S*B1
        fadd.x          fp2,fp1                 ; fp1 is Q

        fmovem.x        (sp)+,fp2/fp3           ; fp2 restored {fp2/fp3}

        fadd.x          fp1,fp0                 ; fp0 is S*B1+Q

        fadd.x          (SC,a0),fp0
        unlk            a0
        rts

.EM1BIG
;--Step 10      |X| > 70 log2
        move.l          (SC,a0),d1
        unlk            a0
        tst.l           d1
        bgt.w           @exp

;--Step 10.2
        fmove.s         #$BF800000,fp0          ; fp0 is -1
        fadd.s          #$00800000,fp0          ; -1 + 2^(-126)
        unlk            a0
        rts