******************************************************************************* * * * 128/64 BIT DIVISION (UNSIGNED) * * 01/30/07 (dkc) * * * * This C64 subroutine does 128/64 bit division. The calling sequence of the * * subroutine is; * * * * dividend (A[0], A[1], A[2], A[3]) => a4, b4, a6, b6 * * address of quotient => a8 * * divisor (D[0], D[1]) => b8, a10 * * * ******************************************************************************* .global _div128_64 .text _div128_64: mv.l1 a4, a1 ; load A[0] || mv.s1x b4, a4 ; load A[1] || addab.d1 a6, 0, a13 ; load A[2] || mv.l2x a8, b4 ; load address of quotient || mvk.s2 32, b9 ; load 32 || stw.d2 a13, *b15-- ; save a13 lmbd.l2 1, b8, b5 ; left-most bit detection || lmbd.l1 1, a10, a5 ; left-most bit detection || mv.s1x b6, a3 ; load A[3] || mv.s2x a13, b2 ; load A[2] || addab.d1 a10, 0, a6 ; load D[3] || addab.d2 b8, 0, b1 ; load D[2] [!b1] add.l2x b5, a5, b5 ; divisor left-most bit detection || cmpltu.l1x a13, b8, a2 ; compare A[2] to D[2] || stw.d2 a12, *b15-- ; save a12 || or.s1 a1, a4, a9 ; A[0] | A[1] || addab.d1 a1, 0, a5 ; load A[0] lmbd.l1 1, a5, a0 ; left-most bit detection (A[0]) || lmbd.l2x 1, a4, b0 ; left-most bit detection (A[1]) || sub.s2 b2, b8, b2 ; compare A[2] to D[2] || mv.s1x b5, a7 ; load divisor left-most bit detection || addab.d1 a3, 0, a12 ; load A[3] || stw.d2 b11, *b15-- ; save b11 || mpy.m2 b11, 0, b11 ; zero D[0] [!a1] add.s1x b0, a0, a0 ; left-most bit detection (A[0] and A[1]) ||[!b2] cmpltu.l1 a12, a6, a2 ; compare A[3] to D[3] || lmbd.l2x 1, a13, b5 ; left-most bit detection (A[2]) || addab.d1 a9, 0, a1 ; load A[0] | A[1] || stw.d2 b10, *b15-- ; save b10 || mpy.m2 b10, 0, b10 ; zero D[1] [!a1] add.s1x a0, b5, a0 ; left-most bit detection (A[0], A[1], and A[2]) || or.l1 a1, a13, a1 ; A[0] | A[1] | A[2] || lmbd.l2x 1, a12, b5 ; left-most bit detection (A[3]) || stw.d2 b12, *b15-- ; save b12 || [a1] subab.d1 a2, a2, a2 ; load zero [!a1] add.l1x a0, b5, a0 ; dividend left-most bit detection || addk.s1 64, a7 ; divisor left-most bit detection + 64 || [a2] subab.d1 a4, a4, a4 ; load 0 || [a2] mpy.m1 a13, 0, a13 ; load 0 subab.d1 a7, a0, a8 ; shift = lmbd(1,x2) - lmbd(1,x1) || mv.l1x b8, a7 ; load D[2] || [a2] b.s2 zskip ; return zero || [a2] mpy.m1 a5, 0, a5 ; load 0 || mvk.s1 32, a3 ; load 32 [!a2] cmplt.l1 a8, a3, a2 ; compare shift to 32 || [a2] subab.d1 a2, a2, a2 ; load zero || sub.l2x b9, a8, b0 ; 32 - shift || sub.s1 a8, a3, a9 ; shift - 32 || mvk.s2 32, b5 ; load 32 || [a2] mpy.m1 a12, 0, a12 ; load 0 [!a2] addab.d1 a6, 0, a7 ; D[2] = D[3] ||[!a2] mpy.m1 a6, 0, a6 ; clear D[3] ||[!a2] addab.d2 b10, 0, b11 ; D[0] = D[1] ||[!a2] mv.l2 b8, b10 ; D[1] = D[2] || [a2] shru.s2x a6, b0, b2 ; D[2] = D[3] >> (32-shift) || [a2] b.s1 askip ; branch if shift < 32 [a2] shl.s1 a7, a8, a7 ; D[2] << shift || [a2] shru.s2 b10, b0, b1 ; D[0] = D[1] >> (32-shift) || mv.l2x a8, b8 ; load shift || [a2] addab.d1 a7, 0, a0 ; save D[2] || [a2] mv.l1x b0, a1 ; load 32-shift [a2] or.l1x a7, b2, a7 ; load D[2] || [a2] shl.s2 b11, b8, b11 ; D[0] << shift || [a2] shru.s1 a0, a1, a0 ; D[1] = D[2] >> (32-shift) [a2] or.l2 b11, b1, b11 ; load D[0] || [a2] shl.s2 b10, b8, b10 ; D[1] << shift || [a2] shl.s1 a6, a8, a6 ; D[3] << shift [a2] or.l2x b10, a0, b10 ; load D[1] || cmplt.l1 a9, a3, a2 ; compare shift-32 to 32 ||[!a2] add.s2 b0, b5, b0 ; 64 - shift ||[!a2] sub.s1 a9, a3, a9 ; shift - 64 ||[!a2] subab.d2 b8, b5, b8 ; shift - 32 mvk.s2 3, b12 ; load 3 * [!a2] mpy.m1 a7, 0, a7 ; clear D[2] ||[!a2] addab.d2 b10, 0, b11 ; D[0] = D[1] ||[!a2] mv.l2x a7, b10 ; D[1] = D[2] || [a2] shru.s2 b10, b0, b1 ; D[0] = D[1] >> (64-shift) || [a2] b.s1 askip ; branch if shift < 64 || [a2] mv.l1x b0, a1 ; load 64-shift [a2] shl.s2 b11, b8, b11 ; D[0] << (shift-32) || [a2] shru.s1 a7, a1, a0 ; D[1] = D[2] >> (64-shift) || [a2] mv.l1x b8, a1 ; load shift - 32 [a2] or.l2 b11, b1, b11 ; load D[0] || [a2] shl.s2 b10, b8, b10 ; D[1] << (shift-32) || [a2] shl.s1 a7, a1, a7 ; D[2] << (shift-32) [a2] or.l2x b10, a0, b10 ; load D[1] || cmplt.l1 a9, a3, a2 ; compare shift-64 to 32 ||[!a2] add.s2 b0, b5, b0 ; 96 - shift ||[!a2] sub.s1 a9, a3, a9 ; shift - 96 ||[!a2] subab.d2 b8, b5, b8 ; shift - 64 mv.l1x b8, a0 ; load shift - 64 || subab.d2 b12, 1, b12 ; load 2 nop * [!a2] addab.d2 b10, 0, b11 ; D[0] = D[1] ||[!a2] mpy.m2 b10, 0, b10 ; D[1] = 0 || [a2] shru.s2 b10, b0, b1 ; D[0] = D[1] >> (96-shift) || sub.l2 b8, b5, b2 ; shift - 96 [a2] shl.s1x b11, a0, a0 ; D[0] << (shift-64) ||[!a2] shl.s2 b11, b2, b11 ; D[0] << (shift-96) || subab.d2 b12, 1, b12 ; load 1 ||[!a2] add.l2 b0, b5, b0 ; 128 - shift [a2] or.l2x b1, a0, b11 ; load D[0] || [a2] shl.s2 b10, b8, b10 ; D[1] << (shift-64) ||[!a2] subab.d2 b12, 1, b12 ; load 0 askip not.l1 a7, a0 ; invert D[2] || not.s1 a6, a8 ; invert D[3] || subab.d1 a9, a9, a9 ; load 0 || mpy.m1 a1, 0, a1 ; load 0 || mv.l2x a8, b2 ; load shift || sub.s2 b0, 1, b0 ; 127-shift add.l1 a9:a8, 1, a9:a8 ; -D[3] || mpy.m1 a3, 0, a3 ; load 0 || shl.s2 b0, 5, b9 ; (127-shift) << 5 add.l1 a1:a0, a9, a1:a0 ; -D[2] || not.s1x b10, a2 ; invert D[1] || or.l2 b9, b0, b9 ; (127-shift)::(127-shift) add.l1 a3:a2, a1, a3:a2 ; -D[1] || addab.d1 a0, 0, a9 ; load -D[2] || not.s1x b11, a0 ; invert D[0] add.l1 a0, a3, a0 ; -D[0] || mv.l2x a2, b10 ; load -D[1] || mv.s1 a13, a14 ; save A[2] || mpy.m1 a13, 0, a13 ; load 0 || stw.d2 a14, *b15-- ; save a14 mv.l2x a0, b11 ; load -D[0] || zero.s1 a1 ; load 0 || stw.d2 a15, *b15-- ; save a15 ***************** * begin loop * ***************** aloop addu.l1 a13:a12, a8, a1:a0 ; A[3] - D[3] || subab.d1 a15, a15, a15 ; load 0 || shru.s2x a14, 31, b7 ; isolate MSB of A[2] || shru.s1 a12, 31, a3 ; isolate MSB of A[3] addu.l1 a15:a14, a1, a15:a14 ; A[2] + carry || shru.s2x a0, 31, b0 ; isolate MSB of delta[3] || addab.d1 a0, a0, a0 ; (A[3]-D[3]) << 1 || shl.s1 a14, 1, a13 ; A[2] << 1 || mpy.m1 a7, 0, a7 ; load 0 addu.l1 a15:a14, a9, a15:a14 ; A[2] - D[2] || addab.d1 a4, 0, a6 ; load A[1] || or.s1 a0, 1, a0 ; (A[3]-D[3])<<1 | 1 || shru.s2x a4, 31, b5 ; isolate MSB of A[1] addu.l1 a7:a6, a15, a7:a6 ; A[1] + carry || addab.d1 a14, a14, a1 ; (A[2]-D[2]) << 1 || shru.s2x a14, 31, b1 ; isolate MSB of delta[2] || mv.s1x b10, a14 ; load D[1] addu.l1 a7:a6, a14, a7:a6 ; A[1] - D[1] || or.s1x a1, b0, a1 ; (A[2]-D[2])<<1 | LSB || addab.d1 a12, a12, a12 ; A[3] << 1 || [b2] b.s2 aloop ; conditional branch to loop beginning add.l1 a5, a7, a7 ; A[0] + carry || shru.s1 a6, 31, a5 ; isolate MSB of delta[1] || addab.d1 a6, a6, a6 ; (A[1]-D[1]) << 1 || shl.s2x a5, 1, b6 ; A[0] << 1 add.l1x a7, b11, a7 ; A[0] - D[0] || or.s1 a13, a3, a14 ; (A[2]<<1) | LSB || addab.d1 a4, a4, a4 ; A[1] << 1 || or.l2 b6, b5, b6 ; (A[0]<<1) | LSB cmplt.l1 a7, 0, a2 ; compare delta to 0 || addab.d1 a7, a7, a7 ; (A[0]-D[0]) << 1 || or.s1x a6, b1, a6 ; (A[1]-D[1])<<1 | LSB || or.l2x b7, a4, b7 ; (A[1]<<1) | LSB || mpy.m1 a1, 0, a1 ; load 0 [!a2] or.l1 a7, a5, a5 ; A[0] = (A[0]-D[0])<<1 | LSB || [a2] mv.s1x b7, a4 ; A[1] = (A[1]<<1) | LSB ||[!a2] addab.d1 a1, 0, a14 ; A[2] = (A[2]-D[2])<<1 | LSB || mpy.m1 a13, 0, a13 ; load 0 [!a2] addab.d1 a0, 0, a12 ; A[3] = (A[3]-D[2])<<1 ||[!a2] mv.l1 a6, a4 ; A[1] = (A[1]-D[1])<<1 | LSB || [a2] mv.s1x b6, a5 ; A[0] = (A[0]<<1) | LSB || [b2] sub.l2 b2, 1, b2 ; decrement loop count ***************** * end loop * ***************** cmpeq.l2 b12, 3, b1 ; compare flag to 3 || mv.l1x b9, a0 ; load shift || mv.s1 a14, a13 ; load A[2] [b1] extu.s1 a12, a0, a12 ; A[3] << shift || [b1] zero.l1 a13 ; A[2] = 0 || [b1] subab.d1 a4, a4, a4 ; A[1] = 0 || [b1] mpy.m1 a5, 0, a5 ; A[0] = 0 || cmpeq.l2 b12, 2, b1 ; compare flag to 2 [b1] extu.s1 a13, a0, a13 ; A[2] << shift || [b1] zero.l1 a4 ; A[1] = 0 || [b1] subab.d1 a5, a5, a5 ; A[0] = 0 || cmpeq.l2 b12, 1, b1 ; compare flag to 1 [b1] extu.s1 a4, a0, a4 ; A[1] << shift || [b1] zero.l1 a5 ; A[0] = 0 || cmpeq.l2 b12, 0, b1 ; compare flag to 0 || ldw.d2 *++b15[1], a15 ; restore a15 [b1] extu.s1 a5, a0, a5 ; A[0] << shift || ldw.d2 *++b15[1], a14 ; restore a14 zskip ldw.d2 *++b15[1], b12 ; restore b12 ldw.d2 *++b15[1], b10 ; restore b10 ldw.d2 *++b15[1], b11 ; restore b11 ldw.d2 *++b15[1], a12 ; restore a12 || b.s2 b3 ldw.d2 *++b15[1], a13 ; restore a13 stw.d2 a5, *b4 ; store quotient stw.d2 a4, *+b4[1] ; store quotient stw.d2 a13, *+b4[2] ; store quotient || mv.l1 a12, a0 ; save quotient stw.d2 a0, *+b4[3] ; store quotient .end