******************************************************************************* * * * 64/32 BIT DIVISION (UNSIGNED) * * 01/30/07 (dkc) * * * * This C64 subroutine does 64/32 bit division. The calling sequence of the * * subroutine is; * * * * address of dividend (A[0], A[1]) => a4 * * address of quotient (B[0], B[1]) => b4 * * divisor => a6 * * * ******************************************************************************* .global _div64_32 .text _div64_32: ldw.d1 *a4, a1 ; load A[0] || lmbd.l1 1, a6, a5 ; left-most bit detection || mvk.s1 32, a3 ; load 32 ldw.d1 *+a4[1], a4 ; load A[1] || add.l1 a5, a3, a5 ; divisor left-most bit detection nop 4 lmbd.l1 1, a1, a0 ; left-most bit detection || lmbd.l2x 1, a4, b0 ; left-most bit detection || zero.s1 a2 ; clear flag [!a1] add.s1x b0, a3, a0 ; dividend left-most bit detection ||[!a1] cmpltu.l1 a4, a6, a2 ; compare A[1] to divisor sub.l1 a5, a0, a8 ; shift = lmbd(1,x2) - lmbd(1,x1) || mv.l2x a3, b0 ; load 32 || mpy.m1 a7, 0, a7 ; clear D[0] || [a2] b.s2 askip ; return zero || mv.s1 a1, a5 ; load A[0] cmplt.l1 a8, a3, a2 ; compare shift to 32 || sub.l2x b0, a8, b0 ; 32 - shift || sub.s1 a8, a3, a9 ; shift - 32 || [a2] mpy.m1 a5, 0, a5 ; load 0 || [a2] subab.d1 a4, a4, a4 ; load 0 || mvk.s2 32, b5 ; load 32 [!a2] mv.l1 a6, a7 ; D[0] = D[1] ||[!a2] mpy.m1 a6, 0, a6 ; clear D[1] || [a2] shl.s1 a6, a8, a6 ; D[1] << shift || [a2] shru.s2x a6, b0, b0 ; D[0] = D[1] >> (32-shift) [!a2] shl.s1 a7, a9, a7 ; D[0] = D[0] << (shift-32) || [a2] mv.l1x b0, a7 ; load D[0] || mvk.s2 63, b0 ; load 63 || mv.l2x a8, b2 ; load shift || subab.d1 a9, a9, a9 ; load 0 not.l1 a7, a0 ; invert D[0] || not.s1 a6, a8 ; invert D[1] || mv.l2x a8, b2 ; load shift - 1 || subab.d2 b0, b2, b0 ; 64 - shift || mvk.s2 31, b6 ; load 31 || addab.d1 a9, 1, a1 ; load 1 cmplt.l2 b0, b5, b1 ; compare 64-shift to 32 || shl.s2 b0, 5, b9 ; (64-shift) << 5 || subab.d2 b6, b2, b6 ; 32 - shift || addu.l1 a9:a8, a1, a9:a8 ; -D[0]::D[1] * and.l1 a9, 1, a9 ; isolate carry bit || mv.s1 a5, a10 ; save A[0] || mpy.m1 a5, 0, a5 ; load 0 || or.l2 b9, b0, b9 ; (64-shift)::(64-shift) || shl.s2 b6, 5, b7 ; (32-shift) << 5 || stw.d2 a10, *b15-- ; save a10 add.l1 a0, a9, a9 ; -D[0]::D[1] || cmpgt.l2 b0, b5, b0 ; compare 64-shift to 32 || or.s2 b6, b7, b6 ; (32-shift)::(32-shift) || zero.s1 a1 ; load 0 || stw.d2 a11, *b15-- ; save a11 ***************** * begin loop * ***************** aloop addu.l1 a5:a4, a8, a1:a0 ; A[1] - D[1] || shru.s1 a4, 31, a3 ; isolate MSB of A[1] || addab.d1 a10, a10, a6 ; A[0] << 1 || [b2] b.s2 aloop ; conditional branch to loop beginning || [b2] sub.l2 b2, 1, b2 ; decrement loop count and.l1 a1, 1, a7 ; isolate carry bit || addab.d1 a10, a9, a1 ; A[0] - D[0] - carry || shl.s1 a4, 1, a11 ; A[1] << 1 add.l1 a1, a7, a1 ; A[0] - D[0] || or.s1 a3, a6, a3 ; A[0] << 1 | LSB cmplt.l1 a1, 0, a2 ; compare A[1]::A[0] - D[1]::D[0] to zero || shl.s1 a1:a0, 1, a1:a0 ; (A[1] - D[1]) << 1 || addab.d1 a1, a1, a7 ; (A[0] - D[0]) << 1 [a2] addab.d1 a3, 0, a10 ; if less than, A[0] = A[0] << 1 || [a2] mv.s1 a11, a4 ; if less than, A[1] = A[1] << 1 || or.l1 a0, 1, a0 ; ((A[1] - D[1]) << 1) | 1 || and.l2x a1, 1, b7 ; isolate LSB of (A[0] - D[0]) || mpy.m1x b9, 1, a0 ; load (64-shift)::(64-shift) [!a2] or.l1x a7, b7, a10 ; if greater than or equal, A[0] = (delta<<1)|1 ||[!a2] mv.s1 a0, a4 ; if greater than or equal, A[1] = (delta<<1)|1 || subab.d1 a1, a1, a1 ; load 0 ***************** * end loop * ***************** [b1] extu.s1 a10, a0, a5 ; A[0] << (64-shift) || mv.l1x b6, a0 ; (32-shift)::(32-shift) || ldw.d2 *++b15[1], a11 ; restore a11 [!b1] zero.l1 a5 ; zero A[0] || [b0] extu.s1 a4, a0, a4 ; A[1] << (64-shift) || ldw.d2 *++b15[1], a10 ; restore a10 askip b.s2 b3 || stw.d2 a5, *b4 ; store quotient stw.d2 a4, *+b4[1] ; store quotient nop 4 .end