******************************************************************************* * * * 64X64 BIT MULTIPLICATION (UNSIGNED) * * 01/30/07 (dkc) * * * * This C64 subroutine does 64x64 bit multiplication. The calling sequence * * of the subroutine is; * * * * multiplicand (d[0], d[1]) => a4, b4 * * multiplier (m[0], m[1]) => a6, b6 * * address of product => a8 * * * ******************************************************************************* .def _mul64_64 .text _mul64_64: sub.s1x b15, 8, a3 ; load sp-2 mpyhlu.m1x a4, b6, a0 ; d3 * m0 || mpyhlu.m2 b4, b6, b0 ; d1 * m0 || mv.l1 a8, a7 ; save address of product || mv.l2x a6, b12 ; load m[0] || stw.d2 b12, *b15-- ; save b12 mpyu.m1x a4, b6, a8 ; d2 * m0 || mpyu.m2 b4, b6, b8 ; d0 * m0 || stw.d2 a10, *b15--[2] ; save a10 || stw.d1 b10, *a3--[2] ; save b10 shl.s1 a0, 16, a1 ; d3*m0 << 16 || shl.s2 b0, 16, b1 ; d1*m0 << 16 || zero.l1 a9 ; zero odd register of pair || zero.l2 b9 ; zero odd register of pair || mpyhu.m1x a4, b6, a2 ; d3 * m1 || mpyhu.m2 b4, b6, b2 ; d1 * m1 || stw.d2 a11, *b15--[2] ; save a11 || stw.d1 b11, *a3--[2] ; save b11 addu.l1 a9:a8, a1, a9:a8 ; d3*m0<<16 + d2*m0 || addu.l2 b9:b8, b1, b9:b8 ; p0 = d1*m0<<16 + d0*m0 || shru.s1 a0, 16, a0 ; p2 = d3*m0 >> 16 || shru.s2 b0, 16, b0 ; d1*m0 >> 16 || subab.d2 b1, b1, b1 ; load 0 || mpylhu.m1x a4, b6, a10 ; d2 * m1 || mpylhu.m2 b4, b6, b10 ; d0 * m1 addab.d1 a0, a9, a0 ; p2 = p2 + carry || addu.l2 b1:b0, b9, b1:b0 ; d1*m0>>16 + carry || mv.s2x a8, b9 ; load d3*m0<<16 + d2*m0 || shl.s1 a2, 16, a3 ; d3*m1 << 16 || mpy.m2 b11, 0, b11 ; zero odd register of pair || zero.l1 a11 ; zero odd register of pair || addab.d2 b12, 0, b6 ; load m[0] addu.l2 b1:b0, b9, b1:b0 ; p1 = d1*m0>>16+carry+d3*m0<<16+d2*mp || shl.s2 b2, 16, b3 ; d1*m1 << 16 || shru.s1 a2, 16, a2 ; p2' = d3*m1 >> 16 || addab.d2 b3, 0, b7 ; save return address add.s1x a0, b1, a0 ; p2 = p2 + carry || addu.l1 a11:a10, a3, a11:a10 ; d3*m1<<16 + d2*m1 || addu.l2 b11:b10, b3, b11:b10 ; p0' = d1*m1<<16 + d0*m1 || shru.s2 b2, 16, b2 ; d1*m1 >> 16 || subab.d2 b3, b3, b3 ; load 0 addab.d1 a2, a11, a2 ; p2' = p2' + carry || addu.l2 b3:b2, b11, b3:b2 ; d1*m1>>16 + carry || mv.s2x a10, b11 ; load d3*m1<<16 + d2*m1 || mpy.m2 b9, 0, b9 ; load 0 addu.l2 b3:b2, b11, b3:b2 ; p1' = d1*m0>>16+carry+d3*m0<<16+d2*mp || shl.s2 b10, 16, b11 ; p0' << 16 || mpy.m2 b1, 0, b1 ; load 0 add.s1x a2, b3, a2 ; p2' = p2' + carry || addu.l2 b9:b8, b11, b9:b8 ; P0 = p0 + p0'<<16 || shru.s2 b10, 16, b10 ; p0' >> 16 addu.l2 b1:b0, b9, b1:b0 ; p1 + carry || shl.s2 b2, 16, b9 ; p1' << 16 || shl.s1 a2, 16, a2 ; p2' << 16 addu.l2 b1:b0, b10, b1:b0 ; p1 + carry + p0'>>16 || shru.s2 b2, 16, b2 ; p1' >> 16 || add.l1 a0, a2, a0 ; p2 + p2'<<16 addu.l2 b1:b0, b9, b1:b0 ; P1 = p1'<<16 + p1 + carry + p0'>>16 || add.s2x a0, b2, b2 ; P2 = p2 + p2'<<16 + p1'>>16 || mpyhlu.m1 a4, a6, a0 ; d3 * m2 || mpyhlu.m2 b4, b6, b0 ; d1 * m2 add.s2 b2, b1, b13 ; P2 = P2 + carry || stw.d2 b13, *b15-- ; save b13 || mpyu.m1 a4, a6, a8 ; d2 * m2 || mpyu.m2x b4, a6, b8 ; d0 * m2 || mv.l2 b0, b12 ; save P1 shl.s1 a0, 16, a1 ; d3*m2 << 16 || shl.s2 b0, 16, b1 ; d1*m2 << 16 || zero.l1 a9 ; zero odd register of pair || zero.l2 b9 ; zero odd register of pair || mpyhu.m1 a4, a6, a2 ; d3 * m3 || mpyhu.m2x b4, a6, b2 ; d1 * m3 || stw.d1 b8, *+a7[3] ; store P0 addu.l1 a9:a8, a1, a9:a8 ; d3*m2<<16 + d2*m2 || addu.l2 b9:b8, b1, b9:b8 ; p0 = d1*m2<<16 + d0*m2 || shru.s1 a0, 16, a0 ; p2 = d3*m2 >> 16 || shru.s2 b0, 16, b0 ; d1*m2 >> 16 || subab.d2 b1, b1, b1 ; load 0 || mpylhu.m1 a4, a6, a10 ; d2 * m3 || mpylhu.m2x b4, a6, b10 ; d0 * m3 addab.d1 a0, a9, a0 ; p2 = p2 + carry || addu.l2 b1:b0, b9, b1:b0 ; d1*m2>>16 + carry || mv.s2x a8, b9 ; load d3*m2<<16 + d2*m2 || shl.s1 a2, 16, a3 ; d3*m3 << 16 || mpy.m2 b11, 0, b11 ; zero odd register of pair || zero.l1 a11 ; zero odd register of pair addu.l2 b1:b0, b9, b1:b0 ; p1 = d1*m2>>16+carry+d3*m2<<16+d2*mp || shl.s2 b2, 16, b3 ; d1*m3 << 16 || shru.s1 a2, 16, a2 ; p2' = d3*m3 >> 16 add.s1x a0, b1, a0 ; p2 = p2 + carry || addu.l1 a11:a10, a3, a11:a10 ; d3*m3<<16 + d2*m3 || addu.l2 b11:b10, b3, b11:b10 ; p0' = d1*m3<<16 + d0*m3 || shru.s2 b2, 16, b2 ; d1*m3 >> 16 || subab.d2 b3, b3, b3 ; load 0 addab.d1 a2, a11, a2 ; p2' = p2' + carry || addu.l2 b3:b2, b11, b3:b2 ; d1*m3>>16 + carry || mv.s2x a10, b11 ; load d3*m3<<16 + d2*m3 || mpy.m2 b9, 0, b9 ; load 0 addu.l2 b3:b2, b11, b3:b2 ; p1' = d1*m2>>16+carry+d3*m2<<16+d2*mp || shl.s2 b10, 16, b11 ; p0' << 16 || mpy.m2 b1, 0, b1 ; load 0 add.s1x a2, b3, a2 ; p2' = p2' + carry || addu.l2 b9:b8, b11, b9:b8 ; P0 = p0 + p0'<<16 || shru.s2 b10, 16, b10 ; p0' >> 16 addu.l2 b1:b0, b9, b1:b0 ; p1 + carry || shl.s2 b2, 16, b9 ; p1' << 16 || shl.s1 a2, 16, a2 ; p2' << 16 addu.l2 b1:b0, b10, b1:b0 ; p1 + carry + p0'>>16 || shru.s2 b2, 16, b2 ; p1' >> 16 || add.l1 a0, a2, a0 ; p2 + p2'<<16 || mpy.m2 b9, 0, b9 ; load 0 addu.l2 b1:b0, b9, b1:b0 ; P1 = p1'<<16 + p1 + carry + p0'>>16 || add.l1x a0, b2, a0 ; P2 = p2 + p2'<<16 + p1'>>16 || mpy.m2 b1, 0, b1 ; load 0 add.s1x a0, b1, a0 ; P2 = P2 + carry || addu.l2 b9:b8, b12, b9:b8 ; P0 + old P1 addu.l2 b1:b0, b9, b1:b0 ; P1 + carry addu.l2 b1:b0, b13, b1:b0 ; P1 + carry + old P2 || add.l1x b15, 4, a3 ; load sp + 1 add.l2x a0, b1, b1 ; P2 + carry || ldw.d2 *++b15[1], b13 ; restore b13 || ldw.d1 *++a3[1], a0 ; restore b11 ldw.d2 *++b15[2], a11 ; restore a11 || ldw.d1 *++a3[2], b10 ; restore b10 || b.s2 b7 ; return ldw.d2 *++b15[2], a10 ; restore a10 || ldw.d1 *++a3[2], b12 ; restore b12 stw.d1 b8, *+a7[2] ; store P0 || addaw.d2 b15, 1, b15 ; pop stack stw.d1 b0, *+a7[1] ; store P1 stw.d1 b1, *a7 ; store P2 || mv.l2x a0, b11 ; load b11 nop .end