*******************************************************************************
*									      *
*  64X64 BIT MULTIPLICATION (UNSIGNED)					      *
*  01/30/07 (dkc)							      *
*									      *
*  This C64 subroutine does 64x64 bit multiplication.  The calling sequence   *
*  of the subroutine is;						      *
*									      *
*     multiplicand (d[0], d[1]) => a4, b4				      *
*     multiplier (m[0], m[1]) => a6, b6 				      *
*     address of product => a8						      *
*									      *
*******************************************************************************
	.def _mul64_64
	.text
_mul64_64:
	sub.s1x b15, 8, a3		;  load sp-2

	mpyhlu.m1x a4, b6, a0		;  d3 * m0
||	mpyhlu.m2 b4, b6, b0		;  d1 * m0
||	mv.l1 a8, a7			;  save address of product
||	mv.l2x a6, b12			;  load m[0]
||	stw.d2 b12, *b15--		;  save b12

	mpyu.m1x a4, b6, a8		;  d2 * m0
||	mpyu.m2 b4, b6, b8		;  d0 * m0
||	stw.d2 a10, *b15--[2]		;  save a10
||	stw.d1 b10, *a3--[2]		;  save b10

	shl.s1 a0, 16, a1		;  d3*m0 << 16
||	shl.s2 b0, 16, b1		;  d1*m0 << 16
||	zero.l1 a9			;  zero odd register of pair
||	zero.l2 b9			;  zero odd register of pair
||	mpyhu.m1x a4, b6, a2		;  d3 * m1
||	mpyhu.m2 b4, b6, b2		;  d1 * m1
||	stw.d2 a11, *b15--[2]		;  save a11
||	stw.d1 b11, *a3--[2]		;  save b11

	addu.l1 a9:a8, a1, a9:a8	;  d3*m0<<16 + d2*m0
||	addu.l2 b9:b8, b1, b9:b8	;  p0 = d1*m0<<16 + d0*m0
||	shru.s1 a0, 16, a0		;  p2 = d3*m0 >> 16
||	shru.s2 b0, 16, b0		;  d1*m0 >> 16
||	subab.d2 b1, b1, b1		;  load 0
||	mpylhu.m1x a4, b6, a10		;  d2 * m1
||	mpylhu.m2 b4, b6, b10		;  d0 * m1

	addab.d1 a0, a9, a0		;  p2 = p2 + carry
||	addu.l2 b1:b0, b9, b1:b0	;  d1*m0>>16 + carry
||	mv.s2x a8, b9			;  load d3*m0<<16 + d2*m0
||	shl.s1 a2, 16, a3		;  d3*m1 << 16
||	mpy.m2 b11, 0, b11		;  zero odd register of pair
||	zero.l1 a11			;  zero odd register of pair
||	addab.d2 b12, 0, b6		;  load m[0]

	addu.l2 b1:b0, b9, b1:b0	;  p1 = d1*m0>>16+carry+d3*m0<<16+d2*mp
||	shl.s2 b2, 16, b3		;  d1*m1 << 16
||	shru.s1 a2, 16, a2		;  p2' = d3*m1 >> 16
||	addab.d2 b3, 0, b7		;  save return address

	add.s1x a0, b1, a0		;  p2 = p2 + carry
||	addu.l1 a11:a10, a3, a11:a10	;  d3*m1<<16 + d2*m1
||	addu.l2 b11:b10, b3, b11:b10	;  p0' = d1*m1<<16 + d0*m1
||	shru.s2 b2, 16, b2		;  d1*m1 >> 16
||	subab.d2 b3, b3, b3		;  load 0

	addab.d1 a2, a11, a2		;  p2' = p2' + carry
||	addu.l2 b3:b2, b11, b3:b2	;  d1*m1>>16 + carry
||	mv.s2x a10, b11 		;  load d3*m1<<16 + d2*m1
||	mpy.m2 b9, 0, b9		;  load 0

	addu.l2 b3:b2, b11, b3:b2	;  p1' = d1*m0>>16+carry+d3*m0<<16+d2*mp
||	shl.s2 b10, 16, b11		;  p0' << 16
||	mpy.m2 b1, 0, b1		;  load 0

	add.s1x a2, b3, a2		;  p2' = p2' + carry
||	addu.l2 b9:b8, b11, b9:b8	;  P0 = p0 + p0'<<16
||	shru.s2 b10, 16, b10		;  p0' >> 16

	addu.l2 b1:b0, b9, b1:b0	;  p1 + carry
||	shl.s2 b2, 16, b9		;  p1' << 16
||	shl.s1 a2, 16, a2		;  p2' << 16

	addu.l2 b1:b0, b10, b1:b0	;  p1 + carry + p0'>>16
||	shru.s2 b2, 16, b2		;  p1' >> 16
||	add.l1 a0, a2, a0		;  p2 + p2'<<16

	addu.l2 b1:b0, b9, b1:b0	;  P1 = p1'<<16 + p1 + carry + p0'>>16
||	add.s2x a0, b2, b2		;  P2 = p2 + p2'<<16 + p1'>>16
||	mpyhlu.m1 a4, a6, a0		;  d3 * m2
||	mpyhlu.m2 b4, b6, b0		;  d1 * m2

	add.s2 b2, b1, b13		;  P2 = P2 + carry
||	stw.d2 b13, *b15--		;  save b13
||	mpyu.m1 a4, a6, a8		;  d2 * m2
||	mpyu.m2x b4, a6, b8		;  d0 * m2
||	mv.l2 b0, b12			;  save P1

	shl.s1 a0, 16, a1		;  d3*m2 << 16
||	shl.s2 b0, 16, b1		;  d1*m2 << 16
||	zero.l1 a9			;  zero odd register of pair
||	zero.l2 b9			;  zero odd register of pair
||	mpyhu.m1 a4, a6, a2		;  d3 * m3
||	mpyhu.m2x b4, a6, b2		;  d1 * m3
||	stw.d1 b8, *+a7[3]		;  store P0

	addu.l1 a9:a8, a1, a9:a8	;  d3*m2<<16 + d2*m2
||	addu.l2 b9:b8, b1, b9:b8	;  p0 = d1*m2<<16 + d0*m2
||	shru.s1 a0, 16, a0		;  p2 = d3*m2 >> 16
||	shru.s2 b0, 16, b0		;  d1*m2 >> 16
||	subab.d2 b1, b1, b1		;  load 0
||	mpylhu.m1 a4, a6, a10		;  d2 * m3
||	mpylhu.m2x b4, a6, b10		;  d0 * m3

	addab.d1 a0, a9, a0		;  p2 = p2 + carry
||	addu.l2 b1:b0, b9, b1:b0	;  d1*m2>>16 + carry
||	mv.s2x a8, b9			;  load d3*m2<<16 + d2*m2
||	shl.s1 a2, 16, a3		;  d3*m3 << 16
||	mpy.m2 b11, 0, b11		;  zero odd register of pair
||	zero.l1 a11			;  zero odd register of pair

	addu.l2 b1:b0, b9, b1:b0	;  p1 = d1*m2>>16+carry+d3*m2<<16+d2*mp
||	shl.s2 b2, 16, b3		;  d1*m3 << 16
||	shru.s1 a2, 16, a2		;  p2' = d3*m3 >> 16

	add.s1x a0, b1, a0		;  p2 = p2 + carry
||	addu.l1 a11:a10, a3, a11:a10	;  d3*m3<<16 + d2*m3
||	addu.l2 b11:b10, b3, b11:b10	;  p0' = d1*m3<<16 + d0*m3
||	shru.s2 b2, 16, b2		;  d1*m3 >> 16
||	subab.d2 b3, b3, b3		;  load 0

	addab.d1 a2, a11, a2		;  p2' = p2' + carry
||	addu.l2 b3:b2, b11, b3:b2	;  d1*m3>>16 + carry
||	mv.s2x a10, b11 		;  load d3*m3<<16 + d2*m3
||	mpy.m2 b9, 0, b9		;  load 0

	addu.l2 b3:b2, b11, b3:b2	;  p1' = d1*m2>>16+carry+d3*m2<<16+d2*mp
||	shl.s2 b10, 16, b11		;  p0' << 16
||	mpy.m2 b1, 0, b1		;  load 0

	add.s1x a2, b3, a2		;  p2' = p2' + carry
||	addu.l2 b9:b8, b11, b9:b8	;  P0 = p0 + p0'<<16
||	shru.s2 b10, 16, b10		;  p0' >> 16

	addu.l2 b1:b0, b9, b1:b0	;  p1 + carry
||	shl.s2 b2, 16, b9		;  p1' << 16
||	shl.s1 a2, 16, a2		;  p2' << 16

	addu.l2 b1:b0, b10, b1:b0	;  p1 + carry + p0'>>16
||	shru.s2 b2, 16, b2		;  p1' >> 16
||	add.l1 a0, a2, a0		;  p2 + p2'<<16
||	mpy.m2 b9, 0, b9		;  load 0

	addu.l2 b1:b0, b9, b1:b0	;  P1 = p1'<<16 + p1 + carry + p0'>>16
||	add.l1x a0, b2, a0		;  P2 = p2 + p2'<<16 + p1'>>16
||	mpy.m2 b1, 0, b1		;  load 0

	add.s1x a0, b1, a0		;  P2 = P2 + carry
||	addu.l2 b9:b8, b12, b9:b8	;  P0 + old P1

	addu.l2 b1:b0, b9, b1:b0	;  P1 + carry

	addu.l2 b1:b0, b13, b1:b0	;  P1 + carry + old P2
||	add.l1x b15, 4, a3		;  load sp + 1

	add.l2x a0, b1, b1		;  P2 + carry
||	ldw.d2 *++b15[1], b13		;  restore b13
||	ldw.d1 *++a3[1], a0		;  restore b11

	ldw.d2 *++b15[2], a11		;  restore a11
||	ldw.d1 *++a3[2], b10		;  restore b10
||	b.s2 b7 			;  return

	ldw.d2 *++b15[2], a10		;  restore a10
||	ldw.d1 *++a3[2], b12		;  restore b12

	stw.d1 b8, *+a7[2]		;  store P0
||	addaw.d2 b15, 1, b15		;  pop stack

	stw.d1 b0, *+a7[1]		;  store P1

	stw.d1 b1, *a7			;  store P2
||	mv.l2x a0, b11			;  load b11

	nop
	.end