*******************************************************************************
*									      *
*  32x32 BIT MULTIPLY (SIGNED)						      *
*  01/30/07 (dkc)							      *
*									      *
*  This C64 subroutine does 32x32 bit signed multiplication.  The calling     *
*  sequence of the subroutine is;					      *
*									      *
*     multiplicand => a4						      *
*     multiplier => b4							      *
*     address of product (a two-word array) => a6			      *
*									      *
*******************************************************************************
	.def _mul32_32
	.text
_mul32_32:
	mpyhslu.m1x a4, b4, a0		;  A_hi * B_lo
||	mpyhslu.m2x b4, a4, b0		;  B_hi * A_lo

	mpyu.m1x a4, b4, a1		;  A_lo * B_lo
||	mpyh.m2x b4, a4, b1		;  A_hi * B_hi

	shl.s1 a0, 16, a4		;  A_hi*B_lo << 16
||	zero.l1 a5			;  zero odd register of pair
||	b.s2 b3 			;  return

	shl.s1x b0, 16, a1		;  B_hi*A_lo << 16
||	shr.s2x a0, 16, b2		;  A_hi*B_lo >> 16
||	addu.l1 a5:a4, a1, a5:a4	;  sum_lo = A_hi*B_lo<<16 + A_lo*B_lo

	add.l2 b1, b2, b1		;  sum_hi = A_hi*B_lo>>16 + A_hi*B_hi
||	shr.s2 b0, 16, b0		;  B_hi*A_lo >> 16
||	addu.l1 a5:a4, a1, a5:a4	;  sum_lo = sum_lo + B_hi*A_lo<<16

	add.l2 b1, b0, b1		;  sum_hi = sum_hi + B_hi*A_lo>>16
||	stw.d1 a4, *+a6[1]		;  store LSW of product

	add.l2x b1, a5, b1		;  sum_hi = sum_hi + carry

	stw.d1 b1, *a6			;  store MSW of product
	.end