*******************************************************************************
*									      *
*  SUBROUTINE #2							      *
*  03/06/09 (dkc)							      *
*									      *
*  This C64 subroutine performs the following function; 		      *
*									      *
*  div64_32(K, T, 3);			 // k/3 			      *
*  U[0]=T[0];								      *
*  U[1]=T[1];								      *
*  add64(T, U); 							      *
*  add64(T, U); 			 // (k/3)*3			      *
*  sub64(K, U); 			 // k-(k/3)*3			      *
*  if ((U[0]==0)&&(U[1]==0))						      *
*     goto askip;							      *
*									      *
*  The calling sequence of the subroutine is;				      *
*									      *
*     K[0]=>a4								      *
*     K[1]=>b4								      *
*     address of product=>a6						      *
*									      *
*******************************************************************************
	.global _subr2
	.global _mul64_64
	.text
_subr2:
	b.s2 _mul64_64
||	mvkl.s1 0x55555555, a6	   ;  load 1/3
||	mv.l1 a6, a8		   ;  save address of product
||	addab.d1 a6, 0, a17	   ;  save address of product

	mvkh.s1 0x55555555, a6	   ;  load 1/3
||	mvkl.s2 0x55555556, b6	   ;  load 1/3
||	mv.l1 a4, a16		   ;  save K[0]
||	mv.l2 b4, b16		   ;  save K[1]

	mvkh.s2 0x55555556, b6	   ;  load 1/3

	mvkl.s2 askip, b3
||	mv.l2 b3, b17		   ;  save return address

	mvkh.s2 askip, b3

	nop
*
askip	ldw.d1 *+a17[1], a0	   ;  load A[1]

	ldw.d1 *a17, b6 	   ;  load A[0]

	nop 2

	zero.s1 a1		   ;  load 0

	addu.l1 a1:a0, a0, a1:a0   ;  A[1] + B[1]
||	mv.s1 a0, a2		   ;  save A[1]

	and.l1 a1, 1, a3	   ;  isolate carry bit
||	zero.s1 a1		   ;  load 0
||	add.l2 b6, b6, b7	   ;  A[0] + B[0] - carry

	addu.l1 a1:a0, a2, a1:a0   ;  A[1] + B[1]
||	add.l2 b7, b6, b7	   ;  A[0] + B[0] - carry
||	mpy.m1 a9, 0, a9	   ;  load 0

	add.s2x b7, a3, b6	   ;  A[0] + B[0] - carry
||	and.l1 a1, 1, a1	   ;  isolate carry bit
||	not.s1 a0, a8		   ;  not(B[1])
||	mv.d1x b16, a0		   ;  load K[1]

	add.l2x b6, a1, b6	   ;  A[0] + B[0]
||	mv.s1 a16, a6		   ;  load K[0]
||	add.l1 a9:a8, 1, a9:a8	   ;  not(B[1]) + 1

	not.l2 b6, b6		   ;  not(B[0])
||	and.l1 a9, 1, a9	   ;  isolate carry bit
||	zero.s1 a1		   ;  load 0
||	b.s2 b17		   ;  return

	addu.l1 a1:a0, a8, a1:a0   ;  A[1] - B[1]
||	add.l2x b6, a9, b6	   ;  -B[0]::B[1]

	and.s1 a1, 1, a1	   ;  isolate carry bit
||	add.l2x b6, a6, b6	   ;  A[0] - B[0] - carry
||	cmpeq.l1 a0, 0, a2	   ;  compare to 0

	add.l1x b6, a1, a1	   ;  A[0] - B[0]

	cmpeq.l1 a1, 0, a1	   ;  compare to 0

	and.l1 a1, a2, a4	   ;  "and" comparisons

       .end