*******************************************************************************
*									      *
*  N-WORD ADD								      *
*  04/05/10 (dkc)							      *
*									      *
*  This C64 subroutine does n-word addition.  The calling sequence of the     *
*  subroutine is;							      *
*									      *
*     address of augend -> a4						      *
*     address of addend -> b4						      *
*     n -> a6								      *
*									      *
*  n must be even and greater than 2.  (The "B" unit is not used yet, so the  *
*  subroutine is not very efficient.)					      *
*									      *
*******************************************************************************
	.global _addn
	.text
_addn:	mv.l1x b4, a5			;  load address of B
||	sub.s1 a6, 1, a0		;  load n-1

	addaw.d1 a4, a0, a4		;  load address of A[n-1]

	addaw.d1 a5, a0, a5		;  load address of B[n-1]
||	shru.s1 a6, 1, a0		;  load loop count

	ldw.d1 *a4--, a16		;  load A[n-1]
||	mv.l1 a5, a6			;  save address of B[n-1]
|| [a0] sub.s1 a0, 2, a0		;  decremented loop count
**
	ldw.d1 *a5--, a24		;  load B[n-1]

	ldw.d1 *a4--, a18		;  load A[n-2]

	ldw.d1 *a5--, a26		;  load B[n-2]
||	mpy.m1 a17, 0, a17

	nop

	nop

	addu.l1 a17:a16, a24, a17:a16	;  A[n-1]+B[n-1]
||	mpy.m1 a19, 0, a19
||	ldw.d1 *a4--, a16		;  load A[n-1]
*****************
*  begin loop	*
*****************
aloop:	bdec.s1 aloop, a0		;  conditional branch to loop beginning
||	ldw.d1 *a5--, a24		;  load B[n-1]

	addu.l1 a19:a18, a26, a19:a18	;  A[n-2]+B[n-2]-carry
||	ldw.d1 *a4--, a18		;  load A[n-2]

	mpy.m1 a17, 0, a17		;  clear carry bit
||	addu.l1 a19:a18, a17, a19:a18	;  A[n-2]+B[n-2]
||	ldw.d1 *a5--, a26		;  load B[n-2]

	stw.d1 a16, *a6--		;  store A[n-1]+B[n-1]

	addu.l1 a17:a16, a19, a17:a16	;  A[n-3]+carry
||	stw.d1 a18, *a6--		;  store A[n-2]+B[n-2]

	addu.l1 a17:a16, a24, a17:a16	;  A[n-1]+B[n-1]
||	mpy.m1 a19, 0, a19		;  clear carry
||	ldw.d1 *a4--, a16		;  load A[n-1]
*****************
*  end loop	*
*****************
	b.s2 b3 			;  return

	nop 5
	.end
*******************************************************************************
*									      *
*  N-WORD NEGATE							      *
*  04/09/10 (dkc)							      *
*									      *
*  This C64 subroutine does n-word negation.  The calling sequence of the     *
*  subroutine is;							      *
*									      *
*     address of operand -> a4						      *
*     n -> b4								      *
*									      *
*  n must be greater than 1.  (The "B" unit is not used yet, so the subroutine*
*  is not very efficient.)						      *
*									      *
*******************************************************************************
	.global _negn
	.text
_negn:	sub.s1x b4, 1, a0		;  load n-1
||	mvk.d1 1, a24			;  load 1
||	mpy.m1 a17, 0, a17		;  load 0
||	mvk.l1 0, a21			;  load 0

	addaw.d1 a4, a0, a4		;  load address of A[n-1]
|| [a0] sub.l1 a0, 1, a0		;  decremented loop count

	ldw.d1 *a4--, a8		;  load A[n-1]
||	mv.l1 a4, a6			;  save address of A[n-1]

	ldw.d1 *a4--, a8		;  load A[n-2]

	nop

	ldw.d1 *a4--, a8		;  load A[n-3]
||	bdec.s1 aloop, a0		;  conditional branch to loop beginning

	nop

	not.l1 a8, a16			;  invert A[n-1]
||	ldw.d1 *a4--, a8		;  load A[n-4]
||	bdec.s1 aloop, a0		;  conditional branch to loop beginning

	not.s1 a8, a20			;  invert A[n-2]
||	mpy.m1 a24, 0, a24		;  load 0
*****************
*  begin loop	*
*****************
aloop:	bdec.s1 aloop, a0		;  conditional branch to loop beginning
||	addu.l1 a17:a16, a24, a19:a18	;  ~A[n-1]+1,0
||	ldw.d1 *a4--, a8		;  load A[n-1]

	addu.l1 a21:a20, a19, a17:a16	;  ~A[n-2]+carry
||	not.s1 a8, a20			;  ~A[n-2]
||	stw.d1 a18, *a6--		;  store ~A[n-1]+1
*****************
*  end loop	*
*****************
	b.s2 b3 			;  return

	nop 5
	.end
*******************************************************************************
*									      *
*  N-WORD RIGHT SHIFT							      *
*  04/10/10 (dkc)							      *
*									      *
*  This C64 subroutine does an n-word right shift.  The calling sequence of   *
*  the subroutine is;							      *
*									      *
*     address of input data -> a4					      *
*     address of output data -> b4					      *
*     shift amount -> a6						      *
*     n -> b6								      *
*									      *
*  The shift amount must be less than or equal to 32*n.  The input and output *
*  addresses cannot be the same.					      *
*									      *
*******************************************************************************
	.global _rshiftn
	.text
_rshiftn:
	shru.s2x a6, 6, b0		;  shift/64
||	mpy.m2 b17, 0, b17		;  load 0
||	shru.s1 a6, 5, a1		;  shift/32

	mpy.m2 b16, 0, b16		;  load 0
||	and.l1 a1, 1, a2		;  check if odd
||	shl.s1 a1, 5, a5		;  (shift/32)*32

  [!b0] b.s2 askip			;  no double-word shifts
|| [b0] subab.d2 b0, 1, b0		;  decremented loop count
||	sub.l2x b6, a1, b2		;  load n-shift/32
||	sub.l1 a6, a5, a0		;  remainder
||	mvk.s1 32, a3			;  load 32

   [b0] b.s2 aloop			;  conditional branch to loop beginning
|| [b0] sub.l2 b0, 1, b0		;  decrement loop count

   [b0] b.s2 aloop			;  conditional branch to loop beginning
|| [b0] sub.l2 b0, 1, b0		;  decrement loop count

   [b0] b.s2 aloop			;  conditional branch to loop beginning
|| [b0] sub.l2 b0, 1, b0		;  decrement loop count
||	sub.l1 a3, a0, a3		;  32-remainder

   [b0] b.s2 aloop			;  conditional branch to loop beginning
|| [b0] sub.l2 b0, 1, b0		;  decrement loop count

   [b0] b.s2 aloop			;  conditional branch to loop beginning
|| [b0] sub.l2 b0, 1, b0		;  decrement loop count
*****************
*  begin loop	*
*****************
aloop	stndw.d2 b17:b16, *b4++ 	;  store 0
|| [b0] b.s2 aloop			;  conditional branch to loop beginning
|| [b0] sub.l2 b0, 1, b0		;  decrement loop count
*****************
*  end loop	*
*****************
askip:
   [b2] ldw.d1 *a4++, a17		;  load input data
|| [a2] stw.d2 b16, *b4++		;  store 0
||[!b2] b.s2 b3 			;  return
|| [b2] sub.l2 b2, 1, b2		;  decrement loop count

	nop 5

   [b2] b.s2 bloop			;  conditional branch to loop beginning
|| [b2] sub.l2 b2, 1, b2		;  decrement loop count

	ldw.d1 *a4++, a17		;  load input data
||	mv.l1x b4, a7			;  load address

	mvk.s1 0, a18			;  load 0
*****************
*  begin loop	*
*****************
bloop	shru.s1 a17, a0, a17		;  right-shift data
||	mv.l1 a17, a16			;  save data
|| [b2] b.s2 bloop			;  conditional branch to loop beginning
|| [b2] sub.l2 b2, 1, b2		;  decrement loop count

	or.l1 a18, a17, a18		;  "or" data
||	ldw.d1 *a4++, a17		;  load input data

	stw.d1 a18, *a7++		;  store data
||	shl.s1 a16, a3, a18		;  left-shift data
*****************
*  end loop	*
*****************
	b.s2 b3 			;  return

	nop 5
	.end
*******************************************************************************
*									      *
*  N-WORD LEFT SHIFT							      *
*  04/10/10 (dkc)							      *
*									      *
*  This C64 subroutine does an n-word left shift.  The calling sequence of    *
*  the subroutine is;							      *
*									      *
*     address of input data -> a4					      *
*     address of output data -> b4					      *
*     shift amount -> a6						      *
*     n -> b6								      *
*									      *
*  The shift amount must be less than or equal to 32*n.  The input and output *
*  addresses cannot be the same.					      *
*									      *
*******************************************************************************
	.global _lshiftn
	.text
_lshiftn:
	shru.s2x a6, 6, b0		;  shift/64
||	mpy.m2 b17, 0, b17		;  load 0
||	shru.s1 a6, 5, a1		;  shift/32
||	addaw.d2 b4, b6, b4		;  address of B[n]
||	sub.l1x b6, 1, a3		;  load n-1

	mpy.m2 b16, 0, b16		;  load 0
||	and.l1 a1, 1, a2		;  check if odd
||	shl.s1 a1, 5, a5		;  (shift/32)*32
||	addaw.d1 a4, a3, a4		;  address of A[n-1]

  [!b0] b.s2 askip			;  no double-word shifts
|| [b0] subab.d2 b0, 1, b0		;  decremented loop count
||	sub.l2x b6, a1, b2		;  load n-shift/32
||	sub.l1 a6, a5, a0		;  remainder
||	mvk.s1 32, a3			;  load 32

   [b0] b.s2 aloop			;  conditional branch to loop beginning
|| [b0] sub.l2 b0, 1, b0		;  decrement loop count

   [b0] b.s2 aloop			;  conditional branch to loop beginning
|| [b0] sub.l2 b0, 1, b0		;  decrement loop count

   [b0] b.s2 aloop			;  conditional branch to loop beginning
|| [b0] sub.l2 b0, 1, b0		;  decrement loop count
||	sub.l1 a3, a0, a3		;  32-remainder

   [b0] b.s2 aloop			;  conditional branch to loop beginning
|| [b0] sub.l2 b0, 1, b0		;  decrement loop count

   [b0] b.s2 aloop			;  conditional branch to loop beginning
|| [b0] sub.l2 b0, 1, b0		;  decrement loop count
*****************
*  begin loop	*
*****************
aloop	stndw.d2 b17:b16, *--b4 	;  store 0
|| [b0] b.s2 aloop			;  conditional branch to loop beginning
|| [b0] sub.l2 b0, 1, b0		;  decrement loop count
*****************
*  end loop	*
*****************
askip:
   [b2] ldw.d1 *a4--, a17		;  load input data
|| [a2] stw.d2 b16, *--b4		;  store 0
||[!b2] b.s2 b3 			;  return
|| [b2] sub.l2 b2, 1, b2		;  decrement loop count

	nop 5

   [b2] b.s2 bloop			;  conditional branch to loop beginning
|| [b2] sub.l2 b2, 1, b2		;  decrement loop count

	ldw.d1 *a4--, a17		;  load input data
||	mv.l1x b4, a7			;  load address

	mvk.s1 0, a18			;  load 0
*****************
*  begin loop	*
*****************
bloop	shl.s1 a17, a0, a17		;  left-shift data
||	mv.l1 a17, a16			;  save data
|| [b2] b.s2 bloop			;  conditional branch to loop beginning
|| [b2] sub.l2 b2, 1, b2		;  decrement loop count

	or.l1 a18, a17, a18		;  "or" data
||	ldw.d1 *a4--, a17		;  load input data

	stw.d1 a18, *--a7		;  store data
||	shru.s1 a16, a3, a18		;  right-shift data
*****************
*  end loop	*
*****************
	b.s2 b3 			;  return

	nop 5
	.end
*******************************************************************************
*									      *
*  N-WORD NORMALIZATION 						      *
*  04/09/10 (dkc)							      *
*									      *
*  This C64 subroutine does an n-word normalization.  The calling sequence of *
*  the subroutine is;							      *
*									      *
*     address of operand -> a4						      *
*     n -> b4								      *
*									      *
*  The n-word must be non-zero. 					      *
*									      *
*******************************************************************************
	.global _normn
	.text
_normn: zero.l1 a2		     ;	load 0
||	mv.l2x a4, b4		     ;	save address of input
||	mvk.s2 0, b5		     ;	clear counter

	ldndw.d1 *a4++, a1:a0	     ;	load data
||[!a2] b.s1 aloop		     ;	conditional branch to loop beginning

	ldndw.d1 *a4++, a1:a0	     ;	load data
||[!a2] b.s1 aloop		     ;	conditional branch to loop beginning

	ldndw.d1 *a4++, a1:a0	     ;	load data
||[!a2] b.s1 aloop		     ;	conditional branch to loop beginning

	ldndw.d1 *a4++, a1:a0	     ;	load data
||[!a2] b.s1 aloop		     ;	conditional branch to loop beginning

	ldndw.d1 *a4++, a1:a0	     ;	load data
||[!a2] b.s1 aloop		     ;	conditional branch to loop beginning
*****************
*  begin loop	*
*****************
aloop:
  [!a2] ldndw.d1 *a4++, a1:a0	     ;	load data
||[!a2] b.s1 aloop		     ;	conditional branch to loop beginning
||[!a2] or.l1 a0, a1, a2	     ;	"or" data
||[!a2] mv.s2 b5, b6		     ;	save index
||[!a2] add.l2 b5, 1, b5	     ;	increment index
*****************
*  end loop	*
*****************
	ldndw.d2 *+b4[b6], a1:a0     ;	load data
||	shl.s2 b6, 6, b8	     ;	count*64
||	mvk.s1 32, a2		     ;	load 32

	nop

	b.s2 b3 		     ;	return
||	mv.s1x b8, a4		     ;	count*64

	nop 2

  [!a0] lmbd.l1 1, a1, a7	     ;	left-most bit detection
||[!a0] add.s1 a4, a2, a4	     ;	increment count

   [a0] lmbd.l1 1, a0, a7	     ;	left-most bit detection

	add.l1 a4, a7, a4	     ;	count
	.end
*******************************************************************************
*									      *
*  N-WORD "OR"                                                                *
*  04/09/10 (dkc)							      *
*									      *
*  This C64 subroutine does an n-word "or".  The calling sequence of the      *
*  subroutine is;							      *
*									      *
*     address of operand -> a4						      *
*     n -> b4								      *
*									      *
*  n must be even and greater than 2.  (The "B" unit is not used yet, so the  *
*  subroutine is not very efficient.)					      *
*									      *
*******************************************************************************
	.global _orn
	.text
_orn:	shru.s2 b4, 1, b0		;  load n/2
||	mvk.l1 0, a19			;  load 0
||	mvk.d1 0, a18			;  load 0

   [b0] sub.l2 b0, 2, b0		;  decremented loop count

	ldndw.d1 *a4++, a17:a16 	;  load data
||	bdec.s2 aloop, b0		;  conditional branch to loop beginning

	ldndw.d1 *a4++, a17:a16 	;  load data
||	bdec.s2 aloop, b0		;  conditional branch to loop beginning

	ldndw.d1 *a4++, a17:a16 	;  load data
||	bdec.s2 aloop, b0		;  conditional branch to loop beginning

	ldndw.d1 *a4++, a17:a16 	;  load data
||	bdec.s2 aloop, b0		;  conditional branch to loop beginning

	ldndw.d1 *a4++, a17:a16 	;  load data
||	bdec.s2 aloop, b0		;  conditional branch to loop beginning
*****************
*  begin loop	*
*****************
aloop:	bdec.s2 aloop, b0		;  conditional branch to loop beginning
||	or.l1 a19, a17, a19		;  "or" words
||	or.s1 a18, a16, a18		;  "or" words
||	ldndw.d1 *a4++, a17:a16 	;  load data
*****************
*  end loop	*
*****************
	b.s2 b3 			;  return
||	or.l1 a19, a18, a4		;  "or" partial "ors"

	nop 5
	.end
*******************************************************************************
*									      *
*  N-WORD COPY								      *
*  04/09/10 (dkc)							      *
*									      *
*  This C64 subroutine does an n-word copy.  The calling sequence of the      *
*  subroutine is;							      *
*									      *
*     address of input data -> a4					      *
*     address of output data -> b4					      *
*     n -> a6								      *
*									      *
*  n must be even and greater than 2.					      *
*									      *
*******************************************************************************
	.global _copyn
	.text
_copyn: shru.s1 a6, 1, a0		;  load n/2
||	mv.l1x b4, a6			;  load address of output

	ldndw.d1 *a4++, a17:a16 	;  load data
|| [a0] sub.l1 a0, 2, a0		;  decremented loop count

	bdec.s1 aloop, a0		;  conditional branch to loop beginning

	ldndw.d1 *a4++, a17:a16 	;  load data

	bdec.s1 aloop, a0		;  conditional branch to loop beginning

	ldndw.d1 *a4++, a17:a16 	;  load data
*****************
*  begin loop	*
*****************
aloop:	stndw.d1 a17:a16, *a6++ 	;  store data
||	bdec.s1 aloop, a0		;  conditional branch to loop beginning

	ldndw.d1 *a4++, a17:a16 	;  load data
*****************
*  end loop	*
*****************
	b.s2 b3 			;  return

	nop 5
	.end
*******************************************************************************
*									      *
*  N-WORD CLEAR (AND SET OF LAST WORD)					      *
*  04/09/10 (dkc)							      *
*									      *
*  This C64 subroutine does an n-word clear.  The calling sequence of the     *
*  subroutine is;							      *
*									      *
*     address of operand -> a4						      *
*     data for last word -> b4
*     n -> a6								      *
*									      *
*  n must be even and greater than 2.					      *
*									      *
*******************************************************************************
	.global _setn
	.text
_setn:	shru.s1 a6, 1, a0		;  load n/2
||	mv.l1x b4, a5			;  load data
||	mpy.m1 a17, 0, a17		;  load 0

   [a0] sub.l1 a0, 2, a0		;  decremented loop count
||	mvk.d1 0, a16			;  load 0

	bdec.s1 aloop, a0		;  conditional branch to loop beginning

	bdec.s1 aloop, a0		;  conditional branch to loop beginning

	bdec.s1 aloop, a0		;  conditional branch to loop beginning

	bdec.s1 aloop, a0		;  conditional branch to loop beginning

	bdec.s1 aloop, a0		;  conditional branch to loop beginning
*****************
*  begin loop	*
*****************
aloop:	bdec.s1 aloop, a0		;  conditional branch to loop beginning
||	stndw.d1 a17:a16, *a4++ 	;  store 0
*****************
*  end loop	*
*****************
	b.s2 b3 			;  return
||	stw.d1 a5, *-a4[1]		;  store data

	nop 5
	.end
*******************************************************************************
*									      *
*  N-WORD TRIPLE							      *
*  04/09/10 (dkc)							      *
*									      *
*  This C64 subroutine triples an n-word.  The calling sequence of the	      *
*  subroutine is;							      *
*									      *
*     address of operand -> a4						      *
*     n -> b4								      *
*									      *
*  n must be greater than 1.  (The "B" unit is not used yet, so the subroutine*
*  is not very efficient.)						      *
*									      *
*******************************************************************************
	.global _mult3
	.text
_mult3: sub.s1x b4, 1, a0		;  load n-1
||	mpy.m1 a17, 0, a17		;  load 0

	addaw.d1 a4, a0, a4		;  load address of A[n-1]
|| [a0] sub.l1 a0, 1, a0		;  decremented loop count

	ldw.d1 *a4--, a16		;  load A[n-1]
||	mv.l1 a4, a6			;  save address of A[n-1]

	nop 2

	ldw.d1 *a4--, a16		;  load A[n-2]
||	bdec.s1 aloop, a0		;  conditional branch to loop beginning

	mpy.m1 a19, 0, a19		;  clear upper word

	mv.l1 a16, a18			;  load A[n-1]
||	mv.s1 a16, a20			;  load A[n-1]
*****************
*  begin loop	*
*****************
aloop	addu.l1 a19:a18, a16, a19:a18	;  A[n-1]+A[n-1]+A[n-1]
||	mpy.m1 a17, 0, a17		;  clear upper word
||	ldw.d1 *a4--, a16		;  load A[n-3]
||	bdec.s1 aloop, a0		;  conditional branch to loop beginning

	addu.l1 a19:a18, a20, a19:a18	;  A[n-1]+A[n-1]+A[n-1]+carry

	addu.l1 a17:a16, a16, a19:a18	;  A[n-2]+A[n-2]
||	rotl.m1 a19, 0, a20		;  save carry
||	stw.d1 a18, *a6--		;  store A[n-1]
*****************
*  end loop	*
*****************
	b.s2 b3 			;  return

	nop 5
	.end
*******************************************************************************
*									      *
*  N-WORD TRIPLE AND RIGHT-SHIFT					      *
*  04/09/10 (dkc)							      *
*									      *
*  This C64 subroutine triples and right-shifts m-words.  The calling	      *
*  sequence of the subroutine is;					      *
*									      *
*     address of operand -> a4						      *
*     shift amount -> b4						      *
*     n -> a6								      *
*									      *
*  n must be even and greater than 2.  The shift amount must be less than or  *
*  equal to 32.  (The "B" unit is not used yet, so the subroutine is not very *
*  efficient.)								      *
*									      *
*******************************************************************************
	.global _mul3shft
	.text
_mul3shft:
	sub.l1 a6, 1, a0		;  load n-1
||	mvk.s2 32, b7			;  load 32
||	mvk.d1 1, a2			;  set flag
||	shru.s1 a6, 1, a1		;  load n/2
||	mpy.m1 a17, 0, a17		;  load 0

	addaw.d1 a4, a0, a4		;  load address of A[n-1]
||	sub.s2 b7, b4, b7		;  32-shift
||	mv.l1x b4, a6			;  load shift
|| [a1] sub.s1 a1, 2, a1		;  decremented loop count

	ldw.d1 *a4--, a16		;  load A[n-1]
||	mv.l1 a4, a8			;  save address of A[n-1]

	mv.l2x a1, b0			;  load decremented loop count
||	mv.l1x b7, a7			;  load 32-shift

	nop

	ldw.d1 *a4--, a16		;  load A[n-2]

	mpy.m1 a19, 0, a19		;  load 0

	mv.l1 a16, a18			;  load A[n-1]
||	mv.s1 a16, a30			;  load A[n-1]
*****************
*  begin loop	*
*****************
aloop	addu.l1 a19:a18, a16, a19:a18	;  A[n-1]+A[n-1]+A[n-1]
||	or.s1 a21, a20, a21		;  "or" words
||	mpy.m1 a17, 0, a17		;  clear upper word
||	ldw.d1 *a4--, a16		;  load A[n-3]
||	bdec.s2 aloop, b0		;  conditional branch to loop beginning

	addu.l1 a19:a18, a30, a19:a18	;  A[n-1]+A[n-1]+A[n-1]+carry
||	shru.s1 a24, a6, a24		;  right-shift A[n-2]
||[!a2] stw.d1 a21, *a8--		;  store A[n-1]

	addu.l1 a17:a16, a16, a19:a18	;  A[n-2]+A[n-2]
||	rotl.m1 a18, 0, a20		;  save 3*A[n-1]
||	shl.s1 a18, a7, a25		;  left-shift 3*A[n-1]
||	mv.d1 a19, a30			;  save carry

	addu.l1 a19:a18, a16, a19:a18	;  A[n-2]+A[n-2]+A[n-2]
||	or.s1 a25, a24, a25		;  "or" words
||	mpy.m1 a17, 0, a17		;  clear upper word
||	ldw.d1 *a4--, a16		;  load A[n-3]

	addu.l1 a19:a18, a30, a19:a18	;  A[n-2]+A[n-2]+A[n-2]+carry
||[!a2] stw.d1 a25, *a8--		;  store A[n-2]
||	shru.s1 a20, a6, a20		;  right-shift 3*A[n-1]
||	mpy.m1 a2, 0, a2		;  clear flag

	addu.l1 a17:a16, a16, a19:a18	;  A[n-2]+A[n-2]
||	shl.s1 a18, a7, a21		;  left-shift 3*A[n-2]
||	rotl.m1 a18, 0, a24		;  save 3*A[n-2]
||	mv.d1 a19, a30			;  save carry
*****************
*  end loop	*
*****************
	b.s2 b3 			;  return
||	or.s1 a21, a20, a21		;  "or" words

	shru.s1 a24, a6, a24		;  right-shift A[n-2]
||	stw.d1 a21, *a8--		;  store A[n-1]

	stw.d1 a24, *a8--		;  store A[n-2]

	nop 3
	.end
*******************************************************************************
*									      *
*  N-WORD TRIPLE AND RIGHT-SHIFT (with carry-in and carry-out)		      *
*  04/09/10 (dkc)							      *
*									      *
*  This C64 subroutine triples and right-shifts m-words.  The calling	      *
*  sequence of the subroutine is;					      *
*									      *
*     address of operand -> a4						      *
*     shift amount -> b4						      *
*     carry-in -> a6							      *
*     n -> b6								      *
*									      *
*  n must be even and greater than 2.  The shift amount must be less than or  *
*  equal to 32.  The carry-out is returned.  (The "B" unit is not used yet,   *
*  so the subroutine is not very efficient.)				      *
*									      *
*******************************************************************************
	.global _mul3shfc
	.text
_mul3shfc:
	mv.l1x b6, a6			;  load n
||	mv.s1 a6, a29			;  save carry-in

	sub.l1 a6, 1, a0		;  load n-1
||	mvk.s2 32, b7			;  load 32
||	mvk.d1 1, a2			;  set flag
||	shru.s1 a6, 1, a1		;  load n/2
||	mpy.m1 a17, 0, a17		;  load 0

	addaw.d1 a4, a0, a4		;  load address of A[n-1]
||	sub.s2 b7, b4, b7		;  32-shift
||	mv.l1x b4, a6			;  load shift
|| [a1] sub.s1 a1, 2, a1		;  decremented loop count

	ldw.d1 *a4--, a16		;  load A[n-1]
||	mv.l1 a4, a8			;  save address of A[n-1]

	mv.l2x a1, b0			;  load decremented loop count
||	mv.l1x b7, a7			;  load 32-shift

	nop 2

	ldw.d1 *a4--, a16		;  load A[n-2]
||	mpy.m1 a19, 0, a19		;  load 0

	mv.l1 a16, a18			;  load A[n-1]
||	mv.s1 a16, a30			;  load A[n-1]

	addu.l1 a19:a18, a29, a19:a18	;  A[n-1]+carry
*****************
*  begin loop	*
*****************
aloop	addu.l1 a19:a18, a16, a19:a18	;  A[n-1]+A[n-1]+A[n-1]
||	or.s1 a21, a20, a21		;  "or" words
||	mpy.m1 a17, 0, a17		;  clear upper word
||	ldw.d1 *a4--, a16		;  load A[n-3]
||	bdec.s2 aloop, b0		;  conditional branch to loop beginning

	addu.l1 a19:a18, a30, a19:a18	;  A[n-1]+A[n-1]+A[n-1]+carry
||	shru.s1 a24, a6, a24		;  right-shift A[n-2]
||[!a2] stw.d1 a21, *a8--		;  store A[n-1]

	addu.l1 a17:a16, a16, a19:a18	;  A[n-2]+A[n-2]
||	rotl.m1 a18, 0, a20		;  save 3*A[n-1]
||	shl.s1 a18, a7, a25		;  left-shift 3*A[n-1]
||	mv.d1 a19, a30			;  save carry

	addu.l1 a19:a18, a16, a19:a18	;  A[n-2]+A[n-2]+A[n-2]
||	or.s1 a25, a24, a25		;  "or" words
||	mpy.m1 a17, 0, a17		;  clear upper word
||	ldw.d1 *a4--, a16		;  load A[n-3]

	addu.l1 a19:a18, a30, a19:a18	;  A[n-2]+A[n-2]+A[n-2]+carry
||[!a2] stw.d1 a25, *a8--		;  store A[n-2]
||	shru.s1 a20, a6, a20		;  right-shift 3*A[n-1]
||	mpy.m1 a2, 0, a2		;  clear flag

	addu.l1 a17:a16, a16, a19:a18	;  A[n-2]+A[n-2]
||	shl.s1 a18, a7, a21		;  left-shift 3*A[n-2]
||	rotl.m1 a18, 0, a24		;  save 3*A[n-2]
||	mv.d1 a19, a30			;  save carry
*****************
*  end loop	*
*****************
	b.s2 b3 			;  return
||	addu.l1 a19:a18, a16, a19:a18	;  A[n-1]+A[n-1]+A[n-1]
||	or.s1 a21, a20, a21		;  "or" words

	addu.l1 a19:a18, a30, a19:a18	;  A[n-1]+A[n-1]+A[n-1]+carry
||	shru.s1 a24, a6, a24		;  right-shift A[n-2]
||	stw.d1 a21, *a8--		;  store A[n-1]

	shl.s1 a18, a7, a25		;  left-shift 3*A[n-1]
||	mv.l1 a30, a4			;  return carry

	or.s1 a25, a24, a25		;  "or" words

	stw.d1 a25, *a8--		;  store A[n-2]

	nop
	.end
*******************************************************************************
*									      *
*  N-WORD TRIPLE AND RIGHT-SHIFT (with carry-in, carry-out, and copy)	      *
*  04/09/10 (dkc)							      *
*									      *
*  This C64 subroutine triples and right-shifts n-words.  The calling	      *
*  sequence of the subroutine is;					      *
*									      *
*     address of operand -> a4						      *
*     shift amount -> b4						      *
*     carry-in -> a6							      *
*     source address (for copy) -> b6					      *
*     destination address (for copy) -> a8				      *
*     address of output -> b8
*     n -> a10								      *
*									      *
*  n must be even and greater than 2.  The shift amount must be less than or  *
*  equal to 32.  The carry-out is returned.				      *
*									      *
*******************************************************************************
	.global _mul3scpy
	.text
_mul3scpy:
	mv.d1 a10, a6			;  load n
||	mv.s1 a6, a29			;  save carry-in
||	mv.l2x a8, b8			;  load destination address
||	sub.l1x b8, 4, a8		;  load output address

	sub.l1 a6, 1, a0		;  load n-1
||	mvk.s2 32, b7			;  load 32
||	mvk.d1 1, a2			;  set flag
||	shru.s1 a6, 1, a1		;  load n/2
||	mpy.m1 a17, 0, a17		;  load 0

	addaw.d1 a4, a0, a4		;  load address of A[n-1]
||	sub.s2 b7, b4, b7		;  32-shift
||	mv.l1x b4, a6			;  load shift
|| [a1] sub.s1 a1, 2, a1		;  decremented loop count

	ldw.d1 *a4--, a26		;  load A[n-1]

	mv.l2x a1, b0			;  load decremented loop count
||	mv.l1x b7, a7			;  load 32-shift
||	addaw.d1 a8, a0, a8		;  load address of A[n-1]

	nop 2

	ldw.d1 *a4--, a16		;  load A[n-2]
||	mpy.m1 a19, 0, a19		;  load 0

	mv.l1 a26, a18			;  load A[n-1]
||	mv.s1 a26, a30			;  load A[n-1]
||	ldw.d1 *a4--[2], a26

	addu.l1 a19:a18, a29, a19:a18	;  A[n-1]+carry
||	ldndw.d2 *b6++, b17:b16 	;  load data
*****************
*  begin loop	*
*****************
aloop	addu.l1 a19:a18, a26, a19:a18	;  A[n-1]+A[n-1]+A[n-1]
||	or.s1 a21, a20, a23		;  "or" words
||	mpy.m1 a17, 0, a17		;  clear upper word
||	bdec.s2 aloop, b0		;  conditional branch to loop beginning

	addu.l1 a19:a18, a30, a19:a18	;  A[n-1]+A[n-1]+A[n-1]+carry
||	shru.s1 a24, a6, a24		;  right-shift A[n-2]
||	mv.d1 a27, a16
||	ldndw.d2 *b6++, b17:b16 	;  load data

	addu.l1 a17:a16, a16, a29:a28	;  A[n-2]+A[n-2]
||	rotl.m1 a18, 0, a20		;  save 3*A[n-1]
||	shl.s1 a18, a7, a25		;  left-shift 3*A[n-1]
||	ldndw.d1 *a4--, a27:a26 	;  load A[n-3]

	addu.l1 a29:a28, a16, a19:a18	;  A[n-2]+A[n-2]+A[n-2]
||	or.s1 a25, a24, a22		;  "or" words
||	mpy.m1 a27, 0, a27		;  clear upper word
||	mv.d1 a19, a30			;  save carry

	addu.l1 a19:a18, a30, a19:a18	;  A[n-2]+A[n-2]+A[n-2]+carry
||[!a2] stndw.d1 a23:a22, *a8-- 	;  store A[n-2]
||	shru.s1 a20, a6, a20		;  right-shift 3*A[n-1]
||	mpy.m1 a2, 0, a2		;  clear flag

	addu.l1 a27:a26, a26, a19:a18	;  A[n-2]+A[n-2]
||	shl.s1 a18, a7, a21		;  left-shift 3*A[n-2]
||	rotl.m1 a18, 0, a24		;  save 3*A[n-2]
||	mv.d1 a19, a30			;  save carry
||	stndw.d2 b17:b16, *b8++ 	;  store data
*****************
*  end loop	*
*****************
	b.s2 b3 			;  return
||	addu.l1 a19:a18, a26, a19:a18	;  A[n-1]+A[n-1]+A[n-1]
||	or.s1 a21, a20, a23		;  "or" words

	addu.l1 a19:a18, a30, a19:a18	;  A[n-1]+A[n-1]+A[n-1]+carry
||	shru.s1 a24, a6, a24		;  right-shift A[n-2]

	shl.s1 a18, a7, a25		;  left-shift 3*A[n-1]
||	mv.l1 a30, a4			;  return carry

	or.s1 a25, a24, a22		;  "or" words

	stndw.d1 a23:a22, *a8-- 	;  store A[n-2]

	nop
	.end