******************************************************************************* * * * N-WORD ADD * * 04/05/10 (dkc) * * * * This C64 subroutine does n-word addition. The calling sequence of the * * subroutine is; * * * * address of augend -> a4 * * address of addend -> b4 * * n -> a6 * * * * n must be even and greater than 2. (The "B" unit is not used yet, so the * * subroutine is not very efficient.) * * * ******************************************************************************* .global _addn .text _addn: mv.l1x b4, a5 ; load address of B || sub.s1 a6, 1, a0 ; load n-1 addaw.d1 a4, a0, a4 ; load address of A[n-1] addaw.d1 a5, a0, a5 ; load address of B[n-1] || shru.s1 a6, 1, a0 ; load loop count ldw.d1 *a4--, a16 ; load A[n-1] || mv.l1 a5, a6 ; save address of B[n-1] || [a0] sub.s1 a0, 2, a0 ; decremented loop count ** ldw.d1 *a5--, a24 ; load B[n-1] ldw.d1 *a4--, a18 ; load A[n-2] ldw.d1 *a5--, a26 ; load B[n-2] || mpy.m1 a17, 0, a17 nop nop addu.l1 a17:a16, a24, a17:a16 ; A[n-1]+B[n-1] || mpy.m1 a19, 0, a19 || ldw.d1 *a4--, a16 ; load A[n-1] ***************** * begin loop * ***************** aloop: bdec.s1 aloop, a0 ; conditional branch to loop beginning || ldw.d1 *a5--, a24 ; load B[n-1] addu.l1 a19:a18, a26, a19:a18 ; A[n-2]+B[n-2]-carry || ldw.d1 *a4--, a18 ; load A[n-2] mpy.m1 a17, 0, a17 ; clear carry bit || addu.l1 a19:a18, a17, a19:a18 ; A[n-2]+B[n-2] || ldw.d1 *a5--, a26 ; load B[n-2] stw.d1 a16, *a6-- ; store A[n-1]+B[n-1] addu.l1 a17:a16, a19, a17:a16 ; A[n-3]+carry || stw.d1 a18, *a6-- ; store A[n-2]+B[n-2] addu.l1 a17:a16, a24, a17:a16 ; A[n-1]+B[n-1] || mpy.m1 a19, 0, a19 ; clear carry || ldw.d1 *a4--, a16 ; load A[n-1] ***************** * end loop * ***************** b.s2 b3 ; return nop 5 .end ******************************************************************************* * * * N-WORD NEGATE * * 04/09/10 (dkc) * * * * This C64 subroutine does n-word negation. The calling sequence of the * * subroutine is; * * * * address of operand -> a4 * * n -> b4 * * * * n must be greater than 1. (The "B" unit is not used yet, so the subroutine* * is not very efficient.) * * * ******************************************************************************* .global _negn .text _negn: sub.s1x b4, 1, a0 ; load n-1 || mvk.d1 1, a24 ; load 1 || mpy.m1 a17, 0, a17 ; load 0 || mvk.l1 0, a21 ; load 0 addaw.d1 a4, a0, a4 ; load address of A[n-1] || [a0] sub.l1 a0, 1, a0 ; decremented loop count ldw.d1 *a4--, a8 ; load A[n-1] || mv.l1 a4, a6 ; save address of A[n-1] ldw.d1 *a4--, a8 ; load A[n-2] nop ldw.d1 *a4--, a8 ; load A[n-3] || bdec.s1 aloop, a0 ; conditional branch to loop beginning nop not.l1 a8, a16 ; invert A[n-1] || ldw.d1 *a4--, a8 ; load A[n-4] || bdec.s1 aloop, a0 ; conditional branch to loop beginning not.s1 a8, a20 ; invert A[n-2] || mpy.m1 a24, 0, a24 ; load 0 ***************** * begin loop * ***************** aloop: bdec.s1 aloop, a0 ; conditional branch to loop beginning || addu.l1 a17:a16, a24, a19:a18 ; ~A[n-1]+1,0 || ldw.d1 *a4--, a8 ; load A[n-1] addu.l1 a21:a20, a19, a17:a16 ; ~A[n-2]+carry || not.s1 a8, a20 ; ~A[n-2] || stw.d1 a18, *a6-- ; store ~A[n-1]+1 ***************** * end loop * ***************** b.s2 b3 ; return nop 5 .end ******************************************************************************* * * * N-WORD RIGHT SHIFT * * 04/10/10 (dkc) * * * * This C64 subroutine does an n-word right shift. The calling sequence of * * the subroutine is; * * * * address of input data -> a4 * * address of output data -> b4 * * shift amount -> a6 * * n -> b6 * * * * The shift amount must be less than or equal to 32*n. The input and output * * addresses cannot be the same. * * * ******************************************************************************* .global _rshiftn .text _rshiftn: shru.s2x a6, 6, b0 ; shift/64 || mpy.m2 b17, 0, b17 ; load 0 || shru.s1 a6, 5, a1 ; shift/32 mpy.m2 b16, 0, b16 ; load 0 || and.l1 a1, 1, a2 ; check if odd || shl.s1 a1, 5, a5 ; (shift/32)*32 [!b0] b.s2 askip ; no double-word shifts || [b0] subab.d2 b0, 1, b0 ; decremented loop count || sub.l2x b6, a1, b2 ; load n-shift/32 || sub.l1 a6, a5, a0 ; remainder || mvk.s1 32, a3 ; load 32 [b0] b.s2 aloop ; conditional branch to loop beginning || [b0] sub.l2 b0, 1, b0 ; decrement loop count [b0] b.s2 aloop ; conditional branch to loop beginning || [b0] sub.l2 b0, 1, b0 ; decrement loop count [b0] b.s2 aloop ; conditional branch to loop beginning || [b0] sub.l2 b0, 1, b0 ; decrement loop count || sub.l1 a3, a0, a3 ; 32-remainder [b0] b.s2 aloop ; conditional branch to loop beginning || [b0] sub.l2 b0, 1, b0 ; decrement loop count [b0] b.s2 aloop ; conditional branch to loop beginning || [b0] sub.l2 b0, 1, b0 ; decrement loop count ***************** * begin loop * ***************** aloop stndw.d2 b17:b16, *b4++ ; store 0 || [b0] b.s2 aloop ; conditional branch to loop beginning || [b0] sub.l2 b0, 1, b0 ; decrement loop count ***************** * end loop * ***************** askip: [b2] ldw.d1 *a4++, a17 ; load input data || [a2] stw.d2 b16, *b4++ ; store 0 ||[!b2] b.s2 b3 ; return || [b2] sub.l2 b2, 1, b2 ; decrement loop count nop 5 [b2] b.s2 bloop ; conditional branch to loop beginning || [b2] sub.l2 b2, 1, b2 ; decrement loop count ldw.d1 *a4++, a17 ; load input data || mv.l1x b4, a7 ; load address mvk.s1 0, a18 ; load 0 ***************** * begin loop * ***************** bloop shru.s1 a17, a0, a17 ; right-shift data || mv.l1 a17, a16 ; save data || [b2] b.s2 bloop ; conditional branch to loop beginning || [b2] sub.l2 b2, 1, b2 ; decrement loop count or.l1 a18, a17, a18 ; "or" data || ldw.d1 *a4++, a17 ; load input data stw.d1 a18, *a7++ ; store data || shl.s1 a16, a3, a18 ; left-shift data ***************** * end loop * ***************** b.s2 b3 ; return nop 5 .end ******************************************************************************* * * * N-WORD LEFT SHIFT * * 04/10/10 (dkc) * * * * This C64 subroutine does an n-word left shift. The calling sequence of * * the subroutine is; * * * * address of input data -> a4 * * address of output data -> b4 * * shift amount -> a6 * * n -> b6 * * * * The shift amount must be less than or equal to 32*n. The input and output * * addresses cannot be the same. * * * ******************************************************************************* .global _lshiftn .text _lshiftn: shru.s2x a6, 6, b0 ; shift/64 || mpy.m2 b17, 0, b17 ; load 0 || shru.s1 a6, 5, a1 ; shift/32 || addaw.d2 b4, b6, b4 ; address of B[n] || sub.l1x b6, 1, a3 ; load n-1 mpy.m2 b16, 0, b16 ; load 0 || and.l1 a1, 1, a2 ; check if odd || shl.s1 a1, 5, a5 ; (shift/32)*32 || addaw.d1 a4, a3, a4 ; address of A[n-1] [!b0] b.s2 askip ; no double-word shifts || [b0] subab.d2 b0, 1, b0 ; decremented loop count || sub.l2x b6, a1, b2 ; load n-shift/32 || sub.l1 a6, a5, a0 ; remainder || mvk.s1 32, a3 ; load 32 [b0] b.s2 aloop ; conditional branch to loop beginning || [b0] sub.l2 b0, 1, b0 ; decrement loop count [b0] b.s2 aloop ; conditional branch to loop beginning || [b0] sub.l2 b0, 1, b0 ; decrement loop count [b0] b.s2 aloop ; conditional branch to loop beginning || [b0] sub.l2 b0, 1, b0 ; decrement loop count || sub.l1 a3, a0, a3 ; 32-remainder [b0] b.s2 aloop ; conditional branch to loop beginning || [b0] sub.l2 b0, 1, b0 ; decrement loop count [b0] b.s2 aloop ; conditional branch to loop beginning || [b0] sub.l2 b0, 1, b0 ; decrement loop count ***************** * begin loop * ***************** aloop stndw.d2 b17:b16, *--b4 ; store 0 || [b0] b.s2 aloop ; conditional branch to loop beginning || [b0] sub.l2 b0, 1, b0 ; decrement loop count ***************** * end loop * ***************** askip: [b2] ldw.d1 *a4--, a17 ; load input data || [a2] stw.d2 b16, *--b4 ; store 0 ||[!b2] b.s2 b3 ; return || [b2] sub.l2 b2, 1, b2 ; decrement loop count nop 5 [b2] b.s2 bloop ; conditional branch to loop beginning || [b2] sub.l2 b2, 1, b2 ; decrement loop count ldw.d1 *a4--, a17 ; load input data || mv.l1x b4, a7 ; load address mvk.s1 0, a18 ; load 0 ***************** * begin loop * ***************** bloop shl.s1 a17, a0, a17 ; left-shift data || mv.l1 a17, a16 ; save data || [b2] b.s2 bloop ; conditional branch to loop beginning || [b2] sub.l2 b2, 1, b2 ; decrement loop count or.l1 a18, a17, a18 ; "or" data || ldw.d1 *a4--, a17 ; load input data stw.d1 a18, *--a7 ; store data || shru.s1 a16, a3, a18 ; right-shift data ***************** * end loop * ***************** b.s2 b3 ; return nop 5 .end ******************************************************************************* * * * N-WORD NORMALIZATION * * 04/09/10 (dkc) * * * * This C64 subroutine does an n-word normalization. The calling sequence of * * the subroutine is; * * * * address of operand -> a4 * * n -> b4 * * * * The n-word must be non-zero. * * * ******************************************************************************* .global _normn .text _normn: zero.l1 a2 ; load 0 || mv.l2x a4, b4 ; save address of input || mvk.s2 0, b5 ; clear counter ldndw.d1 *a4++, a1:a0 ; load data ||[!a2] b.s1 aloop ; conditional branch to loop beginning ldndw.d1 *a4++, a1:a0 ; load data ||[!a2] b.s1 aloop ; conditional branch to loop beginning ldndw.d1 *a4++, a1:a0 ; load data ||[!a2] b.s1 aloop ; conditional branch to loop beginning ldndw.d1 *a4++, a1:a0 ; load data ||[!a2] b.s1 aloop ; conditional branch to loop beginning ldndw.d1 *a4++, a1:a0 ; load data ||[!a2] b.s1 aloop ; conditional branch to loop beginning ***************** * begin loop * ***************** aloop: [!a2] ldndw.d1 *a4++, a1:a0 ; load data ||[!a2] b.s1 aloop ; conditional branch to loop beginning ||[!a2] or.l1 a0, a1, a2 ; "or" data ||[!a2] mv.s2 b5, b6 ; save index ||[!a2] add.l2 b5, 1, b5 ; increment index ***************** * end loop * ***************** ldndw.d2 *+b4[b6], a1:a0 ; load data || shl.s2 b6, 6, b8 ; count*64 || mvk.s1 32, a2 ; load 32 nop b.s2 b3 ; return || mv.s1x b8, a4 ; count*64 nop 2 [!a0] lmbd.l1 1, a1, a7 ; left-most bit detection ||[!a0] add.s1 a4, a2, a4 ; increment count [a0] lmbd.l1 1, a0, a7 ; left-most bit detection add.l1 a4, a7, a4 ; count .end ******************************************************************************* * * * N-WORD "OR" * * 04/09/10 (dkc) * * * * This C64 subroutine does an n-word "or". The calling sequence of the * * subroutine is; * * * * address of operand -> a4 * * n -> b4 * * * * n must be even and greater than 2. (The "B" unit is not used yet, so the * * subroutine is not very efficient.) * * * ******************************************************************************* .global _orn .text _orn: shru.s2 b4, 1, b0 ; load n/2 || mvk.l1 0, a19 ; load 0 || mvk.d1 0, a18 ; load 0 [b0] sub.l2 b0, 2, b0 ; decremented loop count ldndw.d1 *a4++, a17:a16 ; load data || bdec.s2 aloop, b0 ; conditional branch to loop beginning ldndw.d1 *a4++, a17:a16 ; load data || bdec.s2 aloop, b0 ; conditional branch to loop beginning ldndw.d1 *a4++, a17:a16 ; load data || bdec.s2 aloop, b0 ; conditional branch to loop beginning ldndw.d1 *a4++, a17:a16 ; load data || bdec.s2 aloop, b0 ; conditional branch to loop beginning ldndw.d1 *a4++, a17:a16 ; load data || bdec.s2 aloop, b0 ; conditional branch to loop beginning ***************** * begin loop * ***************** aloop: bdec.s2 aloop, b0 ; conditional branch to loop beginning || or.l1 a19, a17, a19 ; "or" words || or.s1 a18, a16, a18 ; "or" words || ldndw.d1 *a4++, a17:a16 ; load data ***************** * end loop * ***************** b.s2 b3 ; return || or.l1 a19, a18, a4 ; "or" partial "ors" nop 5 .end ******************************************************************************* * * * N-WORD COPY * * 04/09/10 (dkc) * * * * This C64 subroutine does an n-word copy. The calling sequence of the * * subroutine is; * * * * address of input data -> a4 * * address of output data -> b4 * * n -> a6 * * * * n must be even and greater than 2. * * * ******************************************************************************* .global _copyn .text _copyn: shru.s1 a6, 1, a0 ; load n/2 || mv.l1x b4, a6 ; load address of output ldndw.d1 *a4++, a17:a16 ; load data || [a0] sub.l1 a0, 2, a0 ; decremented loop count bdec.s1 aloop, a0 ; conditional branch to loop beginning ldndw.d1 *a4++, a17:a16 ; load data bdec.s1 aloop, a0 ; conditional branch to loop beginning ldndw.d1 *a4++, a17:a16 ; load data ***************** * begin loop * ***************** aloop: stndw.d1 a17:a16, *a6++ ; store data || bdec.s1 aloop, a0 ; conditional branch to loop beginning ldndw.d1 *a4++, a17:a16 ; load data ***************** * end loop * ***************** b.s2 b3 ; return nop 5 .end ******************************************************************************* * * * N-WORD CLEAR (AND SET OF LAST WORD) * * 04/09/10 (dkc) * * * * This C64 subroutine does an n-word clear. The calling sequence of the * * subroutine is; * * * * address of operand -> a4 * * data for last word -> b4 * n -> a6 * * * * n must be even and greater than 2. * * * ******************************************************************************* .global _setn .text _setn: shru.s1 a6, 1, a0 ; load n/2 || mv.l1x b4, a5 ; load data || mpy.m1 a17, 0, a17 ; load 0 [a0] sub.l1 a0, 2, a0 ; decremented loop count || mvk.d1 0, a16 ; load 0 bdec.s1 aloop, a0 ; conditional branch to loop beginning bdec.s1 aloop, a0 ; conditional branch to loop beginning bdec.s1 aloop, a0 ; conditional branch to loop beginning bdec.s1 aloop, a0 ; conditional branch to loop beginning bdec.s1 aloop, a0 ; conditional branch to loop beginning ***************** * begin loop * ***************** aloop: bdec.s1 aloop, a0 ; conditional branch to loop beginning || stndw.d1 a17:a16, *a4++ ; store 0 ***************** * end loop * ***************** b.s2 b3 ; return || stw.d1 a5, *-a4[1] ; store data nop 5 .end ******************************************************************************* * * * N-WORD TRIPLE * * 04/09/10 (dkc) * * * * This C64 subroutine triples an n-word. The calling sequence of the * * subroutine is; * * * * address of operand -> a4 * * n -> b4 * * * * n must be greater than 1. (The "B" unit is not used yet, so the subroutine* * is not very efficient.) * * * ******************************************************************************* .global _mult3 .text _mult3: sub.s1x b4, 1, a0 ; load n-1 || mpy.m1 a17, 0, a17 ; load 0 addaw.d1 a4, a0, a4 ; load address of A[n-1] || [a0] sub.l1 a0, 1, a0 ; decremented loop count ldw.d1 *a4--, a16 ; load A[n-1] || mv.l1 a4, a6 ; save address of A[n-1] nop 2 ldw.d1 *a4--, a16 ; load A[n-2] || bdec.s1 aloop, a0 ; conditional branch to loop beginning mpy.m1 a19, 0, a19 ; clear upper word mv.l1 a16, a18 ; load A[n-1] || mv.s1 a16, a20 ; load A[n-1] ***************** * begin loop * ***************** aloop addu.l1 a19:a18, a16, a19:a18 ; A[n-1]+A[n-1]+A[n-1] || mpy.m1 a17, 0, a17 ; clear upper word || ldw.d1 *a4--, a16 ; load A[n-3] || bdec.s1 aloop, a0 ; conditional branch to loop beginning addu.l1 a19:a18, a20, a19:a18 ; A[n-1]+A[n-1]+A[n-1]+carry addu.l1 a17:a16, a16, a19:a18 ; A[n-2]+A[n-2] || rotl.m1 a19, 0, a20 ; save carry || stw.d1 a18, *a6-- ; store A[n-1] ***************** * end loop * ***************** b.s2 b3 ; return nop 5 .end ******************************************************************************* * * * N-WORD TRIPLE AND RIGHT-SHIFT * * 04/09/10 (dkc) * * * * This C64 subroutine triples and right-shifts m-words. The calling * * sequence of the subroutine is; * * * * address of operand -> a4 * * shift amount -> b4 * * n -> a6 * * * * n must be even and greater than 2. The shift amount must be less than or * * equal to 32. (The "B" unit is not used yet, so the subroutine is not very * * efficient.) * * * ******************************************************************************* .global _mul3shft .text _mul3shft: sub.l1 a6, 1, a0 ; load n-1 || mvk.s2 32, b7 ; load 32 || mvk.d1 1, a2 ; set flag || shru.s1 a6, 1, a1 ; load n/2 || mpy.m1 a17, 0, a17 ; load 0 addaw.d1 a4, a0, a4 ; load address of A[n-1] || sub.s2 b7, b4, b7 ; 32-shift || mv.l1x b4, a6 ; load shift || [a1] sub.s1 a1, 2, a1 ; decremented loop count ldw.d1 *a4--, a16 ; load A[n-1] || mv.l1 a4, a8 ; save address of A[n-1] mv.l2x a1, b0 ; load decremented loop count || mv.l1x b7, a7 ; load 32-shift nop ldw.d1 *a4--, a16 ; load A[n-2] mpy.m1 a19, 0, a19 ; load 0 mv.l1 a16, a18 ; load A[n-1] || mv.s1 a16, a30 ; load A[n-1] ***************** * begin loop * ***************** aloop addu.l1 a19:a18, a16, a19:a18 ; A[n-1]+A[n-1]+A[n-1] || or.s1 a21, a20, a21 ; "or" words || mpy.m1 a17, 0, a17 ; clear upper word || ldw.d1 *a4--, a16 ; load A[n-3] || bdec.s2 aloop, b0 ; conditional branch to loop beginning addu.l1 a19:a18, a30, a19:a18 ; A[n-1]+A[n-1]+A[n-1]+carry || shru.s1 a24, a6, a24 ; right-shift A[n-2] ||[!a2] stw.d1 a21, *a8-- ; store A[n-1] addu.l1 a17:a16, a16, a19:a18 ; A[n-2]+A[n-2] || rotl.m1 a18, 0, a20 ; save 3*A[n-1] || shl.s1 a18, a7, a25 ; left-shift 3*A[n-1] || mv.d1 a19, a30 ; save carry addu.l1 a19:a18, a16, a19:a18 ; A[n-2]+A[n-2]+A[n-2] || or.s1 a25, a24, a25 ; "or" words || mpy.m1 a17, 0, a17 ; clear upper word || ldw.d1 *a4--, a16 ; load A[n-3] addu.l1 a19:a18, a30, a19:a18 ; A[n-2]+A[n-2]+A[n-2]+carry ||[!a2] stw.d1 a25, *a8-- ; store A[n-2] || shru.s1 a20, a6, a20 ; right-shift 3*A[n-1] || mpy.m1 a2, 0, a2 ; clear flag addu.l1 a17:a16, a16, a19:a18 ; A[n-2]+A[n-2] || shl.s1 a18, a7, a21 ; left-shift 3*A[n-2] || rotl.m1 a18, 0, a24 ; save 3*A[n-2] || mv.d1 a19, a30 ; save carry ***************** * end loop * ***************** b.s2 b3 ; return || or.s1 a21, a20, a21 ; "or" words shru.s1 a24, a6, a24 ; right-shift A[n-2] || stw.d1 a21, *a8-- ; store A[n-1] stw.d1 a24, *a8-- ; store A[n-2] nop 3 .end ******************************************************************************* * * * N-WORD TRIPLE AND RIGHT-SHIFT (with carry-in and carry-out) * * 04/09/10 (dkc) * * * * This C64 subroutine triples and right-shifts m-words. The calling * * sequence of the subroutine is; * * * * address of operand -> a4 * * shift amount -> b4 * * carry-in -> a6 * * n -> b6 * * * * n must be even and greater than 2. The shift amount must be less than or * * equal to 32. The carry-out is returned. (The "B" unit is not used yet, * * so the subroutine is not very efficient.) * * * ******************************************************************************* .global _mul3shfc .text _mul3shfc: mv.l1x b6, a6 ; load n || mv.s1 a6, a29 ; save carry-in sub.l1 a6, 1, a0 ; load n-1 || mvk.s2 32, b7 ; load 32 || mvk.d1 1, a2 ; set flag || shru.s1 a6, 1, a1 ; load n/2 || mpy.m1 a17, 0, a17 ; load 0 addaw.d1 a4, a0, a4 ; load address of A[n-1] || sub.s2 b7, b4, b7 ; 32-shift || mv.l1x b4, a6 ; load shift || [a1] sub.s1 a1, 2, a1 ; decremented loop count ldw.d1 *a4--, a16 ; load A[n-1] || mv.l1 a4, a8 ; save address of A[n-1] mv.l2x a1, b0 ; load decremented loop count || mv.l1x b7, a7 ; load 32-shift nop 2 ldw.d1 *a4--, a16 ; load A[n-2] || mpy.m1 a19, 0, a19 ; load 0 mv.l1 a16, a18 ; load A[n-1] || mv.s1 a16, a30 ; load A[n-1] addu.l1 a19:a18, a29, a19:a18 ; A[n-1]+carry ***************** * begin loop * ***************** aloop addu.l1 a19:a18, a16, a19:a18 ; A[n-1]+A[n-1]+A[n-1] || or.s1 a21, a20, a21 ; "or" words || mpy.m1 a17, 0, a17 ; clear upper word || ldw.d1 *a4--, a16 ; load A[n-3] || bdec.s2 aloop, b0 ; conditional branch to loop beginning addu.l1 a19:a18, a30, a19:a18 ; A[n-1]+A[n-1]+A[n-1]+carry || shru.s1 a24, a6, a24 ; right-shift A[n-2] ||[!a2] stw.d1 a21, *a8-- ; store A[n-1] addu.l1 a17:a16, a16, a19:a18 ; A[n-2]+A[n-2] || rotl.m1 a18, 0, a20 ; save 3*A[n-1] || shl.s1 a18, a7, a25 ; left-shift 3*A[n-1] || mv.d1 a19, a30 ; save carry addu.l1 a19:a18, a16, a19:a18 ; A[n-2]+A[n-2]+A[n-2] || or.s1 a25, a24, a25 ; "or" words || mpy.m1 a17, 0, a17 ; clear upper word || ldw.d1 *a4--, a16 ; load A[n-3] addu.l1 a19:a18, a30, a19:a18 ; A[n-2]+A[n-2]+A[n-2]+carry ||[!a2] stw.d1 a25, *a8-- ; store A[n-2] || shru.s1 a20, a6, a20 ; right-shift 3*A[n-1] || mpy.m1 a2, 0, a2 ; clear flag addu.l1 a17:a16, a16, a19:a18 ; A[n-2]+A[n-2] || shl.s1 a18, a7, a21 ; left-shift 3*A[n-2] || rotl.m1 a18, 0, a24 ; save 3*A[n-2] || mv.d1 a19, a30 ; save carry ***************** * end loop * ***************** b.s2 b3 ; return || addu.l1 a19:a18, a16, a19:a18 ; A[n-1]+A[n-1]+A[n-1] || or.s1 a21, a20, a21 ; "or" words addu.l1 a19:a18, a30, a19:a18 ; A[n-1]+A[n-1]+A[n-1]+carry || shru.s1 a24, a6, a24 ; right-shift A[n-2] || stw.d1 a21, *a8-- ; store A[n-1] shl.s1 a18, a7, a25 ; left-shift 3*A[n-1] || mv.l1 a30, a4 ; return carry or.s1 a25, a24, a25 ; "or" words stw.d1 a25, *a8-- ; store A[n-2] nop .end ******************************************************************************* * * * N-WORD TRIPLE AND RIGHT-SHIFT (with carry-in, carry-out, and copy) * * 04/09/10 (dkc) * * * * This C64 subroutine triples and right-shifts n-words. The calling * * sequence of the subroutine is; * * * * address of operand -> a4 * * shift amount -> b4 * * carry-in -> a6 * * source address (for copy) -> b6 * * destination address (for copy) -> a8 * * address of output -> b8 * n -> a10 * * * * n must be even and greater than 2. The shift amount must be less than or * * equal to 32. The carry-out is returned. * * * ******************************************************************************* .global _mul3scpy .text _mul3scpy: mv.d1 a10, a6 ; load n || mv.s1 a6, a29 ; save carry-in || mv.l2x a8, b8 ; load destination address || sub.l1x b8, 4, a8 ; load output address sub.l1 a6, 1, a0 ; load n-1 || mvk.s2 32, b7 ; load 32 || mvk.d1 1, a2 ; set flag || shru.s1 a6, 1, a1 ; load n/2 || mpy.m1 a17, 0, a17 ; load 0 addaw.d1 a4, a0, a4 ; load address of A[n-1] || sub.s2 b7, b4, b7 ; 32-shift || mv.l1x b4, a6 ; load shift || [a1] sub.s1 a1, 2, a1 ; decremented loop count ldw.d1 *a4--, a26 ; load A[n-1] mv.l2x a1, b0 ; load decremented loop count || mv.l1x b7, a7 ; load 32-shift || addaw.d1 a8, a0, a8 ; load address of A[n-1] nop 2 ldw.d1 *a4--, a16 ; load A[n-2] || mpy.m1 a19, 0, a19 ; load 0 mv.l1 a26, a18 ; load A[n-1] || mv.s1 a26, a30 ; load A[n-1] || ldw.d1 *a4--[2], a26 addu.l1 a19:a18, a29, a19:a18 ; A[n-1]+carry || ldndw.d2 *b6++, b17:b16 ; load data ***************** * begin loop * ***************** aloop addu.l1 a19:a18, a26, a19:a18 ; A[n-1]+A[n-1]+A[n-1] || or.s1 a21, a20, a23 ; "or" words || mpy.m1 a17, 0, a17 ; clear upper word || bdec.s2 aloop, b0 ; conditional branch to loop beginning addu.l1 a19:a18, a30, a19:a18 ; A[n-1]+A[n-1]+A[n-1]+carry || shru.s1 a24, a6, a24 ; right-shift A[n-2] || mv.d1 a27, a16 || ldndw.d2 *b6++, b17:b16 ; load data addu.l1 a17:a16, a16, a29:a28 ; A[n-2]+A[n-2] || rotl.m1 a18, 0, a20 ; save 3*A[n-1] || shl.s1 a18, a7, a25 ; left-shift 3*A[n-1] || ldndw.d1 *a4--, a27:a26 ; load A[n-3] addu.l1 a29:a28, a16, a19:a18 ; A[n-2]+A[n-2]+A[n-2] || or.s1 a25, a24, a22 ; "or" words || mpy.m1 a27, 0, a27 ; clear upper word || mv.d1 a19, a30 ; save carry addu.l1 a19:a18, a30, a19:a18 ; A[n-2]+A[n-2]+A[n-2]+carry ||[!a2] stndw.d1 a23:a22, *a8-- ; store A[n-2] || shru.s1 a20, a6, a20 ; right-shift 3*A[n-1] || mpy.m1 a2, 0, a2 ; clear flag addu.l1 a27:a26, a26, a19:a18 ; A[n-2]+A[n-2] || shl.s1 a18, a7, a21 ; left-shift 3*A[n-2] || rotl.m1 a18, 0, a24 ; save 3*A[n-2] || mv.d1 a19, a30 ; save carry || stndw.d2 b17:b16, *b8++ ; store data ***************** * end loop * ***************** b.s2 b3 ; return || addu.l1 a19:a18, a26, a19:a18 ; A[n-1]+A[n-1]+A[n-1] || or.s1 a21, a20, a23 ; "or" words addu.l1 a19:a18, a30, a19:a18 ; A[n-1]+A[n-1]+A[n-1]+carry || shru.s1 a24, a6, a24 ; right-shift A[n-2] shl.s1 a18, a7, a25 ; left-shift 3*A[n-1] || mv.l1 a30, a4 ; return carry or.s1 a25, a24, a22 ; "or" words stndw.d1 a23:a22, *a8-- ; store A[n-2] nop .end