* FIND 3N+C CYCLES * * 04/15/13 (dkc) * * * * This C64 subroutine finds 3n+c cycles. The calling sequence of the sub- * * routine is; * * * * K[0] => a4 * * K[1] => b4 * * maximum allowable value of first word of an odd element => a6 * * c => b6 * * address to store output n value (one in the cycle) => a8 * * * * (K[0], K[1]) is an input 64-bit odd n value (positive or negative). The * * input n and c values must be in the range so that |3*n+c| is less than or * * equal to 0x7fffffffffffffff. If the maximum allowable first word in a * * subsequent even n value in the sequence is exceeded, the subroutine * * terminates and 0 is returned (c must be in the range so this even n value * * can be computed without overflowing the signed 64-bit words). Every other * * initial even n value in the sequence is checked to determine if the maximum* * allowable value has been exceeded, so c must be in the range so that * * 3*[(3*n+c)/2]+c can be computed without overflowing the signed 64-bit * * words. If the second word of an initial even element in the 3n+c sequence * * is 0, the subroutine terminates and an error code of 2 is returned. (To * * reduce execution time, the algorithm is coded on the assumption that the * * second word is not 0. The probability that the second word is 0 is very * * small.) Otherwise, a value of 1 is returned (indicating that a cycle has * * been found). An element in the cycle is returned in the output array. * * Robert Floyd's cycle-finding algorithm is used. Each iteration of the * * loop takes 22 instruction cycles to execute, so the subroutine is much * * faster than most C code (depending on what compiler and chip the code is * * being run on). * * * ******************************************************************************* .global _cycle .text _cycle:zero.l2 b19 ; load 0 || mvk.s2 1, b31 ; load 1 || mv.d2 b4, b18 ; load K[1] || mvk.d1 2, a22 ; load error code addu.l2 b19:b18, b18, b21:b20 ; K[1]*2 || rotl.m2x a4, 0, b16 ; load K[0] || shl.s2 b31, 31, b31 ; load mask || mv.l1x b6, a24 ; load c addu.l2 b21:b20, b18, b21:b20 ; K[1]*3 || mvk.s2 32, b29 ; load 32 || mvk.s1 32, a29 ; load 32 addu.l2 b21:b20, b6, b21:b20 ; K[1]=K[1]*3+c || mv.l1x b31, a31 ; load mask addab.d2 b16, b16, b17 ; K[0]*2 || shr.s2 b20, 31, b23 ; load MSB of K[1] || bitr.m2 b20, b0 ; bit-reverse K[1] addab.d2 b17, b16, b16 ; K[0]*3 [!b0] b.s2 b3 ; return || addab.d2 b16, b21, b16 ; K[0]=K[0]*3+carry || lmbd.l2 1, b0, b30 ; count bits || mpy.m2 b2, 0, b2 ; load 0 [!b0] mvk.s1 2, a4 ; return error code || and.d2 b16, b31, b1 ; isolate sign || sub.s2 b29, b30, b28 ; 32-count || neg.l2 b16, b17 ; -K[0]-carry [!b1] addab.d2 b16, 0, b17 ; load K[0] ||[b1] add.l2 b17, b23, b17 ; -K[0] || shru.s2 b20, b30, b20 ; K[1]>>count [b0] cmpltu.l2x a6, b17, b2 ; compare to maximum || shl.s2 b16, b28, b28 ; K[0]<<(32-count) [b2] mpy.m1 a4, 0, a4 ; return 0 || shr.s2 b16, b30, b16 ; K[0]=K[0]>>count || or.l2 b20, b28, b20 ; K[1]=K[1]|(K[0]<<(32-count)) [b2] b.s2 b3 ; return || mpy.m1 a19, 0, a19 ; load 0 *** mv.l1x b20, a18 ; load K[1] mv.s1x b16, a16 ; load K[0] || addu.l1 a19:a18, a18, a21:a20 ; K[1]*2 addu.l1 a21:a20, a18, a21:a20 ; K[1]*3 addu.l1 a21:a20, a24, a21:a20 ; K[1]=K[1]*3+c addab.d1 a16, a16, a17 ; K[0]*2 || shr.s1 a20, 31, a23 ; load MSB of K[1] || bitr.m1 a20, a0 ; bit-reverse K[1] addab.d1 a17, a16, a16 ; K[0]*3 [!a0] b.s2 b3 ; return || addab.d1 a16, a21, a16 ; K[0]=K[0]*3+carry || lmbd.l1 1, a0, a30 ; count bits || mpy.m1 a2, 0, a2 ; load 0 [!a0] rotl.m1 a22, 0, a4 ; return error code || sub.s1 a29, a30, a28 ; 32-count || and.d1 a16, a31, a1 ; isolate sign || neg.l1 a16, a17 ; -K[0]-carry [!a1] addab.d1 a16, 0, a17 ; load K[0] ||[a1] add.l1 a17, a23, a17 ; -K[0] || shru.s1 a20, a30, a20 ; K[1]>>count [a0] cmpltu.l1 a6, a17, a2 ; compare to maximum || shl.s1 a16, a28, a28 ; K[0]<<(32-count) || mpy.m1 a19, 0, a19 ; load 0 [a2] mpy.m1 a4, 0, a4 ; return 0 || shr.s1 a16, a30, a16 ; K[0]=K[0]>>count || or.l1 a20, a28, a20 ; K[1]=K[1]|(K[0]<<(32-count)) || mpy.m2 b19, 0, b19 ; load 0 [a2] b.s2 b3 ; return || mv.s1 a20, a18 ; load K[1] || mv.l2 b20, b18 ; load K[1] *** addu.l1 a19:a18, a18, a21:a20 ; K[1]*2 || addu.l2 b19:b18, b18, b21:b20 ; K[1]*2 addu.l1 a21:a20, a18, a21:a20 ; K[1]*3 || addu.l2 b21:b20, b18, b21:b20 ; K[1]*3 addu.l1 a21:a20, a24, a21:a20 ; K[1]=K[1]*3+c || addu.l2 b21:b20, b6, b21:b20 ; K[1]=K[1]*3+c addab.d1 a16, a16, a17 ; K[0]*2 || bitr.m1 a20, a0 ; bit-reverse K[1] || addab.d2 b16, b16, b17 ; K[0]*2 || shr.s2 b20, 31, b23 ; load MSB of K[1] || bitr.m2 b20, b0 ; bit-reverse K[1] addab.d1 a17, a16, a16 ; K[0]*3 || mvk.s1 1, a2 ; load 1 || addab.d2 b17, b16, b16 ; K[0]*3 || mvk.s2 1, b2 ; load 1 [!a0] sub.s2 b2, b2, b2 ; clear flag || addab.d1 a16, a21, a16 ; K[0]=K[0]*3+carry || lmbd.l1 1, a0, a30 ; count bits ||[!b0] mpy.m2 b2, 0, b2 ; clear flag || addab.d2 b16, b21, b16 ; K[0]=K[0]*3+carry || lmbd.l2 1, b0, b30 ; count bits *************** * begin loop * *************** aloop: [!a0] rotl.m1 a22, 0, a4 ; return error code || sub.l1 a29, a30, a28 ; 32-count || shru.s1 a20, a30, a20 ; K[1]>>count ||[!b0] mvk.d1 2, a4 ; return error code || and.d2 b16, b31, b1 ; isolate sign || sub.s2 b29, b30, b28 ; 32-count || neg.l2 b16, b17 ; -K[0]-carry || mpy.m2 b1, 0, b1 ; load 0 shl.s1 a16, a28, a28 ; K[0]<<(32-count) ||[!b1] addab.d2 b16, 0, b17 ; load K[0] ||[b1] add.l2 b17, b23, b17 ; -K[0] || shru.s2 b20, b30, b20 ; K[1]>>count shr.s1 a16, a30, a16 ; K[0]=K[0]>>count || or.l1 a20, a28, a20 ; K[1]=K[1]|(K[0]<<(32-count)) || mpy.m1 a19, 0, a19 ; load 0 ||[b2] cmpltu.l2x a6, b17, b1 ; compare to maximum || shl.s2 b16, b28, b28 ; K[0]<<(32-count) addab.d1 a20, 0, a18 ; load K[1] ||[b1] mpy.m1 a4, 0, a4 ; return 0 || shr.s2 b16, b30, b16 ; K[0]=K[0]>>count || or.l2 b20, b28, b20 ; K[1]=K[1]|(K[0]<<(32-count)) || mpy.m2 b19, 0, b19 ; load 0 ||[b1] subab.d2 b2, b2, b2 ; load 0 [!b2] b.s2 b3 ; return || addu.l1 a19:a18, a18, a21:a20 ; K[1]*2 || mv.l2 b20, b18 ; load K[1] || mvk.s1 1, a0 ; load 1 addu.l1 a21:a20, a18, a21:a20 ; K[1]*3 addu.l1 a21:a20, a24, a21:a20 ; K[1]=K[1]*3+c addab.d1 a16, a16, a17 ; K[0]*2 || shr.s1 a20, 31, a23 ; load MSB of K[1] || bitr.m1 a20, a0 ; bit-reverse K[1] ||[b2] mv.l1 a20, a0 ; load K[1] [!a0] b.s2 b3 ; return || addab.d1 a17, a16, a16 ; K[0]*3 addab.d1 a16, a21, a16 ; K[0]=K[0]*3+carry || lmbd.l1 1, a0, a30 ; count bits [!a0] rotl.m1 a22, 0, a4 ; return error code || sub.s1 a29, a30, a28 ; 32-count || and.d1 a16, a31, a1 ; isolate sign || neg.l1 a16, a17 ; -K[0]-carry [!a1] addab.d1 a16, 0, a17 ; load K[0] ||[a1] add.l1 a17, a23, a17 ; -K[0] || shru.s1 a20, a30, a20 ; K[1]>>count cmpltu.l1 a6, a17, a2 ; compare to maximum || shl.s1 a16, a28, a28 ; K[0]<<(32-count) || mvk.d1 1, a1 ; load 1 shr.s1 a16, a30, a16 ; K[0]=K[0]>>count || or.l1 a20, a28, a20 ; K[1]=K[1]|(K[0]<<(32-count)) || subab.d1 a19, a19, a19 ; load 0 ||[a2] rotl.m1x b16, 0, a16 ; load K[0] [!a2] cmpeq.l1x a20, b20, a1 ; compare K[1] values ||[a2] b.s2 b3 ; return ||[a2] mpy.m1 a4, 0, a4 ; return 0 || addab.d1 a20, 0, a18 ; load K[1] [a1] cmpeq.l1x a16, b16, a1 ; compare K[0] values [!a1] b.s2 aloop ; branch if not equal || addu.l1 a19:a18, a18, a21:a20 ; K[1]*2 || addu.l2 b19:b18, b18, b21:b20 ; K[1]*2 ||[a1] stw.d1 a20, *+a8[1]; ; save K[1] ||[a1] mvk.s1 1, a4 ; return 1 ||[a2] mpy.m1 a4, 0, a4 ; return 0 addu.l1 a21:a20, a18, a21:a20 ; K[1]*3 || addu.l2 b21:b20, b18, b21:b20 ; K[1]*3 ||[a1] stw.d1 a16, *a8 ; save K[0] ||[a1] b.s2 b3 ; return [!a1] addu.l1 a21:a20, a24, a21:a20 ; K[1]=K[1]*3+c || addu.l2 b21:b20, b6, b21:b20 ; K[1]=K[1]*3+c addab.d1 a16, a16, a17 ; K[0]*2 || bitr.m1 a20, a0 ; bit-reverse K[1] || addab.d2 b16, b16, b17 ; K[0]*2 || shr.s2 b20, 31, b23 ; load MSB of K[1] || bitr.m2 b20, b0 ; bit-reverse K[1] [!a1] addab.d1 a17, a16, a16 ; K[0]*3 || mvk.s1 1, a2 ; load 1 || addab.d2 b17, b16, b16 ; K[0]*3 || mvk.s2 1, b2 ; load 1 [!a0] sub.s2 b2, b2, b2 ; clear flag || addab.d1 a16, a21, a16 ; K[0]=K[0]*3+carry || lmbd.l1 1, a0, a30 ; count bits ||[!b0] mpy.m2 b2, 0, b2 ; clear flag || addab.d2 b16, b21, b16 ; K[0]=K[0]*3+carry || lmbd.l2 1, b0, b30 ; count bits **************** * end loop * **************** nop .end