/*CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C                                                                             C
C  COMPUTE MERTENS FUNCTION (j(x) where x is a highly composite number, uses  C
C  01/11/16 (DKC)	     OpenMP and CUDA C) 			      C
C                                                                             C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC*/
#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <cuda_runtime.h>
#include "table.h"
extern char *malloc();
// add up partial sums of Mertens function
__global__ void mertsum(double *A, double B, unsigned int e) {
   unsigned int i=blockIdx.x*blockDim.x+threadIdx.x;
   if ((e==(e/(i+1))*(i+1))&&(e>(i+1)))
      A[i]=A[i]-B;
   }
// compute Mertens function
int newmert(unsigned int s, unsigned long long x, int *M) {
unsigned int t;
int i;
long long sum;
t=(unsigned int)sqrt((double)x);
t=t+2;
if (s>((unsigned int)(x/(unsigned long long)t)))
   return(0x7fffffff);
sum=0;
#pragma omp parallel for reduction (+:sum)
for (i=s; i<=(int)(x/(unsigned long long)t); i++)
   sum=sum+M[x/i-1];
#pragma omp parallel for reduction (+:sum)
for (i=1; i<(int)t; i++) {
   sum=sum+(long long)M[i-1]*(x/(unsigned long long)i-x/(unsigned long long)(i+1));
   }
return((int)sum);
}
// compute Mobius function
int newmobl(unsigned long long a, unsigned long long b, long long *out, unsigned int *table,
	    unsigned int tsize) {
unsigned int i,p,count;
unsigned long long beg,ps,rem;
long long t;
count=(unsigned int)(b-a);
for (i=0; i<count; i++)
   out[i]=1;
for (i=0; i<tsize; i++) {
   p=table[i];
   ps=(unsigned long long)p*(unsigned long long)p;
   if (ps>b)
      goto askip;
   rem=a-(a/ps)*ps;
   if (rem!=0)
      beg=ps-rem;
   else
      beg=0;
   while (beg<count) {
      out[beg]=0;
      beg=beg+ps;
      }
   rem=a-(a/p)*p;
   if (rem!=0)
      beg=p-rem;
   else
      beg=0;
   while (beg<count) {
      out[beg]=-out[beg]*p;
      beg=beg+p;
      }
   }
return(p);
askip:
for (i=0; i<count; i++) {
   if (out[i]==0)
      continue;
   t=out[i];
   if (t<0)
      t=-t;
   if ((unsigned long long)t<(i+a))
      out[i]=-out[i];
   }
for (i=0; i<count; i++) {
   if (out[i]==0)
      continue;
   if (out[i]>0)
      out[i]=1;
   else
      out[i]=-1;
   }
return(1);
}
// generate prime look-up table
unsigned int primed(unsigned int *out, unsigned int tsize,
		    unsigned int *table,unsigned int limit) {
unsigned int d;
unsigned int i,j,k,l,flag,count;
count=tsize;
for (i=0; i<tsize; i++)
   out[i]=table[i];
j=table[tsize-1]+1;
for (d=j; d<=limit; d++) {
   if(d==(d/2)*2) continue;
   if(d==(d/3)*3) continue;
   if(d==(d/5)*5) continue;
   if(d==(d/7)*7) continue;
   if(d==(d/11)*11) continue;
   l=(unsigned int)(10.0+sqrt((double)d));
   k=0;
   if (l>table[tsize-1])
      return(0);
   else {
      for (i=0; i<tsize; i++) {
	 if (table[i]<=l)
	    k=i;
	 else
	    break;
	 }
      }
   flag=1;
   l=k;
   for (i=0; i<=l; i++) {
      k=table[i];
      if ((d/k)*k==d) {
	 flag=0;
	 break;
	 }
      }
   if (flag==1)
      out[count]=d;
   count=count+flag;
   }
return(count);
}
unsigned long long Msize=15000000000;  // 64 GB of RAM, 64-bit OS
unsigned int tsize=1230;	       // prime look-up table size
unsigned int tmpsiz=10000;	       // used to compute Mobius function
unsigned int t2size=200000;	       // prime look-up table size
unsigned int Tsize=100000000;	       // partial sums of Mertens function
unsigned int oldnews=100000;	       // used to compute factors of N
unsigned int prsize=1000;	       // used to factor N
//
unsigned int incnt=167;
unsigned long long in[167*2]={ // highly composite numbers and number of factors
		   2,		2,
		   4,		3,
		   6,		4,
		  12,		6,
		  24,		8,
		  36,		9,
		  48,	       10,
		  60,	       12,
		 120,	       16,
		 180,	       18,
		 240,	       20,
		 360,	       24,
		 720,	       30,
		 840,	       32,
		1260,	       36,
		1680,	       40,
		2520,	       48,
		5040,	       60,
		7560,	       64,
	       10080,	       72,
	       15120,	       80,
	       20160,	       84,
	       25200,	       90,
	       27720,	       96,
	       45360,	      100,
	       50400,	      108,
	       55440,	      120,
	       83160,	      128,
	      110880,	      144,
	      166320,	      160,
	      221760,	      168,
	      277200,	      180,
	      332640,	      192,
	      498960,	      200,
	      554400,	      216,
	      665280,	      224,
	      720720,	      240,
	     1081080,	      256,
	     1441440,	      288,
	     2162160,	      320,
	     2882880,	      336,
	     3603600,	      360,
	     4324320,	      384,
	     6486480,	      400,
	     7207200,	      432,
	     8648640,	      448,
	    10810800,	      480,
	    14414400,	      504,
	    17297280,	      512,
	    21621600,	      576,
	    32432400,	      600,
	    36756720,	      640,
	    43243200,	      672,
	    61261200,	      720,
	    73513440,	      768,
	   110270160,	      800,
	   122522400,	      864,
	   147026880,	      896,
	   183783600,	      960,
	   245044800,	     1008,
	   294053760,	     1024,
	   367567200,	     1152,
	   551350800,	     1200,
	   698377680,	     1280,
	   735134400,	     1344,
	  1102701600,	     1440,
	  1396755360,	     1536,
	  2095133040,	     1600,
	  2205403200,	     1680,
	  2327925600,	     1728,
	  2793510720,	     1792,
	  3491888400,	     1920,
	  4655851200,	     2016,
	  5587021440,	     2048,
	  6983776800,	     2304,
	 10475665200,	     2400,
	 13967553600,	     2688,
	 20951330400,	     2880,
	 27935107200,	     3072,
	 41902660800,	     3360,
	 48886437600,	     3456,
	 64250746560,	     3584,
	 73329656400,	     3600,
	 80313433200,	     3840,
	 97772875200,	     4032,
	128501493120,	     4096,
	146659312800,	     4320,
	160626866400,	     4608,
	240940299600,	     4800,
	293318625600,	     5040,
	321253732800,	     5376,
	481880599200,	     5760,
	642507465600,	     6144,
	963761198400,	     6720,
       1124388064800,	     6912, 
       1606268664000,	     7168, 
       1686582097200,	     7200,
       1927522396800,	     7680,
       2248776129600,	     8064, 
       3212537328000,	     8192,
       3373164194400,	     8640,
       4497552259200,	     9216,
       6746328388800,	    10080,
       8995104518400,	    10368,
       9316358251200,	    10752,
      13492656777600,	    11520,
      18632716502400,	    12288,
      26985313555200,	    12960,
      27949074753600,	    13440,
      32607253879200,	    13824,
      46581791256000,	    14336,
      48910880818800,	    14400,
      55898149507200,	    15360,
      65214507758400,	    16128,
      93163582512000,	    16384,
      97821761637600,	    17280,
     130429015516800,	    18432,
     195643523275200,	    20160,
     260858031033600,	    20736,
     288807105787200,	    21504,
     391287046550400,	    23040,
     577614211574400,	    24576,
     782574093100800,	    25920,
     866421317361600,	    26880,
    1010824870255200,	    27648,
    1444035528936000,	    28672, 
    1516237305382800,	    28800,
    1732842634723200,	    30720,
    2021649740510400,	    32256,
    2888071057872000,	    32768,
    3032474610765600,	    34560, 
    4043299481020800,	    36864,
    6064949221531200,	    40320, 
    8086598962041600,	    41472,
   10108248702552000,	    43008, 
   12129898443062400,	    46080,
   18194847664593600,	    48384,
   20216497405104000,	    49152,
   24259796886124800,	    51840,
   30324746107656000,	    53760, 
   36389695329187200,	    55296,
   48519593772249600,	    57600,
   60649492215312000,	    61440,
   72779390658374400,	    62208,
   74801040398884800,	    64512, 
  106858629141264000,	    65536,
  112201560598327200,	    69120,
  149602080797769600,	    73728,
  224403121196654400,	    80640,
  299204161595539200,	    82944,
  374005201994424000,	    86016,
  448806242393308800,	    92160,
  673209363589963200,	    96768,
  748010403988848000,	    98304,
  897612484786617600,	   103680,
 1122015605983272000,	   107520,
 1346418727179926400,	   110592,
 1795224969573235200,	   115200,
 2244031211966544000,	   122880,
 2692837454359852800,	   124416,
 3066842656354276800,	   129024,
 4381203794791824000,	   131072,
 4488062423933088000,	   138240,
 6133685312708553600,	   147456,
 8976124847866176000,	   153600,
 9200527969062830400,	   161280,
 12267370625417107200,	   165888};

int main(void)
{
unsigned long long *oldtmp,*newtmp;
long long *temp,sum,sump;
int *M;
double *T,*d_A,*B;
unsigned int *ntable;
unsigned int *pritab;
unsigned int p,tindex,prind,delta,joff,newind,count,total,h,k,ntsize;
unsigned int g,f,L,start;
int e;
unsigned long long index,ta,tb,j,N,ut,pz,tz,mcount;
long long i;
int savet,t,ID;
int dev=0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("Using Device %d: %s\n",dev,deviceProp.name);
cudaSetDevice(dev);
cudaMalloc((double**)&d_A, (Tsize+1)*8);
dim3 block(1024);
//
omp_set_dynamic(0);
omp_set_num_threads(4);
#pragma omp parallel  
{
   ID=omp_get_thread_num();
   printf(" ID=%d \n",ID);
}
B=(double*) malloc(8);
if (B==NULL) {
   printf("not enough memory \n");
   return(0);
   }
ntable=(unsigned int*) malloc((t2size+1)*4);
if (ntable==NULL) {
   printf("not enough memory \n");
   return(0);
   }
pritab=(unsigned int*) malloc((prsize+1)*4);
if (pritab==NULL) {
   printf("not enough memory \n");
   return(0);
   }
temp=(long long*) malloc((tmpsiz+1)*8);
if (temp==NULL) {
   printf("not enough memory \n");
   return(0);
   }
oldtmp=(unsigned long long*) malloc((oldnews+1)*8);
if (oldtmp==NULL) {
   printf("not enough memory \n");
   return(0);
   }
newtmp=(unsigned long long *) malloc((oldnews+1)*8);
if (newtmp==NULL) {
   printf("not enough memory \n");
   return(0);
   }
M=(int*) malloc((Msize+1)*4);
if (M==NULL) {
   printf("not enough memory \n");
   return(0);
   }
T=(double*)malloc((Tsize+1)*8);
if (T==NULL) {
    printf("not enough memory: 5 \n");
    return(0);
    }
ntsize=primed(ntable,tsize,table,2000000);
printf("prime look-up table size=%d, largest prime=%d \n",ntsize,ntable[ntsize-1]);
printf("computing Mobius function \n");
index=0;
ta=1;
tb=(unsigned long long)(tmpsiz+1);
mcount=Msize/(unsigned long long)tmpsiz;
for (i=0; i<(long long)mcount; i++) {
   t=newmobl(ta,tb,temp,ntable,ntsize);
   if (t!=1) {
      printf("error computing Mobius function \n");
      return(0);
      }
   ta=tb;
   tb=tb+(unsigned long long)tmpsiz;
   for (j=0; j<tmpsiz; j++)
      M[j+index]=(int)temp[j];
   index=index+(unsigned long long)tmpsiz;
   }
printf("computing Mertens function \n");
for (i=1; i<=(long long)Msize; i++) {
    M[i]=M[i-1]+M[i];
   }
printf("computing j(x) \n");
L=0;
for (g=0; g<incnt; g++) {
   N=in[2*g];
// compute factors of N
   ut=N;
   prind=0;
   tindex=0;
   total=1;
   h=(unsigned int)(sqrt((double)ut)+0.01);
   for (f=0; f<ntsize; f++) {
      p=table[f];
      if (p>h)
	 goto fskip;
      count=0;
      while (ut==(ut/p)*p) {
	 if (count==0)
	    oldtmp[tindex]=p;
	 else
	    oldtmp[tindex]=oldtmp[tindex-1]*p;
	 tindex=tindex+1;
	 if (tindex>oldnews) {
	    printf("divisor table not big enough (1): N=%d \n",N);
	    return(0);
	    }
	 ut=ut/p;
	 count=count+1;
	 }
      if (count!=0) {
	 total=total*(count+1);
	 pritab[prind]=count;
	 prind=prind+1;
	 if (prind>prsize) {
	    printf("prime table not big enough: N=%d \n",N);
	    return(0);
	    }
	 }
      if (ut==1)
	 goto askip;
      }
   printf("error: prime look-up table not big enough \n");
   return(0);
fskip:
   oldtmp[tindex]=ut;
   tindex=tindex+1;
   if (tindex>oldnews) {
      printf("divisor table not big enough (2): N=%d \n",N);
      return(0);
      }
   count=1;
   total=total*(count+1);
   pritab[prind]=count;
   prind=prind+1;
   if (prind>prsize) {
       printf("prime table not big enough: N=%d \n",N);
       return(0);
       }
askip:
   if (total!=(unsigned int)in[2*g+1]) {
      printf("error: total=%d %d \n",total,(unsigned int)in[2*g+1]);
      return(0);
      }
   if (prind==1) {
      newind=tindex;
      goto cskip;
      }
   delta=0;
   pritab[prind]=0;
   pritab[prind+1]=0;
bskip:
   joff=0;
   delta=0;
   newind=0;
   for (f=0; f<(prind+1)/2; f++) {
      count=pritab[2*f];
      for (j=0; j<count; j++) {
	 newtmp[newind]=oldtmp[j+joff];
	 newind=newind+1;
	 }
      for (j=0; j<pritab[2*f+1]; j++) {
	 newtmp[newind]=oldtmp[j+joff+count];
	 newind=newind+1;
	 }
      for (j=0; j<count; j++) {
	 tz=oldtmp[j+joff];
	 for (k=0; k<pritab[2*f+1]; k++) {
	    pz=tz*oldtmp[k+count+joff];
	    newtmp[newind]=pz;
	    newind=newind+1;
	    if (newind>oldnews) {
	       printf("divisor table not big enough (3): N=%d \n",N);
	       return(0);
	       }
	    }
	 }
      joff=joff+pritab[2*f]+pritab[2*f+1];
      pritab[delta]=pritab[2*f]*pritab[2*f+1]+pritab[2*f]+pritab[2*f+1];
      delta=delta+1;
      }
   for (f=0; f<newind; f++)
      oldtmp[f]=newtmp[f];
   pritab[delta]=0;
   pritab[delta+1]=0;
   prind=delta;
   if (delta>1)
      goto bskip;
cskip:
   if ((newind+1)!=(unsigned int)in[2*g+1]) {
      printf("error: newind=%d %d \n",newind,(unsigned int)in[2*g+1]);
      return(0);
      }
// compute j(x)
   sum=0;
   if (N>Msize) {
      L=(unsigned int)(N/(unsigned long long)Msize);
      if (L>(unsigned long long)Tsize) {
	 printf("error: not enough memory for T array \n");
	 return(0);
	 }
      for (e=1; e<=(int)L; e++) {
	 start=(unsigned int)((N/(unsigned long long)e)/(unsigned long long)Msize)+1;
	 T[e-1]=(double)(1-newmert(start, N/e, M));
	 }
      cudaMemcpy(d_A,T,L*8,cudaMemcpyHostToDevice);
      for (e=L; e>1; e--) {
	 dim3 grid((e+block.x-1)/block.x);
	 cudaMemcpy(B,&d_A[e-1],8,cudaMemcpyDeviceToHost);
	 mertsum<<< grid, block >>>(d_A, B[0], e);
	 cudaDeviceSynchronize();
	 }
      cudaMemcpy(T,d_A,L*8,cudaMemcpyDeviceToHost);
      savet = (int)T[0];
#pragma omp parallel for reduction (+:sum)
	 for (e=1; e<=(int)L; e++)
	    if (N==(N/(unsigned long long)e)*(unsigned long long)e)
	       sum=sum+(long long)T[e-1]*(long long)T[e-1];
      }
   sump=1;
   for (f=0; f<newind; f++) {
     tz=oldtmp[f];
     if (tz<=Msize) {
	t=M[tz-1];
	sump=sump+(long long)t*(long long)t;
	if (tz==N)
	   savet=t;
	}
     }
   sum=sum+sump;
   printf(" %I64x %I64x %d %d %d \n",N,sum,newind+1,savet,L);
   }
return(0);
}