#include <stdio.h>
#include <math.h>
#include <R.h>
/*algorithms for MLE of mixture proportions, including NPMLE for interval censored data*/
#define TINY 1e-100
#define SMALL 1e-6
#define LARGE 1e+100

void em(int *n, int *p, double **X, double *wt, double *theta, double *fitted, int *conv){
  /*EM iteration for MLE of mixture proportions:
    l(theta) = sum_i=1^n wt_i*log(sum_j=1^p X_ij*theta_j), sum theta_j=1
    X (n by p): non-negative matrix, each row has AT LEAST ONE POSITIVE ENTRY
    fitted (n by 1): fitted values; fitted_i=sum_j X_ij*theta_j
    conv=1 upon convergence
  */
  int i, j;
  double swt, tmp, upper;
  for(swt=0.0, j=0; j<*n; j++)
    swt+=wt[j];
  for(upper=swt, j=0; j<*p; j++){
    for(tmp=0.0, i=0; i<*n; i++)
      tmp+=wt[i]*X[i][j]/fitted[i];
    theta[j]*=tmp/swt;
    if(upper<tmp)
      upper=tmp;
  }
  /*redundant (numerical safeguard)
  for(tmp=0.0, j=0; j<*p; j++)
    tmp+=theta[j];
  for(j=0; j<*p; j++)
    theta[j]/=tmp;
  */
  for(i=0; i<*n; i++)
    for(fitted[i]=0.0, j=0; j<*p; j++)
      fitted[i]+=X[i][j]*theta[j];
  if(upper<=SMALL+swt)
    *conv=1;
  else *conv=0;
} 

void effcm(int *n, double *diff, double *wt, double *theta1, double *theta2, double *fitted){
  /*efficient EM for maximizing the function sum_i wt_i*log(X1_i*theta1+X2_i*theta2+r_i)
    r_i=fitted_i-X1_i*theta1-X2_i*theta2
    diff=X1-X2
    subject to theta1+theta2=const., theta1>=0, theta2>=0
  */
  int i;
  double new, tmp1, tmp2, c1, c2, upper;
  for(tmp1=tmp2=0.0, c1=c2=LARGE, i=0; i<*n; i++){
    if(diff[i]>TINY){
      new=fitted[i]/diff[i];
      if(c1>new)
        c1=new;
      tmp1+=wt[i]/new;
    }
    else if(diff[i]<-TINY){
      new=-fitted[i]/diff[i];
      if(c2>new)
        c2=new;
      tmp2+=wt[i]/new;
    }
  }
  tmp1*=c1;
  tmp2*=c2;
  if(tmp1+tmp2<TINY)
    return;
  new=tmp1/(tmp1+tmp2)*(c1+c2)-c1+*theta1;
  upper=*theta1+*theta2;
  if(new<0.0)
    new=0.0;
  if(new>upper)
    new=upper;
  tmp1=new-*theta1;
  for(i=0; i<*n; i++)
    fitted[i]+=tmp1*diff[i];
  *theta1=new;
  *theta2=upper-new; 
}

void vdm(int *n, int *p, double **X, double *wt, double *theta, double *fitted, int *conv){
  /*VDM iteration for MLE of mixture proportions*/
  int i, j;
  double swt, upper, t1=0.0, t2=1.0, *der;
  der=(double *)malloc(sizeof(double)*(*n+*p));
  for(swt=0.0, i=0; i<*n; i++)
    swt+=wt[i];
  
  for(j=0; j<*p; j++)
    for(der[j]=0.0, i=0; i<*n; i++)
      der[j]+=wt[i]*X[i][j]/fitted[i];
  for(upper=swt, i=j=0; j<*p; j++)
    if(upper<der[j]){
      upper=der[j];
      i=j;
    }

  for(j=0; j<*n; j++)
    der[j]=X[j][i]-fitted[j];
  effcm(n, der, wt, &t1, &t2, fitted);
  for(j=0; j<*p; j++) 
    theta[j]*=t2;
  theta[i]+=t1;
  free(der);
  if(upper<=SMALL+swt)
    *conv=1;
  else *conv=0;
}

void vem(int *n, int *p, double **X, double *wt, double *theta, double *fitted, int *conv){
  /*VEM iteration for MLE of mixture proportions*/
  int i, j, k;
  double swt, lower, upper, *der;
  der=(double *)malloc(sizeof(double)*(*n+*p));
  for(swt=0.0, i=0; i<*n; i++)
    swt+=wt[i];
  
  for(j=0; j<*p; j++)
    for(der[j]=0.0, i=0; i<*n; i++)
      der[j]+=wt[i]*X[i][j]/fitted[i];
  for(lower=LARGE, upper=swt, i=k=j=0; j<*p; j++){
    if(theta[j]>0.0&&lower>der[j]){
      lower=der[j];
      i=j;
    }
    if(upper<der[j]){
      upper=der[j];
      k=j;
    }
  }
  for(j=0; j<*n; j++)
    der[j]=X[j][i]-X[j][k];
  effcm(n, der, wt, theta+i, theta+k, fitted);
    
  free(der);
  if(upper<=SMALL+swt)
    *conv=1;
  else *conv=0;
}

void vdm_nne(int *n, int *p, double **X, double *wt, double *theta, double *fitted, int *conv){
  /*VDM + nearest neighbor exchanges*/
  int i, j, k;
  double *diff;   
  diff=(double *)malloc(sizeof(double)*(*n));
  vdm(n, p, X, wt, theta, fitted, conv);
  for(i=0; i<*p; i++)
    if(theta[i]>TINY)
      break;
  do{
    for(j=i+1; j<*p; j++)
      if(theta[j]>TINY)
        break;
    if(j>=*p) break;
    for(k=0; k<*n; k++)
      diff[k]=X[k][i]-X[k][j];
    effcm(n, diff, wt, theta+i, theta+j, fitted);
    i=j;
  } while(1);
  free(diff);
}

void alg_mix(int *n, int *p, double *Xvec, double *wt, double *theta, int *iter, int *alg){
  /*algorithms for maximizing sum_i wt_i*log(sum_j X_ij*theta_j)
    X (n by p) stored row-wise as Xvec 
    wt[i]: number of replica of observation i
    iter: maximum number of iterations
    alg=1: EM
    alg=2: VEM
    alg=3: VDM+NNE
    alg=4: cocktail algorithm VDM + nearest neighbor exchanges + EM (recommended)
      essential: a good ordering of the columns of X (mixture components) 
    theta: NPMLE (output)
  */
  int i, j, conv=0;
  double *fitted, **X;
  fitted=(double *)malloc(sizeof(double)*(*n));
  X=(double **)malloc(sizeof(double *)*(*n));
  for(i=0; i<*n; i++)
    X[i]=Xvec+i*(*p);
  for(i=0; i<*n; i++){
    for(fitted[i]=0.0, j=0; j<*p; j++){
      if(X[i][j]<0.0){
        conv=-1;
        break;
      }
      fitted[i]+=X[i][j]*theta[j];
    }
    if(conv==-1||fitted[i]<TINY){
      Rprintf("ERROR: data badly prepared\n");
      free(fitted);
      free(X);
      return; 
    }
  }
  for(j=1; j<=*iter; j++){
    if(*alg==1)
      em(n, p, X, wt, theta, fitted, &conv);
    else if(*alg==2)
      vem(n, p, X, wt, theta, fitted, &conv);
    else if(*alg==3)
      vdm_nne(n, p, X, wt, theta, fitted, &conv);
    else if(*alg==4){
      vdm_nne(n, p, X, wt, theta, fitted, &conv); 
      em(n, p, X, wt, theta, fitted, &i);
    }
    if(conv) break;
  }
  *iter=j;
  free(fitted);
  free(X);
}

void sp_set(int *n, int *p, int *l, int *u, int **ll, int **uu){
  /*set up the sparse representation ll and uu:
    n observations, p time points, observation i is in [l[i], u[i]]
    ll[j][0]: #{i: l[i]=j}, ll[j][1:ll[j][0]]: indices {i: l[i]=j}
    uu[j][0]: #{i: u[i]=j-1}, uu[j][1:uu[j][0]]: indices {i: u[i]=j-1}
  */
  int i, j;
  for(j=0; j<*p; j++){
    ll[j][0]=0;
    uu[j][0]=0;
  }
  for(i=0; i<*n; i++){
    j=l[i];
    ll[j][0]++;
    ll[j][ll[j][0]]=i;
    j=u[i]+1;
    if(j<*p){
      uu[j][0]++;
      uu[j][uu[j][0]]=i;
    }
  }
}

void F_fitted(int *n, int *l, int *u, double *F, double *fitted){
  /*calculate fitted values
    n observations, p time points, observation i is in [l[i], u[i]]
  */
  int i, j; 
  for(i=0; i<*n; i++){ 
    fitted[i]=F[u[i]];
    j=l[i]-1;
    if(j>=0)
      fitted[i]-=F[j];
  }
}  

void sp_fitted(int *n, int *p, int *l, int *u, double *theta, double *fitted){
  /*calculate fitted values*/
  int i, j;
  double *F;
  F=(double *)malloc(sizeof(double)*(*p));
  for(F[0]=theta[0], i=1; i<*p; i++)
    F[i]=F[i-1]+theta[i];
  F_fitted(n, l, u, F, fitted);
  free(F);
}

void calc_der(int *p, double *wt, int **ll, int **uu, double *fitted, double *der){
  /*computes the directional derivative + sum_i wt_i*/
  int i, j, k;
  double tmp;
  for(tmp=0.0, j=0; j<*p; j++){ 
    for(i=1; i<=ll[j][0]; i++){
      k=ll[j][i];
      tmp+=wt[k]/fitted[k];
    }
    for(i=1; i<=uu[j][0]; i++){
      k=uu[j][i];
      tmp-=wt[k]/fitted[k];
    }
    der[j]=tmp;
  }
}

void sp_em(int *n, int *p, int *l, int *u, double *wt, int **ll, int **uu, double *theta, double *fitted, int *conv){
  /*EM iteration for NPMLE for interval censored data*/
  int j;
  double swt, tmp, upper, *der;
  der=(double *)malloc(sizeof(double)*(*p));  
  for(swt=0.0, j=0; j<*n; j++)
    swt+=wt[j]; 
  calc_der(p, wt, ll, uu, fitted, der);
  for(upper=swt, j=0; j<*p; j++){
    if(upper<der[j]) 
      upper=der[j];
    theta[j]*=der[j]/swt;
  } 
  /*redundant (numerical safeguard)
  for(tmp=0.0, j=0; j<*p; j++)
    tmp+=theta[j];
  for(j=0; j<*p; j++)         
    theta[j]/=tmp;
  */
  sp_fitted(n, p, l, u, theta, fitted);
  free(der);
  if(upper<=SMALL+swt)
    *conv=1;
  else *conv=0;
}

void sp_cm(int *n, int *l, int *u, double *wt, int **ll, int **uu, double *theta, int *ind1, int *ind2, double *fitted){
  /*exchange between theta[ind1] and theta[ind2]*/
  int i, j, k1, k2, len, *inds;
  double *diff, *spwt, *spfitted;
  inds=(int *)malloc(sizeof(int)*(*n+1));
  diff=(double *)malloc(sizeof(double)*(*n));
  spwt=(double *)malloc(sizeof(double)*(*n));
  spfitted=(double *)malloc(sizeof(double)*(*n));

  k1=*ind1;
  k2=*ind2;
  if(k1>k2){
    j=k2;
    k2=k1;
    k1=j;
  }
  for(len=0, i=k1+1; i<=k2; i++){
    for(j=1; j<=ll[i][0]; j++)
      if(u[ll[i][j]]>=k2){
        inds[len]=ll[i][j];
        diff[len]=-1.0;
        len++;
      } 
    for(j=1; j<=uu[i][0]; j++) 
      if(l[uu[i][j]]<=k1){
        inds[len]=uu[i][j];  
        diff[len]=1.0;
        len++;
      } 
  }
  for(i=0; i<len; i++){
    j=inds[i];
    spwt[i]=wt[j];
    spfitted[i]=fitted[j];
  }
  effcm(&len, diff, spwt, theta+k1, theta+k2, spfitted);
  for(i=0; i<len; i++)
    fitted[inds[i]]=spfitted[i];

  free(inds);
  free(diff);
  free(spwt);
  free(spfitted);
}

void sp_vdm(int *n, int *p, int *l, int *u, double *wt, int **ll, int **uu, double *theta, double *fitted, int *conv){
  /*VDM iteration for NPMLE for interval censored data*/
  int i, j; 
  double swt, upper, t1=0.0, t2=1.0, *der;
  der=(double *)malloc(sizeof(double)*(*p));
  for(swt=0.0, i=0; i<*n; i++)
    swt+=wt[i];

  calc_der(p, wt, ll, uu, fitted, der);
  for(upper=swt, i=j=0; j<*p; j++)
    if(upper<der[j]){
      upper=der[j];
      i=j;
    } 

  for(j=0; j<*n; j++)
    der[j]=(i>=l[j]&&i<=u[j])-fitted[j];
  effcm(n, der, wt, &t1, &t2, fitted);
  for(j=0; j<*p; j++)
    theta[j]*=t2;
  theta[i]+=t1;
  free(der);
  if(upper<=SMALL+swt)
    *conv=1;
  else *conv=0;
}

void sp_vem(int *n, int *p, int *l, int *u, double *wt, int **ll, int **uu, double *theta, double *fitted, int *conv){
  /*VEM iteration for NPMLE for interval censored data*/
  int i, j, k;
  double swt, lower, upper, *der;
  der=(double *)malloc(sizeof(double)*(*p));
  for(swt=0.0, i=0; i<*n; i++)
    swt+=wt[i];

  calc_der(p, wt, ll, uu, fitted, der);
  for(lower=LARGE, upper=swt, i=k=j=0; j<*p; j++){
    if(theta[j]>0.0&&lower>der[j]){
      lower=der[j];
      i=j;
    }
    if(upper<der[j]){
      upper=der[j];
      k=j;
    }
  }
  sp_cm(n, l, u, wt, ll, uu, theta, &i, &k, fitted);

  free(der);
  if(upper<=SMALL+swt)
    *conv=1;
  else *conv=0;
}

void sp_vdm_nne(int *n, int *p, int *l, int *u, double *wt, int **ll, int **uu, double *theta, double *fitted, int *conv){
  /*VDM + nearest neighbor exchanges*/
  int i, j;
  sp_vdm(n, p, l, u, wt, ll, uu, theta, fitted, conv);
  for(i=0; i<*p; i++)
    if(theta[i]>TINY)
      break;
  do{
    for(j=i+1; j<*p; j++)
      if(theta[j]>TINY)
        break;
    if(j>=*p) break;
    sp_cm(n, l, u, wt, ll, uu, theta, &i, &j, fitted);
    i=j;
  } while(1);
}

void alg_npmle(int *n, int *p, int *l, int *u, double *wt, double *theta, int *iter, int *alg){
  /*algorithms for maximizing sum_i wt_i*log(sum_j X_ij*theta_j)
    X_ij=1 if l_i<=j<=u_i, and 0 otherwise (caution: j goes from 0 to p-1)
    n: number of distinct observations
    p: number of time points
    wt[i]: number of replica of observation i
    iter: maximum number of iterations
    alg=1: EM
    alg=2: VEM
    alg=3: VDM+NNE
    alg=4: cocktail algorithm VDM + nearest neighbor exchanges + EM (recommended)
    theta: NPMLE (output) 
  */
  int i, j, nrow, ncol, conv=0, **ll, **uu; 
  double *fitted; 
  for(i=0; i<*n; i++)
    if(l[i]<0||l[i]>u[i]||u[i]>=*p){
      Rprintf("ERROR: data badly prepared\n");
      return;
    }
  fitted=(double *)malloc(sizeof(double)*(*n));
  sp_fitted(n, p, l, u, theta, fitted); 
  for(i=0; i<*n; i++)
    if(fitted[i]<TINY){
      Rprintf("ERROR: data badly prepared\n");
      free(fitted);
      return;
    }
  nrow=*p;
  ncol=*n+1;
  ll=(int **)malloc(sizeof(int *)*nrow);
  ll[0]=(int *)malloc(sizeof(int)*nrow*ncol);
  for (i=1; i<nrow; i++)
    ll[i]=ll[i-1]+ncol;
  uu=(int **)malloc(sizeof(int *)*nrow);
  uu[0]=(int *)malloc(sizeof(int)*nrow*ncol);
  for (i=1; i<nrow; i++)
    uu[i]=uu[i-1]+ncol;
  sp_set(n, p, l, u, ll, uu);
  for(j=1; j<=*iter; j++){
    if(*alg==1)
      sp_em(n, p, l, u, wt, ll, uu, theta, fitted, &conv);
    else if (*alg==2)
      sp_vem(n, p, l, u, wt, ll, uu, theta, fitted, &conv);
    else if(*alg==3)
      sp_vdm_nne(n, p, l, u, wt, ll, uu, theta, fitted, &conv);
    else if(*alg==4){
      sp_vdm_nne(n, p, l, u, wt, ll, uu, theta, fitted, &conv); 
      sp_em(n, p, l, u, wt, ll, uu, theta, fitted, &i);
    }
    if(conv) break; 
  }
  *iter=j;
  free(fitted);
  free(ll[0]);
  free(ll);
  free(uu[0]);
  free(uu);
}