#include #include #include /*algorithms for MLE of mixture proportions, including NPMLE for interval censored data*/ #define TINY 1e-100 #define SMALL 1e-6 #define LARGE 1e+100 void em(int *n, int *p, double **X, double *wt, double *theta, double *fitted, int *conv){ /*EM iteration for MLE of mixture proportions: l(theta) = sum_i=1^n wt_i*log(sum_j=1^p X_ij*theta_j), sum theta_j=1 X (n by p): non-negative matrix, each row has AT LEAST ONE POSITIVE ENTRY fitted (n by 1): fitted values; fitted_i=sum_j X_ij*theta_j conv=1 upon convergence */ int i, j; double swt, tmp, upper; for(swt=0.0, j=0; j<*n; j++) swt+=wt[j]; for(upper=swt, j=0; j<*p; j++){ for(tmp=0.0, i=0; i<*n; i++) tmp+=wt[i]*X[i][j]/fitted[i]; theta[j]*=tmp/swt; if(upper=0, theta2>=0 */ int i; double new, tmp1, tmp2, c1, c2, upper; for(tmp1=tmp2=0.0, c1=c2=LARGE, i=0; i<*n; i++){ if(diff[i]>TINY){ new=fitted[i]/diff[i]; if(c1>new) c1=new; tmp1+=wt[i]/new; } else if(diff[i]<-TINY){ new=-fitted[i]/diff[i]; if(c2>new) c2=new; tmp2+=wt[i]/new; } } tmp1*=c1; tmp2*=c2; if(tmp1+tmp2upper) new=upper; tmp1=new-*theta1; for(i=0; i<*n; i++) fitted[i]+=tmp1*diff[i]; *theta1=new; *theta2=upper-new; } void vdm(int *n, int *p, double **X, double *wt, double *theta, double *fitted, int *conv){ /*VDM iteration for MLE of mixture proportions*/ int i, j; double swt, upper, t1=0.0, t2=1.0, *der; der=(double *)malloc(sizeof(double)*(*n+*p)); for(swt=0.0, i=0; i<*n; i++) swt+=wt[i]; for(j=0; j<*p; j++) for(der[j]=0.0, i=0; i<*n; i++) der[j]+=wt[i]*X[i][j]/fitted[i]; for(upper=swt, i=j=0; j<*p; j++) if(upper0.0&&lower>der[j]){ lower=der[j]; i=j; } if(upperTINY) break; do{ for(j=i+1; j<*p; j++) if(theta[j]>TINY) break; if(j>=*p) break; for(k=0; k<*n; k++) diff[k]=X[k][i]-X[k][j]; effcm(n, diff, wt, theta+i, theta+j, fitted); i=j; } while(1); free(diff); } void alg_mix(int *n, int *p, double *Xvec, double *wt, double *theta, int *iter, int *alg){ /*algorithms for maximizing sum_i wt_i*log(sum_j X_ij*theta_j) X (n by p) stored row-wise as Xvec wt[i]: number of replica of observation i iter: maximum number of iterations alg=1: EM alg=2: VEM alg=3: VDM+NNE alg=4: cocktail algorithm VDM + nearest neighbor exchanges + EM (recommended) essential: a good ordering of the columns of X (mixture components) theta: NPMLE (output) */ int i, j, conv=0; double *fitted, **X; fitted=(double *)malloc(sizeof(double)*(*n)); X=(double **)malloc(sizeof(double *)*(*n)); for(i=0; i<*n; i++) X[i]=Xvec+i*(*p); for(i=0; i<*n; i++){ for(fitted[i]=0.0, j=0; j<*p; j++){ if(X[i][j]<0.0){ conv=-1; break; } fitted[i]+=X[i][j]*theta[j]; } if(conv==-1||fitted[i]=0) fitted[i]-=F[j]; } } void sp_fitted(int *n, int *p, int *l, int *u, double *theta, double *fitted){ /*calculate fitted values*/ int i, j; double *F; F=(double *)malloc(sizeof(double)*(*p)); for(F[0]=theta[0], i=1; i<*p; i++) F[i]=F[i-1]+theta[i]; F_fitted(n, l, u, F, fitted); free(F); } void calc_der(int *p, double *wt, int **ll, int **uu, double *fitted, double *der){ /*computes the directional derivative + sum_i wt_i*/ int i, j, k; double tmp; for(tmp=0.0, j=0; j<*p; j++){ for(i=1; i<=ll[j][0]; i++){ k=ll[j][i]; tmp+=wt[k]/fitted[k]; } for(i=1; i<=uu[j][0]; i++){ k=uu[j][i]; tmp-=wt[k]/fitted[k]; } der[j]=tmp; } } void sp_em(int *n, int *p, int *l, int *u, double *wt, int **ll, int **uu, double *theta, double *fitted, int *conv){ /*EM iteration for NPMLE for interval censored data*/ int j; double swt, tmp, upper, *der; der=(double *)malloc(sizeof(double)*(*p)); for(swt=0.0, j=0; j<*n; j++) swt+=wt[j]; calc_der(p, wt, ll, uu, fitted, der); for(upper=swt, j=0; j<*p; j++){ if(upperk2){ j=k2; k2=k1; k1=j; } for(len=0, i=k1+1; i<=k2; i++){ for(j=1; j<=ll[i][0]; j++) if(u[ll[i][j]]>=k2){ inds[len]=ll[i][j]; diff[len]=-1.0; len++; } for(j=1; j<=uu[i][0]; j++) if(l[uu[i][j]]<=k1){ inds[len]=uu[i][j]; diff[len]=1.0; len++; } } for(i=0; i=l[j]&&i<=u[j])-fitted[j]; effcm(n, der, wt, &t1, &t2, fitted); for(j=0; j<*p; j++) theta[j]*=t2; theta[i]+=t1; free(der); if(upper<=SMALL+swt) *conv=1; else *conv=0; } void sp_vem(int *n, int *p, int *l, int *u, double *wt, int **ll, int **uu, double *theta, double *fitted, int *conv){ /*VEM iteration for NPMLE for interval censored data*/ int i, j, k; double swt, lower, upper, *der; der=(double *)malloc(sizeof(double)*(*p)); for(swt=0.0, i=0; i<*n; i++) swt+=wt[i]; calc_der(p, wt, ll, uu, fitted, der); for(lower=LARGE, upper=swt, i=k=j=0; j<*p; j++){ if(theta[j]>0.0&&lower>der[j]){ lower=der[j]; i=j; } if(upperTINY) break; do{ for(j=i+1; j<*p; j++) if(theta[j]>TINY) break; if(j>=*p) break; sp_cm(n, l, u, wt, ll, uu, theta, &i, &j, fitted); i=j; } while(1); } void alg_npmle(int *n, int *p, int *l, int *u, double *wt, double *theta, int *iter, int *alg){ /*algorithms for maximizing sum_i wt_i*log(sum_j X_ij*theta_j) X_ij=1 if l_i<=j<=u_i, and 0 otherwise (caution: j goes from 0 to p-1) n: number of distinct observations p: number of time points wt[i]: number of replica of observation i iter: maximum number of iterations alg=1: EM alg=2: VEM alg=3: VDM+NNE alg=4: cocktail algorithm VDM + nearest neighbor exchanges + EM (recommended) theta: NPMLE (output) */ int i, j, nrow, ncol, conv=0, **ll, **uu; double *fitted; for(i=0; i<*n; i++) if(l[i]<0||l[i]>u[i]||u[i]>=*p){ Rprintf("ERROR: data badly prepared\n"); return; } fitted=(double *)malloc(sizeof(double)*(*n)); sp_fitted(n, p, l, u, theta, fitted); for(i=0; i<*n; i++) if(fitted[i]