/*--------------------------------------------------- * file: topicbomb * purpose: run standard topic model * inputs: T (number of topics), iter (number of iterations), docword.txt * outputs: wp.txt, dp.txt, z.txt * version: 1.0 * author: newman@uci.edu * date: 2009 module load gcc/4.4.0 gcc -DNOOMP -Wall -O3 -o topicbomb1 topicbomb.c gcc -DNOOMP -DALPHA -I. -L. -Wall -O3 -o topicbomb1a topicbomb.c -lgsl -lm gcc -Wall -O3 -fopenmp -o topicbomb4 topicbomb.c gcc -DALPHA -I. -L. -Wall -O3 -fopenmp -o topicbomb4a topicbomb.c -lgsl -lm export OMP_NUM_THREADS=4 *-------------------------------------------------*/ #include #include #include #include #include #ifdef ALPHA #include #endif typedef short int myint; #define TMAX 2100 #define BETA 0.01 #define MAX(a,b) ((a) > (b) ? (a) : (b)) #define MIN(a,b) ((a) < (b) ? (a) : (b)) int *ivec(int n); double *dvec(int n); myint **mymat(int nr, int nc); double **dmat(int nr, int nc); int countntot(char *fname); void read_dw(char *fname, int *d, int *w, int *D, int *W); void write_ivec (int n, int *x, char *fname); void write_dvec (int n, double *x, char *fname); void chkdsum (int n, int T, double **x, double *sumx); void chkmysum(int n, int T, myint **x, double *sumx); void read_ivec (int n, int *x, char *fname); void read_dvec (int n, double *x, char *fname); int imax(int n, int *x); void alpha_fixed_point(int T, int D, myint **Ndt, int *Nd, double *alpha); double dmax(int n, double *x); double dmin(int n, double *x); double dsum(int n, double *x); /*========================================== * main *========================================== */ int main(int argc, char* argv[]) { int N; // number of words in corpus int D; // number of docs int W; // number of unique words int T; // number of topics int i, t, iter, istart, idump, iend, seed, learnalpha, ichk; int *w, *d, *z, *Nd; double **Nwt, *Nt, *alpha, WBETA; myint **Ndt; // majority of storage if T>>L char fname[128]; if (argc == 1) { fprintf(stderr, "usage: %s istart idump iend T seed learnalpha\n", argv[0]); exit(-1); } istart=atoi(argv[1]); assert(istart>-1); idump =atoi(argv[2]); assert(idump>0); iend =atoi(argv[3]); assert(iend>istart); T =atoi(argv[4]); assert(T>0); assert(T0); srand48(seed); learnalpha=atoi(argv[6]); // if learnalpha==0 alpha=0.05*N/(D*T) // allocate memory N = countntot("docword.txt"); d = ivec(N); w = ivec(N); z = ivec(N); read_dw("docword.txt", d, w, &D, &W); Ndt = mymat(D,T); Nwt = dmat(W,T); Nt = dvec(T); Nd = ivec(D); alpha = dvec(T); WBETA = W*BETA; // read in z.txt alpha.txt, or randomly initialize if (istart>0) { sprintf(fname,"z.iter%d.txt",istart); read_ivec(N,z,fname); ichk=imax(N,z)+1; assert(T==ichk); sprintf(fname,"alpha.iter%d.txt",istart); read_dvec(T,alpha,fname); for(t=0;t0); } else { for(i=0;icumprob) { t++; cumprob+=p[t]; } z[i]=t; Ndt[did][t]++; Nwt[wid][t]++; Nt[t]++; } #else // over NOOMP #pragma omp parallel default(none) shared(N,T,w,d,z,Ndt,Nwt,Nt,alpha) { #pragma omp sections { #pragma omp section { int i,wid,did,t; double Z,p[TMAX],u,cumprob,tot[TMAX]; memcpy(tot,Nt,T*sizeof(double)); for (i=0*N/4; i<1*N/4; i++) { wid=w[i]; did=d[i]; t=z[i]; Ndt[did][t]--; Nwt[wid][t]--; tot[t]--; for (t=0, Z=0; tcumprob) { t++; cumprob+=p[t]; } z[i]=t; Ndt[did][t]++; Nwt[wid][t]++; tot[t]++; } } #pragma omp section { int i,wid,did,t; double Z,p[TMAX],u,cumprob,tot[TMAX]; memcpy(tot,Nt,T*sizeof(double)); for (i=1*N/4; i<2*N/4; i++) { wid=w[i]; did=d[i]; t=z[i]; Ndt[did][t]--; Nwt[wid][t]--; tot[t]--; for (t=0, Z=0; tcumprob) { t++; cumprob+=p[t]; } z[i]=t; Ndt[did][t]++; Nwt[wid][t]++; tot[t]++; } } #pragma omp section { int i,wid,did,t; double Z,p[TMAX],u,cumprob,tot[TMAX]; memcpy(tot,Nt,T*sizeof(double)); for (i=2*N/4; i<3*N/4; i++) { wid=w[i]; did=d[i]; t=z[i]; Ndt[did][t]--; Nwt[wid][t]--; tot[t]--; for (t=0, Z=0; tcumprob) { t++; cumprob+=p[t]; } z[i]=t; Ndt[did][t]++; Nwt[wid][t]++; tot[t]++; } } #pragma omp section { int i,wid,did,t; double Z,p[TMAX],u,cumprob,tot[TMAX]; memcpy(tot,Nt,T*sizeof(double)); for (i=3*N/4; i<4*N/4; i++) { wid=w[i]; did=d[i]; t=z[i]; Ndt[did][t]--; Nwt[wid][t]--; tot[t]--; for (t=0, Z=0; tcumprob) { t++; cumprob+=p[t]; } z[i]=t; Ndt[did][t]++; Nwt[wid][t]++; tot[t]++; } } } // over sections } // over omp parallel // initialize smoothing for(t=0;t99 && iter%10==0 && learnalpha>0) alpha_fixed_point(T,D,Ndt,Nd,alpha); // dump z.iter.txt and alpha.iter.txt if (iter>istart && iter%idump==0) { sprintf(fname,"z.iter%d.txt",iter); write_ivec(N,z,fname); sprintf(fname,"alpha.iter%d.txt",iter); write_dvec(T,alpha,fname); } printf("iter %d \n", iter); } // over iter // final dump z.iter.txt and alpha.iter.txt sprintf(fname,"z.iter%d.txt",iter); write_ivec(N,z,fname); sprintf(fname,"alpha.iter%d.txt",iter); write_dvec(T,alpha,fname); return 0; } /***********************************************************/ /***********************************************************/ /***********************************************************/ int *ivec(int n) // { int *x = (int*)calloc(n,sizeof(int)); assert(x); return x; } double *dvec(int n) // { double *x = (double*)calloc(n,sizeof(double)); assert(x); return x; } myint **mymat(int nr, int nc) // { int ntot = nr*nc; myint *tmp = (myint*) calloc(ntot,sizeof(myint)); myint **x = (myint**)calloc(nr,sizeof(myint*)); int r; assert(tmp); assert(x); for (r = 0; r < nr; r++) x[r] = tmp + nc*r; return x; } double **dmat(int nr, int nc) // { int ntot = nr*nc; double *tmp = (double*) calloc(ntot,sizeof(double)); double **x = (double**)calloc(nr,sizeof(double*)); int r; assert(tmp); assert(x); for (r = 0; r < nr; r++) x[r] = tmp + nc*r; return x; } int countntot(char *fname) // { int i, count, ntot = 0; char buf[BUFSIZ]; FILE *fp = fopen(fname ,"r"); assert(fp); for (i = 0; i < 3; i++) fgets(buf, BUFSIZ, fp); // skip 3 header lines while (fscanf(fp, "%*d%*d%d", &count) != EOF) ntot += count; fclose(fp); assert(ntot>0); return ntot; } void read_dw(char *fname, int *d, int *w, int *D, int *W) // { int i,wt,dt,ct,count,nnz; FILE *fp = fopen(fname ,"r"); assert(fp); count = 0; fscanf(fp,"%d", D); assert(*D>0); fscanf(fp,"%d", W); assert(*W>0); fscanf(fp,"%d", &nnz); assert(nnz>0); while (fscanf(fp, "%d%d%d", &dt, &wt, &ct) != EOF) { for (i = count; i < count+ct; i++) { w[i] = wt-1; d[i] = dt-1; } count += ct; } fclose(fp); } void write_ivec (int n, int *x, char *fname) // { FILE *fp = fopen(fname,"w"); int i; assert(fp); for (i = 0; i < n; i++) fprintf(fp, "%d\n", x[i]+1 ); fclose(fp); } void write_dvec (int n, double *x, char *fname) // { FILE *fp = fopen(fname,"w"); int i; assert(fp); for (i = 0; i < n; i++) fprintf(fp, "%.9e\n", x[i] ); fclose(fp); } void chkdsum (int n, int T, double **x, double *sumx) // { int i, t; double sum; for (t = 0; t < T; t++) { sum = 0.0; for (i = 0; i < n; i++) sum += x[i][t]; assert(fabs(sum-sumx[t])<1e-6); } } void chkmysum (int n, int T, myint **x, double *sumx) // { int i, t; double sum; for (t = 0; t < T; t++) { sum = 0.0; for (i = 0; i < n; i++) sum += x[i][t]; assert(fabs(sum-sumx[t])<1e-6); } } void read_ivec (int n, int *x, char *fname) // { FILE *fp = fopen(fname,"r"); int i; assert(fp); for (i = 0; i < n; i++) { fscanf(fp, "%d", x+i ); x[i]--; } fclose(fp); } void read_dvec (int n, double *x, char *fname) // { FILE *fp = fopen(fname,"r"); int i; assert(fp); for (i = 0; i < n; i++) { fscanf(fp, "%lf", x+i ); } fclose(fp); } int imax(int n, int *x) // { int i, xmax=x[0]; for(i=0;ixmax) xmax=x[i]; return xmax; } double dmax(int n, double *x) // { int i; double xmax=x[0]; for (i = 0; i < n; i++) xmax = MAX(xmax,x[i]); return xmax; } double dmin(int n, double *x) // { int i; double xmin=x[0]; for (i = 0; i < n; i++) xmin = MIN(xmin,x[i]); return xmin; } double dsum(int n, double *x) // { int i; double xsum=0; for (i = 0; i < n; i++) xsum += x[i]; return xsum; } void alpha_fixed_point(int T, int D, myint **Ndt, int *Nd, double *alpha) { #ifdef ALPHA double err=1,tmpnum,tmpden,sumalpha=0,psi_alpha_t; double *alpha_last = dvec(T); int t, d, loop=0; while(err>1e-6 && loop++<100) { memcpy(alpha_last,alpha,T*sizeof(double)); for(t=0;t