#include #include #include #define MAX_SEGMENTS 40 /* Check sd as follows gcc gmm-bic.c -lm -o gmm-bic gmm-bic 3 749 oil.dat Segment, Mu, Orig-Mu, SD, Mix.Prob. Card. 1 , 21.003319 , 17.599658 , 6.629804, 0.119210, 105 2 , 78.266766 , 19.807108 , 33.210456, 0.679207, 502 3 , 190.457096 , 24.131935 , 32.736168, 0.201583, 142 In S-Plus (oil.dat = input data, yyyy = cut and paste of cluster assignments from standard output device): a <- scan('oil.dat') yyyy <- scan('yyyy') (Labels in yyyy) mean(a[yyyy==1]) 17.58229 mean(a[yyyy==2]) 19.92508 mean(a[yyyy==3]) 24.34845 sqrt(var(a[yyyy==1])) 0.2079167 sqrt(var(a[yyyy==2])) 1.21213 sqrt(var(a[yyyy==3])) 1.069975 For BIC: 1 -8282 2 -8150 3 -8097 <--- 4 -8101 5 -8118 6 -8136 */ /* ************************************************************ gmm-bic.c Mixture model fitting, for a given number of Gaussian clusters. Output including BIC value to standard output. Algorithm: determine a 256-length histogram of input data. Use EM (expectation-maximization) to find MLE solution (maximum likelihood estimators of mixture model parameters). Usage gcc gmm-bic.c -lm -o gmm-bic gcc-bic [writes out syntax] gmm-bic 4 749 oil.dat FM, 2003/4 ******************************************************************* */ main(argc, argv) int argc; char *argv[]; { FILE *stream; void free_vector(), erhand(); float Normi(), Normr(), *in_vector, *vector(), *labs, datamin, datamax; float *orig_vector, p, pmax, ival, mlikedenom, in_value; double mu[MAX_SEGMENTS], sd[MAX_SEGMENTS]; double loglik_old, loglik_new, conv_crit, segment_probs[MAX_SEGMENTS]; double dsegment_number, segment_probs_total, denom[256]; double mu_numer, mu_denom, sd_numer, sd_denom, loglik_sum; double dnorm_lookup[256][MAX_SEGMENTS], delta_lookup[256][MAX_SEGMENTS]; int i, j, k, iseg, segment_number, n; int iter_counter, max_iterations, current_breakpoint, current_total, done; int greyscale_histogram[256], initial_breakpoints[(MAX_SEGMENTS+1)]; int sd_counter[MAX_SEGMENTS], mu_counter[MAX_SEGMENTS]; int empty_levels, initialization_multiplier, initialization_error; int cardinality[MAX_SEGMENTS]; if (argc != 4) { printf("Syntax: gmm-bic no_of_clusters no_of_vals input_file\n"); printf("Max no. of clusters currently supported: 40\n"); exit(1); } segment_number = atoi(argv[1]); printf("Number of clusters wanted: %d.\n",segment_number); n = atoi(argv[2]); printf("Number of input data values: %d.\n",n); if ((stream = fopen(argv[3],"r")) == NULL) { fprintf(stderr, "Program %s : cannot open file %s\n", argv[0], argv[3]); fprintf(stderr, "Exiting to system."); exit(1); } orig_vector = vector(0,n-1); /* Input data */ in_vector = vector(0,n-1); /* Input data */ labs = vector(0,n-1); /* Used later for cluster labels */ for (i = 0; i < n; i++) { fscanf(stream, "%f", &in_value); orig_vector[i] = in_value; } printf("Check on first 10 values of input data:\n"); for (i = 0; i < 10; i++) printf("%5.1f", orig_vector[i]); /* Transform data */ for (i = 0; i < n-1; i++) in_vector[i] = log(orig_vector[i+1])-log(orig_vector[i]); /* in_vector[i] = log(orig_vector[i+1] - orig_vector[i]); */ n = n - 1; /* One value less if we are working on differences */ datamin = 1.0E30; datamax = -1.0E30; for (i = 0; i < n; i++) { if (in_vector[i] < datamin) datamin = in_vector[i]; if (in_vector[i] > datamax) datamax = in_vector[i]; } printf("\nMin and max values = %f, %f\n", datamin, datamax); /* delta_lookup is prob of a datum (we're using histogram) with value j being in segment i. mu, sd, and segment_probs are the estimated mean, standard deviation, and mixture probability for each segment. */ loglik_old = 0; /* Some initializations */ loglik_new = 1; max_iterations = 200; dsegment_number = segment_number; /* explicit type conversion */ for (i=0; i= (n/segment_number) ) { initial_breakpoints[i] = current_breakpoint; done = 1; } } if (current_total==0) initialization_error+=1; } initialization_multiplier = 1; while (initialization_error > 0) { initial_breakpoints[0] = -1; initial_breakpoints[segment_number] = 256; current_breakpoint = -1; initialization_error = 0; initialization_multiplier = initialization_multiplier*2; for (i=1; i<(segment_number); i++) { current_total = 0; done = 0; while (done==0) { current_breakpoint += 1; if (current_breakpoint==255) initialization_error+=1; current_total += greyscale_histogram[current_breakpoint]; if (initialization_multiplier == 0) { printf("Pathological case in histogram. Stopping.\n"); exit(1); } if (initialization_multiplier >= 4096) { printf("Pathological case in histogram. Stopping.\n"); exit(1); } if (current_total >= (int)((float)n/ ( (float)initialization_multiplier * (float)segment_number ))) { initial_breakpoints[i] = current_breakpoint; done = 1; } } if (current_total==0) initialization_error+=1; } } /* Check here that the init breaks are valid. If not, probably trying to fit more segments than unique greyscale values (or non-zero histogram bins). */ for (i=0; i<(segment_number); i++) { for (j=0; j<256; j++) { if ( (j>initial_breakpoints[i]) && (j<= initial_breakpoints[(i+1)])) { delta_lookup[j][i] = 1.0; } else delta_lookup[j][i] = 0.0; } } /* End of initializing delta_lookup*/ /* Initialize mu to the starting guess */ for (i=0; i1.0) conv_crit=1.0; printf("Convergence criterion %f \n",conv_crit); /* Main while loop ************* */ while ( (fabs(loglik_new-loglik_old) > (float)conv_crit) || (iter_counter==0) ) { iter_counter += 1; /* E-step */ /* Only need to calculate the Gaussian probs. for values 0-255 for each segment*/ for (i=0; i0) delta_lookup[j][i] = dnorm_lookup[j][i]/denom[j]; else delta_lookup[j][i] = 0; } } /* M-step */ if (fmod(iter_counter,1000)==0) printf("Another 1000 iterations...\n"); /* update segment_probs */ for (i=0; i0) loglik_new += greyscale_histogram[j]*log(loglik_sum); /* Otherwise, greyscale_histogram will be zero, so add zero. */ } /* printf("Loglikelihood: %24f \n",loglik_new); printf("Change: %24f \n",(loglik_new-loglik_old)); printf("BIC: %14f \n",(2*loglik_new-((3*segment_number)-1)*log(w*h))); */ } /* End of while loop. EM has converged (or diverged beyond precision)*/ /* Classify each pixel into its most likely segment */ printf("\n"); printf("Loglikelihood: %24f \n",loglik_new); /* printf("Change: %24f \n",(loglik_new-loglik_old)); */ printf("BIC: %14.0f No.clusters: %d \n", (2*loglik_new - ((3*segment_number)-1)*log((float)n)), segment_number); printf("\n"); printf("Now determining posteriors, for each pixel.\n"); /* Determine posteriors. By Bayes: prob(seg-k | ival) = f(ival | seg-k) . prob(seg-k) / sum_r f(ival | seg-r) . prob(seg-r) where LHS is posterior probability of seg-k given the data, ival, f(ival | seg-k) is the integrated or marginal likelihood of seg k prob(seg-k) is prior probability */ for (k = 0; k < segment_number; k++) cardinality[k] = 0; for (i=0; i < n; i++) { labs[i] = 0; ival = in_vector[i]; pmax = 0.0; iseg = 0; mlikedenom = 0.0; for (k=0; k pmax) { pmax = p; iseg = k; } } /* Increment cluster numbers by one so sequence is 1, 2, ... */ labs[i] = iseg + 1; cardinality[iseg] += 1; } /* For convenience, redo stdevs from original data */ for (j=0; j