import java.io.*; import java.util.*; import java.text.*; import Jama.*; /** * Partition - k-means partitioning using exchange method

* Uses: PCAcorr - Principal Components Analysis on correlations

* Jama Matrix class package, "JAMA: A Java Matrix Package" is used.
* See: http://math.nist.gov/javanumerics/jama

* Example of use:

* javac partition.java
* java partition iris0.dat > * partoutput.txt

* Format of input data set: *

* Outputs produced: * * Changes in 2003 Nov. * * * Version: 2003 Nov. 14
* Author: F. Murtagh, f.murtagh@qub.ac.uk * @version 2003 Nov. 14 * @author F. Murtagh, f.murtagh@qub.ac.uk */ public class partition { public static final double MAXVAL = 1.0e12; public static final double R = 0.999; // See Spaeth, p. 103 public static void main (String argv[]) { PrintStream out = System.out; try{ //------------------------------------------------------------------- if (argv.length == 0) { System.out.println(" Syntax: java partition infile.dat "); System.out.println(" Input file format: "); System.out.println(" Line 1: integer no. rows, no. cols., no clus."); System.out.println(" Successive lines: matrix values, floating"); System.out.println(" Read in row-wise"); System.exit (1); } System.out.println (" K-Means using Spaeth's exchange algorithm,"); System.out.println (" initialized using first PC projections."); String filname = argv[0]; System.out.println (" Input file name: " + filname); // Open the matrix file FileInputStream is = new FileInputStream(filname); BufferedReader bis = new BufferedReader(new InputStreamReader(is)); StreamTokenizer st = new StreamTokenizer(bis); // Row and column sizes, read in first st.nextToken(); int n = (int)st.nval; st.nextToken(); int m = (int)st.nval; st.nextToken(); int k = (int)st.nval; System.out.println(" No. of rows, n = " + n); System.out.println(" No. of columns, m = " + m); System.out.println(" No. of clusters, k = " + k); // Input array, values to be read in successively, float double[][] indat = new double[n][m]; double inval; // New read in input array values, successively for (int i = 0; i < n; i++) { for (int j = 0; j < m; j++) { st.nextToken(); inval = (double)st.nval; indat[i][j] = inval; } } // Data preprocessing - standardization and determining correlations double[][] indatstd = Standardize(n, m, indat); // use Jama matrix class Matrix X = new Matrix(indatstd); // Sums of squares and cross-products matrix Matrix Xprime = X.transpose(); Matrix SSCP = Xprime.times(X); // Eigen decomposition EigenvalueDecomposition evaldec = SSCP.eig(); Matrix evecs = evaldec.getV(); // evecs contains the cols. ordered right to left // Evecs is the more natural order with cols. ordered left to right // So to repeat: leftmost col. of Evecs is assoc with largest Evals // Evecs ordered from left to right // reverse order of Matrix evecs into Matrix Evecs double[][] tempold = evecs.getArray(); double[][] tempnew = new double[m][m]; for (int j1 = 0; j1 < m; j1++) { for (int j2 = 0; j2 < m; j2++) { tempnew[j1][j2] = tempold[j1][m - j2 - 1]; } } Matrix Evecs = new Matrix(tempnew); // Evecs.print(10,4); // Projections - for rows // Row projections in new space, X U Dims: (n x m) x (m x m) Matrix rowproj = X.times(Evecs); // rowproj.print(10,4); //------------------------------------------------------------------- // Now k-means partitioning using exchange method double[] tempvec = new double[n]; double[] rproj = new double[n]; tempold = rowproj.getArray(); for (int i = 0; i < n; i++) { tempvec[i] = tempold[i][0]; rproj[i] = tempold[i][0]; } inSort(tempvec); // Sort in place, modifying values. // printVect(tempvec, 4, 10); // First choose breakpoints int[] breakindexes = new int[k+1]; double[] breakpoints = new double[k+1]; breakpoints[0] = tempvec[0] - 0.1; //Offset so that strict ">" used later breakpoints[k] = tempvec[n-1]; for (int i = 1; i < k; i++) { breakindexes[i] = i*n/k; breakpoints[i] = tempvec[breakindexes[i]]; } // printVect(breakpoints, 4, 10); double[][] gpmeans = new double[k][m]; double[] cardinality = new double[k]; int[] assignment = new int[n]; for (int i = 0; i < k-1; i++) { cardinality[i] = 0.0; for (int j = 0; j < m-1; j++) { gpmeans[i][j] = 0.0; } } for (int icl = 0; icl < k; icl++) { // lo, hi (resp.) are breakpoints[i], breakpoints[i+1] // cluster = icl, with values 0, 1, 2, ... k-1 for (int i = 0; i < n; i++) { // Not terribly efficient - fix later if ( (rproj[i] > breakpoints[icl]) && (rproj[i] <= breakpoints[icl+1]) ) { assignment[i] = icl; cardinality[icl] += 1.0; for (int j = 0; j < m; j++) { gpmeans[icl][j] += indat[i][j]; } } } for (int j=0; j * Based on ER Harold, "Java I/O", O'Reilly, around p. 473. * @param n1 row dimension of matrix * @param n2 column dimension of matrix * @param m input matrix values * @param d display precision, number of decimal places * @param w display precision, total width of floating value */ public static void printMatrix(int n1, int n2, double[][] m, int d, int w) { // Some definitions for handling output formating NumberFormat myFormat = NumberFormat.getNumberInstance(); FieldPosition fp = new FieldPosition(NumberFormat.INTEGER_FIELD); myFormat.setMaximumIntegerDigits(d); myFormat.setMaximumFractionDigits(d); myFormat.setMinimumFractionDigits(d); for (int i=0; i * Based on ER Harold, "Java I/O", O'Reilly, around p. 473. * @param m input vector of length m.length * @param d display precision, number of decimal places * @param w display precision, total width of floating value */ public static void printVect(double[] m, int d, int w) { // Some definitions for handling output formating NumberFormat myFormat = NumberFormat.getNumberInstance(); FieldPosition fp = new FieldPosition(NumberFormat.INTEGER_FIELD); myFormat.setMaximumIntegerDigits(d); myFormat.setMaximumFractionDigits(d); myFormat.setMinimumFractionDigits(d); int len = m.length; for (int i=0; i * Based on ER Harold, "Java I/O", O'Reilly, around p. 473. * @param m input vector of length m.length * @param d display precision, number of decimal places * @param w display precision, total width of floating value */ public static void printVect(int[] m, int d, int w) { // Some definitions for handling output formating NumberFormat myFormat = NumberFormat.getNumberInstance(); FieldPosition fp = new FieldPosition(NumberFormat.INTEGER_FIELD); myFormat.setMaximumIntegerDigits(d); int len = m.length; for (int i=0; i * Note the formalas used (since these vary between implementations):
* reduction: (vect - meanvect)/sqrt(nrow)*colstdev
* colstdev: sum_cols ((vect - meanvect)^2/nrow)
* if colstdev is close to 0, then set it to 1. * @param nrow number of rows in input matrix * @param ncol number of columns in input matrix * @param A input matrix values */ public static double[][] Standardize(int nrow, int ncol, double[][] A) { double[] colmeans = new double[ncol]; double[] colstdevs = new double[ncol]; // Adat will contain the standardized data and will be returned double[][] Adat = new double[nrow][ncol]; double[] tempcol = new double[nrow]; double tot; // Determine means and standard deviations of variables/columns for (int j=0; j 0 && invect[i] > a) { CORRECTED! while (i >= 0 && invect[i] > a) { invect[i+1] = invect[i]; i--; } invect[i+1] = a; } // Return type void } // inSort } // PCAcorr