import java.io.*; import java.util.*; import java.text.*; import Jama.*; /** * PCAcorr - Principal Components Analysis on correlations

* Jama Matrix class package, "JAMA: A Java Matrix Package" is used.
* See: http://math.nist.gov/javanumerics/jama

* Example of use:

* javac PCAcorr.java
* java PCAcorr iris.dat > * pcaoutput.txt

* Format of input data set: *

integer row and column dimensions, *
followed by floating values *
which are read row-wise. *

* Outputs produced: *

Echo of input file name, input dimensions, sample of data *
Variable means and standard deviations *
Correlation matrix to be diagonalized *
Eigenvectors *
Eigenvalues and as cumulative percentages *
Row projections in new principal component space *
Column projections in new principal component space *

* Version: 2002 Aug. 15
* Author: F. Murtagh, f.murtagh@qub.ac.uk * @version 2002 Aug 15 * @author F. Murtagh, f.murtagh@qub.ac.uk */ public class PCAcorr{ public static void main (String argv[]) { PrintStream out = System.out; try{ //------------------------------------------------------------------- if (argv.length == 0) { System.out.println(" Syntax: java PCAcorr infile.dat "); System.out.println(" Input file format: "); System.out.println(" Line 1: integer no. rows, no. cols."); System.out.println(" Successive lines: matrix values, floating"); System.out.println(" Read in row-wise"); System.exit (1); } String filname = argv[0]; System.out.println (" Input file name: " + filname); // Open the matrix file FileInputStream is = new FileInputStream(filname); BufferedReader bis = new BufferedReader(new InputStreamReader(is)); StreamTokenizer st = new StreamTokenizer(bis); // Row and column sizes, read in first st.nextToken(); int n = (int)st.nval; st.nextToken(); int m = (int)st.nval; System.out.println(" No. of rows, n = " + n); System.out.println(" No. of cols, m = " + m); // Input array, values to be read in successively, float double[][] indat = new double[n][m]; double inval; // New read in input array values, successively System.out.println (" Input data sample follows as a check, first 4 values."); for (int i = 0; i < n; i++) { for (int j = 0; j < m; j++) { st.nextToken(); inval = (double)st.nval; indat[i][j] = inval; if (i < 2 && j < 2) { System.out.println(" value = " + inval); } } } System.out.println(); //------------------------------------------------------------------- // Data preprocessing - standardization and determining correlations double[][] indatstd = Standardize(n, m, indat); // use Jama matrix class Matrix X = new Matrix(indatstd); // Sums of squares and cross-products matrix Matrix Xprime = X.transpose(); Matrix SSCP = Xprime.times(X); // Note the following: // - with no preprocessing of the input data, we have an SSCP matrix // - with centering of columns (i.e. each col. has col. mean // [vector in row-space] subtracted) we have variances/covariances // - with centering and reduction to unit variance [i.e. centered // cols. are divided by std. dev.] we have correlations // Note: the current version supports correlations only // Print out SSCP matrix // Parameters: floating value width, and no. of decimal places // Comment out this line for large data sets, and/or diff. precision System.out.println(); System.out.println (" SSCP or sums-of-squares and cross-products matrix:"); System.out.println (" (Note: correlations in this implementation)"); SSCP.print(6,2); //------------------------------------------------------------------- // Eigen decomposition EigenvalueDecomposition evaldec = SSCP.eig(); Matrix evecs = evaldec.getV(); double[] evals = evaldec.getRealEigenvalues(); // print out eigenvectors System.out.println (" Eigenvectors (leftmost col <--> largest eval):"); // evecs contains the cols. ordered right to left // Evecs is the more natural order with cols. ordered left to right // So to repeat: leftmost col. of Evecs is assoc with largest Evals // Evals and Evecs ordered from left to right double tot = 0.0; for (int j = 0; j < evals.length; j++) { tot += evals[j]; } // reverse order of evals into Evals double[] Evals = new double[m]; for (int j = 0; j < m; j++) { Evals[j] = evals[m - j - 1]; } // reverse order of Matrix evecs into Matrix Evecs double[][] tempold = evecs.getArray(); double[][] tempnew = new double[m][m]; for (int j1 = 0; j1 < m; j1++) { for (int j2 = 0; j2 < m; j2++) { tempnew[j1][j2] = tempold[j1][m - j2 - 1]; } } Matrix Evecs = new Matrix(tempnew); Evecs.print(10,4); System.out.println(); System.out.println (" Eigenvalues and as cumulative percentages:"); double runningtotal = 0.0; double[] percentevals = new double[m]; for (int j = 0; j < Evals.length; j++) { percentevals[j] = runningtotal + 100.0*Evals[j]/tot; runningtotal = percentevals[j]; } printVect(Evals, 4, 10); printVect(percentevals, 4, 10); //------------------------------------------------------------------- // Projections - row, and col // Row projections in new space, X U Dims: (n x m) x (m x m) System.out.println(); System.out.println(" Row projections in new principal component space:"); Matrix rowproj = X.times(Evecs); rowproj.print(10,4); // Col projections (X'X) U (4x4) x4 And col-wise div. by sqrt(evals) Matrix colproj = SSCP.times(Evecs); // We need to leave colproj Matrix class and instead use double array double[][] ynew = colproj.getArray(); for (int j1 = 0; j1 < m; j1++) { for (int j2 = 0; j2 < m; j2++) { if (Evals[j2] > 0.00005) { ynew[j1][j2] = ynew[j1][j2]/Math.sqrt(Evals[j2]); } if (Evals[j2] <= 0.00005) { ynew[j1][j2] = 0.0; } } } System.out.println (); System.out.println (" Column projections in new princ. comp. space."); printMatrix(m, m, ynew, 4, 8); //------------------------------------------------------------------- // That's it. } catch (IOException e) { out.println("error: " + e); System.exit(1); } } // end of main //------------------------------------------------------------------- // Little method for helping in output formating public static String getSpaces(int n) { StringBuffer sb = new StringBuffer(n); for (int i = 0; i < n; i++) sb.append(' '); return sb.toString(); } // getSpaces //------------------------------------------------------------------- /** * Method for printing a matrix
* Based on ER Harold, "Java I/O", O'Reilly, around p. 473. * @param n1 row dimension of matrix * @param n2 column dimension of matrix * @param m input matrix values * @param d display precision, number of decimal places * @param w display precision, total width of floating value */ public static void printMatrix(int n1, int n2, double[][] m, int d, int w) { // Some definitions for handling output formating NumberFormat myFormat = NumberFormat.getNumberInstance(); FieldPosition fp = new FieldPosition(NumberFormat.INTEGER_FIELD); myFormat.setMaximumIntegerDigits(d); myFormat.setMaximumFractionDigits(d); myFormat.setMinimumFractionDigits(d); for (int i=0; i * Based on ER Harold, "Java I/O", O'Reilly, around p. 473. * @param m input vector of length m.length * @param d display precision, number of decimal places * @param w display precision, total width of floating value */ public static void printVect(double[] m, int d, int w) { // Some definitions for handling output formating NumberFormat myFormat = NumberFormat.getNumberInstance(); FieldPosition fp = new FieldPosition(NumberFormat.INTEGER_FIELD); myFormat.setMaximumIntegerDigits(d); myFormat.setMaximumFractionDigits(d); myFormat.setMinimumFractionDigits(d); int len = m.length; for (int i=0; i * Note the formalas used (since these vary between implementations):
* reduction: (vect - meanvect)/sqrt(nrow)*colstdev
* colstdev: sum_cols ((vect - meanvect)^2/nrow)
* if colstdev is close to 0, then set it to 1. * @param nrow number of rows in input matrix * @param ncol number of columns in input matrix * @param A input matrix values */ public static double[][] Standardize(int nrow, int ncol, double[][] A) { double[] colmeans = new double[ncol]; double[] colstdevs = new double[ncol]; // Adat will contain the standardized data and will be returned double[][] Adat = new double[nrow][ncol]; double[] tempcol = new double[nrow]; double tot; // Determine means and standard deviations of variables/columns for (int j=0; j