import java.text.*; import java.io.*; /** * Carry out pairwise agglomerations. For n items, therefore there * are n-1 agglomerations. Represent the cluster labels is an nxn * cluster label matrix. Column no. n will be the singleton labels, * 1 to n. Column no. n-1 will have n-1 unique values (or label sequence * numbers). Column no. n-2 will have n-2 unique values. Column no. 1 * will have the value 1 only, implying that all n items are in one * cluster. *

* ClustMat is our agglomeration "engine". It looks after labeling * only, and is independent of any agglomerative clustering criterion. *

* Other utility methods: *

* Dissim ... calculate dissimilarity matrix
* getNNs ... get nearest neighbors and associated * nearest neighbor dissimilarities
* getSpaces ... helping in output formating
* printMatrix ... print matrix of doubles, or integers
* printVect ... print vector of doubles, or integers *

* main does the following: *

    *
  1. Reads data. (Format: integer row, column dimensions, followed * by matrix values, read row-wise.) No preprocessing on input. *
  2. Calculate pairwise dissimilarities, and determines nearest neighbors * and corresponding dissimilarities. (Squared Euclidean distance * used.) *
  3. Determines the closest nearest neighbors. *
  4. Carries out an agglomeration in ClustMat. *
  5. Updates the pairwise dissimilarity matrix, and then, on the basis * of this, the nearest neighbors, and the nearest neighbor * dissimilarities. *
  6. Repeats while no. of clusters is greater than 2. *
* Constant MAXVAL is used in, resp., dissimilarities and * nearest neighbor dissimilarities, to indicate when items are processed and * no longer exist as singletons.
* Note also how flag = 1 denotes an active observation, and flag = 0 * denotes an inactive one (since it has been agglomerated). It is not * necessary to use the flag since exceptionally high dissimilarities * will signify inactive observations. However the use of flag is * helpful computationally. *

* Step 5 here determines the agglomerative clustering criterion. We are * currently using the minimum variance method. It is indicated * in the code where to change to use other agglomerative criteria.

* Output cluster labels using original sequence numbers. The ordering * of observations is not such that a dendrogram can be directly * constructed. However the cluster labels do allow selections and * further inter and intra cluster processing of the input data.

* Example of use:

* javac HCL.java
* java HCL iris.dat > * hcloutput.txt

* First version: 1999 Nov.
* Version: 2002 Oct. 26
* Author: F. Murtagh, f.murtagh@qub.ac.uk * @version 2002 Oct. 26 * @author F. Murtagh, f.murtagh@qub.ac.uk */ public class HCL { public static final double MAXVAL = 1.0e12; /** * Method Dissim, calculates dissimilarity n x n array * @param nrow integer row dimension * @param ncol integer column dimension * @param A floating row/column matrix * @return Adiss floating n x n dissimilarity array */ public static double[][] Dissim(int nrow, int ncol, double[] mass, double[][] A) { double[][] Adiss = new double[nrow][nrow]; // Initialize to 0. Caters for the diagonal which stays 0. for (int i1 = 0; i1 < nrow; i1++) { for (int i2 = 0; i2 < nrow; i2++) Adiss[i1][i2] = 0.0; } for (int i1 = 0; i1 < nrow; i1++) { // All dissimilarity we are dealing with are symmetric // so just calculate for half the array and fill in later for (int i2 = 0; i2 < i1; i2++) { for (int j = 0; j < ncol; j++) { // We are happy with the squared dissimilarity // 0.5 term since this equals // (clcard[i1]*clcard[i2])/(clcard[i1]+clcard[i2]) // for minimum variance criterion Adiss[i1][i2] += 0.5*Math.pow(A[i1][j] - A[i2][j], 2.0); } // Adiss[i1][i2] += ((mass[i1]*mass[i2])/(mass[i1]+mass[i2]))* // Math.pow( A[i1][j] - A[i2][j], 2.0 ); // Fill the other half of the array Adiss[i2][i1] = Adiss[i1][i2]; } } return Adiss; } // Dissim /** * Method getNNs, determine NNs and NN dissimilarities * @param nrow row dimension or number of observations (input) * @param flag =1 for active observation, = 0 for inactive one (input) * @param diss dissimilarity matrix (input) * @param nn nearest neighbor sequence number (calculated) * @param nndiss nearest neigbor dissimilarity (calculated) */ public static void getNNs(int nrow, int[] flag, double[][] diss, int[] nn, double[]nndiss) { int minobs; double mindist; for (int i1 = 0; i1 < nrow; i1++) { if (flag[i1] == 1) { minobs = -1; mindist = MAXVAL; for (int i2 = 0; i2 < nrow; i2++) { if ((diss[i1][i2] < mindist) && (i1 != i2)) { mindist = diss[i1][i2]; minobs = i2; } } nn[i1] = minobs + 1; nndiss[i1] = mindist; } } // Return type void => no return stmt } // getNNs /** * Method ClustMat, updates cluster structure matrix following * an agglomeration * @param nrow row dimension or number of observations (input) * @param clusters list of agglomerations, stored as array of * pairs of cluster sequence numbers (input, and updated) * @param clust1 first agglomerand (input) * @param clust2 second agglomerand (input) * @param ncl number of clusters remaining (input) */ public static void ClustMat(int nrow, int[][] clusters, int clust1, int clust2, int ncl) { // If either clust* is not initialized, then we must init. clusters if ((clust1 == 0) || (clust2 == 0)) { for (int j = 0; j < nrow; j++) { for (int i = 0; i < nrow; i++) { clusters[i][j] = 0; } } // Adjust for 0-sequencing in extreme right-hand col values. for (int i = 0; i < nrow; i++) clusters[i][ncl-1] = i + 1; return; } // For some agglomeration, we are told that label clust1 and // label clust2, among all labels in col. ncl-1 (ncl ranges over // 0 thru nrow-1) are to be agglomerated // We have arranged that order is such that clust1 < clust2 // Note: clust1 and clust2 are 1-sequenced and not 0-sequenced int ncl1; ncl1 = ncl - 1; for (int i = 0; i < nrow; i++) { clusters[i][ncl1] = clusters[i][ncl]; if (clusters[i][ncl1] == clust2) clusters[i][ncl1] = clust1; } // Return type void => no return stmt } // ClustMat // Little method for helping in output formating public static String getSpaces(int n) { StringBuffer sb = new StringBuffer(n); for (int i = 0; i < n; i++) sb.append(' '); return sb.toString(); } /** * Method for printing a double float matrix
* Based on ER Harold, "Java I/O", O'Reilly, around p. 473. * @param n1 row dimension of matrix * @param n2 column dimension of matrix * @param m input matrix values, double * @param d display precision, number of decimal places * @param w display precision, total width of floating value */ public static void printMatrix(int n1, int n2, double[][] m, int d, int w) { // Some definitions for handling output formating NumberFormat myFormat = NumberFormat.getNumberInstance(); FieldPosition fp = new FieldPosition(NumberFormat.INTEGER_FIELD); myFormat.setMaximumIntegerDigits(d); myFormat.setMaximumFractionDigits(d); myFormat.setMinimumFractionDigits(d); for (int i=0; i * Based on ER Harold, "Java I/O", O'Reilly, around p. 473. * @param n1 row dimension of matrix * @param n2 column dimension of matrix * @param m input matrix values * @param d display precision, number of decimal places * @param w display precision, total width of floating value */ public static void printMatrix(int n1, int n2, int[][] m, int d, int w) { // Some definitions for handling output formating NumberFormat myFormat = NumberFormat.getNumberInstance(); FieldPosition fp = new FieldPosition(NumberFormat.INTEGER_FIELD); myFormat.setMaximumIntegerDigits(d); for (int i=0; i * Based on ER Harold, "Java I/O", O'Reilly, around p. 473. * @param m input vector of length m.length * @param d display precision, number of decimal places * @param w display precision, total width of floating value */ public static void printVect(double[] m, int d, int w) { // Some definitions for handling output formating NumberFormat myFormat = NumberFormat.getNumberInstance(); FieldPosition fp = new FieldPosition(NumberFormat.INTEGER_FIELD); myFormat.setMaximumIntegerDigits(d); myFormat.setMaximumFractionDigits(d); myFormat.setMinimumFractionDigits(d); int len = m.length; for (int i=0; i * Based on ER Harold, "Java I/O", O'Reilly, around p. 473. * @param m input vector of length m.length * @param d display precision, number of decimal places * @param w display precision, total width of floating value */ public static void printVect(int[] m, int d, int w) { // Some definitions for handling output formating NumberFormat myFormat = NumberFormat.getNumberInstance(); FieldPosition fp = new FieldPosition(NumberFormat.INTEGER_FIELD); myFormat.setMaximumIntegerDigits(d); int len = m.length; for (int i=0; i nn[minobs]) { clust2 = minobs + 1; clust1 = nn[minobs]; } // Now we have clust1 < clust2, and we'll agglomerate System.out.println(" clus#1: " + clust1 + "; clus#2: " + clust2 + "; new card: " + (clcard[clust1-1]+ clcard[clust2-1]) + "; # clus left: " + ncl + "; mindiss: " + mindist); // Now we will carry out an agglomeration ncl = ncl - 1; ClustMat(nrow, clusters, clust1, clust2, ncl); // System.out.println("#clusters left: " + ncl + // "; cluster label matrix: "); // printMatrix(nrow, nrow, clusters, 4, 10); // Update the following: diss, nndiss // Strategy: // nn[clust2] ceases to exist; similarly nndiss[clust2] // ... for all occurrences of clust2 // nn[clust1] must be updated, as must nndiss[clust1] // Only other change is for any nn[i] such that nn[i] = // ... clust1 or clust2; this must be updated. // First, update diss cl1 = clust1 - 1; cl2 = clust2 - 1; for (int i = 0; i < nrow; i++) { // The following test is important: if ( (i != cl1) && (i != cl2) && (flag[i] == 1) ) { // Minimum variance criterion diss[cl1][i] = (mass[cl1]+mass[i])/(mass[cl1]+mass[cl2]+mass[i]) *diss[cl1][i] + (mass[cl2]+mass[i])/(mass[cl1]+mass[cl2]+mass[i]) *diss[cl2][i] - (mass[i])/(mass[cl1]+mass[cl2]+mass[i]) *diss[cl1][cl2] ; // For other alternative agglomeration criteria, // e.g. single or complete linkage, see the update // formulas in F. Murtagh, "Multidimensional // Clustering Algorithms", Physica-Verlag, 1985, p. 68. // ( (clcard[cl1]*clcard[cl2])/(clcard[cl1]+clcard[cl2]) )* // (diss[cl1][i] - diss[cl2][i]); diss[i][cl1] = diss[cl1][i]; } } clcard[cl1] = clcard[cl1] + clcard[cl2]; mass[cl1] = mass[cl1] + mass[cl2]; // Cluster label clust2 is knocked out for (int i = 0; i < nrow; i++) { diss[cl2][i] = MAXVAL; diss[i][cl2] = diss[cl2][i]; flag[cl2] = 0; nndiss[cl2] = MAXVAL; mass[cl2] = 0; } // Get nearest neighbors and nearest neighbor dissimilarities getNNs(nrow, flag, diss, nn, nndiss); // System.out.println("Nearest neighbors of items 1, 2, ... n:"); // printVect(nn, 4, 10); // System.out.println // ("Corresponding nearest neighbors dissimilarities:"); // printVect(nndiss, 4, 10); } // End of agglomerations loop while (ncl > 1); // For convenience, transpose the cluster labels int[][] tclusters = new int[nrow][nrow]; for (int i1 = 0; i1 < nrow; i1++) { for (int i2 = 0; i2 < nrow; i2++) { tclusters[i2][i1] = clusters[i1][i2]; } } // Row 1: just one cluster // Row 2: two clusters // Row 3: three clusters // ... // Row nrow: nrow clusters printMatrix(nrow, nrow, tclusters, 4, 4); } // End of try catch (IOException e) { out.println("error: " + e); System.exit(1); } } // End of main } // End of class HCL