import java.text.*; import java.io.*; /** * Carry out pairwise agglomerations. For n items, therefore there * are n-1 agglomerations. Represent the cluster labels is an nxn * cluster label matrix. Column no. n will be the singleton labels, * 1 to n. Column no. n-1 will have n-1 unique values (or label sequence * numbers). Column no. n-2 will have n-2 unique values. Column no. 1 * will have the value 1 only, implying that all n items are in one * cluster. *
* ClustMat is our agglomeration "engine". It looks after labeling * only, and is independent of any agglomerative clustering criterion. *
* Other utility methods: *
* Dissim ... calculate dissimilarity matrix
* getNNs ... get nearest neighbors and associated
* nearest neighbor dissimilarities
* getSpaces ... helping in output formating
* printMatrix ... print matrix of doubles, or integers
* printVect ... print vector of doubles, or integers
*
* main does the following: *
* Step 5 here determines the agglomerative clustering criterion. We are * currently using the minimum variance method. It is indicated * in the code where to change to use other agglomerative criteria.
* Output cluster labels using original sequence numbers. The ordering * of observations is not such that a dendrogram can be directly * constructed. However the cluster labels do allow selections and * further inter and intra cluster processing of the input data.
* Example of use:
* javac HCL.java
* java HCL iris.dat >
* hcloutput.txt
* First version: 1999 Nov.
* Version: 2002 Oct. 26
* Author: F. Murtagh, f.murtagh@qub.ac.uk
* @version 2002 Oct. 26
* @author F. Murtagh, f.murtagh@qub.ac.uk
*/
public class HCL
{
public static final double MAXVAL = 1.0e12;
/**
* Method Dissim, calculates dissimilarity n x n array
* @param nrow integer row dimension
* @param ncol integer column dimension
* @param A floating row/column matrix
* @return Adiss floating n x n dissimilarity array
*/
public static double[][] Dissim(int nrow, int ncol,
double[] mass, double[][] A)
{
double[][] Adiss = new double[nrow][nrow];
// Initialize to 0. Caters for the diagonal which stays 0.
for (int i1 = 0; i1 < nrow; i1++) {
for (int i2 = 0; i2 < nrow; i2++) Adiss[i1][i2] = 0.0;
}
for (int i1 = 0; i1 < nrow; i1++) {
// All dissimilarity we are dealing with are symmetric
// so just calculate for half the array and fill in later
for (int i2 = 0; i2 < i1; i2++) {
for (int j = 0; j < ncol; j++) {
// We are happy with the squared dissimilarity
// 0.5 term since this equals
// (clcard[i1]*clcard[i2])/(clcard[i1]+clcard[i2])
// for minimum variance criterion
Adiss[i1][i2] += 0.5*Math.pow(A[i1][j] - A[i2][j], 2.0); }
// Adiss[i1][i2] += ((mass[i1]*mass[i2])/(mass[i1]+mass[i2]))*
// Math.pow( A[i1][j] - A[i2][j], 2.0 );
// Fill the other half of the array
Adiss[i2][i1] = Adiss[i1][i2];
}
}
return Adiss;
} // Dissim
/**
* Method getNNs, determine NNs and NN dissimilarities
* @param nrow row dimension or number of observations (input)
* @param flag =1 for active observation, = 0 for inactive one (input)
* @param diss dissimilarity matrix (input)
* @param nn nearest neighbor sequence number (calculated)
* @param nndiss nearest neigbor dissimilarity (calculated)
*/
public static void getNNs(int nrow, int[] flag, double[][] diss,
int[] nn, double[]nndiss)
{
int minobs;
double mindist;
for (int i1 = 0; i1 < nrow; i1++) {
if (flag[i1] == 1) {
minobs = -1;
mindist = MAXVAL;
for (int i2 = 0; i2 < nrow; i2++) {
if ((diss[i1][i2] < mindist) && (i1 != i2)) {
mindist = diss[i1][i2];
minobs = i2;
}
}
nn[i1] = minobs + 1;
nndiss[i1] = mindist;
}
}
// Return type void => no return stmt
} // getNNs
/**
* Method ClustMat, updates cluster structure matrix following
* an agglomeration
* @param nrow row dimension or number of observations (input)
* @param clusters list of agglomerations, stored as array of
* pairs of cluster sequence numbers (input, and updated)
* @param clust1 first agglomerand (input)
* @param clust2 second agglomerand (input)
* @param ncl number of clusters remaining (input)
*/
public static void ClustMat(int nrow, int[][] clusters,
int clust1, int clust2, int ncl)
{
// If either clust* is not initialized, then we must init. clusters
if ((clust1 == 0) || (clust2 == 0)) {
for (int j = 0; j < nrow; j++) {
for (int i = 0; i < nrow; i++) {
clusters[i][j] = 0;
}
}
// Adjust for 0-sequencing in extreme right-hand col values.
for (int i = 0; i < nrow; i++) clusters[i][ncl-1] = i + 1;
return;
}
// For some agglomeration, we are told that label clust1 and
// label clust2, among all labels in col. ncl-1 (ncl ranges over
// 0 thru nrow-1) are to be agglomerated
// We have arranged that order is such that clust1 < clust2
// Note: clust1 and clust2 are 1-sequenced and not 0-sequenced
int ncl1;
ncl1 = ncl - 1;
for (int i = 0; i < nrow; i++) {
clusters[i][ncl1] = clusters[i][ncl];
if (clusters[i][ncl1] == clust2) clusters[i][ncl1] = clust1;
}
// Return type void => no return stmt
} // ClustMat
// Little method for helping in output formating
public static String getSpaces(int n) {
StringBuffer sb = new StringBuffer(n);
for (int i = 0; i < n; i++) sb.append(' ');
return sb.toString();
}
/**
* Method for printing a double float matrix
* Based on ER Harold, "Java I/O", O'Reilly, around p. 473.
* @param n1 row dimension of matrix
* @param n2 column dimension of matrix
* @param m input matrix values, double
* @param d display precision, number of decimal places
* @param w display precision, total width of floating value
*/
public static void printMatrix(int n1, int n2, double[][] m, int d, int w)
{
// Some definitions for handling output formating
NumberFormat myFormat = NumberFormat.getNumberInstance();
FieldPosition fp = new FieldPosition(NumberFormat.INTEGER_FIELD);
myFormat.setMaximumIntegerDigits(d);
myFormat.setMaximumFractionDigits(d);
myFormat.setMinimumFractionDigits(d);
for (int i=0; i