facor <- function(a, nhier=4, nf=2) { # R PROGRAM facor PURPOSE # Carry out a Correspondence Analysis of input matrix, a. # Consider the best factor space of dimensionality, nf. # Carry out a hierarchical clustering on row projections, and on # column projections in the factor space of dimensionaly nf. # In both cases, use the Correspondence Analysis row and column # masses as weights. # Choose a partition for both hierarchical clusterings, with nhier clusters. # Determine the coordinates of the clusters (on either rows, or columns) # in the factor space of dimensionality nf. # Author: F. Murtagh, 2010 Jan. 17. f m u r t a g h _at_ a c m _dot_ o r g # INPUT DATASET # File casa2.prn characterizes the 77 successive scenes of the film # Casablanca by 13 characterics related to the characters and location, # defined in the following labels: # collabels <- c("No","Int","Ext","Day","Night","Action","Dialog","Rick", # "Ilsa","Renault","Strasser","Laszlo","Minor") # "No" is sequence number of scene. "Int", "Ext": interior, exterior. # "Minor" is minor character. # EXAMPLE OF USE # cas <- matrix(scan("~/casablanca/casa2.prn"),nrow=77,ncol=13,byrow=T) # casc <- ca(cas[,-1]) # Col 1 is just seq nos. # casfacor <- facor(cas[,-1], nhier=8) # Same CA is repeated -note same evals. # Set up plot: # plot(casc$rproj[,1], casc$rproj[,2], type="n") # Plot the row cluster centers: # nhier <- 8 # text(casfacor$centersr[,1], casfacor$centersr[,2],1:nhier) # Plot rows/observations: # text(casc$rproj[,1], casc$rproj[,2], 1:77, col="red") # Plot the col. cluster centers: # text(casfacor$centersc[,1], casfacor$centersc[,2],1:10,col="blue") # Plot cols./attributes: # text(casc$cproj[,1], casc$cproj[,2], 1:13, col="green") # PARAMETERS # nf is number of factor to use for the clustering. # If there are linear dependencies, then one cannot use all # factors which are, resulting from the dependency, undefined. # 0 eigenvalues express this. Recommend: use nf = 2 or other low number. # ASSUMPTIONS # Not checked: a, input data matrix, must contain non-negative values. # Marginal sums of a must not contain a 0 value. Else zero-divides occur. # REQUIRED # Require: ca.r Correspondence Analysis. # Require: hcluswtd.r Hierarchical Clustering, using weights. # Require: facor-new.r facor, Factor coordinates of clusters. cat("nhier =", nhier, "\n") k <- sum(a) fIJ <- a/k fI <- apply(fIJ, 1, sum) fJ <- apply(fIJ, 2, sum) cac <- ca(fIJ) nclr <- nrow(fIJ) nclc <- ncol(fIJ) rproj <- cac$rproj[,1:nf] cproj <- cac$cproj[,1:nf] hclr <- hierclust(rproj, fI) hclc <- hierclust(cproj, fJ) labsr <- 1:nclr labsallr <- dimnames(fIJ)[[1]] # membersr <- cutree(hclr, nclr) # I.e. fine partition; reproduces data exactly. membersr <- cutree(hclr, nhier) # NEW labsc <- 1:nclc labsallc <- dimnames(fIJ)[[2]] # membersc <- cutree(hclc, nclc) # I.e. fine partition; reproduces data exactly. membersc <- cutree(hclc, nhier) # NEW centersr <- NULL #for (k in 1:nclr) { for (k in 1:nhier) { # NEW if (length(fI[membersr==k]) > 1) centersr <- rbind(centersr, apply(rproj[membersr==k,],2,sum)) if (length(fI[membersr==k]) == 1) # Card of cluster = 1 => bypass summing. centersr <- rbind(centersr, rproj[membersr==k,]) labsr[k] <- list(labsallr[membersr==k]) } # NEW Averaging (in equiwtd. Eucl. factor space) the cluster coords. for (k in 1:nhier) { centersr[k,] <- centersr[k,]/length(fI[membersr==k]) } centersc <- NULL #for (k in 1:nclc) { for (k in 1:nhier) { # NEW if (length(fJ[membersc==k]) > 1) centersc <- rbind(centersc, apply(cproj[membersc==k,],2,sum)) if (length(fJ[membersc==k]) == 1) # Card of cluster = 1 => bypass summing. centersc <- rbind(centersc, cproj[membersc==k,]) labsc[k] <- list(labsallc[membersc==k]) } # NEW Averaging (in equiwtd. Eucl. factor space) the cluster coords. for (k in 1:nhier) { centersc[k,] <- centersc[k,]/length(fJ[membersr==k]) } list(centersr=centersr, cluslabrow=labsr, centersc=centersc, cluslabcol=labsc) }