# Chapter 9-主成分分析 #################### # Exam 1: 相关系数矩阵 ###################### library(mvtnorm) library(MASS) library(corrplot) library(lattice) n<-1000 p<-10 mu<-rep(1,p) rho<-0.5 corrmat <- diag(rep(1-rho, p)) + matrix(rho, p, p) X <- rmvnorm(n,mu,corrmat) corrplot(cor(X), type = "lower", diag=FALSE, tl.col = "black", tl.srt = 45) rgb.palette = colorRampPalette(c("white", "black"), space = "rgb") levelplot(cor(X), las=1,main="Correlation matrix", xlab="", ylab="", col.regions=rgb.palette(120), cuts=100, at=seq(0,1,0.01)) #AR(1)相关系数矩阵 n<-1000 p<-10 mu<-rep(1,p) rho<-0.5 ar1mat <- rho^outer(1:p, 1:p, function(x,y) abs(x-y)) X <- rmvnorm(n, mu, ar1mat) corrplot(cor(X), type = "lower", diag=FALSE, tl.col = "black", tl.srt = 45) rgb.palette <- colorRampPalette(c("white", "black"), space = "rgb") levelplot(cor(X), las=1,main="Correlation matrix", xlab="", ylab="", col.regions=rgb.palette(120), cuts=100, at=seq(0,1,0.01)) #################### # Exam 2 ###################### Sig = matrix(c(1,-2,0,-2,6,0,0,0,2),ncol=3,nrow=3) ev = eigen(Sig) val = round(ev$val,2) vec = round(ev$vec,3) #贡献率和累计贡献率 round(val/sum(val),4) ###贡献率 round(cumsum(val)/sum(val),4) ###累计贡献率 rho11 = (vec[1,1]*sqrt(val[1]))/sqrt(Sig[1,1]) rho12 = (vec[2,1]*sqrt(val[1]))/sqrt(Sig[2,2]) rho13 = (vec[3,1]*sqrt(val[1]))/sqrt(Sig[3,3]) rho1 = c(round(rho11,3),round(rho12,3),round(rho13,3)) v1 = round(rho1^2,3) rho21 = (vec[1,2]*sqrt(val[2]))/sqrt(Sig[1,1]) rho22 = (vec[2,2]*sqrt(val[2]))/sqrt(Sig[2,2]) rho23 = (vec[3,2]*sqrt(val[2]))/sqrt(Sig[3,3]) rho2 = c(round(rho21,3),round(rho22,3),round(rho23,3)) v2 = round(rho2^2,3) #################### # Exam 3: 基于协方差矩阵Sigma的主成分分析(PCA) ###################### Sig = matrix(c(1,4,4,100),ncol=2,nrow=2) ev.Sig = eigen(Sig) val.Sig = round(ev.Sig$val,2) vec.Sig = round(ev.Sig$vec,3) f1.Sig = round(val.Sig/sum(val.Sig),4) ### 贡献率 fm.Sig = round(cumsum(val.Sig)/sum(val.Sig),4) ### 累计贡献率 rho11.Sig = (vec.Sig[1,1]*sqrt(val.Sig[1]))/sqrt(Sig[1,1]) rho12.Sig = (vec.Sig[2,1]*sqrt(val.Sig[1]))/sqrt(Sig[2,2]) rho1.Sig = c(round(rho11.Sig,3),round(rho12.Sig,3)) v1.Sig = round(rho1.Sig^2,3) #################### # Exam 4: 基于相关系数矩阵R的主成分分析(PCA) ###################### Corr = matrix(c(1,0.4,0.4,1), ncol=2, nrow=2) ev.Corr = eigen(Corr) val.Corr = round(ev.Corr$val,2) vec.Corr = round(ev.Corr$vec,3) f1.Corr = round(val.Corr[1]/sum(val.Corr),4) rho11.Corr = (vec.Corr[1,1]*sqrt(val.Corr[1]))/sqrt(Corr[1,1]) rho12.Corr = (vec.Corr[2,1]*sqrt(val.Corr[1]))/sqrt(Corr[2,2]) rho1.Corr = c(round(rho11.Corr,3),round(rho12.Corr,3)) v1.Corr = round(rho1.Corr^2,3) #################### # Exam 5: 学生(student) 数据分析 ###################### student = data.frame( X1=c(148,139,160,149,159,142,153,150,151,139,140,161,158,140,137, 152,149,145,160,156,151,147,157,147,157,151,144,141,139,148), X2=c(41, 34, 49, 36, 45, 31, 43, 43, 42, 31, 29, 47, 49, 33, 31, 35, 47, 35, 47, 44, 42, 38, 39, 30, 48, 36, 36, 30, 32, 38), X3=c(72, 71, 77, 67, 80, 66, 76, 77, 77, 68, 64, 78, 78, 67, 66, 73, 82, 70, 74, 78, 73, 73, 68, 65, 80, 74, 68, 67, 68, 70), X4=c(78, 76, 86, 79, 86, 76, 83, 79, 80, 74, 74, 84, 83, 77, 73, 79, 79, 77, 87, 85, 82, 78, 80, 75, 88, 80, 76, 76, 73, 78) ) student.pca = princomp(student, cor=TRUE) summary(student.pca, loadings=TRUE) predict(student.pca) par(mfrow=c(1,2)) screeplot(student.pca, type="lines", main="Scree Plot") biplot(student.pca) #################### # Exam 6: 身体数据例子 ###################### x = c(1.00, 0.79, 1.00, 0.36, 0.31, 1.00, 0.96, 0.74, 0.38, 1.00, 0.89, 0.58, 0.31, 0.90, 1.00, 0.79, 0.58, 0.30, 0.78, 0.79, 1.00, 0.76, 0.55, 0.35, 0.75, 0.74, 0.73, 1.00, 0.26, 0.19, 0.58, 0.25, 0.25, 0.18, 0.24, 1.00, 0.21, 0.07, 0.28, 0.20, 0.18, 0.18, 0.29,-0.04, 1.00, 0.26, 0.16, 0.33, 0.22, 0.23, 0.23, 0.25, 0.49,-0.34, 1.00, 0.07, 0.21, 0.38, 0.08,-0.02, 0.00, 0.10, 0.44,-0.16, 0.23, 1.00, 0.52, 0.41, 0.35, 0.53, 0.48, 0.38, 0.44, 0.30,-0.05, 0.50, 0.24, 1.00, 0.77, 0.47, 0.41, 0.79, 0.79, 0.69, 0.67, 0.32, 0.23, 0.31, 0.10, 0.62, 1.00, 0.25, 0.17, 0.64, 0.27, 0.27, 0.14, 0.16, 0.51, 0.21, 0.15, 0.31, 0.17, 0.26, 1.00, 0.51, 0.35, 0.58, 0.57, 0.51, 0.26, 0.38, 0.51, 0.15, 0.29, 0.28, 0.41, 0.50, 0.63, 1.00, 0.21, 0.16, 0.51, 0.26, 0.23, 0.00, 0.12, 0.38, 0.18, 0.14, 0.31, 0.18, 0.24, 0.50, 0.65, 1.00) names = c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14", "X15", "X16") #将矩阵生成相关矩阵 R = matrix(0, nrow=16, ncol=16, dimnames=list(names, names)) for (i in 1:16){ for (j in 1:i){ R[i,j]=x[(i-1)*i/2+j]; R[j,i]=R[i,j] } } pr = princomp(covmat=R) load = loadings(pr) pr load plot(load[,1:2]); text(load[,1], load[,2], adj=c(-0.4, 0.3)) #################### # Exam 7: 对R语言中自带的数据state进行分析 ###################### help(state) state.x77 #生成数据框数据 state.x77.df = data.frame(state.x77) names(state.x77.df) = c("Popul", "Income", "Illit", "LifeExp", "Murder", "HSGrad", "Frost", "Area") attach(state.x77.df) state.pc = princomp(state.x77.df, cor=T) #显示主成分的系数 summary(state.pc, loadings=T) # 显示相关系数矩阵特征值 (state.pc$sdev)^2 #绘制碎石图 plot(1:(length(state.pc$sdev)), (state.pc$sdev)^2, type='b', main="碎石图", xlab="主成分个数", ylab="特征值") par(pty="s") plot(state.pc$scores[,1], state.pc$scores[,2], ylim=range(state.pc$scores[,1]), xlab="PC 1", ylab="PC 2", type ='n', lwd=2) text(state.pc$scores[,1], state.pc$scores[,2], labels=state.abb, cex=0.7, lwd=2) par(pty="s") plot(state.pc$scores[,1], state.pc$scores[,2], xlab="PC 1", ylab="PC 2", type ='n', lwd=2) text(state.pc$scores[,1], state.pc$scores[,2], labels=state.abb, cex=0.7, lwd=2) #前两个主成分得分的biplot biplot(state.pc,xlabs=state.abb) #################### # Exam 8 ###################### #### Inference about Principal Components: #### CIs for eigenvalue(s) of Sigma: #### (Note these eigenvalues will be different from the eigenvalues in the PCA done above, which was based on the *correlation* matrix R, not on S.) lambda.CI=function(X, label.vec, conf.level=0.95){ alpha <- 1-conf.level n <- nrow(X) m <- length(label.vec) z <- qnorm(alpha/(2*m), lower=F) lambdas <- princomp(X)$sdev^2 print(lambdas) LCL.M1 <- lambdas[label.vec]/(1+z*sqrt(2/n)) UCL.M1 <- lambdas[label.vec]/(1-z*sqrt(2/n)) LCL.M2 <- lambdas[label.vec]/exp(z*sqrt(2/n)) UCL.M2 <- lambdas[label.vec]*exp(z*sqrt(2/n)) CI.M1 <- cbind(LCL.M1, UCL.M1) CI.M2 <- cbind(LCL.M2, UCL.M2) return(list(Method1=CI.M1, Method2=CI.M2)) } # Use the student data student = data.frame( X1=c(148,139,160,149,159,142,153,150,151,139,140,161,158,140,137, 152,149,145,160,156,151,147,157,147,157,151,144,141,139,148), X2=c(41, 34, 49, 36, 45, 31, 43, 43, 42, 31, 29, 47, 49, 33, 31, 35, 47, 35, 47, 44, 42, 38, 39, 30, 48, 36, 36, 30, 32, 38), X3=c(72, 71, 77, 67, 80, 66, 76, 77, 77, 68, 64, 78, 78, 67, 66, 73, 82, 70, 74, 78, 73, 73, 68, 65, 80, 74, 68, 67, 68, 70), X4=c(78, 76, 86, 79, 86, 76, 83, 79, 80, 74, 74, 84, 83, 77, 73, 79, 79, 77, 87, 85, 82, 78, 80, 75, 88, 80, 76, 76, 73, 78) ) #95% CI for variance of the first population PC lambda.CI(student,1) #Joint 95% CIs for variances of the first two population PCs lambda.CI(student,c(1,2)) #Joint 95% CIs for variances of the first four population PCs lambda.CI(student,c(1,2,3,4)) #Note the first interval is wider because the FAMILYWISE confidence coefficient is 95% here. # 99% CI for variance of the first population PC lambda.CI(student,1, conf.level=0.99) #################### # Exam 9: 不相关变量的例子 ###################### library(mvtnorm) library(MASS) pca <- function(n, p, rho, mu){ corrmat = diag(rep(1-rho, p)) + matrix(rho, p, p) X = rmvnorm(n,mu,corrmat) pca = princomp(X, cor=TRUE) return(pca) } # A scree plot n<-10000 p<-5 mu<-rep(1,p) pca.r1 <- pca(n, p, rho=0, mu) pca.r2 <- pca(n, p, rho=0.5, mu) par(mfrow=c(1,2)) screeplot(pca.r1, type="lines", ylim=c(0,3), main="Meaningless Scree Plot") legend("topright",legend=c("rho=0")) screeplot(pca.r2, type="lines", ylim=c(0,3), main="Scree Plot") legend("topright",legend=c("rho=0.5")) plot(1:(length(pca.r1$sdev)), (pca.r1$sdev)^2, type='b', ylim=c(0,3), main="Meaningless Scree Plot", xlab="Number of Components", ylab="Eigenvalue") legend("topright",legend=c("rho=0")) plot(1:(length(pca.r2$sdev)), (pca.r2$sdev)^2, type='b', main="Scree Plot", xlab="Number of Components", ylab="Eigenvalue") legend("topright",legend=c("rho=0.5")) #################### # Exam 10: PCA application # Data Visualization # Fisher iris data ###################### X <- iris[,1:4] groupid <- iris[,5] X.pca <- princomp(~.,data=X) summary(X.pca,loadings=TRUE) plot(X.pca$scores[,1], X.pca$scores[,2], xlab="PC1", ylab="PC2", pch=rep(1:3,each=50), col=groupid, main="Iris data") legend(2,1.1,legend=levels(groupid), pch=1:3, col=1:3) #################### # Exam 11: 图像处理 这个例子缺图片 ###################### library(jpeg) Disney<-readJPEG('disney.jpg') # 图片数据不存在,换个图片 dim(Disney) Disney.pca=list() for(i in 1:3) Disney.pca[[i]]=prcomp(Disney[,,i], center = FALSE) imge=list() for (i in c(3,10,100)) { imge[[i]] = sapply(Disney.pca, function(y) { comimge = y$x[,1:i] %*% t(y$rotation[,1:i]) }, simplify = 'array') writeJPEG(imge[[i]], paste('compDisney', round(i,0), '.jpg', sep = '')) } FS = file.info('Disney.jpg')$size FS z = NULL for(i in c(3,10,100)) z = c(z,file.info(paste("compDisney",i,".jpg",sep=""))$size/FS) z #################### # Exam 12: 图像处理 ###################### library(jpeg) PIC=readJPEG('flower.jpg') PICsvd = list() for(i in 1:3) PICsvd[[i]] = svd(PIC[,,i]) comPIC = PIC; cPIC = list() PS = file.info('flower.jpg')$size PS for (i in c(5, 20, 100)){ for(k in 1:3){ comPIC[,,k]=PICsvd[[k]]$u[,1:i] %*% diag(PICsvd[[k]]$d[1:i]) %*% t(PICsvd[[k]]$v[,1:i]) writeJPEG(comPIC, paste('comPIC', round(i,0), '.jpg', sep = '')) } cPIC[[i]]=comPIC } z = NULL for(i in c(5, 20, 100)) z = c(z,file.info(paste("comPIC",i,".jpg",sep=""))$size/PS) z #################### # Exam 13: 人脸识别 # PCA image recognition ###################### library(RColorBrewer) library(rtiff) # 安装不成功 'rtiff‘包不适用于新版本的R 4.0.3 # install triff package locally showMatrix = function(x) image(t(x[nrow(x):1,]), xaxt = 'none', yaxt = 'none', col=rev(colorRampPalette(brewer.pal(7,'Greys'))(100))) # set path path = "E:/face" fileNames = dir(path) filePath = sapply(fileNames, function(x){ paste(path,x,sep='/')}) data = lapply(filePath, function(x){ as.vector(readTiff(x)@red)}) data = data.frame(data) X = t(data) #展示40个原始人脸图像 par(mfrow=c(8,5),mar = c(0, 0, 0, 0)) for(i in 1:40){ face.mat = matrix(X[i,],538,438) showMatrix(face.mat) } n= nrow(X); p=ncol(X) X.C = scale(X,scale=FALSE) ## 中心化 Y = t(X.C)/sqrt(n-1) ## 作变换得到Y Y.egn = eigen(t(Y)%*%Y) Phi = Y%*%Y.egn$vectors Phi = apply(Phi,2,function(i) i/sqrt(sum(i*i))) m = 25 ## 选25个主成分,累计贡献率达到89.2% sum(Y.egn$value[1:m])/sum(Y.egn$value) Phi.m = Phi[,1:m]; Y.hat = X.C%*%Phi.m mu = apply(X, 2, mean) #恢复图像 X.R = Y.hat%*%t(Phi.m)+matrix(mu,nrow=n,ncol=p,byrow=TRUE) #展示特征脸(Eigenface) par(mfrow = c(5,5), mar = c(0, 0, 1, 0), bty = 'n') for(i in 1:25){ showMatrix(matrix(Phi.m[,i],538,438)) } # 重构所有的40个人脸图像 par(mfrow = c(8,5), mar = c(0, 0, 0, 0), bty = 'n') for(i in 1:40){ showMatrix(matrix(X.R[i,],538,438)) } #比较第3个原始人脸图像和恢复后的图像 face.3 = matrix(X[3,],538,438) face.3.R = matrix(X.R[3,],538,438) par(mfrow = c(1,2), mar = c(0, 0, 0, 0), bty = 'n') showMatrix(face.3); showMatrix(face.3.R)