# Chapter 10-因子分析 # Data names <- c('30年','20年','10年','5年','1年') R <- matrix(c(1.0,0.98,0.92,0.85,0.66,0.98,1,0.92,0.86,0.67,0.92,0.92,1,0.90,0.71,0.85,0.86,0.90,1,0.84,0.66,0.67,0.71,0.84,1), nrow=5, ncol=5, dimnames=list(names,names), byrow=T) #################### # Exam 1: 主成分法(Principal Component Method) ###################### fa.pcm = function(S, k){ p = nrow(S); diag_S = diag(S); sum_rank = sum(diag_S) rowname = rownames(S); colname = paste('Factor',1:k,sep='') A = matrix(0,nrow=p,ncol=k,dimnames=list(rowname,colname)) eig = eigen(S) for (i in 1:k) A[,i] = sqrt(eig$values[i])*eig$vectors[,i] h = diag(A%*%t(A)) rowname = c('SS loadings','Proportion Var','Cumulative Var') B = matrix(0,nrow=3,ncol=k,dimnames=list(rowname,colname)) for (i in 1:k){ B[1,i] = sum(A[,i]^2) B[2,i] = B[1,i]/sum_rank B[3,i] = sum(B[1,1:i])/sum_rank } A = round(A,3); B = round(B,3); h = round(h,3) diag_S = round(diag_S,3) method = c('Principal Component Method') list(method=method, loadings=A, var=cbind(common=h,spcific=diag_S-h), B=B) } fa.pcm.fit <- fa.pcm(R, k=2) fa.pcm.fit vm.pcm <- varimax(fa.pcm.fit$loadings, normalize=F) ####(方差最大化旋转) vm.pcm # 主成分法的因子得分 z <- c(1.4,-0.2,0.5,-1.0,0.8) A <- fa.pcm.fit$loadings D <- diag(fa.pcm.fit$var[,2]) I <- diag(c(1,1)) B.score <- solve(t(A)%*%solve(D)%*%A)%*%t(A)%*%solve(D)%*%z #### Bartlett因子得分 T.score <- solve(I+t(A)%*%solve(D)%*%A)%*%t(A)%*%solve(D)%*%z #### Thompsom因子得分 B.score #################### # Exam 2: 主因子法(Iterative Principal Factor Method)(给初始值迭代法) ###################### fa.ipfm = function(R, k, d){ p = nrow(R); diag_R = diag(R); sum_rank = sum(diag_R) rowname = rownames(R) #paste('X',1:p,sep='') colname = paste('Factor',1:k,sep='') A = matrix(0,nrow=p,ncol=k,dimnames=list(rowname,colname)) mmax = 200; m = 1; h = diag_R-d repeat{ diag(R) = h; h1 = h; eig = eigen(R) for (i in 1:k) A[,i] = sqrt(eig$values[i])*eig$vectors[,i] h = diag(A%*%t(A)) if ((sqrt(sum((h-h1)^2))<1e-4)|m==mmax) break m = m+1 } rowname = c('SS loadings','Proportion Var','Cumulative Var') B = matrix(0,nrow=3,ncol=k,dimnames=list(rowname,colname)) for (i in 1:k){ B[1,i] = sum(A[,i]^2) B[2,i] = B[1,i]/sum_rank B[3,i] = sum(B[1,1:i])/sum_rank } A = round(A,3) B = round(B,3) h = round(h,3) diag_R = round(diag_R,3) method = c('Iterative Principal Factor Method') list(method=method,loadings=A,var=cbind(common=h,spcific=diag_R-h),B=B,iterative=m) } d = c(0.12,0.15,0.09,0.05,0.07) fa.pfm.fit = fa.ipfm(R, k=2, d) fa.pfm.fit fa.ipfm = function(R, k, d){ p = nrow(R); diag_R = diag(R); sum_rank = sum(diag_R) rowname = rownames(R) #paste('X',1:p,sep='') colname = paste('Factor',1:k,sep='') A = matrix(0,nrow=p,ncol=k,dimnames=list(rowname,colname)) mmax = 200; m = 1; h = diag_R-d repeat{ diag(R) = h; h1 = h; eig = eigen(R) for (i in 1:k) A[,i] = sqrt(eig$values[i])*eig$vectors[,i] h = diag(A%*%t(A)) if ((sqrt(sum((h-h1)^2))<1e-4)|m==mmax) break m = m+1 } rowname = c('SS loadings','Proportion Var','Cumulative Var') B = matrix(0,nrow=3,ncol=k,dimnames=list(rowname,colname)) for (i in 1:k){ B[1,i] = sum(A[,i]^2) B[2,i] = B[1,i]/sum_rank B[3,i] = sum(B[1,1:i])/sum_rank } A = round(A,3) B = round(B,3) h = round(h,3) diag_R = round(diag_R,3) method = c('Iterative Principal Factor Method') list(method=method,loadings=A,var=cbind(common=h,spcific=diag_R-h),B=B,iterative=m) } d <- c(0.12,0.15,0.09,0.05,0.07) fa.pfm.fit <- fa.ipfm(R, k=2, d) fa.pfm.fit # 主因子法的因子得分 z <- c(1.4,-0.2,0.5,-1.0,0.8) A <- fa.pfm.fit$loadings D <- diag(fa.pfm.fit$var[,2]) I <- diag(c(1,1)) B.score <- solve(t(A)%*%solve(D)%*%A)%*%t(A)%*%solve(D)%*%z T.score <- solve(I+t(A)%*%solve(D)%*%A)%*%t(A)%*%solve(D)%*%z B.score T.score #################### # Exam 3: 无迭代主因子法(Non-iterative Principal Factor Method) # 初始值最大值相关系数 ###################### fa.nipfm = function(R, k){ p = nrow(R); diag_R = diag(R); sum_rank = sum(diag_R) rowname = rownames(R) # paste('X',1:p,sep='') colname = paste('Factor',1:k,sep='') A = matrix(0,nrow=p,ncol=k,dimnames=list(rowname,colname)) h = apply(abs(R-diag(1,nrow=p,ncol=p)),2,max) ##初始值 diag(R) = h; h1 = h; eig = eigen(R) for (i in 1:k){ A[,i] = sqrt(eig$values[i])*eig$vectors[,i] h = diag(A%*%t(A))} rowname = c('SS loadings','Proportion Var','Cumulative Var') B = matrix(0,nrow=3,ncol=k,dimnames=list(rowname,colname)) for (i in 1:k){ B[1,i] = sum(A[,i]^2) B[2,i] = B[1,i]/sum_rank B[3,i] = sum(B[1,1:i])/sum_rank } A = round(A,3) B = round(B,3) h = round(h,3) diag_R = round(diag_R,3) method = c('Non-iterative Principal Factor Method') list(method=method,loadings=A,var=cbind(common=h,spcific=diag_R-h),B=B) } fb.fit <- fa.nipfm(R, k=2) fb.fit vm.nipfm <- varimax(fb.fit$loadings, normalize=F) ####(方差最大化旋转) vm.nipfm #################### # Exam 4: 极大似然法(Maximum likelihood method) ###################### fa.mlm = function(S, k, d){ p = nrow(S); diag_S = diag(S); sum_rank = sum(diag_S) rowname = paste("X", 1:p, sep="") colname = paste("Factor", 1:k, sep="") A = matrix(0, nrow=p, ncol=k, dimnames=list(rowname, colname)) itmax = 20; it = 1 repeat{ d1 = d; d2 = 1/sqrt(d); eig = eigen(S * (d2 %o% d2)) for (i in 1:k) A[,i] = sqrt(eig$values[i]-1)*eig$vectors[,i] A = diag(sqrt(d)) %*% A d = diag(S-A%*%t(A)) if ((sqrt(sum((d-d1)^2))<1e-4)|it==itmax) break it = it + 1 } rowname = c("SS loadings", "Proportion Var", "Cumulative Var") B = matrix(0, nrow=3, ncol=k, dimnames=list(rowname, colname)) for (i in 1:k){ B[1,i] = sum(A[,i]^2) B[2,i] = B[1,i]/sum_rank B[3,i] = sum(B[1,1:i])/sum_rank } method = c("Maximum Likelihood Method") list(method=method, loadings=A, var=cbind(common=diag_S-d, spcific=d), B=B, iterative=it) } d = c(0.12,0.15,0.09,0.05,0.07) fa.mlm.fit = fa.mlm(R, k=2, d) fa.mlm.fit vm.mlm = varimax(fa.mlm.fit$loadings, normalize=F) ####(方差最大化旋转) vm.mlm #################### # Exam 5: Maximum likelihood method using function factanal() ###################### fa.mle.fit2 <- factanal(factors=2, covmat=R, rotation="none", n.obs=696) fa.mle.fit2 vm.mlm2 = factanal(covmat=R, factors=2, rotation="varimax", n.obs=696)#(方差最大化旋转) vm.mlm2 # 极大似然法因子得分 z = c(1.4,-0.2,0.5,-1.0,0.8) A1 = fa.mle.fit2$loadings A = A1[1:5,] D = diag(diag(R-A%*%t(A))) I = diag(c(1,1)) B.score = solve(t(A)%*%solve(D)%*%A)%*%t(A)%*%solve(D)%*%z T.score = solve(I+t(A)%*%solve(D)%*%A)%*%t(A)%*%solve(D)%*%z B.score #################### # Exam 6: The example in Lawley and Maxwell (1979) ###################### names = c('盖尔语','英语','历史','算数','代数','几何') R.score = matrix(c(1.0, .439, .410, .288, .329, .248, .439, 1.0, .351, .354, .320, .329, .410, .351, 1.0, .164, .190, .181, .288, .354, .164, 1.0, .595, .470, .329, .320, .190, .595, 1.0, .464, .248, .329, .181, .470, .464, 1.0), nrow=6, ncol=6, dimnames=list(names,names), byrow=T) fa.fit = factanal(factors=2, covmat=R.score, rotation="none", n.obs=220) fa.fit #旋转20度后的载荷矩阵 A = fa.fit$loadings[1:6, ] phi = 20/180*pi G = matrix(c(cos(phi), -sin(phi), sin(phi), cos(phi)), nrow=2, ncol=2) round(A%*%G, 3) # 旋转20度直角坐标图10.1 library(ggplot2) # 这个包装不上,需要高版本R x = A[,1]; y = A[,2] data = data.frame(x, y, ID=1:6) ggplot(data, aes(x=x, y=y)) + geom_point(size=2)+ coord_cartesian(xlim=c(0, 1.2),ylim=c(-0.6,0.6))+ geom_segment(mapping=aes(x=0, y=-0.6, xend=0, yend=0.55), arrow=arrow(length=unit(0.4,"cm")), size=.5, color="black",linetype=1)+ geom_segment(mapping=aes(x=0, y=0, xend=1.1, yend=0), arrow=arrow(length=unit(0.4,"cm")), size=.5, color="black",linetype=1)+ annotate('text',label='F1',x = 1.15,y=0)+ annotate('text',label='F2',x =0, y=0.6)+ geom_segment(mapping=aes(x=0, y=0*tan(pi*70/180), xend=.2, yend=.2*tan(pi*70/180)), arrow=arrow(length=unit(0.4,"cm")), size=.5, color="black",linetype=2)+ geom_segment(mapping=aes(x=0, y=-0*tan(pi*20/180), xend=0.85, yend=-0.85*tan(pi*20/180)), arrow=arrow(length=unit(0.4,"cm")), size=.5, color="black",linetype=2)+ annotate('text',label='1',x =0.553+0.05,y=0.429)+ annotate('text',label='2',x =0.568+0.05,y=0.288)+ annotate('text',label='3',x =0.392+0.05,y=0.450)+ annotate('text',label='4',x =0.740+0.05,y=-0.273)+ annotate('text',label='5',x =0.724+0.05,y=-0.211)+ annotate('text',label='6',x =0.595+0.05,y=-0.132)+ annotate('text',label='F2*',x =0.2+.05,y=0.2*tan(pi*70/180))+ annotate('text',label='F1*',x =0.85+.05,y=-0.85*tan(pi*20/180))+ annotate('text',label='0',x =0,y=-0.03)+ annotate('text',label='1',x =1,y=-0.03)+ annotate('text',label='0.5',x =0.5 ,y=-0.03)+ annotate('text',label='0.5',x =.03,y=.5)+ annotate('text',label='-0.5',x =.03,y=-.5)+ theme(axis.title.x = element_blank(),axis.text.y = element_blank(),axis.title.y = element_blank(),axis.text.x = element_blank(),panel.grid.major = element_blank(),panel.grid.minor = element_blank(),panel.background = element_blank()) # 方差最大化的旋转: varimax() T.fit = varimax(A, normalize=TRUE) theta = acos(T.fit$rotmat[1,1])*180/pi ## 旋转角度 fcv = factanal(factors = 2,covmat = R.score, rotation="varimax", n.obs=220) ## 方差最大化旋转 #方差最大旋转直角坐标图,最大旋转角度:32.6度 x = A[,1]; y = A[,2] data = data.frame(x, y, ID=1:6) ggplot(data, aes(x=x, y=y)) + geom_point(size=2)+ coord_cartesian(xlim=c(0, 1.2),ylim=c(-0.6,0.6))+ geom_segment(mapping=aes(x=0, y=-0.6, xend=0, yend=0.55), arrow=arrow(length=unit(0.4,"cm")), size=.5, color="black",linetype=1)+ geom_segment(mapping=aes(x=0, y=0, xend=1.1, yend=0), arrow=arrow(length=unit(0.4,"cm")), size=.5, color="black",linetype=1)+ annotate('text',label='F1',x = 1.15,y=0)+ annotate('text',label='F2',x =0, y=0.6)+ geom_segment(mapping=aes(x=0, y=0*tan(pi*57/180), xend=.35, yend=.35*tan(pi*57/180)), arrow=arrow(length=unit(0.4,"cm")), size=.5, color="black",linetype=2)+ geom_segment(mapping=aes(x=0, y=-0*tan(pi*33/180), xend=0.8, yend=-0.8*tan(pi*33/180)), arrow=arrow(length=unit(0.4,"cm")), size=.5, color="black",linetype=2)+ annotate('text',label='1',x =0.553+0.05,y=0.429)+ annotate('text',label='2',x =0.568+0.05,y=0.288)+ annotate('text',label='3',x =0.392+0.05,y=0.450)+ annotate('text',label='4',x =0.740+0.05,y=-0.273)+ annotate('text',label='5',x =0.724+0.05,y=-0.211)+ annotate('text',label='6',x =0.595+0.05,y=-0.132)+ annotate('text',label='F2*',x =0.35+.05,y=0.35*tan(pi*57/180))+ annotate('text',label='F1*',x =0.8+.05,y=-0.8*tan(pi*33/180))+ annotate('text',label='0',x =0,y=-0.03)+ annotate('text',label='1',x =1,y=-0.03)+ annotate('text',label='0.5',x =0.5 ,y=-0.03)+ annotate('text',label='0.5',x =.03,y=.5)+ annotate('text',label='-0.5',x =.03,y=-.5)+ theme(axis.title.x = element_blank(),axis.text.y = element_blank(),axis.title.y = element_blank(),axis.text.x = element_blank(),panel.grid.major = element_blank(),panel.grid.minor = element_blank(),panel.background = element_blank()) #################### # Exam 7: 因子个数确定 # Likelihood ratio test 检验 ###################### factanal(covmat=R, factors=1, rotation="none", n.obs=696) factanal(covmat=R, factors=2, rotation="none", n.obs=696) #平行分析方法(Parallel Analysis) library(psych) fa.parallel(R, n.obs=696) fa.parallel(R, n.obs=696, fa=“fa”) fa.parallel(R, n.obs=696, fa=“pc”) #################### # Exam 8:31个国家/地区预期寿命数据分析 ###################### library(ggplot2) library(GGally) library(psych) life.data <- read.table("life.txt") life <- structure(.Data=life.data[,2:9], class = "data.frame", names = c("m0", "m25", "m50", "m75", "w0", "w25", "w50", "w75"), row.names = life.data[,1]) # 计算相关系数矩阵 R <- cor(life) R pairs(life) # 绘制碎石图,确定因子个数k x <- scale(life, center = T, scale = T) # k = fa.parallel(x, fa = "fa") x factanal(life, factors=2, rotation="varimax") factanal(life, factors=3, rotation="varimax") #因子得分 life.Thompson <- factanal(life, factors=3, rotation="varimax", scores="regression")$scores life.Thompson life.Bartlett <- factanal(life, factors=3, rotation="varimax", scores="Bartlett")$scores life.Bartlett #使用3-D图绘制此数据集的3个因子得分 # 使用原始数据和分数创建数据框 library(lattice) life.df <- data.frame(life, life.Thompson) attach(life.df) cloud(Factor3 ~ Factor1 * Factor2, xlim=range(Factor1), ylim=range(Factor2), zlim=range(Factor3),pch=row.names(life.df),scales = list(distance = rep(1, 3), arrows = FALSE)) row.names(life.df)[order(Factor1)] row.names(life.df)[order(Factor2)] row.names(life.df)[order(Factor3)] # 进行因子得分的二维散点图 # Factor 2 vs. Factor 1 plot(Factor1, Factor2, type='n', xlab='Factor 1 scores', ylab='Factor 2 scores') text(Factor1, Factor2, labels = row.names(life.df), cex = 0.7) abline(h=0,lty=2) abline(v=0,lty=2) #Factor 3 vs. Factor 1 plot(Factor1, Factor3, type='n', xlab='Factor 1 scores', ylab='Factor 3 scores') text(Factor1, Factor3, labels = row.names(life.df), cex = 0.7) abline(h=0,lty=2) abline(v=0,lty=2) # Factor 3 vs. Factor 2 plot(Factor2, Factor3, type='n', xlab='Factor 2 scores', ylab='Factor 3 scores') text(Factor2, Factor3, labels = row.names(life.df), cex = 0.7) abline(h=0,lty=2) abline(v=0,lty=2)