--- title: "Multivariate Stats in R" author: "Mike Herrmann" date: "Thursday, October 29, 2015" output: pdf_document: toc: yes --- # Statistical tests that we will be covering 1. ANOVA and LM 2. Principal Component Analysis 3. Discriminant Analysis 4. Non-metric Multidimentional Scaling (NMDS) ```{r eval=FALSE} # Install required packages install.packages("MASS") install.packages("ggplot2") install.packages("vegan") install.packages("Ecdat") install.packages("ellipse") install.packages("rgl") ``` ```{r warning=FALSE, message=FALSE} # Load packages library(MASS) library(ggplot2) library(vegan) library(Ecdat) library(ellipse) ``` # ANOVA, ANCOVA, and LM First example, basic ANOVA. We will use a diamond dataset in the package Ecdat. It compares the price of diamonds to several measurements. ```{r} attach(Diamond) View(Diamond) summary(Diamond) #test for effects of transmission type, starting with a simple one-way test diamond_test1 <- aov(price ~ clarity) summary(diamond_test1) #add in more factors, using * between factors will give you a full-factorial design #A WARNING! R uses a type-I sequencial SS for their analysis. THIS ANALYSIS WILL GIVE DIFFERENT RESULTS BASED ON VARIABLE ORDER FOR NON-ORTHOGONAL DESIGNS! Watch for this if you do not have equal observations in all of your variables! diamond_test3 <- aov(price ~ colour*clarity*certification) summary(diamond_test3) #To look at diagnostic plots, we can use the plot function plot(diamond_test3) #to see effect directions, boxplots work well boxplot(price~clarity) plot(price~carat) #For an ANCOVA, replace * with +. This is also useful if you have a random variable, such as with a blocking design, that you want to try to account for. diamond_ancova<-aov(price ~ clarity + colour) summary(diamond_ancova) #linear models work essentially the same way, but instead of "aov", we use "lm" diamond_lm <- lm(price ~ carat) summary(diamond_lm) plot(diamond_lm) detach(Diamond) ``` # Principal Component Analysis (PCA) Principal component analysis- to look at this, we will use a different sample data set that comes with R, the Iris dataset ```{r} attach(iris) summary(iris) par(mfrow=c(1,1)) PC <- princomp(iris[,1:4], scores=TRUE) summary(PC) PC$loadings PC$loadings[,1:3] plot(PC, type="lines") biplot(PC) biplot(PC, xlabs=Species) biplot(PC) #set up a 3D plot library(rgl) plot3d(PC$scores[,1:3], col=as.integer(Species)) #We can perform unsupervised clustering using k-means cl <- kmeans(iris[,1:4],3) iris$cluster <- as.factor(cl$cluster) #Now lets check how our clustering performed table(iris$cluster, Species) ``` # Linear Discriminant Analysis ```{r} # Check for collinear values cor <- cor(iris[,1:4]) dissimilarity <- 1-abs(cor) distance <- as.dist(dissimilarity) hc <- hclust(distance) clusterV=cutree(hc, h=0.05) clusterV cor(Petal.Length, Petal.Width) Petal.Width <- NULL # The LDA irisLDA <- lda(iris[,1:4], Species, CV=FALSE) irisCV <- lda(iris[,1:4], Species, CV=TRUE) plot(irisLDA) irisLDA$counts #CV table(Species, irisCV$class) pred <- predict(irisLDA) #Percent explained by each DA prop <- irisLDA$svd^2/sum(irisLDA$svd^2) prop #use ggplot to make a much prettier version of the LDA plot pred<-data.frame(Species=predict(irisLDA)$class,predict(irisLDA)$x) library(ellipse) dat_ell <- data.frame() for(g in levels(pred$Species)){ dat_ell <- rbind(dat_ell, cbind(as.data.frame(with(pred[pred$Species==g,], ellipse(cor(LD1, LD2), scale=c(sd(LD1),sd(LD2)),centre=c(mean(LD1),mean(LD2))))),Species=g))} ggplot(pred, aes(x=LD1, y=LD2, col=Species) ) + geom_point( size = 4, aes(color = Species))+theme_bw()+geom_path(data=dat_ell,aes(x=x,y=y,color=Species),size=1,linetype=2) ``` # Non-metric Multidimentional Scaling (NMDS) ```{r} set.seed(2) #set up random species data community_matrix=matrix( sample(1:100,300,replace=T),nrow=10, dimnames=list(paste("community",1:10,sep=""),paste("sp",1:30,sep=""))) example_NMDS=metaMDS(community_matrix, # Our community-by-species matrix k=2) # k = number of dimentions #stressplot - if points remain close to the line, our data retains its original differences despite the dimention reduction stressplot(example_NMDS) #plot nmds- communities ("sites") are open circles, species are red crosses plot(example_NMDS) #we can use ordiplot to create a plot with labels ordiplot(example_NMDS,type="n") orditorp(example_NMDS,display="species",col="red",air=0.01) orditorp(example_NMDS,display="sites",cex=1.25,air=0.01) treat=c(rep("Treatment1",5),rep("Treatment2",5)) ordiplot(example_NMDS,type="n") ordiellipse(example_NMDS,groups=treat,draw="polygon",col="grey90",label=F) orditorp(example_NMDS,display="species",col="red",air=0.01) orditorp(example_NMDS,display="sites",col=c(rep("green",5),rep("blue",5)), air=0.01,cex=1.25) ```