###############################################
#
#     INTRO TO R INTEGRATED ASSIGNMENT: ggplot2
#
###############################################

## library in ggplot2 (if you haven't installed it yet, use the command install.packages("ggplot2"))
library('ggplot2')

## First set your working directory
my.wd <- "C:/Users/larun/Desktop/CBW/Intro to R/Data/"
setwd(my.wd)

## READ IN DATA FOR YOUR PLOT
myplot.df <- readRDS("MYC-let-7-ggplot-data.rds")

## This is the normalized expression of MYC 
##  and a member of the miRNA let-7 family from the TCGA HNSC data set 
##  This data frame also contains gender and 
##  "tumor_nuclei_percent" to indicate sample purity
head(myplot.df)

## 
##   GGPLOT2 SCATTER PLOT WITH REGRESSION LINE
##

## Use ggplot to make a simple scatter plot of y=MYC vs x=let-7
(scatter1 <- ggplot(data = myplot.df, aes(y=MYC,x=hsa.let.7e.3p)) + 
  geom_point(aes(col=gender)) + 
  geom_smooth(method="lm") + 
  xlab("hsa-let-7a-3p") + 
  ylab("MYC"))

## Now change the colors and the legend
(scatter2 <- scatter1 + scale_color_manual(values=c("male"="blue","female"="red"),name="Legend"))

## Now make your points bigger
(scatter3 <- scatter2 + geom_point(aes(col=gender),size=3))

## Now change the background and make the legend title larger and bold
(scatter4  <-  scatter3 +
  theme_bw() +
  theme(legend.title = element_text(size=16, face="bold"))) 

## Now chnage the size of the points based on the variable: tumor_nuclei_percent
(scatter5 <- scatter4 + geom_point(aes(col=gender,size=tumor_nuclei_percent)))

## It's super cluttered so it's hard to see a pattern. 
##  Let's create a variable that separates our variable 'tumor_nuclei_percent'
##    into 4 categories based their values 
myplot.df$tum.nucl.cl <- NA
myplot.df$tum.nucl.cl[myplot.df$tumor_nuclei_percent < summary(myplot.df$tumor_nuclei_percent)["Median"]] <- "Low"
myplot.df$tum.nucl.cl[myplot.df$tumor_nuclei_percent >= summary(myplot.df$tumor_nuclei_percent)["Median"]] <- "High"

## create a factor variable of our "Low" and "High" categories so they are ordered in our graphs
myplot.df$tum.ncl.fac <- factor(myplot.df$tum.nucl.cl,levels = c("Low","High"))

## Check the distribution of indivdiuals who fall into "Low" and "High" using the table() function
table(myplot.df$tum.nucl.cl)

## First let's use facet_grid to look at the histograms of these groups by their membership
##    using facet_wrap
full_hist <- ggplot(myplot.df,aes(x=tumor_nuclei_percent)) + geom_histogram()
full_hist + facet_grid(. ~ tum.ncl.fac)

## Now facet our scatterplot we made based on membership in the groups we created
(facet_plot1 <- ggplot(data = myplot.df, aes(y=MYC,x=hsa.let.7e.3p)) + 
    geom_smooth(method="lm") + 
    xlab("hsa-let-7a-3p") + 
    ylab("MYC") +
    geom_point(aes(col=gender,size=tumor_nuclei_percent)) +
    theme_bw() +
    scale_color_manual(values=c("male"="blue","female"="red"),name="Legend") +
    theme(legend.title = element_text(size=16, face="bold")) + 
    facet_grid(. ~ tum.ncl.fac))

## Try splitting by gender
facet_plot1 + facet_grid(. ~ gender)


## 
##   GGPLOT2 OVERLAYED HISTOGRAMS
##

## First, use ggplot to make a basic histogram of let-7's expression

## plotting histogram for let-7's normalized expression
(hist1 <- ggplot(data = myplot.df, aes(x=hsa.let.7e.3p)) + geom_histogram())

## Now make a plot with the two histograms of each gender on top of each other 
## with different colors for each
(hist2 <- hist1 + 
  geom_histogram(aes(fill = gender)))

## Now change the colors within each histogram
## (for example: "forestgreen" and "darkorchid4")
(hist3 <- hist2 +  
  scale_fill_manual(values = c("male" = "forestgreen","female" = "darkorchid4")))

## The histogram looks really blocky, let's increase the number of bins
(hist4 <- ggplot(data = myplot.df, aes(x=hsa.let.7e.3p)) +  
    scale_fill_manual(values = c("male" = "forestgreen","female" = "darkorchid4")) + 
  geom_histogram(bins = 100,aes(fill = gender)))

## Now make the background different from the default (e.g. make it black and white themed)
(hist5 <- hist4 + 
  theme_minimal())

## Change the legend title from "gender" to "legend" and
##  the x-axis label from "hsa.let.7e.3p" to "hsa-let-7e-3p"
(hist6 <- hist5  +  
  scale_fill_manual(values = c("male" = "forestgreen","female" = "darkorchid4"),name="Legend") + 
    xlab("hsa-let-7e-3p")) 

## What if we do facet_wrap instead of overlaying the histograms? 
hist6 + facet_grid(. ~ gender)

## 
##   GGPLOT2 BOXPLOTS
##

## First, let's make boxplots of normalized MYC expression
##  split by our "low" and "high" tumour_nuclei_percent groups
##  recall, the variable is "tum.ncl.fac"

## Name the axes, change the colors, make the lengend title blank, 
##    and change the background from the default
(bp1 <- ggplot(myplot.df,aes(x = tum.ncl.fac,y=MYC)) + 
  geom_boxplot(aes(fill = tum.ncl.fac)) + 
    xlab("Tumour Nuclei Percentage") + 
    scale_fill_manual(values = c("Low" = "seagreen","High"="red4"),name = "") + 
    theme_light())

## Now let's annotate the outliers in the "High" group as "outliers" 
(bp2 <- bp1 + annotate("text", x = 1.8, y = 4, label = "Outliers",cex = 8))

## Finally, let's split these graphs into facets based on sex: 
bp2 + facet_grid(. ~ gender)
  ## Note that facet_grid will apply everything that has been done in the original graph
  ## to both graphs (i.e 2 labes of outliers)