######################## SURVIVAL LAB OUTLINE ########################## #The goal of the lab is to to learn to run Survival analysis using high #resolution data. The lab is based on the MSKCC prostate cancer study #(Taylor et al, Cell 2010; pdf file was provided prior to the workshop and #was printed out for students), which published integrated genome and #transcriptome analysis of ~200 tumour samples with rich clinical #annotation. Using this data we will: # (1) Get an understanding of how to work with clinical data and learn # basic R functions for survival analysis. For this we will reproduce the # survival analysis of reported genomically distinct subgroups of PCa (as # in Figure 4 of the paper) using MSKCC clinical info table. (2) To learn # how to discover novel subgroups of tumours that might be associated with # outcome. For this we will use processed whole genome copy number data of # tumours for unsupervised hirarchical clustering. The observed subclasses # of tumours with different genomic landscape will be tested for # difference in survival. (3) To learn how to associate aberrations in # multiple genes (gene signatures) with outcome. For this, we will use the # list of 100 neuroendocrine associated genes and their CNA data that was # adapted form original CNA MSKCC data for the purposes of the lab. We # will explore how the aberrations in multiple genes effect outcome of # patients. (4) To learn how to use different data types (such as copy # number and gene expression) to associate aberrations within individual # candidate biomarker genes with outcome. For this we will use MSKCC # processed genome copy number (Agilent aCGH) and gene expression data # (Affymetrix Exon Array) to explore the impact of aberrations in PTEN, # and neuroendocrine markers (CHGA, CHGB, SYP). PTEN is the known tumour # supressor in PCa, which is frequently deleted/mutated/downregulated. # Neuroendocrine markers presense identifies an agressive phenotype of PCa # (neuroendocrine prostate cancer), which is associated with poor outcome. # There are many NE markers, of which the mentioned 3 genes are commonly # used for histological classification. # # ############ (1) Get an understanding of how to work with clinical data ############ and learn basic R functions for survival analysis. ############ ########################## ############ ######### Load the package ############# library(survival) #read the functions descriptions: Read about the syntax of the Kaplan-Meyer #estimator function: ?survfit #Read about the syntax of the log-rank test ?survdiff ######### read the clinical data in ############# ######### clinData<-read.table("TaylorClinicalData_for_CNA_data.txt", header = TRUE, sep = "\t", na.strings = "NA", quote = "", comment.char = "") ######### Run Kaplan-Meyer analysis ############# ######### #fit and plot Kaplan-Meyer curves for Primary vs Mets: pFit <- survfit(Surv(BCR_FreeTime, BCR_Event) ~ Type, data = clinData) plot(pFit, col = c("green", "purple")) legend("topright", levels(clinData$Type), lty = 1, col = c("green", "purple")) ######### Run Log-rank test ############# ######### survdiff(Surv(BCR_FreeTime, BCR_Event) ~ Type, data = clinData) #define a custom function to pull the p-value out of the Log-rank test getPval <- function(x) { if( is.matrix(x$obs)) etmp <- apply(x$exp, 1, sum) else etmp <- x$exp df<- (sum(1 * (etmp > 0))) - 1 pv <- 1 - pchisq(x$chisq, df) format(pv, digits = 3) } } pValue<-getPval(survdiff(Surv(BCR_FreeTime, BCR_Event) ~ Type, data = clinData)) #add p-value on the plot legend("bottomright", paste("p-value=", pValue), col = "black") ######### Subset the data to look at primary tumours only. Plot survival ######### according to the tumour grade ############# ######### # subset the data to look at primary tumours only and re-construct the data # object: #the factor has to be rebuilt since the column still knows about the #"NA" #values; Use columns "Type" and "PathGGS" - Pathological Gleason Grade #Score. subsetData<-subset(clinData, Type=="PRIMARY") subsetData$Type<-factor(subsetData$Type) pFit <- survfit(Surv(BCR_FreeTime, BCR_Event) ~ PathGGS, data = subsetData) plot(pFit, col = c("green", "purple", "magenta", "blue", "orange")) legend("topright", levels(subsetData$PathGGS), lty = 1, col = c("green", "purple", "magenta", "blue", "orange")) pValue<-getPval(survdiff(Surv(BCR_FreeTime, BCR_Event) ~ PathGGS, data = subsetData)) legend("bottomright", paste("p-value=", pValue), col = "black") # The following will compare only two subgroups: grade 6 and 9 getPval(survdiff(Surv(BCR_FreeTime, BCR_Event) ~ PathGGS, data = subset(subsetData, PathGGS == "6" | PathGGS == "9"))) ######### Exercise on your own: Plot survival according to the Copy ######### number Cluster reported in Taylor et al (Figure 4) ######### ############# # Tip: the column containing cluster assignment is "Copy.number.Cluster" # Run survfit function for groups in that column. Run log-rank test and # report the p-value on the plot. # # # ############ (2) To learn how to discover novel subgroups of tumours that ############ might be associated with outcome. ############ ############################## ############ # a) Using MSKCC_PCa_RAE_gene_calls.txt file select top 500-1000 most # variable genes. # Tip: RAE algorithm identifies genes that are copy number changed (0 - # normal; -1 - deleted heterozygously; -2 - deleted homozygously; >1 - # amplification). Use these scores to select most frequently aberrant # genes. # b) Cluster RAE scores of selected genes in GenePattern. Login to # GenePattern server first using your own login. # Tip: create gct input file; see example here: # MSKCC_PCa_RAE_gene_calls_top1000CNA_genes.gct; Tip: use unsupervised # clustering module of GenePattern with the following parameters: # city-block distance, complete linkage method. Check: compare your # clusering result with the Taylor et al. For this, label samples in # GenePattern with regard to the discovered clusters (1 through 6, # figure 4 from the paper). The sample label file to use is here: # MSKCC_sample_label_GenePattern.txt # c) Inspect the sample dendrogram and define the number of clusters # according to the genomic landscape (2,3,4 clusters). Save the image as # png file # and the sample legend as well. Export cluster labels from GenePattern. # Tip: select the sample cluster dendrogram and one of the genes. Click # Menu->File-> Save Dataset. Save the file on the local drive. This file # will contain the sample list from the selected cluster. Tip: add # cluster information to the clinical info file using R functions or # Excel (VLOOKUP function). Note: several outlier samples will not be # assigned to major clusters. Assign NA to those samples. # d) Run Kaplan-Meyer analysis to see the survival experience of discovered # clusters. # # Help: after discovering clusters and compiling a file with cluster # assignment following code can be used to run KM. Use # discovery_cluster_example.txt as an example. # discoverClusters<-read.table("discovery_cluster_example.txt", header = TRUE, sep = "\t", na.strings = "NA", quote = "", comment.char = "") clusterClinData<-merge(clinData, discoverClusters, by.x="PCA.ID", by.y="PCA.ID", all.x=TRUE) pFit <- survfit(Surv(BCR_FreeTime, BCR_Event) ~ cluster, data = clusterClinData) plot(pFit, col = c("green", "purple", "magenta", "blue", "orange")) legend("topright", levels(clusterClinData$cluster), lty = 1, col = c("green", "purple", "magenta", "blue", "orange")) pValue<-getPval(survdiff(Surv(BCR_FreeTime, BCR_Event) ~ cluster, data = clusterClinData)) legend("bottomright", paste("p-value=", pValue), col = "black") # Which cluster has the worst outcome? Check the survival difference # between most aberrant tumours and most "normal" (flat) ones. Report the # p-value. # # ############ (3) To learn how to associate aberrations in multiple genes ############ (gene signatures) with outcome. #For this, we will use the list of 100 neuroendocrine associated genes and #their CNA data that was adapted form original CNA MSKCC data for the #purposes of the lab. Neuroendocrine genes up-regulation/copy number gain #is associated with aggressive NEPC. Therefore, only gains of these genes #will be of interest. # # d<-clinData # BAALC is the last gene. the next # code summarizes the number of # samples with a certain NE gene # aberated. sink("summaryPerGeneNE.txt") for( colNum in 2:108 ) { print(names(d)[colNum]) print(summary(d[[colNum]])) } sink() # just look at the gains; ignore # instances of losses. these are # all the call types # # CN Gain CN Gain, High Copy Gain # CN Loss CN Loss, Homozygous Copy # Loss High Copy Gain Homozygous # Copy Loss # # data frame copy for gains only g <- d write.csv(g, file = "gNE.csv") # translate the CN calls into # numbers for( colNum in 2:108 ) { # losses are assigned 0; neglected ## colNum <- 10 g[[colNum]] <- sub("CN Loss, Homozygous Copy Loss", "0", g[[colNum]]) g[[colNum]] <- sub("Homozygous Copy Loss", "0", g[[colNum]]) g[[colNum]] <- sub("CN Loss", "0", g[[colNum]]) # gains are assigned 1, counted in. g[[colNum]] <- sub("CN Gain, High Copy Gain", "1", g[[colNum]]) g[[colNum]] <- sub("High Copy Gain", "1", g[[colNum]]) g[[colNum]] <- sub("CN Gain", "1", g[[colNum]]) g[[colNum]][ g[[colNum]]==""] <- 0 g[[colNum]] <- as.numeric( g[[colNum]] ) } } # gives the number of samples that # have a gain aberation for a # certain gene. sink("number_of_samples_with_amplification_per_gene.txt") for( colNum in 2:108 ) { print(names(g)[colNum]) print(sum(g[[colNum]])) } sink() # sum the amplified genes per sample # g$NEnumber <- apply(subset(g, select = 2:108), 1, sum) # test if this NEnumber is # predictive for outcome. Maybe # there needs to be two groups # based on the NEnumber # # only can do KM on those samples # that have a time to recurrence. # g <- subset(g, ! is.na(BCR_FreeTime)) pdf(file="KMplotForAllNEnumberCutoffs.pdf", width=20, height=20) # list all the NE numbers within # the data set to then loop through # them all allNEnums <- sort(unique(g$NEnumber)) # loop through all the NE numbers # and do a KM plot with p-value for( NEnumCutOff in allNEnums[-1]) { ## NEnumCutOff <- 14 g$NEnumGroup <- "low" g$NEnumGroup[ g$NEnumber >= NEnumCutOff ] <- "high" g$NEnumGroup <- factor(g$NEnumGroup) gSurFitGemcap <- survfit(Surv(time = BCR_FreeTime, event = BCR_Event) ~ NEnumGroup , data = g) plot(gSurFitGemcap, col = c("green", "purple"), main = paste("NEnumber cutoff: >=", NEnumCutOff, ", p-value: ", getPval( survdiff(Surv(time = BCR_FreeTime, event = BCR_Event) ~ NEnumGroup, data = g) ) ) ) abline(v = c(36, 60), col = c("blue", "red"), lty = 2) abline(h = 0.5, col = "gray", lty = 2) legend("topright", paste("NE number", levels(g$NEnumGroup), summary(g$NEnumGroup)[1:2]), lty = 1, col = c("green", "purple")) legend("bottomright", c("3 years", "5 years"), lty = 2, col = c("blue", "red")) par(xpd=TRUE) } dev.off() ############ (4) To learn how to use different data types (such as copy ############ number and gene expression) ############ to associate ############ aberrations within individual candidate biomarker with ############ outcome. ############ # read the clinical data, used to be clinData cd <- read.table("TaylorClinicalData_for_CNA_data.txt", header = TRUE, sep = "\t", na.strings = "NA", quote = "", comment.char = "") # copy number data cn <- read.table("MSKCC_PCa_RAE_gene_calls.txt", header=TRUE, sep="\t", na.strings = "NA", quote = "", comment.char = "") # gene expression data ge <- read.table("MSKCC_PCa_mRNA_Zscores.txt", header=TRUE, sep="\t", na.strings = "NA", quote = "", comment.char = "") # toss the duplicated gene symbols in cn cn <- cn[! duplicated(cn$GeneSymbol), ] # only use samples that have ge cn and cd information cnSamples <- names(cn)[3:length(cn)] geSamples <- names(ge)[3:length(ge)] cdSamples <- cd$PCA.ID # copy number samples that also exist in gene expression cnGeSamples <- cnSamples[cnSamples %in% geSamples] # above samples also found in clinical information cnGeCdSample <- cnGeSamples[cnGeSamples %in% cdSamples] # add the gene id and gene symbol to the shared sample so they kept in the # next step cnGeCdSample <- c("GeneID", "GeneSymbol", cnGeCdSample) # keep only rows of clinical data that belong to the samples that have data # in all three columns, plus the columsn GeneID and GeneSymbol. cd <- subset(cd, PCA.ID %in% cnGeCdSample) # keep only the columns of those samples that are shared cn <- subset(cn, select= names(cn) %in% cnGeCdSample ) ge <- subset(ge, select= names(ge) %in% cnGeCdSample ) # set gene of interest goi <- "PTEN" # get the cn data for goi cnGoi <- subset(cn, GeneSymbol==goi, select=grep("Gene",names(cn), invert=TRUE )) # cn samples of interest, where the gene of interest has the aberration of # interest (PTEN - loss => the cut is <0) cnSoi <- names(cnGoi)[cnGoi<0] # get the ge data for goi geGoi <- subset(ge, GeneSymbol==goi, select=grep("Gene",names(ge), invert=TRUE )) # ge samples of interest, the cut is <= -2 geSoi <- names(geGoi)[geGoi<=-2] # un-comment the option that you are interested in below. For PTEN I am # using the last one to get more samples for the Kaplan Meier graph. # # sample of interest is that which came up in ge as well as in cn data soi # <- cnSoi[cnSoi %in% geSoi] # # if you only want to run this on gene expression data un-comment the next # line comment three lines above soi <- geSoi # # of if you only care about copy number results un-comment soi <- cnSoi # # if it has to just turn up in one or the other soi <- unique(geSoi, cnSoi) ######### Run Kaplan-Meyer analysis ############# ######### ######### cdSoi <- rep("others", length(cd[[1]])) cdSoi[cd$PCA.ID %in% soi] <- "samples of interest" cdSoi <- factor(cdSoi) pFit <- survfit(Surv(BCR_FreeTime, BCR_Event) ~ cdSoi, data = cd) plot(pFit, col = c("green", "purple"), main = goi) legend("topright", levels(cdSoi), lty = 1, col = c("green", "purple")) ######### Run Log-rank test ############# ######### survdiff(Surv(BCR_FreeTime, BCR_Event) ~ cdSoi, data = cd) #define a custom function to pull the p-value out of the Log-rank test # pValue<-getPval(survdiff(Surv(BCR_FreeTime, BCR_Event) ~ cdSoi, data = cd)) #add p-value on the plot legend("bottomright", paste("p-value=", pValue), col = "black") ######### Exercise on your own ############# # plot the survival of tumours that have over-expression of at least one of # the neuroendocrine markers CHGA, CHGB or SYP. #Tip: modify the above code to use only expression data for multiple genes #instead of multiple data sets for a single gene (as with PTEN). # Over-expression cut-off z-score >=2. # #