# download KnockoffZoom summary statistics download_KZ_data = function(data_dir){ discoveries_dir=sprintf("%s/ukb_discoveries", data_dir) statistics_dir=sprintf("%s/ukb_stats", data_dir) discoveries_file="ukb_discoveries.zip" statistics_file="ukb_stats.zip" remote="https://msesia.github.io/knockoffzoom/data" if(!dir.exists(discoveries_dir)){ cat(sprintf("Downloading KnockoffZoom discoveries...\n")) url = sprintf("%s/%s", remote, discoveries_file) dest = sprintf("%s/%s", data_dir, discoveries_file) download(url, dest, mode="wb") unzip(dest, exdir = data_dir) file.remove(dest) } if(!dir.exists(statistics_dir)){ cat(sprintf("Downloading KnockoffZoom statistics...\n")) url = sprintf("%s/%s", remote, statistics_file) dest = sprintf("%s/%s", data_dir, statistics_file) download(url, dest, mode="wb") unzip(dest, exdir = data_dir) file.remove(dest) } } # helper function to read KZ data for one phenotype read_phenotype = function(phenotype, data_dir){ statistics = read_delim(file = sprintf("%s/ukb_stats/ukb_%s_kz_res2.txt", data_dir, phenotype), delim = " ", col_types = "iiciiddc") discoveries = read_delim(file = sprintf("%s/ukb_discoveries/ukb_%s_kz_res2.txt", data_dir, phenotype), delim = " ", col_types = "icidiiiii") R2.max = 0.99 # maximum allowable correlation between knockoff and original variable statistics = statistics %>% filter(is.na(R2)|R2<=R2.max) %>% select(CHR, Group, W) discoveries = discoveries %>% select(CHR, Group, BP.min, BP.max) df = left_join(statistics, discoveries, by = c("CHR", "Group")) # sort by decreasing order of magnitude mag_order = order(abs(df$W), decreasing = TRUE) df = df[mag_order,] df$Phenotype = phenotype return(df) } # read and process low-resolution phenotype data read_KZ_data = function(data_dir, phenotypes){ # read all phenotypes dfs_list = lapply(phenotypes, function(phenotype)(read_phenotype(phenotype, data_dir))) # combine into one data frame df = do.call("rbind", dfs_list) return(df) } # definition of informative points get_informative_points = function(values, min_val_improvement = 0.01, min_k_improvement = 5){ k_vals = c() m = length(values) while(TRUE){ still_eligible = !logical(m) if(length(k_vals) > 0){ ineligible_k = max(k + min_k_improvement - 1, max(which(values <= min_val + min_val_improvement))) still_eligible[1:ineligible_k] = FALSE } if(!any(still_eligible)){ break } min_val = min(values[still_eligible]) k = which(still_eligible)[which.min(values[still_eligible])] k_vals = c(k_vals, k) } return(k_vals) } # compute FDP-hat based on knockoff statistics W get_FDP_hat = function(W){ return((1+cumsum(W<0))/cumsum(W>0)) } # compute FDP-bar based on knockoff statistics W get_FDP_bar = function(W, C){ num_discoveries = cumsum(W>0) # initial bound on V max_false_positives = pmin(num_discoveries, floor(C*(1+cumsum(W<0)))) min_true_positives = num_discoveries - max_false_positives # interpolate max_false_positives = rev(cummin(rev(max_false_positives))) min_true_positives = cummax(min_true_positives) max_false_positives = pmin(max_false_positives, num_discoveries - min_true_positives) return(max_false_positives/num_discoveries) } # find number of rejections made for given phenotype # when controlling FDR at level q get_FDR_thresh = function(df, phenotype, q){ df_phenotype = df %>% filter(Phenotype == phenotype) eligible = df_phenotype$num_rejections[df_phenotype$FDP_hat <= q] if(length(eligible) > 0){ return(max(eligible)) } else{ return(0) } } # find number of rejections made for given phenotype # when controlling FDP at level q get_FDP_thresh = function(df, phenotype, q){ df_phenotype = df %>% filter(Phenotype == phenotype) eligible = df_phenotype$num_rejections[df_phenotype$FDP_bar <= q] if(length(eligible) > 0){ return(max(eligible)) } else{ return(0) } } # Read GREAT results read_great_results = function(data_dir, k){ great_results = read_tsv(sprintf("%s/GREAT_output/greatExportAll_%d.tsv", data_dir, k), skip = 3, n_max = 500, col_types = "-cc-d--d----------------") # great_results = great_results %>% filter(Desc %in% GO_terms) %>% select("ID", "Desc", "BinomP", "RegionFoldEnrich") great_results = great_results %>% select("ID", "Desc", "BinomP", "RegionFoldEnrich") great_results$set = k return(great_results) }