# Preprocessing datasets
# 2021/02/22

# This script will serve to process datasets (Asthma, RA, T1D-noHLA, T1D, T2D, BRCA, PRCA, MDD, CAD, BMI and Height).
# Also, we'll apply QC recommended by Florian Privé in previous versions.


library(data.table)
library(bigsnpr)
library(ggplot2)
setDTthreads(8)

## Helper function for quant traits

sdY.est <- function(vbeta, maf, n) {
  warning("estimating sdY from maf and varbeta, please directly supply sdY if known")
  oneover <- 1/vbeta
  nvx <- 2 * n * maf * (1-maf)
  m <- lm(nvx ~ oneover - 1)
  cf <- coef(m)[['oneover']]
  if(cf < 0)
    stop("estimated sdY is negative - this can happen with small datasets, or those with errors.  A reasonable estimate of sdY is required to continue.")
  return(sqrt(cf))
}

## QC variants
# For consistency across methods, we'll filter our summary statistic datasets by HapMap3 variants.
hm3 <- readRDS(url("https://ndownloader.figshare.com/files/25503788"))
hm3 <- hm3[,1:4]
# Additionally, we'll create another set of datasets, filtered by QC'd (ie. maf > 0.01 and hwe p-value > 1e-10) 1000 Genomes variants, for RápidoPGS comparison across different SNP sets.
#varlist <- fread("../datasets/RapidoPGS_panel_variants.txt", header=FALSE)
#varlist <- varlist[,V1]


## Asthma dataset

asthma <- fread("ftp://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/DemenaisF_29273806_GCST006862/harmonised/29273806-GCST006862-EFO_0000270-build37.f.tsv.gz",
	select = c(1,  3:8),
	col.names = c("CHR", "BP", "REF", "ALT", "BETA", "SE", "P"))# European dataset Demenais et al., 2018

asthma$n_eff <- 4 / (1 / 19954 + 1 / 107715)

# We need the frequencies
fwrite(asthma, "../datasets/Asthma-NOFREQS.tsv.gz", sep="\t")

# Let's compute the frequencies using an external tool and the RápidoPGS panel
system("Compute_freqs.sh -f ../datasets/Asthma-NOFREQS.tsv.gz -p CEU")

# Recover the file
asthma <- fread("../datasets/Asthma-withfreqs.tsv.gz")

## Apply QC recommended by LDpred2 authors.

sd_ss <- with(asthma, 2 / sqrt(n_eff * SE^2))
sd_val <- with(asthma, sqrt(2*ALT_FREQ*(1-ALT_FREQ)))

is_bad <- sd_ss < (0.5 * sd_val) | sd_ss > (sd_val + 0.1) | sd_ss < 0.1 | sd_val < 0.05 
table(is_bad)
# is_bad
#   FALSE    TRUE 
# 1987447    5685 

qplot(sd_val, sd_ss, color = is_bad, alpha = I(0.5)) +
  theme_bigstatsr() +
  coord_equal() +
  scale_color_viridis_d(direction = -1) +
  geom_abline(linetype = 2, color = "red") +
  labs(x = "Standard deviations in the validation set",
       y = "Standard deviations derived from the summary statistics",
       color = "Removed?")

ggsave("../figures/sd-approx-Asthma.png", width = 10, height = 7)

asthma <- asthma[!is_bad,]

asthma[,SNPID:=paste(CHR,BP,sep=":")]
# Filter by 1kG QC variants
#invarlist <- asthma$SNPID %in% varlist
#asthma.1kg <- asthma[invarlist,]

# Save file with 1kG variants
#fwrite(asthma.1kg, "../datasets/Asthma-1kg.tsv.gz", sep="\t")

# Filter (and align) to hm3 variants manifest
setnames(asthma, c("CHR","BP","REF","ALT", "BETA"), c("chr","pos","a0","a1","beta"))
asthma.hm3 <- as.data.table(snp_match(asthma, hm3, strand_flip = FALSE))
#1,987,447 variants to be matched.
#0 ambiguous SNPs have been removed.
#932,300 variants have been matched; 0 were flipped and 0 were reversed.
setnames(asthma.hm3,c("chr","pos","a0","a1","beta"), c("CHR","BP","REF","ALT", "BETA"))
asthma.hm3[,c("_NUM_ID_.ss","_NUM_ID_"):=NULL]

# Save file with HapMap3 variants
fwrite(asthma.hm3, "../datasets/Asthma-hm3.tsv.gz", sep="\t")

rm(asthma, asthma.1kg, asthma.hm3, sd_ss, sd_val, is_bad, invarlist)


## RA (Rheumatoid Arthritis) dataset

# Now I chose to analyse the European-only meta-analysis instead. (RA_1 in our repo).
ra <- fread("http://plaza.umin.ac.jp/~yokada/datasource/files/GWASMetaResults/RA_GWASmeta_European_v2.txt.gz", 
	select= c(2:6,9),
	col.names = c("CHR", "BP", "ALT", "REF", "OR", "P"))
ra[,BETA:=log(OR)][,chi2:= qchisq(P, df = 1, lower.tail = FALSE)][,SE:= ifelse(chi2 > 1e-4, abs(BETA) / sqrt(chi2), NA)][, n_eff:= 4 / (1 / 14361 + 1 / 43923)] # Updated numbers here too


## This file didn't have MAF available, so we'll need to compute them using an external tool.
fwrite(ra, "../datasets/RA-NOFREQS.tsv.gz", sep="\t")
system("Compute_freqs.sh -f ../datasets/RA-NOFREQS.tsv.gz -p CEU")

## Recover the file
ra <- fread("../datasets/RA-withfreqs.tsv.gz")

## Apply QC recommended by LDpred2 authors.

sd_ss <- with(ra, 2 / sqrt(n_eff * SE^2))
sd_val <- with(ra, sqrt(2*ALT_FREQ*(1-ALT_FREQ)))

is_bad <- sd_ss < (0.5 * sd_val) | sd_ss > (sd_val + 0.1) | sd_ss < 0.1 | sd_val < 0.05 
table(is_bad)
# is_bad
#   FALSE    TRUE 
# 5443957 2617608 

qplot(sd_val, sd_ss, color = is_bad, alpha = I(0.5)) +
  theme_bigstatsr() +
  coord_equal() +
  scale_color_viridis_d(direction = -1) +
  geom_abline(linetype = 2, color = "red") +
  labs(x = "Standard deviations in the validation set",
       y = "Standard deviations derived from the summary statistics",
       color = "Removed?")

ggsave("../figures/sd-approx-RA.png", width = 10, height = 7)

ra <- ra[!is_bad,]

ra[,SNPID:=paste(CHR,BP,sep=":")]
ra[,c("OR","chi2"):=NULL]

# Filter by 1000G QC variants
#invarlist <- ra$SNPID %in% varlist
#ra.1kg <- ra[invarlist,]

# Write 1000G qc-filtered file
#fwrite(ra.1kg, "../datasets/RA-1kg.tsv.gz", sep ="\t")

# Filter (and align) to hm3 variants manifest
setnames(ra, c("CHR","BP","REF","ALT", "BETA"), c("chr","pos","a0","a1","beta"))
ra.hm3 <- as.data.table(snp_match(ra, hm3, strand_flip=FALSE))
#5,443,957 variants to be matched.
#0 ambiguous SNPs have been removed.
#781,276 variants have been matched; 0 were flipped and 379,961 were reversed.
setnames(ra.hm3,c("chr","pos","a0","a1","beta"), c("CHR","BP","REF","ALT", "BETA"))
ra.hm3[,c("_NUM_ID_.ss","_NUM_ID_"):=NULL]

# Save file with HapMap3 variants
fwrite(ra.hm3, "../datasets/RA-hm3.tsv.gz", sep="\t")

# little test
#hm3f <- readRDS(url("https://ndownloader.figshare.com/files/25503788"))
#test <- as.data.table(snp_match(ra, hm3f))
#plot(test$ALT_FREQ, test$af_UKBB) 
# I wanted to test correlation between CEU allele freqs and UKBB. Despite the fact that half of SNPs are flipped (thus giving me a nice X plot), the correlation is quite high.
# Let's see what happens if we consider the MAF
#test$ALT_FREQ  <- ifelse(test$ALT_FREQ > 0.5, 1 - test$ALT_FREQ, test$ALT_FREQ)
#test$af_UKBB  <- ifelse(test$af_UKBB > 0.5, 1 - test$af_UKBB, test$af_UKBB)
#plot(test$ALT_FREQ, test$af_UKBB) 

rm(ra, ra.hm3, ra.1kg, sd_ss, sd_val, is_bad, invarlist)


## T1D (Type 1 diabetes) dataset

## We had access to the original Cooper dataset, which comes in one file.
t1d <- fread("../../../02-Processed/T1D_Cooper_doi101101120022_1-hg38.tsv.gz",
		select=c(2:6,16,17,19,28),
		col.names=c("CHR", "BP","REF", "ALT","ALT_FREQ","BETA","SE","P","qc.check"))

# NOTE: This file comes with allele frequencies, which we'll use in this case for QC.

## We apply QC check as in LDpred2 paper, including removing alleles with qc.check different than PASS, and those with ALT_FREQ < 1 / sqrt(n_eff)
t1d <- t1d[qc.check == "PASS",][,n_eff:=4 / (1 / 5913 + 1 / 8828)][ALT_FREQ > 1/sqrt(n_eff),][,qc.check:=NULL]
t1d <- na.omit(t1d)

## Apply QC recommended by LDpred2 authors.

sd_ss <- with(t1d, 2 / sqrt(n_eff * SE^2))
sd_val <- with(t1d, sqrt(2*ALT_FREQ*(1-ALT_FREQ)))

is_bad <- sd_ss < (0.5 * sd_val) | sd_ss > (sd_val + 0.1) | sd_ss < 0.1 | sd_val < 0.05 
table(is_bad)
# is_bad
#   FALSE    TRUE 
# 4182567    5023 

qplot(sd_val, sd_ss, color = is_bad, alpha = I(0.5)) +
  theme_bigstatsr() +
  coord_equal() +
  scale_color_viridis_d(direction = -1) +
  geom_abline(linetype = 2, color = "red") +
  labs(x = "Standard deviations in the validation set",
       y = "Standard deviations derived from the summary statistics",
       color = "Removed?")
ggsave("../figures/sd-approx-T1D.png", width = 10, height = 7)

t1d <- t1d[!is_bad,]
t1d[,SNPID:=paste(CHR,BP,sep=":")]

# Filter by 1000G QC variants
#invarlist <- t1d$SNPID %in% varlist
#t1d.1kg <- t1d[invarlist,]

# Write 1000G qc-filtered file
#fwrite(t1d.1kg, "../datasets/T1D-1kg.tsv.gz", sep ="\t")

# Filter (and align) to hm3 variants manifest
setnames(t1d, c("CHR","BP","REF","ALT", "BETA"), c("chr","pos","a0","a1","beta"))
t1d.hm3 <- as.data.table(snp_match(t1d, hm3, strand_flip=FALSE,  match.min.prop=0.4))
#4,182,567 variants to be matched.
#487,732 variants have been matched; 0 were flipped and 0 were reversed.
setnames(t1d.hm3,c("chr","pos","a0","a1","beta"), c("CHR","BP","REF","ALT", "BETA"))
t1d.hm3[,c("_NUM_ID_.ss","_NUM_ID_"):=NULL]

# Save file with HapMap3 variants
fwrite(t1d.hm3, "../datasets/T1D-hm3.tsv.gz", sep="\t")

rm(t1d, t1d.hm3, t1d.1kg, sd_ss,sd_val, is_bad, invarlist)


## T2D (Type 2 diabetes) dataset

t2d <- fread("ftp://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/ScottRA_28566273_GCST004773/METAANALYSIS_DIAGRAM_SE1.txt",
	col.names=c("SNPID","ALT","REF","BETA","SE","P","N"))
t2d[,c("CHR","BP"):=tstrsplit(SNPID, ":", fixed = TRUE)]
t2d[,n_eff:=4 / (1 / 26676 + 1 / 132532)][,CHR:=as.numeric(CHR)][,BP:=as.numeric(BP)][,N:=NULL]

# This file doesn't have ALT_FREQ, so we'll compute it using an external tool
fwrite(t2d, "../datasets/T2D-NOFREQS.tsv.gz", sep="\t")
system("Compute_freqs.sh -f ../datasets/T2D-NOFREQS.tsv.gz -p CEU")

# Let's recover the file
t2d <- fread("../datasets/T2D-withfreqs.tsv.gz")

## Apply QC recommended by LDpred2 authors.

sd_ss <- with(t2d, 2 / sqrt(n_eff * SE^2))
sd_val <- with(t2d, sqrt(2*ALT_FREQ*(1-ALT_FREQ)))

is_bad <- sd_ss < (0.5 * sd_val) | sd_ss > (sd_val + 0.1) | sd_ss < 0.1 | sd_val < 0.05 
table(is_bad)
# is_bad
#   FALSE    TRUE 
# 6550140 1514988 

qplot(sd_val, sd_ss, color = is_bad, alpha = I(0.5)) +
  theme_bigstatsr() +
  coord_equal() +
  scale_color_viridis_d(direction = -1) +
  geom_abline(linetype = 2, color = "red") +
  labs(x = "Standard deviations in the validation set",
       y = "Standard deviations derived from the summary statistics",
       color = "Removed?")
ggsave("../figures/sd-approx-T2D.png", width = 10, height = 7)

t2d <- t2d[!is_bad,]
# Reordering
t2d <- t2d[,c("SNPID","CHR","BP", "REF","ALT","BETA","SE","P","n_eff")]

# Filter by 1000G QC variants
#invarlist <- t2d$SNPID %in% varlist
#t2d.1kg <- t2d[invarlist,]

# Write 1000G qc-filtered file
#fwrite(t2d.1kg, "../datasets/T2D-1kg.tsv.gz", sep ="\t")

# Filter (and align) to hm3 variants manifest
setnames(t2d, c("CHR","BP","REF","ALT", "BETA"), c("chr","pos","a0","a1","beta"))
t2d.hm3 <- as.data.table(snp_match(t2d, hm3, strand_flip=FALSE,  match.min.prop=0.4))
#6,550,140 variants to be matched.
#1,046,689 variants have been matched; 0 were flipped and 508,688 were reversed.
setnames(t2d.hm3,c("chr","pos","a0","a1","beta"), c("CHR","BP","REF","ALT", "BETA"))
t2d.hm3[,c("_NUM_ID_.ss","_NUM_ID_"):=NULL]

# Save file with HapMap3 variants
fwrite(t2d.hm3, "../datasets/T2D-hm3.tsv.gz", sep="\t")

rm(t2d, t2d.hm3, t2d.1kg, sd_ss,sd_val, is_bad, invarlist)


## BRCA (Breast Cancer) dataset

brca <- fread("ftp://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/MichailidouK_29059683_GCST004988/oncoarray_bcac_public_release_oct17%20(1).txt.gz",
	      select = c("chr", "position_b37", "a0", "a1", "bcac_onco_icogs_gwas_eaf_controls", "bcac_onco_icogs_gwas_beta", "bcac_onco_icogs_gwas_se","bcac_onco_icogs_gwas_P1df"),
             col.names = c("CHR", "BP", "REF", "ALT", "ALT_FREQ",
                                 "BETA", "SE", "P"),
	     na.strings="NULL")
# NOTE: This dataset includes a meta-analysis from multiple European and East Asian patients and controls.
# It includes ALT_FREQ in the controls, but it also has imputed SNPs in source GWAS.

brca[,n_eff:=4 / (1 / 137045 + 1 / 119078)]

## Apply QC recommended by LDpred2 authors.

## NOTE: The following line wasn't present in the LDpred2 paper code, but I include it here for consistency.
brca <- brca[pmin(ALT_FREQ, 1-ALT_FREQ) > (1 / sqrt(n_eff)),]


sd_ss <- with(brca, 2 / sqrt(n_eff * SE^2))
sd_val <- with(brca, sqrt(2*ALT_FREQ*(1-ALT_FREQ)))

is_bad <- sd_ss < (0.5 * sd_val) | sd_ss > (sd_val + 0.1) | sd_ss < 0.1 | sd_val < 0.05 
table(is_bad)
# is_bad
#    FALSE     TRUE 
# 10234809  1557549 

qplot(sd_val, sd_ss, color = is_bad, alpha = I(0.5)) +
  theme_bigstatsr() +
  coord_equal() +
  scale_color_viridis_d(direction = -1) +
  geom_abline(linetype = 2, color = "red") +
  labs(x = "Standard deviations in the validation set",
       y = "Standard deviations derived from the summary statistics",
       color = "Removed?")

ggsave("../figures/sd-approx-BRCA.png", width = 10, height = 7)

brca <- brca[!is_bad,]

# Filter by 1000G QC variants
# This dataset didn't have SNPID, so let's create it
brca[,SNPID:=paste(CHR,BP,sep=":")]
#invarlist <- brca$SNPID %in% varlist
#brca.1kg <- brca[invarlist,]

# Write 1000G qc-filtered file
#fwrite(brca.1kg, "../datasets/BRCA-1kg.tsv.gz", sep ="\t")

# Filter (and align) to hm3 variants manifest
setnames(brca, c("CHR","BP","REF","ALT", "BETA"), c("chr","pos","a0","a1","beta"))
brca.hm3 <- as.data.table(snp_match(brca, hm3, strand_flip=FALSE))
#10,234,809 variants to be matched.
#1,053,580 variants have been matched; 0 were flipped and 0 were reversed.
setnames(brca.hm3,c("chr","pos","a0","a1","beta"), c("CHR","BP","REF","ALT", "BETA"))
brca.hm3[,c("_NUM_ID_.ss","_NUM_ID_"):=NULL]

# Save file with HapMap3 variants
fwrite(brca.hm3, "../datasets/BRCA-hm3.tsv.gz", sep="\t")

rm(brca, brca.hm3, brca.1kg, sd_ss,sd_val, is_bad, invarlist)


## PRCA (Prostate Cancer) dataset

prca <- fread("ftp://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/SchumacherFR_29892016_GCST006085/meta_v3_onco_euro_overall_ChrAll_1_release.txt", 
	       select = c("Chr", "position", "Allele1", "Allele2", "Effect", "StdErr",
             "Pvalue", "Freq1"),
  col.names = c("CHR", "BP", "ALT", "REF", "BETA", "SE", "P", "ALT_FREQ"))
prca[,c("REF","ALT"):=list(toupper(REF),toupper(ALT))][,n_eff:= 4 / (1 / 79194 + 1 / 61112)]
# NOTE: I updated the numbers here. After checking the paper, they slightly differed from what Privé et al., used

## Apply QC recommended by LDpred2 authors.

prca <- prca[pmin(ALT_FREQ, 1-ALT_FREQ) > (1 / sqrt(n_eff)),]

sd_ss <- with(prca, 2 / sqrt(n_eff * SE^2))
sd_val <- with(prca, sqrt(2*ALT_FREQ*(1-ALT_FREQ)))

is_bad <- sd_ss < (0.5 * sd_val) | sd_ss > (sd_val + 0.1) | sd_ss < 0.1 | sd_val < 0.05 
table(is_bad)
# is_bad
#    FALSE     TRUE 
# 10554634  3056442 

qplot(sd_val, sd_ss, color = is_bad, alpha = I(0.5)) +
  theme_bigstatsr() +
  #coord_equal() +
  scale_color_viridis_d(direction = -1) +
  geom_abline(linetype = 2, color = "red") +
  labs(x = "Standard deviations in the validation set",
       y = "Standard deviations derived from the summary statistics",
       color = "Removed?")

ggsave("../figures/sd-approx-PRCA.png", width = 10, height = 7)

prca <- prca[!is_bad,]

# Filter by 1000G QC variants
# This dataset didn't have SNPID, so let's create it
prca[,SNPID:=paste(CHR,BP,sep=":")]
#invarlist <- prca$SNPID %in% varlist
#prca.1kg <- prca[invarlist,]

# Write 1000G qc-filtered file
#fwrite(prca.1kg, "../datasets/PRCA-1kg.tsv.gz", sep ="\t")

# Filter (and align) to hm3 variants manifest
setnames(prca, c("CHR","BP","REF","ALT", "BETA"), c("chr","pos","a0","a1","beta"))
prca.hm3 <- as.data.table(snp_match(prca, hm3, strand_flip=FALSE))
#10,234,809 variants to be matched.
#1,053,580 variants have been matched; 0 were flipped and 0 were reversed.
setnames(prca.hm3,c("chr","pos","a0","a1","beta"), c("CHR","BP","REF","ALT", "BETA"))
prca.hm3[,c("_NUM_ID_.ss","_NUM_ID_"):=NULL]

# Save file with HapMap3 variants
fwrite(prca.hm3, "../datasets/PRCA-hm3.tsv.gz", sep="\t")

rm(prca, prca.hm3, prca.1kg, sd_ss,sd_val, is_bad, invarlist)


## MDD (Major depression) dataset

mdd <- fread("ftp://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/WrayNR_29700475_GCST005839/MDD2018_ex23andMe.gz",
	     fill = TRUE,
                   select = c("CHR", "BP", "A1", "A2", "OR", "SE", "P", "Nca", "Nco"),
                   col.names = c("CHR", "BP", "ALT", "REF", "OR", "SE", "P", "N1", "N0"))
mdd  <- na.omit(mdd)
mdd[,BETA:=log(OR)][,OR:=NULL][,chr:=as.numeric(CHR)][,BP:=as.numeric(BP)][,n_eff:=4 / (1 / N1 + 1 / N0)]
mdd  <- mdd[n_eff > (0.5 * max(n_eff)),]

## NOTE: This file didn't have MAF available, so we'll need to compute them using an external tool.
fwrite(mdd, "../datasets/MDD-NOFREQS.tsv.gz", sep="\t")
system("Compute_freqs.sh -f ../datasets/MDD-NOFREQS.tsv.gz -p CEU")
# Recover the file
mdd <- fread("../datasets/MDD-withfreqs.tsv.gz")

## Apply QC recommended by LDpred2 authors.

sd_ss <- with(mdd, 2 / sqrt(n_eff * SE^2))
sd_val <- with(mdd, sqrt(2*ALT_FREQ*(1-ALT_FREQ)))

is_bad <- sd_ss < (0.5 * sd_val) | sd_ss > (sd_val + 0.1) | sd_ss < 0.1 | sd_val < 0.05 
table(is_bad)
# is_bad
#   FALSE    TRUE
# 6617035   29400

qplot(sd_val, sd_ss, color = is_bad, alpha = I(0.5)) +
  theme_bigstatsr() +
  coord_equal() +
  scale_color_viridis_d(direction = -1) +
  geom_abline(linetype = 2, color = "red") +
  labs(x = "Standard deviations in the validation set",
       y = "Standard deviations derived from the summary statistics",
       color = "Removed?")
ggsave("../figures/sd-approx-MDD.png", width = 10, height = 7)

mdd <- mdd[!is_bad,]
mdd[,chr:=NULL]

# Filter by 1000G QC variants
# This dataset didn't have SNPID, so let's create it
mdd[,SNPID:=paste(CHR,BP,sep=":")]
#invarlist <- mdd$SNPID %in% varlist
#mdd.1kg <- mdd[invarlist,]

# Write 1000G qc-filtered file
#fwrite(mdd.1kg, "../datasets/MDD-1kg.tsv.gz", sep ="\t")

# Filter (and align) to hm3 variants manifest
setnames(mdd, c("CHR","BP","REF","ALT", "BETA"), c("chr","pos","a0","a1","beta"))
mdd.hm3 <- as.data.table(snp_match(mdd, hm3, strand_flip=FALSE))
# 6,617,035 variants to be matched.
# 1,043,246 variants have been matched; 0 were flipped and 506,944 were reversed.
setnames(mdd.hm3,c("chr","pos","a0","a1","beta"), c("CHR","BP","REF","ALT", "BETA"))
mdd.hm3[,c("_NUM_ID_.ss","_NUM_ID_"):=NULL]

# Save file with HapMap3 variants
fwrite(mdd.hm3, "../datasets/MDD-hm3.tsv.gz", sep="\t")

rm(mdd, mdd.hm3, mdd.1kg, sd_ss,sd_val, is_bad, invarlist)


## CAD (Coronary artery disease) dataset 

cad <- fread("ftp://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/NikpayM_26343387_GCST003116/cad.add.160614.website.txt",
	select = c("chr", "bp_hg19", "noneffect_allele",
                              "effect_allele", "beta", "se_dgc", "p_dgc"),
        col.names = c("CHR", "BP", "REF", "ALT", "BETA", "SE", "P"))
cad[,n_eff:=4 / (1 / 60801 + 1 / 123504)]

## NOTE: This file didn't have MAF available, so we'll need to compute them using an external tool.
fwrite(cad, "../datasets/CAD-NOFREQS.tsv.gz", sep="\t")
system("Compute_freqs.sh -f ../datasets/CAD-NOFREQS.tsv.gz -p CEU")
# Recover the file
cad <- fread("../datasets/CAD-withfreqs.tsv.gz")

## Apply QC recommended by LDpred2 authors.

sd_ss <- with(cad, 2 / sqrt(n_eff * SE^2))
sd_val <- with(cad, sqrt(2*ALT_FREQ*(1-ALT_FREQ)))

is_bad <- sd_ss < (0.5 * sd_val) | sd_ss > (sd_val + 0.1) | sd_ss < 0.1 | sd_val < 0.05 
table(is_bad)
# is_bad
#   FALSE    TRUE 
# 6278405  670834 

qplot(sd_val, sd_ss, color = is_bad, alpha = I(0.5)) +
  theme_bigstatsr() +
  coord_equal() +
  scale_color_viridis_d(direction = -1) +
  geom_abline(linetype = 2, color = "red") +
  labs(x = "Standard deviations in the validation set",
       y = "Standard deviations derived from the summary statistics",
       color = "Removed?")
ggsave("../figures/sd-approx-CAD.png", width = 10, height = 7)

cad <- cad[!is_bad,]

# Filter by 1000G QC variants
# This dataset didn't have SNPID, so let's create it
cad[,SNPID:=paste(CHR,BP,sep=":")]
#invarlist <- cad$SNPID %in% varlist
#cad.1kg <- cad[invarlist,]

# Write 1000G qc-filtered file
fwrite(cad.1kg, "../datasets/CAD-1kg.tsv.gz", sep ="\t")

# Filter (and align) to hm3 variants manifest
setnames(cad, c("CHR","BP","REF","ALT", "BETA"), c("chr","pos","a0","a1","beta"))
cad.hm3 <- as.data.table(snp_match(cad, hm3, strand_flip=FALSE))
# 6,278,405 variants to be matched.
# 1,044,311 variants have been matched; 0 were flipped and 725,285 were reversed.
setnames(cad.hm3,c("chr","pos","a0","a1","beta"), c("CHR","BP","REF","ALT", "BETA"))
cad.hm3[,c("_NUM_ID_.ss","_NUM_ID_"):=NULL]

# Save file with HapMap3 variants
fwrite(cad.hm3, "../datasets/CAD-hm3.tsv.gz", sep="\t")

rm(cad, cad.hm3, cad.1kg, sd_ss,sd_val, is_bad, invarlist)


## BMI (Body mass Index) dataset

bmi <- fread("https://portals.broadinstitute.org/collaboration/giant/images/1/15/SNP_gwas_mc_merge_nogc.tbl.uniq.gz",
	col.names=c("SNPID", "ALT", "REF", "ALT_FREQ", "BETA", "SE", "P","N"))

# Let's compute n_eff
bmi[,sdy:=sdY.est(SE^2,ALT_FREQ, N)][,n_eff:=4 * N /sdy^2]

## NOTE: This dataset doesn't have coordinates, we'll need to compute them from panel, using 00a-Missing_coordinates. Then we'll reimport it and replace SNPID
# Rscript ../../../GWAS_tools/00a-Missing_coordinates/Fetch_coordinates.R BMI_Locke_25673413_1-nofiltNOCOORDS.tsv.gz
fwrite(bmi, "../datasets/BMI-NOCOORDS.tsv.gz", sep="\t")

# Retrieve coordinates 
system("Rscript --vanilla Fetch_coordinates.R ../datasets/BMI-NOCOORDS.tsv.gz")

bmi <- fread("../datasets/BMI-withcoords.tsv.gz")
bmi[,SNPID:=paste(CHR,BP, sep = ":")]
bmi <- bmi[,c("SNPID","CHR","BP", "REF","ALT","ALT_FREQ","BETA","SE","P","N", "n_eff", "sdy")]
bmi <- na.omit(bmi)

## Apply QC recommended by LDpred2 authors.
# In this particular case, we'll apply a bit different QC, since it's a continuous trait.
# I computed sdy already, so I'll filter SNPs following advice found at https://github.com/privefl/bigsnpr/issues/168
sd_ss <- with(bmi, sdy / sqrt(n_eff * SE^2))
sd_val <- with(bmi, sqrt(2*ALT_FREQ*(1-ALT_FREQ)))

is_bad <- sd_ss < (0.5 * sd_val) | sd_ss > (sd_val + 0.1) | sd_ss < 0.1 | sd_val < 0.05 
table(is_bad)
# is_bad
#   FALSE    TRUE 
# 1878199   97509 

qplot(sd_val, sd_ss, color = is_bad, alpha = I(0.5)) +
  theme_bigstatsr() +
  coord_equal() +
  scale_color_viridis_d(direction = -1) +
  geom_abline(linetype = 2, color = "red") +
  labs(x = "Standard deviations in the validation set",
       y = "Standard deviations derived from the summary statistics",
       color = "Removed?")
ggsave("../figures/sd-approx-BMI.png", width = 10, height = 7)

bmi <- bmi[!is_bad,]

# Filter by 1000G QC variants
#invarlist <- bmi$SNPID %in% varlist
#bmi.1kg <- bmi[invarlist,]

#fwrite(bmi.1kg, "../datasets/BMI-1kg.tsv.gz", sep="\t")

# Filter (and align) to hm3 variants manifest
setnames(bmi, c("CHR","BP","REF","ALT", "BETA"), c("chr","pos","a0","a1","beta"))
bmi.hm3 <- as.data.table(snp_match(bmi, hm3, strand_flip=FALSE))
# 1,878,199 variants to be matched.
# Some duplicates were removed.  
# 763,023 variants have been matched; 0 were flipped and 376,261 were reversed.
setnames(bmi.hm3,c("chr","pos","a0","a1","beta"), c("CHR","BP","REF","ALT", "BETA"))
bmi.hm3[,c("_NUM_ID_.ss","_NUM_ID_"):=NULL]

# Save file with HapMap3 variants
fwrite(bmi.hm3, "../datasets/BMI-hm3.tsv.gz", sep="\t")


rm(bmi, bmi.hm3, bmi.1kg, invarlist)


## Height dataset

height <- fread("https://portals.broadinstitute.org/collaboration/giant/images/0/01/GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt.gz",
	col.names=c("SNPID", "ALT", "REF", "ALT_FREQ", "BETA", "SE", "P","N"))

### Let's compute n_eff
height[,sdy:=sdY.est(SE^2,ALT_FREQ, N)][,n_eff:=4 * N /sdy^2]

## NOTE: This dataset doesn't have coordinates, we'll need to compute them from panel, using 00a-Missing_coordinates. Then we'll reimport it and replace SNPID
fwrite(height, "../datasets/Height-NOCOORDS.tsv.gz", sep="\t")

# Retrieve coordinates 
system("Rscript --vanilla Fetch_coordinates.R ../datasets/Height-NOCOORDS.tsv.gz")


height <- fread("../datasets/Height-withcoords.tsv.gz")
height[,SNPID:=paste(CHR,BP, sep = ":")]
height <- height[,c("SNPID","CHR","BP", "REF","ALT","ALT_FREQ","BETA","SE","P","N", "n_eff", "sdy")]
height <- na.omit(height)

## Apply QC recommended by LDpred2 authors.
# In this particular case, we'll apply a bit different QC, since it's a continuous trait.
# I computed sdy already, so I'll filter SNPs following advice found at https://github.com/privefl/bigsnpr/issues/168
sd_ss <- with(height, sdy / sqrt(n_eff * SE^2))
sd_val <- with(height, sqrt(2*ALT_FREQ*(1-ALT_FREQ)))

is_bad <- sd_ss < (0.5 * sd_val) | sd_ss > (sd_val + 0.1) | sd_ss < 0.1 | sd_val < 0.05 
table(is_bad)
# is_bad
#   FALSE    TRUE 
# 1443721  567305 

qplot(sd_val, sd_ss, color = is_bad, alpha = I(0.5)) +
  theme_bigstatsr() +
  coord_equal() +
  scale_color_viridis_d(direction = -1) +
  geom_abline(linetype = 2, color = "red") +
  labs(x = "Standard deviations in the validation set",
       y = "Standard deviations derived from the summary statistics",
       color = "Removed?")
ggsave("../figures/sd-approx-HEIGHT.png", width = 10, height = 7)

height <- height[!is_bad,]

# Filter by 1000G QC variants
#invarlist <- height$SNPID %in% varlist
#height.1kg <- height[invarlist,]

#fwrite(height.1kg, "../datasets/Height-1kg.tsv.gz", sep="\t")

# Filter (and align) to hm3 variants manifest
setnames(height, c("CHR","BP","REF","ALT", "BETA"), c("chr","pos","a0","a1","beta"))
height.hm3 <- as.data.table(snp_match(height, hm3, strand_flip=FALSE))
# 1,443,721 variants to be matched.
# 581,721 variants have been matched; 0 were flipped and 282,777 were reversed.

setnames(height.hm3,c("chr","pos","a0","a1","beta"), c("CHR","BP","REF","ALT", "BETA"))
height.hm3[,c("_NUM_ID_.ss","_NUM_ID_"):=NULL]

# Save file with HapMap3 variants
fwrite(height.hm3, "../datasets/Height-hm3.tsv.gz", sep="\t")


rm(height, height.hm3, height.1kg, invarlist)


# RA (International)

# RA (European) failed for LDpred2-auto, so we'll fall back to the transethnic dataset (RA_Okada_3) we used in previous versions (and that LDpred2 authors used in their own paper).
raint <- fread("ftp://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/OkadaY_24390342_GCST002318/RA_GWASmeta_TransEthnic_v2.txt.gz", select = 2:7,
                   col.names = c("CHR", "BP", "ALT", "REF", "OR", "P"))
raint[,BETA:=log(OR)][,chi2:= qchisq(P, df = 1, lower.tail = FALSE)][,SE:= ifelse(chi2 > 1e-4, abs(BETA) / sqrt(chi2), NA)][, n_eff:= 4 / (1 / 19234 + 1 / 61565)] # Updated numbers here too


## This file didn't have MAF available, so we'll need to compute them using an external tool.
fwrite(raint, "../datasets/RAint-NOFREQS.tsv.gz", sep="\t")
system("Compute_freqs.sh -f ../datasets/RAint-NOFREQS.tsv.gz -p CEU")

## Recover the file
raint <- fread("../datasets/RAint-withfreqs.tsv.gz")

## Apply QC recommended by LDpred2 authors.

sd_ss <- with(raint, 2 / sqrt(n_eff * SE^2))
sd_val <- with(raint, sqrt(2*ALT_FREQ*(1-ALT_FREQ)))

is_bad <- sd_ss < (0.5 * sd_val) | sd_ss > (sd_val + 0.1) | sd_ss < 0.1 | sd_val < 0.05 
table(is_bad)
# is_bad
#   FALSE    TRUE 
# 4992972 3183373 

qplot(sd_val, sd_ss, color = is_bad, alpha = I(0.5)) +
  theme_bigstatsr() +
  coord_equal() +
  scale_color_viridis_d(direction = -1) +
  geom_abline(linetype = 2, color = "red") +
  labs(x = "Standard deviations in the validation set",
       y = "Standard deviations derived from the summary statistics",
       color = "Removed?")

ggsave("../figures/sd-approx-RAint.png", width = 10, height = 7)

raint <- raint[!is_bad,]

raint[,SNPID:=paste(CHR,BP,sep=":")]
raint[,c("OR","chi2"):=NULL]

# Filter by 1000G QC variants
#invarlist <- raint$SNPID %in% varlist
#raint.1kg <- raint[invarlist,]

# Write 1000G qc-filtered file
#fwrite(raint.1kg, "../datasets/RAint-1kg.tsv.gz", sep ="\t")

# Filter (and align) to hm3 variants manifest
setnames(raint, c("CHR","BP","REF","ALT", "BETA"), c("chr","pos","a0","a1","beta"))
raint.hm3 <- as.data.table(snp_match(raint, hm3, strand_flip=FALSE))
# 4,992,972 variants to be matched.
# 728,597 variants have been matched; 0 were flipped and 354,315 were reversed.
setnames(raint.hm3,c("chr","pos","a0","a1","beta"), c("CHR","BP","REF","ALT", "BETA"))
raint.hm3[,c("_NUM_ID_.ss","_NUM_ID_"):=NULL]

# Save file with HapMap3 variants
fwrite(raint.hm3, "../datasets/RAint-hm3.tsv.gz", sep="\t")

rm(raint, raint.hm3, raint.1kg, sd_ss, sd_val, is_bad, invarlist)